From 44c11faabfe2468eeb5acad6f02cf9bb1766d9bd Mon Sep 17 00:00:00 2001 From: Ma0 Date: Fri, 24 Nov 2017 17:23:59 +0100 Subject: [PATCH 01/51] nits: remove unnecessary parentheses --- source/common/lowres.h | 2 +- source/encoder/sao.h | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/source/common/lowres.h b/source/common/lowres.h index 4cb4d00fcb..072739e80e 100644 --- a/source/common/lowres.h +++ b/source/common/lowres.h @@ -129,7 +129,7 @@ struct Lowres : public ReferencePlanes uint8_t* intraMode; int64_t satdCost; uint16_t* lowresCostForRc; - uint16_t(*lowresCosts[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]); + uint16_t* lowresCosts[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]; int32_t* lowresMvCosts[2][X265_BFRAME_MAX + 1]; MV* lowresMvs[2][X265_BFRAME_MAX + 1]; uint32_t maxBlocksInRow; diff --git a/source/encoder/sao.h b/source/encoder/sao.h index a62305ec35..258b4dd035 100644 --- a/source/encoder/sao.h +++ b/source/encoder/sao.h @@ -55,12 +55,9 @@ class SAO enum { NUM_EDGETYPE = 5 }; enum { NUM_PLANE = 3 }; enum { SAO_DEPTHRATE_SIZE = 4 }; - static const uint32_t s_eoTable[NUM_EDGETYPE]; - - typedef int32_t (PerClass[MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]); - typedef int32_t (PerPlane[NUM_PLANE][MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]); - + typedef int32_t PerClass[MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]; + typedef int32_t PerPlane[NUM_PLANE][MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]; protected: /* allocated per part */ From d42a261238880fdd1c2f3ddb9b8333ad8900b6c6 Mon Sep 17 00:00:00 2001 From: Vignesh Vijayakumar Date: Thu, 2 Nov 2017 09:39:58 +0530 Subject: [PATCH 02/51] x86: Modify asm codes for NASM compatibility --- source/common/x86/blockcopy8.asm | 8 +-- source/common/x86/intrapred8.asm | 2 +- source/common/x86/ipfilter16.asm | 30 +++++------ source/common/x86/ipfilter8.asm | 86 +++++++++++++++---------------- source/common/x86/loopfilter.asm | 48 ++++++++--------- source/common/x86/mc-a.asm | 34 ++++++------ source/common/x86/pixel-util8.asm | 8 +-- source/common/x86/sad-a.asm | 18 +++---- source/common/x86/seaintegral.asm | 24 ++++----- 9 files changed, 129 insertions(+), 129 deletions(-) diff --git a/source/common/x86/blockcopy8.asm b/source/common/x86/blockcopy8.asm index 8c77f293ee..cb30484a27 100644 --- a/source/common/x86/blockcopy8.asm +++ b/source/common/x86/blockcopy8.asm @@ -3850,7 +3850,7 @@ cglobal blockcopy_ss_%1x%2, 4, 5, 6 mov r4d, %2/4 add r1, r1 add r3, r3 -.loop +.loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] @@ -3905,7 +3905,7 @@ cglobal blockcopy_ss_%1x%2, 4, 7, 2 lea r5, [3 * r3] lea r6, [3 * r1] -.loop +.loop: movu m0, [r2] movu xm1, [r2 + 32] movu [r0], m0 @@ -5085,7 +5085,7 @@ cglobal copy_cnt_16, 3,4,6 pxor m4, m4 pxor m5, m5 -.loop +.loop: ; row 0 movu m0, [r1] movu m1, [r1 + 16] @@ -5196,7 +5196,7 @@ cglobal copy_cnt_32, 3,4,6 pxor m4, m4 pxor m5, m5 -.loop +.loop: ; row 0 movu m0, [r1] movu m1, [r1 + 16] diff --git a/source/common/x86/intrapred8.asm b/source/common/x86/intrapred8.asm index 34db950575..61a856190d 100644 --- a/source/common/x86/intrapred8.asm +++ b/source/common/x86/intrapred8.asm @@ -2148,7 +2148,7 @@ cglobal intra_pred_ang4_26, 3,4,4 paddw m0, m1 packuswb m0, m0 - movd r2, m0 + movd r2d, m0 mov [r0], r2b shr r2, 8 mov [r0 + r1], r2b diff --git a/source/common/x86/ipfilter16.asm b/source/common/x86/ipfilter16.asm index f9e415d820..d44cfd9edb 100644 --- a/source/common/x86/ipfilter16.asm +++ b/source/common/x86/ipfilter16.asm @@ -9103,7 +9103,7 @@ cglobal filterPixelToShort_6x%1, 3, 7, 3 ; load constant mova m2, [pw_2000] -.loop +.loop: movu m0, [r0] movu m1, [r0 + r1] psllw m0, (14 - BIT_DEPTH) @@ -9156,7 +9156,7 @@ cglobal filterPixelToShort_8x%1, 3, 7, 2 ; load constant mova m1, [pw_2000] -.loop +.loop: movu m0, [r0] psllw m0, (14 - BIT_DEPTH) psubw m0, m1 @@ -9277,7 +9277,7 @@ cglobal filterPixelToShort_16x%1, 3, 7, 3 ; load constant mova m2, [pw_2000] -.loop +.loop: movu m0, [r0] movu m1, [r0 + r1] psllw m0, (14 - BIT_DEPTH) @@ -9351,7 +9351,7 @@ cglobal filterPixelToShort_16x%1, 3, 7, 3 ; load constant mova m2, [pw_2000] -.loop +.loop: movu m0, [r0] movu m1, [r0 + r1] psllw m0, (14 - BIT_DEPTH) @@ -9405,7 +9405,7 @@ cglobal filterPixelToShort_32x%1, 3, 7, 5 ; load constant mova m4, [pw_2000] -.loop +.loop: movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] @@ -9510,7 +9510,7 @@ cglobal filterPixelToShort_32x%1, 3, 7, 3 ; load constant mova m2, [pw_2000] -.loop +.loop: movu m0, [r0] movu m1, [r0 + r1] psllw m0, (14 - BIT_DEPTH) @@ -9583,7 +9583,7 @@ cglobal filterPixelToShort_64x%1, 3, 7, 5 ; load constant mova m4, [pw_2000] -.loop +.loop: movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] @@ -9758,7 +9758,7 @@ cglobal filterPixelToShort_64x%1, 3, 7, 3 ; load constant mova m2, [pw_2000] -.loop +.loop: movu m0, [r0] movu m1, [r0 + r1] psllw m0, (14 - BIT_DEPTH) @@ -9869,7 +9869,7 @@ cglobal filterPixelToShort_24x%1, 3, 7, 5 ; load constant mova m4, [pw_2000] -.loop +.loop: movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] @@ -9952,7 +9952,7 @@ cglobal filterPixelToShort_24x%1, 3, 7, 3 ; load constant mova m2, [pw_2000] -.loop +.loop: movu m0, [r0] movu m1, [r0 + 32] psllw m0, (14 - BIT_DEPTH) @@ -10017,7 +10017,7 @@ cglobal filterPixelToShort_12x%1, 3, 7, 3 ; load constant mova m2, [pw_2000] -.loop +.loop: movu m0, [r0] movu m1, [r0 + r1] psllw m0, (14 - BIT_DEPTH) @@ -10081,7 +10081,7 @@ cglobal filterPixelToShort_48x64, 3, 7, 5 ; load constant mova m4, [pw_2000] -.loop +.loop: movu m0, [r0] movu m1, [r0 + r1] movu m2, [r0 + r1 * 2] @@ -10214,7 +10214,7 @@ cglobal filterPixelToShort_48x64, 3, 7, 4 ; load constant mova m3, [pw_2000] -.loop +.loop: movu m0, [r0] movu m1, [r0 + 32] movu m2, [r0 + 64] @@ -10314,7 +10314,7 @@ cglobal interp_8tap_horiz_ps_4x%1, 6,8,7 .preloop: lea r6, [r3 * 3] -.loop +.loop: ; Row 0 movu xm3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] movu xm4, [r0 + 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] @@ -10381,7 +10381,7 @@ cglobal interp_8tap_horiz_ps_4x%1, 6,8,7 packssdw xm4, xm4 movq [r2], xm3 ;row 0 -.end +.end: RET %endif %endmacro diff --git a/source/common/x86/ipfilter8.asm b/source/common/x86/ipfilter8.asm index e4f224f9be..36d8986b5a 100644 --- a/source/common/x86/ipfilter8.asm +++ b/source/common/x86/ipfilter8.asm @@ -324,7 +324,7 @@ cextern pw_8192 paddw m0, m5 psraw m0, 6 packuswb m0, m0 - movd r4, m0 + movd r4d, m0 mov [dstq], r4w shr r4, 16 mov [dstq + dststrideq], r4w @@ -3471,7 +3471,7 @@ RET phaddw %2, %2 pmulhrsw %2, %3 packuswb %2, %2 - movd r4, %2 + movd r4d, %2 mov [dstq], r4w shr r4, 16 mov [dstq + dststrideq], r4w @@ -5336,7 +5336,7 @@ cglobal interp_4tap_horiz_ps_64x%1, 4,7,6 sub r0 , r1 add r6d , 3 -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 @@ -5441,7 +5441,7 @@ cglobal interp_8tap_horiz_ps_4x%1, 6,7,6 .preloop: lea r6, [r3 * 3] -.loop +.loop: ; Row 0-1 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 ; shuffled based on the col order tab_Lm @@ -5502,7 +5502,7 @@ cglobal interp_8tap_horiz_ps_4x%1, 6,7,6 movq [r2], xm3 movhps [r2 + r3], xm3 movq [r2 + r3 * 2], xm4 -.end +.end: RET %endif %endmacro @@ -5592,7 +5592,7 @@ cglobal interp_8tap_horiz_ps_8x%1, 4,7,6 paddw xm1, xm2 psubw xm1, xm0 movu [r2], xm1 ;row 0 -.end +.end: RET %endif %endmacro ; IPFILTER_LUMA_PS_8xN_AVX2 @@ -5634,7 +5634,7 @@ cglobal interp_8tap_horiz_ps_%1x%2, 6, 10, 7 sub r0, r8 ; r0(src)-r8 add r9, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) -.label +.label: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m6 ; row 0 (col 4 to 7) @@ -12374,7 +12374,7 @@ cglobal filterPixelToShort_8x%1, 3, 7, 6 mova m4, [pb_128] mova m5, [tab_c_64_n64] -.loop +.loop: movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 @@ -25491,7 +25491,7 @@ cglobal interp_4tap_horiz_ps_32x32, 4,6,8 sub r0, r1 add r4d, 3 -.loop +.loop: ; Row 0 movu m2, [r0] movu m3, [r0 + 1] @@ -25553,7 +25553,7 @@ cglobal interp_4tap_horiz_ps_16x16, 4,7,6 sub r0 , r1 add r6d , 3 -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 @@ -25607,7 +25607,7 @@ cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6 sub r0 , r1 add r6d , 3 -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] pshufb m3, m1 @@ -25670,7 +25670,7 @@ cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6 sub r0 , r1 add r6d , 3 -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] pshufb m3, m1 @@ -25743,7 +25743,7 @@ cglobal interp_4tap_horiz_ps_4x4, 4,7,5 je .label sub r0 , r1 -.label +.label: ; Row 0-1 movu xm3, [r0] vinserti128 m3, m3, [r0 + r1], 1 @@ -25795,7 +25795,7 @@ cglobal interp_4tap_horiz_ps_4x4, 4,7,5 movq [r2+r3], xm4 lea r2, [r2 + r3 * 2] movhps [r2], xm3 -.end +.end: RET cglobal interp_4tap_horiz_ps_4x2, 4,7,5 @@ -25823,7 +25823,7 @@ cglobal interp_4tap_horiz_ps_4x2, 4,7,5 je .label sub r0 , r1 -.label +.label: ; Row 0-1 movu xm3, [r0] vinserti128 m3, m3, [r0 + r1], 1 @@ -25864,7 +25864,7 @@ cglobal interp_4tap_horiz_ps_4x2, 4,7,5 movq [r2+r3], xm4 lea r2, [r2 + r3 * 2] movhps [r2], xm3 -.end +.end: RET ;----------------------------------------------------------------------------------------------------------------------------- @@ -25899,7 +25899,7 @@ cglobal interp_4tap_horiz_ps_%1x%2, 4,7,5 sub r0 , r1 -.loop +.loop: sub r4d, 4 ; Row 0-1 movu xm3, [r0] @@ -25955,7 +25955,7 @@ cglobal interp_4tap_horiz_ps_%1x%2, 4,7,5 movq [r2+r3], xm4 lea r2, [r2 + r3 * 2] movhps [r2], xm3 -.end +.end: RET %endmacro @@ -25993,7 +25993,7 @@ cglobal interp_4tap_horiz_ps_8x8, 4,7,6 sub r0 , r1 add r6d , 1 -.loop +.loop: dec r6d ; Row 0 vbroadcasti128 m3, [r0] @@ -26032,7 +26032,7 @@ cglobal interp_4tap_horiz_ps_8x8, 4,7,6 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2], xm3 -.end +.end: RET INIT_YMM avx2 @@ -26237,7 +26237,7 @@ cglobal interp_4tap_horiz_pp_%1x%2, 4,6,6 dec r0 -.loop +.loop: sub r4d, 4 ; Row 0-1 movu xm3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] @@ -26306,9 +26306,9 @@ cglobal interp_8tap_horiz_ps_%1x%2, 4, 7, 8 sub r0, r6 add r4d, 7 -.label +.label: lea r6, [pw_2000] -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m6 ; row 0 (col 4 to 7) @@ -26405,9 +26405,9 @@ cglobal interp_8tap_horiz_ps_48x64, 4, 7, 8 sub r0, r6 ; r0(src)-r6 add r4d, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) -.label +.label: lea r6, [interp8_hps_shuf] -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m6 ; row 0 (col 4 to 7) @@ -26736,9 +26736,9 @@ cglobal interp_8tap_horiz_ps_64x%1, 4, 7, 8 sub r0, r6 ; r0(src)-r6 add r4d, 7 ; blkheight += N - 1 -.label +.label: lea r6, [pw_2000] -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m6 ; row 0 (col 4 to 7) @@ -26880,7 +26880,7 @@ cglobal interp_4tap_horiz_ps_8x%1, 4,7,6 sub r0 , r1 inc r6d -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] pshufb m3, m1 @@ -26915,7 +26915,7 @@ cglobal interp_4tap_horiz_ps_8x%1, 4,7,6 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2], xm3 -.end +.end: RET %endmacro @@ -26945,7 +26945,7 @@ cglobal interp_4tap_horiz_ps_2x4, 4, 7, 3 jz .label sub r0, r1 -.label +.label: lea r6, [r1 * 3] movq xm1, [r0] movhps xm1, [r0 + r1] @@ -26985,7 +26985,7 @@ cglobal interp_4tap_horiz_ps_2x4, 4, 7, 3 movd [r2], xm1 pextrd [r2 + r3], xm1, 1 pextrd [r2 + r3 * 2], xm1, 2 -.end +.end: RET INIT_YMM avx2 @@ -27005,7 +27005,7 @@ cglobal interp_4tap_horiz_ps_2x8, 4, 7, 7 jz .label sub r0, r1 -.label +.label: mova m4, [interp4_hpp_shuf] mova m5, [pw_1] dec r0 @@ -27062,7 +27062,7 @@ cglobal interp_4tap_horiz_ps_2x8, 4, 7, 7 movd [r2], xm1 pextrd [r2 + r3], xm1, 1 movd [r2 + r3 * 2], xm2 -.end +.end: RET INIT_YMM avx2 @@ -27217,7 +27217,7 @@ cglobal interp_4tap_horiz_ps_6x8, 4,7,6 sub r0 , r1 inc r6d -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] pshufb m3, m1 @@ -27254,7 +27254,7 @@ cglobal interp_4tap_horiz_ps_6x8, 4,7,6 vextracti128 xm4, m3, 1 movq [r2], xm3 movd [r2+8], xm4 -.end +.end: RET INIT_YMM avx2 @@ -27285,7 +27285,7 @@ cglobal interp_8tap_horiz_ps_12x16, 6, 7, 8 lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride sub r0, r6 ; r0(src)-r6 add r4d, 7 -.loop +.loop: ; Row 0 @@ -27350,9 +27350,9 @@ cglobal interp_8tap_horiz_ps_24x32, 4, 7, 8 sub r0, r6 ; r0(src)-r6 add r4d, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) -.label +.label: lea r6, [interp8_hps_shuf] -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m6 ; row 0 (col 4 to 7) @@ -27430,7 +27430,7 @@ cglobal interp_4tap_horiz_ps_24x32, 4,7,6 sub r0 , r1 add r6d , 3 -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 @@ -27988,7 +27988,7 @@ cglobal interp_4tap_horiz_ps_48x64, 4,7,6 sub r0 , r1 add r6d , 3 -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 @@ -28067,7 +28067,7 @@ cglobal interp_4tap_horiz_ps_24x64, 4,7,6 sub r0 , r1 add r6d , 3 -.loop +.loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 @@ -28114,7 +28114,7 @@ cglobal interp_4tap_horiz_ps_2x16, 4, 7, 7 jz .label sub r0, r1 -.label +.label: mova m4, [interp4_hps_shuf] mova m5, [pw_1] dec r0 @@ -28209,7 +28209,7 @@ cglobal interp_4tap_horiz_ps_2x16, 4, 7, 7 movd [r2], xm1 pextrd [r2 + r3], xm1, 1 movd [r2 + r3 * 2], xm2 -.end +.end: RET INIT_YMM avx2 diff --git a/source/common/x86/loopfilter.asm b/source/common/x86/loopfilter.asm index 7e1ed065e8..590652d130 100644 --- a/source/common/x86/loopfilter.asm +++ b/source/common/x86/loopfilter.asm @@ -374,7 +374,7 @@ cglobal saoCuOrgE1, 4,5,8 pxor m0, m0 ; m0 = 0 mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] shr r4d, 4 -.loop +.loop: movu m7, [r0] movu m5, [r0 + 16] movu m3, [r0 + r3] @@ -430,7 +430,7 @@ cglobal saoCuOrgE1, 3, 5, 8, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] mova m7, [pb_128] shr r4d, 4 -.loop +.loop: movu m1, [r0] ; m1 = pRec[x] movu m2, [r0 + r3] ; m2 = pRec[x + iStride] @@ -478,7 +478,7 @@ cglobal saoCuOrgE1, 4,5,6 mova m4, [pb_2] shr r4d, 4 mova m0, [pw_pixel_max] -.loop +.loop: movu m5, [r0] movu m3, [r0 + r3] @@ -523,7 +523,7 @@ cglobal saoCuOrgE1, 3, 5, 8, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth mova xm6, [pb_2] ; xm6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] mova xm7, [pb_128] shr r4d, 4 -.loop +.loop: movu xm1, [r0] ; xm1 = pRec[x] movu xm2, [r0 + r3] ; xm2 = pRec[x + iStride] @@ -572,7 +572,7 @@ cglobal saoCuOrgE1_2Rows, 4,7,8 mov r5d, r4d shr r4d, 4 mov r6, r0 -.loop +.loop: movu m7, [r0] movu m5, [r0 + 16] movu m3, [r0 + r3] @@ -674,7 +674,7 @@ cglobal saoCuOrgE1_2Rows, 3, 5, 8, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuW pxor m0, m0 ; m0 = 0 mova m7, [pb_128] shr r4d, 4 -.loop +.loop: movu m1, [r0] ; m1 = pRec[x] movu m2, [r0 + r3] ; m2 = pRec[x + iStride] @@ -748,7 +748,7 @@ cglobal saoCuOrgE1_2Rows, 4,5,8 mova m4, [pw_pixel_max] vbroadcasti128 m6, [r2] ; m6 = m_iOffsetEo shr r4d, 4 -.loop +.loop: movu m7, [r0] movu m5, [r0 + r3] movu m1, [r0 + r3 * 2] @@ -804,7 +804,7 @@ cglobal saoCuOrgE1_2Rows, 3, 5, 7, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuW vbroadcasti128 m5, [pb_128] vbroadcasti128 m6, [r2] ; m6 = m_iOffsetEo shr r4d, 4 -.loop +.loop: movu xm1, [r0] ; m1 = pRec[x] movu xm2, [r0 + r3] ; m2 = pRec[x + iStride] vinserti128 m1, m1, xm2, 1 @@ -859,7 +859,7 @@ cglobal saoCuOrgE2, 6,6,8 movh m6, [r0 + r4 * 2] movhps m6, [r1 + r4] -.loop +.loop: movu m7, [r0] movu m5, [r0 + 16] movu m3, [r0 + r5 + 2] @@ -918,7 +918,7 @@ cglobal saoCuOrgE2, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth movh m5, [r0 + r4] movhps m5, [r1 + r4] -.loop +.loop: movu m1, [r0] ; m1 = rec[x] movu m2, [r0 + r5 + 1] ; m2 = rec[x + stride + 1] pxor m3, m1, m7 @@ -970,7 +970,7 @@ cglobal saoCuOrgE2, 6,6,7 movhps xm4, [r1 + r4] vbroadcasti128 m5, [r3] mova m6, [pw_pixel_max] -.loop +.loop: movu m1, [r0] movu m3, [r0 + r5 + 2] @@ -1061,7 +1061,7 @@ cglobal saoCuOrgE2_32, 6,6,8 movhps xm4, [r1 + r4] vbroadcasti128 m5, [r3] -.loop +.loop: movu m1, [r0] movu m7, [r0 + 32] movu m3, [r0 + r5 + 2] @@ -1567,11 +1567,11 @@ cglobal saoCuOrgB0, 5,7,8 movu m4, [r1 + 16] ; offset[16-31] pxor m7, m7 -.loopH +.loopH: mov r5d, r2d xor r6, r6 -.loopW +.loopW: movu m2, [r0 + r6] movu m5, [r0 + r6 + 16] psrlw m0, m2, (BIT_DEPTH - 5) @@ -1617,11 +1617,11 @@ cglobal saoCuOrgB0, 4, 7, 8 movu m3, [r1 + 0] ; offset[0-15] movu m4, [r1 + 16] ; offset[16-31] pxor m7, m7 ; m7 =[0] -.loopH +.loopH: mov r5d, r2d xor r6, r6 -.loopW +.loopW: movu m2, [r0 + r6] ; m0 = [rec] psrlw m1, m2, 3 pand m1, [pb_31] ; m1 = [index] @@ -1670,9 +1670,9 @@ cglobal saoCuOrgB0, 5,7,8 mov r6d, r3d shr r3d, 1 -.loopH +.loopH: mov r5d, r2d -.loopW +.loopW: movu m2, [r0] movu m5, [r0 + r4] psrlw m0, m2, (BIT_DEPTH - 5) @@ -1751,9 +1751,9 @@ cglobal saoCuOrgB0, 4, 7, 8 shr r2d, 4 mov r1d, r3d shr r3d, 1 -.loopH +.loopH: mov r5d, r2d -.loopW +.loopW: movu xm2, [r0] ; m2 = [rec] vinserti128 m2, m2, [r0 + r4], 1 psrlw m1, m2, 3 @@ -1789,7 +1789,7 @@ cglobal saoCuOrgB0, 4, 7, 8 test r1b, 1 jz .end mov r5d, r2d -.loopW1 +.loopW1: movu xm2, [r0] ; m2 = [rec] psrlw xm1, xm2, 3 pand xm1, xm7 ; m1 = [index] @@ -1811,7 +1811,7 @@ cglobal saoCuOrgB0, 4, 7, 8 add r0, 16 dec r5d jnz .loopW1 -.end +.end: RET %endif @@ -1827,7 +1827,7 @@ cglobal calSign, 4, 7, 5 add r3d, 1 mov r5, r0 movu m4, [r0 + r4] -.loop +.loop: movu m1, [r1] ; m2 = pRec[x] movu m2, [r2] ; m3 = pTmpU[x] @@ -1921,7 +1921,7 @@ cglobal calSign, 4, 7, 5 mov r5, r0 movu m4, [r0 + r4] -.loop +.loop: movu m1, [r1] ; m2 = pRec[x] movu m2, [r2] ; m3 = pTmpU[x] diff --git a/source/common/x86/mc-a.asm b/source/common/x86/mc-a.asm index fa3c8acd6a..55531e770d 100644 --- a/source/common/x86/mc-a.asm +++ b/source/common/x86/mc-a.asm @@ -4115,7 +4115,7 @@ cglobal pixel_avg_8x16, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 -.loop +.loop: pixel_avg_W8 dec r9d jnz .loop @@ -4129,7 +4129,7 @@ cglobal pixel_avg_8x32, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 8 -.loop +.loop: pixel_avg_W8 dec r9d jnz .loop @@ -4697,7 +4697,7 @@ cglobal pixel_avg_12x16, 6,10,4 lea r8, [r1 * 3] mov r9d, 4 -.loop +.loop: movu m0, [r2] movu m1, [r4] pavgw m0, m1 @@ -4834,7 +4834,7 @@ cglobal pixel_avg_16x16, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 -.loop +.loop: pixel_avg_H16 dec r9d jnz .loop @@ -4848,7 +4848,7 @@ cglobal pixel_avg_16x32, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 -.loop +.loop: pixel_avg_H16 pixel_avg_H16 dec r9d @@ -4863,7 +4863,7 @@ cglobal pixel_avg_16x64, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 -.loop +.loop: pixel_avg_H16 pixel_avg_H16 pixel_avg_H16 @@ -4887,7 +4887,7 @@ cglobal pixel_avg_24x32, 6,10,4 lea r8, [r1 * 3] mov r9d, 8 -.loop +.loop: movu m0, [r2] movu m1, [r4] pavgw m0, m1 @@ -4987,7 +4987,7 @@ cglobal pixel_avg_32x8, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 2 -.loop +.loop: pixel_avg_W32 dec r9d jnz .loop @@ -5001,7 +5001,7 @@ cglobal pixel_avg_32x16, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 -.loop +.loop: pixel_avg_W32 dec r9d jnz .loop @@ -5015,7 +5015,7 @@ cglobal pixel_avg_32x24, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 6 -.loop +.loop: pixel_avg_W32 dec r9d jnz .loop @@ -5029,7 +5029,7 @@ cglobal pixel_avg_32x32, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 8 -.loop +.loop: pixel_avg_W32 dec r9d jnz .loop @@ -5043,7 +5043,7 @@ cglobal pixel_avg_32x64, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 16 -.loop +.loop: pixel_avg_W32 dec r9d jnz .loop @@ -5141,7 +5141,7 @@ cglobal pixel_avg_64x16, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 4 -.loop +.loop: pixel_avg_W64 dec r9d jnz .loop @@ -5155,7 +5155,7 @@ cglobal pixel_avg_64x32, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 8 -.loop +.loop: pixel_avg_W64 dec r9d jnz .loop @@ -5169,7 +5169,7 @@ cglobal pixel_avg_64x48, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 12 -.loop +.loop: pixel_avg_W64 dec r9d jnz .loop @@ -5183,7 +5183,7 @@ cglobal pixel_avg_64x64, 6,10,4 lea r7, [r5 * 3] lea r8, [r1 * 3] mov r9d, 16 -.loop +.loop: pixel_avg_W64 dec r9d jnz .loop @@ -5204,7 +5204,7 @@ cglobal pixel_avg_48x64, 6,10,4 lea r8, [r1 * 3] mov r9d, 16 -.loop +.loop: movu m0, [r2] movu m1, [r4] pavgw m0, m1 diff --git a/source/common/x86/pixel-util8.asm b/source/common/x86/pixel-util8.asm index 11c2500abf..50264c45c0 100644 --- a/source/common/x86/pixel-util8.asm +++ b/source/common/x86/pixel-util8.asm @@ -1785,7 +1785,7 @@ cglobal weight_sp, 6,7,9 movu [r1], xm7 je .nextH -.width6 +.width6: cmp r6d, 6 jl .width4 movq [r1], xm7 @@ -4937,7 +4937,7 @@ cglobal pixel_sub_ps_16x%2, 6, 10, 4, dest, deststride, src0, src1, srcstride0, lea r9, [r4 * 3] lea r8, [r5 * 3] -.loop +.loop: pmovzxbw m0, [r2] pmovzxbw m1, [r3] pmovzxbw m2, [r2 + r4] @@ -5150,7 +5150,7 @@ cglobal pixel_sub_ps_32x%1, 6, 10, 4, dest, deststride, src0, src1, srcstride0, lea r7, [r4 * 3] lea r8, [r5 * 3] -.loop +.loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r3] @@ -5557,7 +5557,7 @@ cglobal pixel_sub_ps_64x64, 6, 10, 8, dest, deststride, src0, src1, srcstride0, lea r7, [r4 * 3] lea r8, [r5 * 3] -.loop +.loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + 64] diff --git a/source/common/x86/sad-a.asm b/source/common/x86/sad-a.asm index 5bd80487aa..62ea494647 100644 --- a/source/common/x86/sad-a.asm +++ b/source/common/x86/sad-a.asm @@ -5631,7 +5631,7 @@ cglobal pixel_sad_32x16, 4,5,6 xorps m5, m5 mov r4d, 4 -.loop +.loop: movu m1, [r0] ; row 0 of pix0 movu m2, [r2] ; row 0 of pix1 movu m3, [r0 + r1] ; row 1 of pix0 @@ -5676,7 +5676,7 @@ cglobal pixel_sad_32x24, 4,7,6 mov r4d, 6 lea r5, [r1 * 3] lea r6, [r3 * 3] -.loop +.loop: movu m1, [r0] ; row 0 of pix0 movu m2, [r2] ; row 0 of pix1 movu m3, [r0 + r1] ; row 1 of pix0 @@ -5718,7 +5718,7 @@ cglobal pixel_sad_32x32, 4,7,5 lea r5, [r1 * 3] lea r6, [r3 * 3] -.loop +.loop: movu m1, [r0] ; row 0 of pix0 movu m2, [r2] ; row 0 of pix1 movu m3, [r0 + r1] ; row 1 of pix0 @@ -5759,7 +5759,7 @@ cglobal pixel_sad_32x64, 4,7,5 lea r5, [r1 * 3] lea r6, [r3 * 3] -.loop +.loop: movu m1, [r0] ; row 0 of pix0 movu m2, [r2] ; row 0 of pix1 movu m3, [r0 + r1] ; row 1 of pix0 @@ -5822,7 +5822,7 @@ cglobal pixel_sad_48x64, 4,7,7 mov r4d, 64/4 lea r5, [r1 * 3] lea r6, [r3 * 3] -.loop +.loop: movu m1, [r0] ; row 0 of pix0 movu m2, [r2] ; row 0 of pix1 movu m3, [r0 + r1] ; row 1 of pix0 @@ -5873,7 +5873,7 @@ cglobal pixel_sad_64x16, 4,5,6 xorps m0, m0 xorps m5, m5 mov r4d, 4 -.loop +.loop: movu m1, [r0] ; first 32 of row 0 of pix0 movu m2, [r2] ; first 32 of row 0 of pix1 movu m3, [r0 + 32] ; second 32 of row 0 of pix0 @@ -5936,7 +5936,7 @@ cglobal pixel_sad_64x32, 4,5,6 xorps m0, m0 xorps m5, m5 mov r4d, 16 -.loop +.loop: movu m1, [r0] ; first 32 of row 0 of pix0 movu m2, [r2] ; first 32 of row 0 of pix1 movu m3, [r0 + 32] ; second 32 of row 0 of pix0 @@ -5978,7 +5978,7 @@ cglobal pixel_sad_64x48, 4,7,6 mov r4d, 12 lea r5, [r1 * 3] lea r6, [r3 * 3] -.loop +.loop: movu m1, [r0] ; first 32 of row 0 of pix0 movu m2, [r2] ; first 32 of row 0 of pix1 movu m3, [r0 + 32] ; second 32 of row 0 of pix0 @@ -6040,7 +6040,7 @@ cglobal pixel_sad_64x64, 4,7,6 mov r4d, 8 lea r5, [r1 * 3] lea r6, [r3 * 3] -.loop +.loop: movu m1, [r0] ; first 32 of row 0 of pix0 movu m2, [r2] ; first 32 of row 0 of pix1 movu m3, [r0 + 32] ; second 32 of row 0 of pix0 diff --git a/source/common/x86/seaintegral.asm b/source/common/x86/seaintegral.asm index cf79ca4478..ea362efd0a 100644 --- a/source/common/x86/seaintegral.asm +++ b/source/common/x86/seaintegral.asm @@ -36,7 +36,7 @@ cglobal integral4v, 2, 3, 2 mov r2, r1 shl r2, 4 -.loop +.loop: movu m0, [r0] movu m1, [r0 + r2] psubd m1, m0 @@ -54,7 +54,7 @@ cglobal integral8v, 2, 3, 2 mov r2, r1 shl r2, 5 -.loop +.loop: movu m0, [r0] movu m1, [r0 + r2] psubd m1, m0 @@ -75,7 +75,7 @@ cglobal integral12v, 2, 4, 2 shl r3, 4 add r2, r3 -.loop +.loop: movu m0, [r0] movu m1, [r0 + r2] psubd m1, m0 @@ -93,7 +93,7 @@ cglobal integral16v, 2, 3, 2 mov r2, r1 shl r2, 6 -.loop +.loop: movu m0, [r0] movu m1, [r0 + r2] psubd m1, m0 @@ -114,7 +114,7 @@ cglobal integral24v, 2, 4, 2 shl r3, 5 add r2, r3 -.loop +.loop: movu m0, [r0] movu m1, [r0 + r2] psubd m1, m0 @@ -132,7 +132,7 @@ cglobal integral32v, 2, 3, 2 mov r2, r1 shl r2, 7 -.loop +.loop: movu m0, [r0] movu m1, [r0 + r2] psubd m1, m0 @@ -264,7 +264,7 @@ cglobal integral4h, 3, 5, 3 movu [r0 + r3], xm0 jmp .end -.end +.end: RET %endif @@ -379,7 +379,7 @@ cglobal integral8h, 3, 5, 3 movu [r0 + r3], m0 jmp .end -.end +.end: RET %endif @@ -577,7 +577,7 @@ cglobal integral12h, 3, 5, 3 movu [r0 + r3], xm0 jmp .end -.end +.end: RET %endif @@ -740,7 +740,7 @@ cglobal integral16h, 3, 5, 3 movu [r0 + r3], m0 jmp .end -.end +.end: RET %endif @@ -883,7 +883,7 @@ cglobal integral24h, 3, 5, 3 movu [r0 + r3], m0 jmp .end -.end +.end: RET %macro INTEGRAL_THIRTYTWO_HORIZONTAL_16 0 @@ -1058,5 +1058,5 @@ cglobal integral32h, 3, 5, 3 movu [r0 + r3], m0 jmp .end -.end +.end: RET From 329cdee9e8c31364f008f4303ec3b08a568fd4a5 Mon Sep 17 00:00:00 2001 From: Vignesh Vijayakumar Date: Thu, 2 Nov 2017 09:40:41 +0530 Subject: [PATCH 03/51] x86: Change assembler from YASM to NASM Supports NASM versions 2.13 and greater --- source/CMakeLists.txt | 20 ++++---- ...n.cmake => CMakeASM_NASMInformation.cmake} | 30 +++++------ ...e => CMakeDetermineASM_NASMCompiler.cmake} | 6 +-- ....cmake => CMakeTestASM_NASMCompiler.cmake} | 2 +- source/cmake/FindNasm.cmake | 25 +++++++++ source/cmake/FindYasm.cmake | 25 --------- source/common/CMakeLists.txt | 6 +-- source/common/x86/x86inc.asm | 51 +++++++++++-------- source/test/CMakeLists.txt | 18 +++---- source/test/checkasm-a.asm | 50 +++++++++--------- 10 files changed, 124 insertions(+), 109 deletions(-) rename source/cmake/{CMakeASM_YASMInformation.cmake => CMakeASM_NASMInformation.cmake} (64%) rename source/cmake/{CMakeDetermineASM_YASMCompiler.cmake => CMakeDetermineASM_NASMCompiler.cmake} (55%) rename source/cmake/{CMakeTestASM_YASMCompiler.cmake => CMakeTestASM_NASMCompiler.cmake} (65%) create mode 100644 source/cmake/FindNasm.cmake delete mode 100644 source/cmake/FindYasm.cmake diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index dfcef20f49..ca8e4d24dd 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -323,15 +323,15 @@ if(GCC) execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE CC_VERSION) endif(GCC) -find_package(Yasm) +find_package(Nasm) if(ARM OR CROSS_COMPILE_ARM) option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON) -elseif(YASM_FOUND AND X86) - if (YASM_VERSION_STRING VERSION_LESS "1.2.0") - message(STATUS "Yasm version ${YASM_VERSION_STRING} is too old. 1.2.0 or later required") +elseif(NASM_FOUND AND X86) + if (NASM_VERSION_STRING VERSION_LESS "2.13.0") + message(STATUS "Nasm version ${NASM_VERSION_STRING} is too old. 2.13.0 or later required") option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF) else() - message(STATUS "Found Yasm ${YASM_VERSION_STRING} to build assembly primitives") + message(STATUS "Found Nasm ${NASM_VERSION_STRING} to build assembly primitives") option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON) endif() else() @@ -517,18 +517,18 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) list(APPEND ASM_OBJS ${ASM}.${SUFFIX}) add_custom_command( OUTPUT ${ASM}.${SUFFIX} - COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${ASM_SRC} -o ${ASM}.${SUFFIX} + COMMAND ${NASM_EXECUTABLE} ARGS ${NASM_FLAGS} ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() endif() endif() source_group(ASM FILES ${ASM_SRCS}) if(ENABLE_HDR10_PLUS) - add_library(x265-static STATIC $ $ $ ${ASM_OBJS} ${ASM_SRCS}) + add_library(x265-static STATIC $ $ $ ${ASM_OBJS}) add_library(hdr10plus-static STATIC $) set_target_properties(hdr10plus-static PROPERTIES OUTPUT_NAME hdr10plus) else() - add_library(x265-static STATIC $ $ ${ASM_OBJS} ${ASM_SRCS}) + add_library(x265-static STATIC $ $ ${ASM_OBJS}) endif() if(NOT MSVC) set_target_properties(x265-static PROPERTIES OUTPUT_NAME x265) @@ -686,11 +686,11 @@ if(ENABLE_CLI) if(ENABLE_HDR10_PLUS) add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} x265.cpp x265.h x265cli.h - $ $ $ ${ASM_OBJS} ${ASM_SRCS}) + $ $ $ ${ASM_OBJS}) else() add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} x265.cpp x265.h x265cli.h - $ $ ${ASM_OBJS} ${ASM_SRCS}) + $ $ ${ASM_OBJS}) endif() else() add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} ${X265_RC_FILE} diff --git a/source/cmake/CMakeASM_YASMInformation.cmake b/source/cmake/CMakeASM_NASMInformation.cmake similarity index 64% rename from source/cmake/CMakeASM_YASMInformation.cmake rename to source/cmake/CMakeASM_NASMInformation.cmake index 051a0cd3df..45c0ad1672 100644 --- a/source/cmake/CMakeASM_YASMInformation.cmake +++ b/source/cmake/CMakeASM_NASMInformation.cmake @@ -1,22 +1,22 @@ -set(ASM_DIALECT "_YASM") +set(ASM_DIALECT "_NASM") set(CMAKE_ASM${ASM_DIALECT}_SOURCE_FILE_EXTENSIONS asm) if(X64) - list(APPEND ASM_FLAGS -DARCH_X86_64=1) + list(APPEND ASM_FLAGS -DARCH_X86_64=1 -I ${CMAKE_CURRENT_SOURCE_DIR}/../common/x86/) if(ENABLE_PIC) list(APPEND ASM_FLAGS -DPIC) endif() if(APPLE) - set(ARGS -f macho64 -m amd64 -DPREFIX) + set(ARGS -f macho64 -DPREFIX) elseif(UNIX AND NOT CYGWIN) - set(ARGS -f elf64 -m amd64) + set(ARGS -f elf64) else() - set(ARGS -f win64 -m amd64) + set(ARGS -f win64) endif() else() - list(APPEND ASM_FLAGS -DARCH_X86_64=0) + list(APPEND ASM_FLAGS -DARCH_X86_64=0 -I ${CMAKE_CURRENT_SOURCE_DIR}/../common/x86/) if(APPLE) - set(ARGS -f macho -DPREFIX) + set(ARGS -f macho32 -DPREFIX) elseif(UNIX AND NOT CYGWIN) set(ARGS -f elf32) else() @@ -40,25 +40,25 @@ else() list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 -DX265_NS=${X265_NS}) endif() -list(APPEND ASM_FLAGS "${CMAKE_ASM_YASM_FLAGS}") +list(APPEND ASM_FLAGS "${CMAKE_ASM_NASM_FLAGS}") if(CMAKE_BUILD_TYPE MATCHES Release) - list(APPEND ASM_FLAGS "${CMAKE_ASM_YASM_FLAGS_RELEASE}") + list(APPEND ASM_FLAGS "${CMAKE_ASM_NASM_FLAGS_RELEASE}") elseif(CMAKE_BUILD_TYPE MATCHES Debug) - list(APPEND ASM_FLAGS "${CMAKE_ASM_YASM_FLAGS_DEBUG}") + list(APPEND ASM_FLAGS "${CMAKE_ASM_NASM_FLAGS_DEBUG}") elseif(CMAKE_BUILD_TYPE MATCHES MinSizeRel) - list(APPEND ASM_FLAGS "${CMAKE_ASM_YASM_FLAGS_MINSIZEREL}") + list(APPEND ASM_FLAGS "${CMAKE_ASM_NASM_FLAGS_MINSIZEREL}") elseif(CMAKE_BUILD_TYPE MATCHES RelWithDebInfo) - list(APPEND ASM_FLAGS "${CMAKE_ASM_YASM_FLAGS_RELWITHDEBINFO}") + list(APPEND ASM_FLAGS "${CMAKE_ASM_NASM_FLAGS_RELWITHDEBINFO}") endif() -set(YASM_FLAGS ${ARGS} ${ASM_FLAGS} PARENT_SCOPE) -string(REPLACE ";" " " CMAKE_ASM_YASM_COMPILER_ARG1 "${ARGS}") +set(NASM_FLAGS ${ARGS} ${ASM_FLAGS} PARENT_SCOPE) +string(REPLACE ";" " " CMAKE_ASM_NASM_COMPILER_ARG1 "${ARGS}") # This section exists to override the one in CMakeASMInformation.cmake # (the default Information file). This removes the # thing so that your C compiler flags that have been set via -# set_target_properties don't get passed to yasm and confuse it. +# set_target_properties don't get passed to nasm and confuse it. if(NOT CMAKE_ASM${ASM_DIALECT}_COMPILE_OBJECT) string(REPLACE ";" " " STR_ASM_FLAGS "${ASM_FLAGS}") set(CMAKE_ASM${ASM_DIALECT}_COMPILE_OBJECT " ${STR_ASM_FLAGS} -o ") diff --git a/source/cmake/CMakeDetermineASM_YASMCompiler.cmake b/source/cmake/CMakeDetermineASM_NASMCompiler.cmake similarity index 55% rename from source/cmake/CMakeDetermineASM_YASMCompiler.cmake rename to source/cmake/CMakeDetermineASM_NASMCompiler.cmake index a902ef8cbd..6ee17e4db9 100644 --- a/source/cmake/CMakeDetermineASM_YASMCompiler.cmake +++ b/source/cmake/CMakeDetermineASM_NASMCompiler.cmake @@ -1,5 +1,5 @@ -set(ASM_DIALECT "_YASM") -set(CMAKE_ASM${ASM_DIALECT}_COMPILER ${YASM_EXECUTABLE}) -set(CMAKE_ASM${ASM_DIALECT}_COMPILER_INIT ${_CMAKE_TOOLCHAIN_PREFIX}yasm) +set(ASM_DIALECT "_NASM") +set(CMAKE_ASM${ASM_DIALECT}_COMPILER ${NASM_EXECUTABLE}) +set(CMAKE_ASM${ASM_DIALECT}_COMPILER_INIT ${_CMAKE_TOOLCHAIN_PREFIX}nasm) include(CMakeDetermineASMCompiler) set(ASM_DIALECT) diff --git a/source/cmake/CMakeTestASM_YASMCompiler.cmake b/source/cmake/CMakeTestASM_NASMCompiler.cmake similarity index 65% rename from source/cmake/CMakeTestASM_YASMCompiler.cmake rename to source/cmake/CMakeTestASM_NASMCompiler.cmake index c668668a03..f3346c1eec 100644 --- a/source/cmake/CMakeTestASM_YASMCompiler.cmake +++ b/source/cmake/CMakeTestASM_NASMCompiler.cmake @@ -1,3 +1,3 @@ -set(ASM_DIALECT "_YASM") +set(ASM_DIALECT "_NASM") include(CMakeTestASMCompiler) set(ASM_DIALECT) diff --git a/source/cmake/FindNasm.cmake b/source/cmake/FindNasm.cmake new file mode 100644 index 0000000000..ff7eac6227 --- /dev/null +++ b/source/cmake/FindNasm.cmake @@ -0,0 +1,25 @@ +include(FindPackageHandleStandardArgs) + +# Simple path search with YASM_ROOT environment variable override +find_program(NASM_EXECUTABLE + NAMES nasm nasm-2.13.0-win32 nasm-2.13.0-win64 nasm nasm-2.13.0-win32 nasm-2.13.0-win64 + HINTS $ENV{NASM_ROOT} ${NASM_ROOT} + PATH_SUFFIXES bin +) + +if(NASM_EXECUTABLE) + execute_process(COMMAND ${NASM_EXECUTABLE} -version + OUTPUT_VARIABLE nasm_version + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(nasm_version MATCHES "^NASM version ([0-9\\.]*)") + set(NASM_VERSION_STRING "${CMAKE_MATCH_1}") + endif() + unset(nasm_version) +endif() + +# Provide standardized success/failure messages +find_package_handle_standard_args(nasm + REQUIRED_VARS NASM_EXECUTABLE + VERSION_VAR NASM_VERSION_STRING) diff --git a/source/cmake/FindYasm.cmake b/source/cmake/FindYasm.cmake deleted file mode 100644 index d93cdece6e..0000000000 --- a/source/cmake/FindYasm.cmake +++ /dev/null @@ -1,25 +0,0 @@ -include(FindPackageHandleStandardArgs) - -# Simple path search with YASM_ROOT environment variable override -find_program(YASM_EXECUTABLE - NAMES yasm yasm-1.2.0-win32 yasm-1.2.0-win64 yasm yasm-1.3.0-win32 yasm-1.3.0-win64 - HINTS $ENV{YASM_ROOT} ${YASM_ROOT} - PATH_SUFFIXES bin -) - -if(YASM_EXECUTABLE) - execute_process(COMMAND ${YASM_EXECUTABLE} --version - OUTPUT_VARIABLE yasm_version - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - if(yasm_version MATCHES "^yasm ([0-9\\.]*)") - set(YASM_VERSION_STRING "${CMAKE_MATCH_1}") - endif() - unset(yasm_version) -endif() - -# Provide standardized success/failure messages -find_package_handle_standard_args(yasm - REQUIRED_VARS YASM_EXECUTABLE - VERSION_VAR YASM_VERSION_STRING) diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 7f2e01b846..65624069c4 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -72,12 +72,12 @@ if(ENABLE_ASSEMBLY AND X86) endif() if(MSVC_IDE OR XCODE) - # MSVC requires custom build rules in the main cmake script for yasm - set(MSVC_ASMS "${A_SRCS}" CACHE INTERNAL "yasm sources") + # MSVC requires custom build rules in the main cmake script for nasm + set(MSVC_ASMS "${A_SRCS}" CACHE INTERNAL "nasm sources") set(A_SRCS) endif() - enable_language(ASM_YASM) + enable_language(ASM_NASM) foreach(SRC ${A_SRCS} ${C_SRCS}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} x86/${SRC}) diff --git a/source/common/x86/x86inc.asm b/source/common/x86/x86inc.asm index a7d96dd82e..fa5ffcd28b 100644 --- a/source/common/x86/x86inc.asm +++ b/source/common/x86/x86inc.asm @@ -66,6 +66,15 @@ %endif %endif +%define FORMAT_ELF 0 +%ifidn __OUTPUT_FORMAT__,elf + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf32 + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define FORMAT_ELF 1 +%endif + %ifdef PREFIX %define mangle(x) _ %+ x %else @@ -88,6 +97,10 @@ default rel %endif +%ifdef __NASM_VER__ + %use smartalign +%endif + ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that @@ -685,7 +698,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, CAT_XDEFINE cglobaled_, %2, 1 %endif %xdefine current_function %2 - %ifidn __OUTPUT_FORMAT__,elf + %if FORMAT_ELF global %2:function %%VISIBILITY %else global %2 @@ -711,14 +724,16 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, ; like cextern, but without the prefix %macro cextern_naked 1 - %xdefine %1 mangle(%1) + %ifdef PREFIX + %xdefine %1 mangle(%1) + %endif CAT_XDEFINE cglobaled_, %1, 1 extern %1 %endmacro %macro const 1-2+ %xdefine %1 mangle(private_prefix %+ _ %+ %1) - %ifidn __OUTPUT_FORMAT__,elf + %if FORMAT_ELF global %1:data hidden %else global %1 @@ -727,9 +742,8 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %1: %2 %endmacro -; This is needed for ELF, otherwise the GNU linker assumes the stack is -; executable by default. -%ifidn __OUTPUT_FORMAT__,elf +; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. +%if FORMAT_ELF [SECTION .note.GNU-stack noalloc noexec nowrite progbits] %endif @@ -801,9 +815,17 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %endif %if ARCH_X86_64 || cpuflag(sse2) - CPU amdnop + %ifdef __NASM_VER__ + ALIGNMODE p6 + %else + CPU amdnop + %endif %else - CPU basicnop + %ifdef __NASM_VER__ + ALIGNMODE nop + %else + CPU basicnop + %endif %endif %endmacro @@ -1467,7 +1489,7 @@ FMA_INSTR pmadcswd, pmaddwd, paddd v%5%6 %1, %2, %3, %4 %elifidn %1, %2 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. - %ifid %3 + %ifnum sizeof%3 v%{5}213%6 %2, %3, %4 %else v%{5}132%6 %2, %4, %3 @@ -1491,14 +1513,3 @@ FMA4_INSTR fmsub, pd, ps, sd, ss FMA4_INSTR fmsubadd, pd, ps FMA4_INSTR fnmadd, pd, ps, sd, ss FMA4_INSTR fnmsub, pd, ps, sd, ss - -; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) -%if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 - %macro vpbroadcastq 2 - %if sizeof%1 == 16 - movddup %1, %2 - %else - vbroadcastsd %1, %2 - %endif - %endmacro -%endif diff --git a/source/test/CMakeLists.txt b/source/test/CMakeLists.txt index f2f0f47492..260195f53a 100644 --- a/source/test/CMakeLists.txt +++ b/source/test/CMakeLists.txt @@ -7,37 +7,37 @@ endif() # add X86 assembly files if(X86) -enable_language(ASM_YASM) +enable_language(ASM_NASM) if(MSVC_IDE) - set(YASM_SRC checkasm-a.obj) + set(NASM_SRC checkasm-a.obj) add_custom_command( OUTPUT checkasm-a.obj - COMMAND ${YASM_EXECUTABLE} - ARGS ${YASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-a.asm -o checkasm-a.obj + COMMAND ${NASM_EXECUTABLE} + ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-a.asm -o checkasm-a.obj DEPENDS checkasm-a.asm) else() - set(YASM_SRC checkasm-a.asm) + set(NASM_SRC checkasm-a.asm) endif() endif(X86) # add ARM assembly files if(ARM OR CROSS_COMPILE_ARM) enable_language(ASM) - set(YASM_SRC checkasm-arm.S) + set(NASM_SRC checkasm-arm.S) add_custom_command( OUTPUT checkasm-arm.obj COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${YASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj + ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj DEPENDS checkasm-arm.S) endif(ARM OR CROSS_COMPILE_ARM) # add PowerPC assembly files if(POWER) - set(YASM_SRC) + set(NASM_SRC) endif(POWER) -add_executable(TestBench ${YASM_SRC} +add_executable(TestBench ${NASM_SRC} testbench.cpp testharness.h pixelharness.cpp pixelharness.h mbdstharness.cpp mbdstharness.h diff --git a/source/test/checkasm-a.asm b/source/test/checkasm-a.asm index d5b60fdf46..9e57a057cc 100644 --- a/source/test/checkasm-a.asm +++ b/source/test/checkasm-a.asm @@ -26,7 +26,7 @@ ;* For more information, contact us at license @ x265.com. ;***************************************************************************** -%include "../common/x86/x86inc.asm" +%include "x86inc.asm" SECTION_RODATA @@ -35,24 +35,24 @@ error_message: db "failed to preserve register", 0 %if ARCH_X86_64 ; just random numbers to reduce the chance of incidental match ALIGN 16 -x6: ddq 0x79445c159ce790641a1b2550a612b48c -x7: ddq 0x86b2536fcd8cf6362eed899d5a28ddcd -x8: ddq 0x3f2bf84fc0fcca4eb0856806085e7943 -x9: ddq 0xd229e1f5b281303facbd382dcf5b8de2 -x10: ddq 0xab63e2e11fa38ed971aeaff20b095fd9 -x11: ddq 0x77d410d5c42c882d89b0c0765892729a -x12: ddq 0x24b3c1d2a024048bc45ea11a955d8dd5 -x13: ddq 0xdd7b8919edd427862e8ec680de14b47c -x14: ddq 0x11e53e2b2ac655ef135ce6888fa02cbf -x15: ddq 0x6de8f4c914c334d5011ff554472a7a10 -n7: dq 0x21f86d66c8ca00ce -n8: dq 0x75b6ba21077c48ad -n9: dq 0xed56bb2dcb3c7736 -n10: dq 0x8bda43d3fd1a7e06 -n11: dq 0xb64a9c9e5d318408 -n12: dq 0xdf9a54b303f1d3a3 -n13: dq 0x4a75479abd64e097 -n14: dq 0x249214109d5d1c88 +x6: dq 0x1a1b2550a612b48c,0x79445c159ce79064 +x7: dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636 +x8: dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e +x9: dq 0xacbd382dcf5b8de2,0xd229e1f5b281303f +x10: dq 0x71aeaff20b095fd9,0xab63e2e11fa38ed9 +x11: dq 0x89b0c0765892729a,0x77d410d5c42c882d +x12: dq 0xc45ea11a955d8dd5,0x24b3c1d2a024048b +x13: dq 0x2e8ec680de14b47c,0xdd7b8919edd42786 +x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef +x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5 +n7: dq 0x21f86d66c8ca00ce +n8: dq 0x75b6ba21077c48ad +n9: dq 0xed56bb2dcb3c7736 +n10: dq 0x8bda43d3fd1a7e06 +n11: dq 0xb64a9c9e5d318408 +n12: dq 0xdf9a54b303f1d3a3 +n13: dq 0x4a75479abd64e097 +n14: dq 0x249214109d5d1c88 %endif SECTION .text @@ -70,14 +70,14 @@ cextern_naked puts ;----------------------------------------------------------------------------- cglobal checkasm_stack_clobber, 1,2 ; Clobber the stack with junk below the stack pointer - %define size (max_args+6)*8 - SUB rsp, size - mov r1, size-8 + %define argsize (max_args+6)*8 + SUB rsp, argsize + mov r1, argsize-8 .loop: mov [rsp+r1], r0 sub r1, 8 jge .loop - ADD rsp, size + ADD rsp, argsize RET %if WIN64 @@ -156,7 +156,11 @@ cglobal checkasm_call, 2,15,16,max_args*8+8 mov r9, rax mov r10, rdx lea r0, [error_message] +%if FORMAT_ELF + call puts wrt ..plt +%else call puts +%endif mov r1, [rsp+max_args*8] mov dword [r1], 0 mov rdx, r10 From 514e3f824b45469713cf434485d8a552f4ff6eae Mon Sep 17 00:00:00 2001 From: Pradeep Ramachandran Date: Thu, 30 Nov 2017 10:06:49 +0530 Subject: [PATCH 04/51] doc: Update build/readme.txt for detailed instructions with nasm --- build/README.txt | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/build/README.txt b/build/README.txt index d13188461b..a555cc1482 100644 --- a/build/README.txt +++ b/build/README.txt @@ -9,7 +9,8 @@ Note: MSVC12 requires cmake 2.8.11 or later = Optional Prerequisites = -1. Yasm 1.2.0 or later, to compile assembly primitives (performance) +1. To compile assembly primitives (performance) + a) If you are using release 2.6 or older, download and install Yasm 1.2.0 or later, For Windows, download the latest yasm executable http://yasm.tortall.net/Download.html and copy the EXE into @@ -33,6 +34,24 @@ Note: MSVC12 requires cmake 2.8.11 or later If cpu capabilities line says 'none!', then the encoder was built without yasm. + b) If you are building from the default branch after release 2.6, download and install nasm 2.13 or newer + + For windows and linux, you can download the nasm installer from http://www.nasm.us/pub/nasm/releasebuilds/?C=M;O=D. + Make sure that it is in your PATH environment variable (%PATH% in windows, and $PATH in linux) so that cmake + can find it. + + Once NASM is properly installed, run cmake to regenerate projects. If you + do not see the below line in the cmake output, NASM is not in the PATH. + + -- Found Nasm 2.13 to build assembly primitives + + Now build the encoder and run x265 -V: + + x265 [info]: using cpu capabilities: MMX, SSE2, ... + + If cpu capabilities line says 'none!', then the encoder was built + without nasm and will be considerably slower for performance. + 2. VisualLeakDetector (Windows Only) Download from https://vld.codeplex.com/releases and install. May need From a38fbf933cd0072de0871b7eb604e698c4a6be17 Mon Sep 17 00:00:00 2001 From: Ma0 Date: Thu, 30 Nov 2017 18:16:40 +0100 Subject: [PATCH 05/51] fix ambiguous call to overloaded function 'sqrt' (in old MSVC) --- source/encoder/encoder.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index a8a9eb6db4..a08f7ae87f 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -575,7 +575,7 @@ int Encoder::setAnalysisDataAfterZScan(x265_analysis_data *analysis_data, Frame* { int mv_x = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[k][(mbIndex * 16) + cuOffset].x; int mv_y = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[k][(mbIndex * 16) + cuOffset].y; - double mv = sqrt(mv_x*mv_x + mv_y*mv_y); + double mv = sqrt((double)(mv_x*mv_x + mv_y*mv_y)); if (numPU == PU_2Nx2N && ((srcInterData)->depth[cuPos + cuOffset] == (m_param->maxCUSize >> 5)) && mv <= MVTHRESHOLD) memset(&curFrame->m_analysisData.modeFlag[k][cuPos + cuOffset], 1, bytes); } @@ -658,7 +658,7 @@ int Encoder::setAnalysisData(x265_analysis_data *analysis_data, int poc, uint32_ { int mv_x = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[i][count + pu].x; int mv_y = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[i][count + pu].y; - double mv = sqrt(mv_x*mv_x + mv_y*mv_y); + double mv = sqrt((double)(mv_x*mv_x + mv_y*mv_y)); if (numPU == PU_2Nx2N && m_param->num4x4Partitions <= 16 && mv <= MVTHRESHOLD) memset(&curFrame->m_analysisData.modeFlag[i][count + pu], 1, bytes); } From d35350163e5d4260cb72cba98afc09bf1c3c316c Mon Sep 17 00:00:00 2001 From: Aruna Matheswaran Date: Tue, 21 Nov 2017 10:48:24 +0530 Subject: [PATCH 06/51] Extend gop boundary by doing gop lookahead --- doc/reST/cli.rst | 10 ++++++ source/CMakeLists.txt | 2 +- source/common/param.cpp | 5 +++ source/encoder/slicetype.cpp | 52 ++++++++++++++++++++++++-------- source/encoder/slicetype.h | 1 + source/test/regression-tests.txt | 2 +- source/x265.h | 7 +++-- source/x265cli.h | 2 ++ 8 files changed, 64 insertions(+), 17 deletions(-) diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst index 87dc6b68b8..cd6bb41cff 100644 --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -1373,6 +1373,16 @@ Slice decision options Default 20 **Range of values:** Between the maximum consecutive bframe count (:option:`--bframes`) and 250 +.. option:: --gop-lookahead + + Number of frames for GOP boundary decision lookahead. If a scenecut frame is found + within this from the gop boundary set by `--keyint`, the GOP will be extented until such a point, + otherwise the GOP will be terminated as set by `--keyint`. Default 0. + + **Range of values:** Between 0 and (`--rc-lookahead` - mini-GOP length) + + It is recommended to have `--gop-lookahaed` less than `--min-keyint` as scenecuts beyond + `--min-keyint` are already being coded as keyframes. .. option:: --lookahead-slices <0..16> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index ca8e4d24dd..b6a9a18299 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -29,7 +29,7 @@ option(NATIVE_BUILD "Target the build CPU" OFF) option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 146) +set(X265_BUILD 147) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" diff --git a/source/common/param.cpp b/source/common/param.cpp index aad9e27e94..4b037d50f1 100644 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -144,6 +144,7 @@ void x265_param_default(x265_param* param) /* Coding Structure */ param->keyframeMin = 0; param->keyframeMax = 250; + param->gopLookahead = 0; param->bOpenGOP = 1; param->bframes = 4; param->lookaheadDepth = 20; @@ -1004,6 +1005,7 @@ int x265_param_parse(x265_param* p, const char* name, const char* value) bError = true; } } + OPT("gop-lookahead") p->gopLookahead = atoi(value); else return X265_PARAM_BAD_NAME; } @@ -1314,6 +1316,8 @@ int x265_check_params(x265_param* param) "Valid penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum"); CHECK(param->keyframeMax < -1, "Invalid max IDR period in frames. value should be greater than -1"); + CHECK(param->gopLookahead < -1, + "GOP lookahead must be greater than -1"); CHECK(param->decodedPictureHashSEI < 0 || param->decodedPictureHashSEI > 3, "Invalid hash option. Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum"); CHECK(param->rc.vbvBufferSize < 0, @@ -1561,6 +1565,7 @@ char *x265_param2string(x265_param* p, int padx, int pady) BOOL(p->bOpenGOP, "open-gop"); s += sprintf(s, " min-keyint=%d", p->keyframeMin); s += sprintf(s, " keyint=%d", p->keyframeMax); + s += sprintf(s, " gop-lookahead=%d", p->gopLookahead); s += sprintf(s, " bframes=%d", p->bframes); s += sprintf(s, " b-adapt=%d", p->bFrameAdaptive); BOOL(p->bBPyramid, "b-pyramid"); diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp index fe2e51aab9..7129db2d2b 100644 --- a/source/encoder/slicetype.cpp +++ b/source/encoder/slicetype.cpp @@ -589,7 +589,7 @@ Lookahead::Lookahead(x265_param *param, ThreadPool* pool) m_outputSignalRequired = false; m_isActive = true; m_inputCount = 0; - + m_extendGopBoundary = false; m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; m_cuCount = m_8x8Width * m_8x8Height; @@ -646,7 +646,11 @@ Lookahead::Lookahead(x265_param *param, ThreadPool* pool) m_numRowsPerSlice = m_8x8Height; m_numCoopSlices = 1; } - + if (param->gopLookahead && (param->gopLookahead > (param->lookaheadDepth - param->bframes - 2))) + { + param->gopLookahead = X265_MAX(0, param->lookaheadDepth - param->bframes - 2); + x265_log(param, X265_LOG_WARNING, "Gop-lookahead cannot be greater than (rc-lookahead - length of the mini-gop); Clipping gop-lookahead to %d\n", param->gopLookahead); + } #if DETAILED_CU_STATS m_slicetypeDecideElapsedTime = 0; m_preLookaheadElapsedTime = 0; @@ -1086,7 +1090,8 @@ void Lookahead::slicetypeDecide() x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n", frm.sliceType, m_param->maxNumReferences); } - if ((!m_param->bIntraRefresh || frm.frameNum == 0) && frm.frameNum - m_lastKeyframe >= m_param->keyframeMax) + if ((!m_param->bIntraRefresh || frm.frameNum == 0) && frm.frameNum - m_lastKeyframe >= m_param->keyframeMax && + (!m_extendGopBoundary || frm.frameNum - m_lastKeyframe >= m_param->keyframeMax + m_param->gopLookahead)) { if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I) frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR; @@ -1377,12 +1382,14 @@ void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe) cuTree(frames, 0, bKeyframe); return; } - frames[framecnt + 1] = NULL; + int keyFrameLimit = m_param->keyframeMax + m_lastKeyframe - frames[0]->frameNum - 1; + if (m_param->gopLookahead && keyFrameLimit <= m_param->bframes + 1) + keyintLimit = keyFrameLimit + m_param->gopLookahead; + else + keyintLimit = keyFrameLimit; - keyintLimit = m_param->keyframeMax - frames[0]->frameNum + m_lastKeyframe - 1; origNumFrames = numFrames = m_param->bIntraRefresh ? framecnt : X265_MIN(framecnt, keyintLimit); - if (bIsVbvLookahead) numFrames = framecnt; else if (m_param->bOpenGOP && numFrames < framecnt) @@ -1472,7 +1479,26 @@ void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe) frames[1]->sliceType = X265_TYPE_I; return; } - + if (m_param->gopLookahead && (keyFrameLimit >= 0) && (keyFrameLimit <= m_param->bframes + 1)) + { + bool sceneTransition = m_isSceneTransition; + m_extendGopBoundary = false; + for (int i = m_param->bframes + 1; i < origNumFrames; i += m_param->bframes + 1) + { + scenecut(frames, i, i + 1, true, origNumFrames); + for (int j = i + 1; j <= X265_MIN(i + m_param->bframes + 1, origNumFrames); j++) + { + if (frames[j]->bScenecut && scenecutInternal(frames, j - 1, j, true) ) + { + m_extendGopBoundary = true; + break; + } + } + if (m_extendGopBoundary) + break; + } + m_isSceneTransition = sceneTransition; + } if (m_param->bframes) { if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS) @@ -1578,6 +1604,8 @@ void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe) if (m_param->rc.cuTree) cuTree(frames, X265_MIN(numFrames, m_param->keyframeMax), bKeyframe); + if (m_param->gopLookahead && (keyFrameLimit >= 0) && (keyFrameLimit <= m_param->bframes + 1) && !m_extendGopBoundary) + keyintLimit = keyFrameLimit; if (!m_param->bIntraRefresh) for (int j = keyintLimit + 1; j <= numFrames; j += m_param->keyframeMax) @@ -1588,8 +1616,8 @@ void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe) if (bIsVbvLookahead) vbvLookahead(frames, numFrames, bKeyframe); + int maxp1 = X265_MIN(m_param->bframes + 1, origNumFrames); - int maxp1 = X265_MIN(m_param->bframes + 1, origNumFrames); /* Restore frame types for all frames that haven't actually been decided yet. */ for (int j = resetStart; j <= numFrames; j++) { @@ -1613,8 +1641,8 @@ bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, in bool fluctuate = false; bool noScenecuts = false; int64_t avgSatdCost = 0; - if (frames[0]->costEst[1][0] > -1) - avgSatdCost = frames[0]->costEst[1][0]; + if (frames[p0]->costEst[p1 - p0][0] > -1) + avgSatdCost = frames[p0]->costEst[p1 - p0][0]; int cnt = 1; /* Where A and B are scenes: AAAAAABBBAAAAAA * If BBB is shorter than (maxp1-p0), it is detected as a flash @@ -1700,12 +1728,10 @@ bool Lookahead::scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScen CostEstimateGroup estGroup(*this, frames); estGroup.singleCost(p0, p1, p1); - int64_t icost = frame->costEst[0][0]; int64_t pcost = frame->costEst[p1 - p0][0]; - int gopSize = frame->frameNum - m_lastKeyframe; + int gopSize = (frame->frameNum - m_lastKeyframe) % m_param->keyframeMax; float threshMax = (float)(m_param->scenecutThreshold / 100.0); - /* magic numbers pulled out of thin air */ float threshMin = (float)(threshMax * 0.25); double bias = m_param->scenecutBias; diff --git a/source/encoder/slicetype.h b/source/encoder/slicetype.h index a247a8c2c9..f85d4aa128 100644 --- a/source/encoder/slicetype.h +++ b/source/encoder/slicetype.h @@ -132,6 +132,7 @@ class Lookahead : public JobProvider bool m_filled; bool m_isSceneTransition; int m_numPools; + bool m_extendGopBoundary; Lookahead(x265_param *param, ThreadPool *pool); #if DETAILED_CU_STATS int64_t m_slicetypeDecideElapsedTime; diff --git a/source/test/regression-tests.txt b/source/test/regression-tests.txt index bdd6935f7c..0dd5dd9323 100644 --- a/source/test/regression-tests.txt +++ b/source/test/regression-tests.txt @@ -150,7 +150,7 @@ Kimono1_1920x1080_24_400.yuv,--preset ultrafast --slices 1 --weightp --tu-intra- Kimono1_1920x1080_24_400.yuv,--preset medium --rdoq-level 0 --limit-refs 3 --slices 2 Kimono1_1920x1080_24_400.yuv,--preset veryslow --crf 4 --cu-lossless --slices 2 --limit-refs 3 --limit-modes Kimono1_1920x1080_24_400.yuv,--preset placebo --ctu 32 --max-tu-size 8 --limit-tu 2 - +big_buck_bunny_360p24.y4m, --keyint 60 --min-keyint 40 --gop-lookahead 20 # Main12 intraCost overflow bug test 720p50_parkrun_ter.y4m,--preset medium diff --git a/source/x265.h b/source/x265.h index 05cffd8ab9..8f6d5ff90a 100644 --- a/source/x265.h +++ b/source/x265.h @@ -1532,11 +1532,14 @@ typedef struct x265_param /* Reuse MV information obtained through API */ int bMVType; - /* Allow the encoder to have a copy of the planes of x265_picture in Frame */ int bCopyPicToFrame; -} x265_param; + /*Number of frames for GOP boundary decision lookahead.If a scenecut frame is found + * within this from the gop boundary set by keyint, the GOP will be extented until such a point, + * otherwise the GOP will be terminated as set by keyint*/ + int gopLookahead; +} x265_param; /* x265_param_alloc: * Allocates an x265_param instance. The returned param structure is not * special in any way, but using this method together with x265_param_free() diff --git a/source/x265cli.h b/source/x265cli.h index bd5f7c5326..de5afcc1f6 100644 --- a/source/x265cli.h +++ b/source/x265cli.h @@ -119,6 +119,7 @@ static const struct option long_options[] = { "open-gop", no_argument, NULL, 0 }, { "keyint", required_argument, NULL, 'I' }, { "min-keyint", required_argument, NULL, 'i' }, + { "gop-lookahead", required_argument, NULL, 0 }, { "scenecut", required_argument, NULL, 0 }, { "no-scenecut", no_argument, NULL, 0 }, { "scenecut-bias", required_argument, NULL, 0 }, @@ -418,6 +419,7 @@ static void showHelp(x265_param *param) H0(" --[no-]open-gop Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP)); H0("-I/--keyint Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax); H0("-i/--min-keyint Scenecuts closer together than this are coded as I, not IDR. Default: auto\n"); + H0(" --gop-lookahead Extends gop boundary if a scenecut is found within this from keyint boundary. Default 0\n"); H0(" --no-scenecut Disable adaptive I-frame decision\n"); H0(" --scenecut How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold); H1(" --scenecut-bias <0..100.0> Bias for scenecut detection. Default %.2f\n", param->scenecutBias); From 96449185802fc6c21a3e23a366b802e806b796f7 Mon Sep 17 00:00:00 2001 From: Pradeep Ramachandran Date: Wed, 29 Nov 2017 08:52:18 +0530 Subject: [PATCH 07/51] Added tag 2.6 for changeset 0e9ea76945c8 --- .hgtags | 1 + 1 file changed, 1 insertion(+) diff --git a/.hgtags b/.hgtags index 363e6eebe4..ece3fd0d03 100644 --- a/.hgtags +++ b/.hgtags @@ -24,3 +24,4 @@ be14a7e9755e54f0fd34911c72bdfa66981220bc 2.2 3037c1448549ca920967831482c653e5892fa8ed 2.3 e7a4dd48293b7956d4a20df257d23904cc78e376 2.4 64b2d0bf45a52511e57a6b7299160b961ca3d51c 2.5 +0e9ea76945c89962cd46cee6537586e2054b2935 2.6 From 371f3ade561880042526e4f954933017fce972a3 Mon Sep 17 00:00:00 2001 From: Santhoshini Sekar Date: Thu, 30 Nov 2017 17:36:32 +0530 Subject: [PATCH 08/51] deblock: set reference frame to NULL if refIdx < 0. Dereferencing refFrameList when refIdx < 0 is not valid --- source/common/deblock.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/source/common/deblock.cpp b/source/common/deblock.cpp index bbdbf7bceb..13ac66f458 100644 --- a/source/common/deblock.cpp +++ b/source/common/deblock.cpp @@ -207,21 +207,18 @@ uint8_t Deblock::getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t pa static const MV zeroMv(0, 0); const Slice* const sliceQ = cuQ->m_slice; const Slice* const sliceP = cuP->m_slice; - - const Frame* refP0 = sliceP->m_refFrameList[0][cuP->m_refIdx[0][partP]]; - const Frame* refQ0 = sliceQ->m_refFrameList[0][cuQ->m_refIdx[0][partQ]]; + const Frame* refP0 = (cuP->m_refIdx[0][partP] >= 0) ? sliceP->m_refFrameList[0][cuP->m_refIdx[0][partP]] : NULL; + const Frame* refQ0 = (cuQ->m_refIdx[0][partQ] >= 0) ? sliceQ->m_refFrameList[0][cuQ->m_refIdx[0][partQ]] : NULL; const MV& mvP0 = refP0 ? cuP->m_mv[0][partP] : zeroMv; const MV& mvQ0 = refQ0 ? cuQ->m_mv[0][partQ] : zeroMv; - if (sliceQ->isInterP() && sliceP->isInterP()) { return ((refP0 != refQ0) || (abs(mvQ0.x - mvP0.x) >= 4) || (abs(mvQ0.y - mvP0.y) >= 4)) ? 1 : 0; } - // (sliceQ->isInterB() || sliceP->isInterB()) - const Frame* refP1 = sliceP->m_refFrameList[1][cuP->m_refIdx[1][partP]]; - const Frame* refQ1 = sliceQ->m_refFrameList[1][cuQ->m_refIdx[1][partQ]]; + const Frame* refP1 = (cuP->m_refIdx[1][partP] >= 0) ? sliceP->m_refFrameList[1][cuP->m_refIdx[1][partP]] : NULL; + const Frame* refQ1 = (cuQ->m_refIdx[1][partQ] >= 0) ? sliceQ->m_refFrameList[1][cuQ->m_refIdx[1][partQ]] : NULL; const MV& mvP1 = refP1 ? cuP->m_mv[1][partP] : zeroMv; const MV& mvQ1 = refQ1 ? cuQ->m_mv[1][partQ] : zeroMv; From 4ba86180bdfc760331dc5bbc279f5fc387c397c8 Mon Sep 17 00:00:00 2001 From: Mahesh Pittala Date: Thu, 30 Nov 2017 19:09:45 +0530 Subject: [PATCH 09/51] Install symbol files in MinGW In debug and RelWithDebInfo configuration, .pdb files are installed. --- source/CMakeLists.txt | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index dfcef20f49..6ab8a32cc4 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -546,14 +546,19 @@ if(ENABLE_HDR10_PLUS) ARCHIVE DESTINATION ${LIB_INSTALL_DIR}) endif() install(FILES x265.h "${PROJECT_BINARY_DIR}/x265_config.h" DESTINATION include) - if(WIN32) - install(FILES "${PROJECT_BINARY_DIR}/Debug/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug) - install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo) - install(FILES "${PROJECT_BINARY_DIR}/Debug/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug OPTIONAL NAMELINK_ONLY) - install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo OPTIONAL NAMELINK_ONLY) + if(MSVC_IDE) + install(FILES "${PROJECT_BINARY_DIR}/Debug/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug) + install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo) + install(FILES "${PROJECT_BINARY_DIR}/Debug/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug OPTIONAL NAMELINK_ONLY) + install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo OPTIONAL NAMELINK_ONLY) + else() + install(FILES "${PROJECT_BINARY_DIR}/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug) + install(FILES "${PROJECT_BINARY_DIR}/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo) + install(FILES "${PROJECT_BINARY_DIR}/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug OPTIONAL NAMELINK_ONLY) + install(FILES "${PROJECT_BINARY_DIR}/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo OPTIONAL NAMELINK_ONLY) + endif() endif() - if(CMAKE_RC_COMPILER) # The resource compiler does not need CFLAGS or macro defines. It # often breaks them From ea3d0eb69b888a3f79884073c12da076c737afcc Mon Sep 17 00:00:00 2001 From: Ma0 Date: Thu, 30 Nov 2017 18:29:19 +0100 Subject: [PATCH 10/51] cli: add new option '--fullhelp' --- source/x265.cpp | 8 +++++++- source/x265cli.h | 5 +++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/source/x265.cpp b/source/x265.cpp index 61e83eaf78..e9d0bf97bb 100644 --- a/source/x265.cpp +++ b/source/x265.cpp @@ -301,9 +301,15 @@ bool CLIOptions::parse(int argc, char **argv) if (!this->qpfile) x265_log_file(param, X265_LOG_ERROR, "%s qpfile not found or error in opening qp file\n", optarg); } + OPT("fullhelp") + { + param->logLevel = X265_LOG_FULL; + printVersion(param, api); + showHelp(param); + break; + } else bError |= !!api->param_parse(param, long_options[long_options_index].name, optarg); - if (bError) { const char *name = long_options_index > 0 ? long_options[long_options_index].name : argv[optind - 2]; diff --git a/source/x265cli.h b/source/x265cli.h index de5afcc1f6..3b59cf3a91 100644 --- a/source/x265cli.h +++ b/source/x265cli.h @@ -38,6 +38,7 @@ static const char short_options[] = "o:D:P:p:f:F:r:I:i:b:s:t:q:m:hwV?"; static const struct option long_options[] = { { "help", no_argument, NULL, 'h' }, + { "fullhelp", no_argument, NULL, 0 }, { "version", no_argument, NULL, 'V' }, { "asm", required_argument, NULL, 0 }, { "no-asm", no_argument, NULL, 0 }, @@ -315,6 +316,7 @@ static void showHelp(x265_param *param) H0(" outfile is raw HEVC bitstream\n"); H0("\nExecutable Options:\n"); H0("-h/--help Show this help text and exit\n"); + H0(" --fullhelp Show all options and exit\n"); H0("-V/--version Show version info and exit\n"); H0("\nOutput Options:\n"); H0("-o/--output Bitstream output file name\n"); @@ -565,9 +567,8 @@ static void showHelp(x265_param *param) #undef OPT #undef H0 #undef H1 - if (level < X265_LOG_DEBUG) - printf("\nUse --log-level full --help for a full listing\n"); + printf("\nUse --fullhelp for a full listing (or --log-level full --help)\n"); printf("\n\nComplete documentation may be found at http://x265.readthedocs.org/en/default/cli.html\n"); exit(1); } From f3e9e95b0d20675778aef302cb1f86a29f3abe52 Mon Sep 17 00:00:00 2001 From: Aruna Matheswaran Date: Mon, 4 Dec 2017 15:00:04 +0530 Subject: [PATCH 11/51] test: Fix gop-lookahead cli error in regression test --- source/test/regression-tests.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/test/regression-tests.txt b/source/test/regression-tests.txt index 0dd5dd9323..48e2d43e32 100644 --- a/source/test/regression-tests.txt +++ b/source/test/regression-tests.txt @@ -150,7 +150,7 @@ Kimono1_1920x1080_24_400.yuv,--preset ultrafast --slices 1 --weightp --tu-intra- Kimono1_1920x1080_24_400.yuv,--preset medium --rdoq-level 0 --limit-refs 3 --slices 2 Kimono1_1920x1080_24_400.yuv,--preset veryslow --crf 4 --cu-lossless --slices 2 --limit-refs 3 --limit-modes Kimono1_1920x1080_24_400.yuv,--preset placebo --ctu 32 --max-tu-size 8 --limit-tu 2 -big_buck_bunny_360p24.y4m, --keyint 60 --min-keyint 40 --gop-lookahead 20 +big_buck_bunny_360p24.y4m, --keyint 60 --min-keyint 40 --gop-lookahead 14 # Main12 intraCost overflow bug test 720p50_parkrun_ter.y4m,--preset medium From abb93e663a248e55c56f815861917697a717a559 Mon Sep 17 00:00:00 2001 From: Santhoshini Sekar Date: Mon, 4 Dec 2017 16:08:19 +0530 Subject: [PATCH 12/51] wait until the last row of recon is complete. Waiting on m_reconEncoded doesn't ensure full recon generation. --- source/encoder/encoder.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index a8a9eb6db4..0512800fba 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -463,8 +463,8 @@ int Encoder::getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc) { int l0POC = framePtr->m_encData->m_slice->m_refFrameList[0][j]->m_poc; Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC); - if (l0Fp->m_reconPic->m_picOrg[0] == NULL) - l0Fp->m_reconEncoded.wait(); /* If recon is not ready, current frame encoder need to wait. */ + while (l0Fp->m_reconRowFlag[l0Fp->m_numRows - 1].get() == 0) + l0Fp->m_reconRowFlag[l0Fp->m_numRows - 1].waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */ l0[j] = l0Fp->m_reconPic; } } @@ -474,8 +474,8 @@ int Encoder::getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc) { int l1POC = framePtr->m_encData->m_slice->m_refFrameList[1][j]->m_poc; Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC); - if (l1Fp->m_reconPic->m_picOrg[0] == NULL) - l1Fp->m_reconEncoded.wait(); /* If recon is not ready, current frame encoder need to wait. */ + while (l1Fp->m_reconRowFlag[l1Fp->m_numRows - 1].get() == 0) + l1Fp->m_reconRowFlag[l1Fp->m_numRows - 1].waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */ l1[j] = l1Fp->m_reconPic; } } From 305677fdcf70c79719dfdcc6f4f8f230ea32bf68 Mon Sep 17 00:00:00 2001 From: Santhoshini Sekar Date: Mon, 4 Dec 2017 16:13:45 +0530 Subject: [PATCH 13/51] remove unnecessary event m_reconEncoded from Frame class --- source/common/frame.h | 1 - source/encoder/frameencoder.cpp | 2 -- 2 files changed, 3 deletions(-) diff --git a/source/common/frame.h b/source/common/frame.h index 50117a456b..c3b9a89f10 100644 --- a/source/common/frame.h +++ b/source/common/frame.h @@ -98,7 +98,6 @@ class Frame float* m_quantOffsets; // points to quantOffsets in x265_picture x265_sei m_userSEI; - Event m_reconEncoded; /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */ ThreadSafeInteger* m_reconRowFlag; // flag of CTU rows completely reconstructed and extended for motion reference diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp index 2e9cb7550e..ce24227c70 100644 --- a/source/encoder/frameencoder.cpp +++ b/source/encoder/frameencoder.cpp @@ -342,8 +342,6 @@ void FrameEncoder::threadMain() } compressFrame(); m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */ - if (m_frame != NULL) - m_frame->m_reconEncoded.trigger(); m_enable.wait(); } } From 0c6817890883462b8e41880a25fbafbc692f1407 Mon Sep 17 00:00:00 2001 From: Aruna Matheswaran Date: Thu, 7 Dec 2017 10:59:12 +0530 Subject: [PATCH 14/51] rc: Fix inconsistency in --const-vbv (issue #381) VBV intializations during ABR-reset is removed. Inconsistent rowTotalBits in rc-update with --const-vbv is fixed. --- source/encoder/frameencoder.cpp | 4 ++-- source/encoder/ratecontrol.cpp | 4 +++- source/encoder/ratecontrol.h | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp index ce24227c70..2eb4447b5e 100644 --- a/source/encoder/frameencoder.cpp +++ b/source/encoder/frameencoder.cpp @@ -1746,8 +1746,8 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) if (rowInSlice == rowCount) { m_rowSliceTotalBits[sliceId] = 0; - if (bIsVbv) - { + if (bIsVbv && !(m_param->rc.bEnableConstVbv && m_param->bEnableWavefront)) + { for (uint32_t i = m_sliceBaseRow[sliceId]; i < rowCount + m_sliceBaseRow[sliceId]; i++) m_rowSliceTotalBits[sliceId] += curEncData.m_rowStat[i].encodedBits; } diff --git a/source/encoder/ratecontrol.cpp b/source/encoder/ratecontrol.cpp index 3864f86bfb..5193c1a888 100644 --- a/source/encoder/ratecontrol.cpp +++ b/source/encoder/ratecontrol.cpp @@ -219,6 +219,7 @@ RateControl::RateControl(x265_param& p) m_param->rc.vbvMaxBitrate = x265_clip3(0, 2000000, m_param->rc.vbvMaxBitrate); m_param->rc.vbvBufferInit = x265_clip3(0.0, 2000000.0, m_param->rc.vbvBufferInit); m_param->vbvBufferEnd = x265_clip3(0.0, 2000000.0, m_param->vbvBufferEnd); + m_initVbv = false; m_singleFrameVbv = 0; m_rateTolerance = 1.0; @@ -319,7 +320,7 @@ RateControl::RateControl(x265_param& p) bool RateControl::init(const SPS& sps) { - if (m_isVbv) + if (m_isVbv && !m_initVbv) { /* We don't support changing the ABR bitrate right now, * so if the stream starts as CBR, keep it CBR. */ @@ -353,6 +354,7 @@ bool RateControl::init(const SPS& sps) m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit; m_bufferFillActual = m_bufferFillFinal; m_bufferExcess = 0; + m_initVbv = true; } m_totalBits = 0; diff --git a/source/encoder/ratecontrol.h b/source/encoder/ratecontrol.h index 49618da824..4fac01a4bc 100644 --- a/source/encoder/ratecontrol.h +++ b/source/encoder/ratecontrol.h @@ -132,6 +132,7 @@ class RateControl bool m_isGrainEnabled; bool m_isAbrReset; bool m_isNextGop; + bool m_initVbv; int m_lastAbrResetPoc; double m_rateTolerance; From 317214a3b0d92165116c46f5eb04d65945f24881 Mon Sep 17 00:00:00 2001 From: Ma0 Date: Thu, 7 Dec 2017 09:19:11 +0100 Subject: [PATCH 15/51] remove unnecessary sqrt from MVTHRESHOLD checking Thanks MonoS Diablo for pointing this. --- source/encoder/encoder.cpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index e7d590316d..153b7dce67 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -50,10 +50,8 @@ const char g_sliceTypeToChar[] = {'B', 'P', 'I'}; /* Threshold for motion vection, based on expermental result. * TODO: come up an algorithm for adoptive threshold */ - -#define MVTHRESHOLD 10 +#define MVTHRESHOLD (10*10) #define PU_2Nx2N 1 - static const char* defaultAnalysisFileName = "x265_analysis.dat"; using namespace X265_NS; @@ -571,12 +569,12 @@ int Encoder::setAnalysisDataAfterZScan(x265_analysis_data *analysis_data, Frame* (interData)->mvpIdx[k][cuPos + cuOffset] = (srcInterData)->mvpIdx[k][(mbIndex * 16) + cuOffset]; (interData)->refIdx[k][cuPos + cuOffset] = (srcInterData)->refIdx[k][(mbIndex * 16) + cuOffset]; memcpy(&(interData)->mv[k][cuPos + cuOffset], &(srcInterData)->mv[k][(mbIndex * 16) + cuOffset], sizeof(MV)); - if (m_param->analysisReuseLevel == 7) + if (m_param->analysisReuseLevel == 7 && numPU == PU_2Nx2N && + ((srcInterData)->depth[cuPos + cuOffset] == (m_param->maxCUSize >> 5))) { int mv_x = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[k][(mbIndex * 16) + cuOffset].x; int mv_y = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[k][(mbIndex * 16) + cuOffset].y; - double mv = sqrt((double)(mv_x*mv_x + mv_y*mv_y)); - if (numPU == PU_2Nx2N && ((srcInterData)->depth[cuPos + cuOffset] == (m_param->maxCUSize >> 5)) && mv <= MVTHRESHOLD) + if ((mv_x*mv_x + mv_y*mv_y) <= MVTHRESHOLD) memset(&curFrame->m_analysisData.modeFlag[k][cuPos + cuOffset], 1, bytes); } } @@ -654,12 +652,11 @@ int Encoder::setAnalysisData(x265_analysis_data *analysis_data, int poc, uint32_ (currInterData)->mvpIdx[i][count + pu] = (interData)->mvpIdx[i][d]; (currInterData)->refIdx[i][count + pu] = (interData)->refIdx[i][d]; memcpy(&(currInterData)->mv[i][count + pu], &(interData)->mv[i][d], sizeof(MV)); - if (m_param->analysisReuseLevel == 7) + if (m_param->analysisReuseLevel == 7 && numPU == PU_2Nx2N && m_param->num4x4Partitions <= 16) { int mv_x = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[i][count + pu].x; int mv_y = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[i][count + pu].y; - double mv = sqrt((double)(mv_x*mv_x + mv_y*mv_y)); - if (numPU == PU_2Nx2N && m_param->num4x4Partitions <= 16 && mv <= MVTHRESHOLD) + if ((mv_x*mv_x + mv_y*mv_y) <= MVTHRESHOLD) memset(&curFrame->m_analysisData.modeFlag[i][count + pu], 1, bytes); } } From 026149cf540975964dd9e4976e8d70975b3c722f Mon Sep 17 00:00:00 2001 From: Santhoshini Sekar Date: Tue, 12 Dec 2017 18:04:04 +0530 Subject: [PATCH 16/51] modify api x265_get_ref_frame_list to provide POC lists for L0 and L1 references --- doc/reST/api.rst | 2 +- source/CMakeLists.txt | 2 +- source/encoder/api.cpp | 4 ++-- source/encoder/encoder.cpp | 4 +++- source/encoder/encoder.h | 2 +- source/x265.h | 4 ++-- 6 files changed, 10 insertions(+), 8 deletions(-) diff --git a/doc/reST/api.rst b/doc/reST/api.rst index 92d2dab9d4..4c6b71729d 100644 --- a/doc/reST/api.rst +++ b/doc/reST/api.rst @@ -206,7 +206,7 @@ changes made to the parameters for auto-detection and other reasons:: /* x265_get_ref_frame_list: * returns negative on error, 0 when access unit were output. * This API must be called after(poc >= lookaheadDepth + bframes + 2) condition check */ - int x265_get_ref_frame_list(x265_encoder *encoder, x265_picyuv**, x265_picyuv**, int, int); + int x265_get_ref_frame_list(x265_encoder *encoder, x265_picyuv**, x265_picyuv**, int, int, int*, int*); **x265_encoder_ctu_info** may be used to provide additional CTU-specific information to the encoder:: diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 1e1d91d140..c859ff4cc0 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -29,7 +29,7 @@ option(NATIVE_BUILD "Target the build CPU" OFF) option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 147) +set(X265_BUILD 148) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" diff --git a/source/encoder/api.cpp b/source/encoder/api.cpp index 2ae2bc82dc..955c941243 100644 --- a/source/encoder/api.cpp +++ b/source/encoder/api.cpp @@ -356,13 +356,13 @@ int x265_get_slicetype_poc_and_scenecut(x265_encoder *enc, int *slicetype, int * return -1; } -int x265_get_ref_frame_list(x265_encoder *enc, x265_picyuv** l0, x265_picyuv** l1, int sliceType, int poc) +int x265_get_ref_frame_list(x265_encoder *enc, x265_picyuv** l0, x265_picyuv** l1, int sliceType, int poc, int* pocL0, int* pocL1) { if (!enc) return -1; Encoder *encoder = static_cast(enc); - return encoder->getRefFrameList((PicYuv**)l0, (PicYuv**)l1, sliceType, poc); + return encoder->getRefFrameList((PicYuv**)l0, (PicYuv**)l1, sliceType, poc, pocL0, pocL1); } int x265_set_analysis_data(x265_encoder *enc, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes) diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index 153b7dce67..b8cab1c410 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -448,7 +448,7 @@ int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut return 0; } -int Encoder::getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc) +int Encoder::getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc, int* pocL0, int* pocL1) { if (!(IS_X265_TYPE_I(sliceType))) { @@ -460,6 +460,7 @@ int Encoder::getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc) if (framePtr->m_encData->m_slice->m_refFrameList[0][j] && framePtr->m_encData->m_slice->m_refFrameList[0][j]->m_reconPic != NULL) { int l0POC = framePtr->m_encData->m_slice->m_refFrameList[0][j]->m_poc; + pocL0[j] = l0POC; Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC); while (l0Fp->m_reconRowFlag[l0Fp->m_numRows - 1].get() == 0) l0Fp->m_reconRowFlag[l0Fp->m_numRows - 1].waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */ @@ -471,6 +472,7 @@ int Encoder::getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc) if (framePtr->m_encData->m_slice->m_refFrameList[1][j] && framePtr->m_encData->m_slice->m_refFrameList[1][j]->m_reconPic != NULL) { int l1POC = framePtr->m_encData->m_slice->m_refFrameList[1][j]->m_poc; + pocL1[j] = l1POC; Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC); while (l1Fp->m_reconRowFlag[l1Fp->m_numRows - 1].get() == 0) l1Fp->m_reconRowFlag[l1Fp->m_numRows - 1].waitForChange(0); /* If recon is not ready, current frame encoder has to wait. */ diff --git a/source/encoder/encoder.h b/source/encoder/encoder.h index b71b89219a..5948346e96 100644 --- a/source/encoder/encoder.h +++ b/source/encoder/encoder.h @@ -208,7 +208,7 @@ class Encoder : public x265_encoder int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut); - int getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc); + int getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc, int* pocL0, int* pocL1); int setAnalysisDataAfterZScan(x265_analysis_data *analysis_data, Frame* curFrame); diff --git a/source/x265.h b/source/x265.h index 8f6d5ff90a..9d6b5d606f 100644 --- a/source/x265.h +++ b/source/x265.h @@ -1746,7 +1746,7 @@ int x265_get_slicetype_poc_and_scenecut(x265_encoder *encoder, int *slicetype, i /* x265_get_ref_frame_list: * returns negative on error, 0 when access unit were output. * This API must be called after(poc >= lookaheadDepth + bframes + 2) condition check */ -int x265_get_ref_frame_list(x265_encoder *encoder, x265_picyuv**, x265_picyuv**, int, int); +int x265_get_ref_frame_list(x265_encoder *encoder, x265_picyuv**, x265_picyuv**, int, int, int*, int*); /* x265_set_analysis_data: * set the analysis data. The incoming analysis_data structure is assumed to be AVC-sized blocks. @@ -1823,7 +1823,7 @@ typedef struct x265_api int (*encoder_intra_refresh)(x265_encoder*); int (*encoder_ctu_info)(x265_encoder*, int, x265_ctu_info_t**); int (*get_slicetype_poc_and_scenecut)(x265_encoder*, int*, int*, int*); - int (*get_ref_frame_list)(x265_encoder*, x265_picyuv**, x265_picyuv**, int, int); + int (*get_ref_frame_list)(x265_encoder*, x265_picyuv**, x265_picyuv**, int, int, int*, int*); FILE* (*csvlog_open)(const x265_param*); void (*csvlog_frame)(const x265_param*, const x265_picture*); void (*csvlog_encode)(x265_encoder*, const x265_stats*, int, char**); From bf494da403ddefc92e2cf5322aaaf776688c3412 Mon Sep 17 00:00:00 2001 From: Divya Manivannan Date: Thu, 14 Dec 2017 15:57:01 +0530 Subject: [PATCH 17/51] analysis: avoid creating analysis file when useanalysisFile option is disabled --- source/encoder/encoder.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index b8cab1c410..072878df59 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -342,10 +342,8 @@ void Encoder::create() m_aborted = true; if (!m_lookahead->create()) m_aborted = true; - initRefIdx(); - - if (m_param->analysisReuseMode) + if (m_param->analysisReuseMode && m_param->bUseAnalysisFile) { const char* name = m_param->analysisReuseFileName; if (!name) @@ -3248,8 +3246,8 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x static uint64_t consumedBytes = 0; static uint64_t totalConsumedBytes = 0; uint32_t depthBytes = 0; - fseeko(m_analysisFile, totalConsumedBytes, SEEK_SET); - + if (m_param->bUseAnalysisFile) + fseeko(m_analysisFile, totalConsumedBytes, SEEK_SET); const x265_analysis_data *picData = &(picIn->analysisData); analysis_intra_data *intraPic = (analysis_intra_data *)picData->intraData; analysis_inter_data *interPic = (analysis_inter_data *)picData->interData; From b2c261cc10b9864dc52008c5579023603f65817f Mon Sep 17 00:00:00 2001 From: Santhoshini Sekar Date: Tue, 19 Dec 2017 15:33:30 +0530 Subject: [PATCH 18/51] fix bugs in analysis-reuse-level=7 and refine-mv-type=AVC --- source/encoder/analysis.cpp | 57 +++++++++++++++++++++---------------- source/encoder/encoder.cpp | 21 +++++++------- 2 files changed, 44 insertions(+), 34 deletions(-) diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp index e3d21bf042..6eecc4375e 100644 --- a/source/encoder/analysis.cpp +++ b/source/encoder/analysis.cpp @@ -280,7 +280,7 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con /* generate residual for entire CTU at once and copy to reconPic */ encodeResidue(ctu, cuGeom); } - else if ((m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel == 10) || ((m_param->bMVType == AVC_INFO) && m_param->analysisReuseLevel >= 7)) + else if ((m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel == 10) || ((m_param->bMVType == AVC_INFO) && m_param->analysisReuseLevel >= 7 && ctu.m_numPartitions <= 16)) { analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData; int posCTU = ctu.m_cuAddr * numPartition; @@ -459,11 +459,9 @@ void Analysis::qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t int bestCUQP = qp; int lambdaQP = lqp; - bool doQPRefine = (bDecidedDepth && depth <= m_slice->m_pps->maxCuDQPDepth) || (!bDecidedDepth && depth == m_slice->m_pps->maxCuDQPDepth); - if (m_param->analysisReuseLevel == 10) + if (m_param->analysisReuseLevel >= 7) doQPRefine = false; - if (doQPRefine) { uint64_t bestCUCost, origCUCost, cuCost, cuPrevCost; @@ -1305,9 +1303,8 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& } } } - /* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */ - if ((mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) || (m_param->bMVType && (m_modeFlag[0] || m_modeFlag[1]))) /* TODO: Re-evaluate if analysis load/save still works */ + if ((mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) || (m_param->bMVType && m_param->analysisReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]))) /* TODO: Re-evaluate if analysis load/save still works */ { /* Compute Merge Cost */ md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); @@ -1317,8 +1314,7 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& skipModes = (m_param->bEnableEarlySkip || m_param->interRefine == 2) && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth } - - if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck && !(m_param->bMVType && (m_modeFlag[0] || m_modeFlag[1]))) + if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck && !(m_param->bMVType && m_param->analysisReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]))) { skipRecursion = md.bestMode->cu.isSkipped(0); if (mightSplit && depth >= minDepth && !skipRecursion) @@ -1329,10 +1325,8 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& skipRecursion = complexityCheckCU(*md.bestMode); } } - - if (m_param->bMVType && md.bestMode && cuGeom.numPartitions <= 16) + if (m_param->bMVType && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisReuseLevel == 7) skipRecursion = true; - /* Step 2. Evaluate each of the 4 split sub-blocks in series */ if (mightSplit && !skipRecursion) { @@ -1387,11 +1381,20 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& else splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits); } - /* If analysis mode is simple do not Evaluate other modes */ - if ((m_param->bMVType && cuGeom.numPartitions <= 16) && (m_slice->m_sliceType == P_SLICE || m_slice->m_sliceType == B_SLICE)) - mightNotSplit = !(m_checkMergeAndSkipOnly[0] || (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])); - + if (m_param->bMVType && m_param->analysisReuseLevel == 7) + { + if (m_slice->m_sliceType == P_SLICE) + { + if (m_checkMergeAndSkipOnly[0]) + skipModes = true; + } + else + { + if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1]) + skipModes = true; + } + } /* Split CUs * 0 1 * 2 3 */ @@ -1998,10 +2001,9 @@ SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& } } } - /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */ if ((mightNotSplit && !md.bestMode && !bCtuInfoCheck) || - (m_param->bMVType && (m_modeFlag[0] || m_modeFlag[1]))) + (m_param->bMVType && m_param->analysisReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]))) { md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp); md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp); @@ -2016,10 +2018,8 @@ SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode) skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0); } - - if (m_param->bMVType && md.bestMode && cuGeom.numPartitions <= 16) + if (m_param->bMVType && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisReuseLevel == 7) skipRecursion = true; - // estimate split cost /* Step 2. Evaluate each of the 4 split sub-blocks in series */ if (mightSplit && !skipRecursion) @@ -2071,11 +2071,20 @@ SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& checkDQPForSplitPred(*splitPred, cuGeom); } - /* If analysis mode is simple do not Evaluate other modes */ - if ((m_param->bMVType && cuGeom.numPartitions <= 16) && (m_slice->m_sliceType == P_SLICE || m_slice->m_sliceType == B_SLICE)) - mightNotSplit = !(m_checkMergeAndSkipOnly[0] || (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])); - + if (m_param->bMVType && m_param->analysisReuseLevel == 7) + { + if (m_slice->m_sliceType == P_SLICE) + { + if (m_checkMergeAndSkipOnly[0]) + skipModes = true; + } + else + { + if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1]) + skipModes = true; + } + } /* Split CUs * 0 1 * 2 3 */ diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index 072878df59..d187ca1397 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -562,7 +562,7 @@ int Encoder::setAnalysisDataAfterZScan(x265_analysis_data *analysis_data, Frame* { int cuOffset = cuI * bytes + pu; (interData)->mergeFlag[cuPos + cuOffset] = (srcInterData)->mergeFlag[(mbIndex * 16) + cuOffset]; - + (interData)->sadCost[cuPos + cuOffset] = (srcInterData)->sadCost[(mbIndex * 16) + cuOffset]; (interData)->interDir[cuPos + cuOffset] = (srcInterData)->interDir[(mbIndex * 16) + cuOffset]; for (uint32_t k = 0; k < numDir; k++) { @@ -570,10 +570,10 @@ int Encoder::setAnalysisDataAfterZScan(x265_analysis_data *analysis_data, Frame* (interData)->refIdx[k][cuPos + cuOffset] = (srcInterData)->refIdx[k][(mbIndex * 16) + cuOffset]; memcpy(&(interData)->mv[k][cuPos + cuOffset], &(srcInterData)->mv[k][(mbIndex * 16) + cuOffset], sizeof(MV)); if (m_param->analysisReuseLevel == 7 && numPU == PU_2Nx2N && - ((srcInterData)->depth[cuPos + cuOffset] == (m_param->maxCUSize >> 5))) + ((interData)->depth[cuPos + cuOffset] == (m_param->maxCUSize >> 5))) { - int mv_x = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[k][(mbIndex * 16) + cuOffset].x; - int mv_y = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[k][(mbIndex * 16) + cuOffset].y; + int mv_x = (interData)->mv[k][cuPos + cuOffset].x; + int mv_y = (interData)->mv[k][cuPos + cuOffset].y; if ((mv_x*mv_x + mv_y*mv_y) <= MVTHRESHOLD) memset(&curFrame->m_analysisData.modeFlag[k][cuPos + cuOffset], 1, bytes); } @@ -640,9 +640,10 @@ int Encoder::setAnalysisData(x265_analysis_data *analysis_data, int poc, uint32_ if (m_param->analysisReuseLevel > 4) { memset(&(currInterData)->partSize[count], (interData)->partSize[d], bytes); - int numPU = nbPartsTable[(currInterData)->partSize[d]]; - for (int pu = 0; pu < numPU; pu++, d++) + int numPU = nbPartsTable[(interData)->partSize[d]]; + for (int pu = 0; pu < numPU; pu++) { + if (pu) d++; (currInterData)->mergeFlag[count + pu] = (interData)->mergeFlag[d]; if (m_param->analysisReuseLevel >= 7) { @@ -654,8 +655,8 @@ int Encoder::setAnalysisData(x265_analysis_data *analysis_data, int poc, uint32_ memcpy(&(currInterData)->mv[i][count + pu], &(interData)->mv[i][d], sizeof(MV)); if (m_param->analysisReuseLevel == 7 && numPU == PU_2Nx2N && m_param->num4x4Partitions <= 16) { - int mv_x = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[i][count + pu].x; - int mv_y = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[i][count + pu].y; + int mv_x = (currInterData)->mv[i][count + pu].x; + int mv_y = (currInterData)->mv[i][count + pu].y; if ((mv_x*mv_x + mv_y*mv_y) <= MVTHRESHOLD) memset(&curFrame->m_analysisData.modeFlag[i][count + pu], 1, bytes); } @@ -3061,14 +3062,14 @@ void Encoder::allocAnalysis(x265_analysis_data* analysis) if (m_param->analysisReuseLevel >= 7) { CHECKED_MALLOC(interData->interDir, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); + CHECKED_MALLOC(interData->sadCost, int64_t, analysis->numPartitions * analysis->numCUsInFrame); for (int dir = 0; dir < numDir; dir++) { CHECKED_MALLOC(interData->mvpIdx[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame); CHECKED_MALLOC(interData->refIdx[dir], int8_t, analysis->numPartitions * analysis->numCUsInFrame); CHECKED_MALLOC(interData->mv[dir], MV, analysis->numPartitions * analysis->numCUsInFrame); - CHECKED_MALLOC(analysis->modeFlag[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame); + CHECKED_MALLOC_ZERO(analysis->modeFlag[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame); } - /* Allocate intra in inter */ if (analysis->sliceType == X265_TYPE_P || m_param->bIntraInBFrames) { From 76f85d1429ab438d8684ffc4c1c4dab91d4b22c6 Mon Sep 17 00:00:00 2001 From: Santhoshini Sekar Date: Thu, 21 Dec 2017 09:09:46 +0530 Subject: [PATCH 19/51] warn out saying that limitTU=3 or 4 with AVCINFO produces inconsistent output. Set mincusize to 8 when it is not 8 for MVtype=AVCINFO as AVCINFO expects so --- source/encoder/encoder.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index d187ca1397..ddda95bdfd 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -2449,6 +2449,18 @@ void Encoder::configure(x265_param *p) this->m_externalFlush = true; else this->m_externalFlush = false; + + if (p->bMVType == AVC_INFO && (p->limitTU == 3 || p->limitTU == 4)) + { + x265_log(p, X265_LOG_WARNING, "limit TU = 3 or 4 with MVType AVCINFO produces inconsistent output\n"); + } + + if (p->bMVType == AVC_INFO && p->minCUSize != 8) + { + p->minCUSize = 8; + x265_log(p, X265_LOG_WARNING, "Setting minCuSize = 8, AVCINFO expects 8x8 blocks\n"); + } + if (p->keyframeMax < 0) { /* A negative max GOP size indicates the user wants only one I frame at From cce6e95d4dfc5e470931013fed8330f5f567120f Mon Sep 17 00:00:00 2001 From: Divya Manivannan Date: Thu, 14 Dec 2017 16:53:44 +0530 Subject: [PATCH 20/51] analysis: avoid copying lowres vbv data when vbv is disabled --- source/encoder/encoder.cpp | 77 +++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index ddda95bdfd..cf7589ba8b 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -1047,11 +1047,14 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) inFrame->m_lowres.sliceType = sliceType; inFrame->m_lowres.bKeyframe = !!inFrame->m_analysisData.lookahead.keyframe; inFrame->m_lowres.bLastMiniGopBFrame = !!inFrame->m_analysisData.lookahead.lastMiniGopBFrame; - int vbvCount = m_param->lookaheadDepth + m_param->bframes + 2; - for (int index = 0; index < vbvCount; index++) + if (m_rateControl->m_isVbv) { - inFrame->m_lowres.plannedSatd[index] = inFrame->m_analysisData.lookahead.plannedSatd[index]; - inFrame->m_lowres.plannedType[index] = inFrame->m_analysisData.lookahead.plannedType[index]; + int vbvCount = m_param->lookaheadDepth + m_param->bframes + 2; + for (int index = 0; index < vbvCount; index++) + { + inFrame->m_lowres.plannedSatd[index] = inFrame->m_analysisData.lookahead.plannedSatd[index]; + inFrame->m_lowres.plannedType[index] = inFrame->m_analysisData.lookahead.plannedType[index]; + } } } } @@ -1165,26 +1168,29 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) pic_out->analysisData.satdCost *= factor; pic_out->analysisData.lookahead.keyframe = outFrame->m_lowres.bKeyframe; pic_out->analysisData.lookahead.lastMiniGopBFrame = outFrame->m_lowres.bLastMiniGopBFrame; - int vbvCount = m_param->lookaheadDepth + m_param->bframes + 2; - for (int index = 0; index < vbvCount; index++) - { - pic_out->analysisData.lookahead.plannedSatd[index] = outFrame->m_lowres.plannedSatd[index] * factor; - pic_out->analysisData.lookahead.plannedType[index] = outFrame->m_lowres.plannedType[index]; - } - for (uint32_t index = 0; index < pic_out->analysisData.numCuInHeight; index++) - { - outFrame->m_analysisData.lookahead.intraSatdForVbv[index] = outFrame->m_encData->m_rowStat[index].intraSatdForVbv * factor; - outFrame->m_analysisData.lookahead.satdForVbv[index] = outFrame->m_encData->m_rowStat[index].satdForVbv * factor; - } - pic_out->analysisData.lookahead.intraSatdForVbv = outFrame->m_analysisData.lookahead.intraSatdForVbv; - pic_out->analysisData.lookahead.satdForVbv = outFrame->m_analysisData.lookahead.satdForVbv; - for (uint32_t index = 0; index < pic_out->analysisData.numCUsInFrame; index++) + if (m_rateControl->m_isVbv) { - outFrame->m_analysisData.lookahead.intraVbvCost[index] = outFrame->m_encData->m_cuStat[index].intraVbvCost * factor; - outFrame->m_analysisData.lookahead.vbvCost[index] = outFrame->m_encData->m_cuStat[index].vbvCost * factor; + int vbvCount = m_param->lookaheadDepth + m_param->bframes + 2; + for (int index = 0; index < vbvCount; index++) + { + pic_out->analysisData.lookahead.plannedSatd[index] = outFrame->m_lowres.plannedSatd[index] * factor; + pic_out->analysisData.lookahead.plannedType[index] = outFrame->m_lowres.plannedType[index]; + } + for (uint32_t index = 0; index < pic_out->analysisData.numCuInHeight; index++) + { + outFrame->m_analysisData.lookahead.intraSatdForVbv[index] = outFrame->m_encData->m_rowStat[index].intraSatdForVbv * factor; + outFrame->m_analysisData.lookahead.satdForVbv[index] = outFrame->m_encData->m_rowStat[index].satdForVbv * factor; + } + pic_out->analysisData.lookahead.intraSatdForVbv = outFrame->m_analysisData.lookahead.intraSatdForVbv; + pic_out->analysisData.lookahead.satdForVbv = outFrame->m_analysisData.lookahead.satdForVbv; + for (uint32_t index = 0; index < pic_out->analysisData.numCUsInFrame; index++) + { + outFrame->m_analysisData.lookahead.intraVbvCost[index] = outFrame->m_encData->m_cuStat[index].intraVbvCost * factor; + outFrame->m_analysisData.lookahead.vbvCost[index] = outFrame->m_encData->m_cuStat[index].vbvCost * factor; + } + pic_out->analysisData.lookahead.intraVbvCost = outFrame->m_analysisData.lookahead.intraVbvCost; + pic_out->analysisData.lookahead.vbvCost = outFrame->m_analysisData.lookahead.vbvCost; } - pic_out->analysisData.lookahead.intraVbvCost = outFrame->m_analysisData.lookahead.intraVbvCost; - pic_out->analysisData.lookahead.vbvCost = outFrame->m_analysisData.lookahead.vbvCost; } writeAnalysisFile(&pic_out->analysisData, *outFrame->m_encData); if (m_param->bUseAnalysisFile) @@ -1351,15 +1357,18 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->bDisableLookahead) { frameEnc->m_dts = frameEnc->m_analysisData.lookahead.dts; - for (uint32_t index = 0; index < frameEnc->m_analysisData.numCuInHeight; index++) + if (m_rateControl->m_isVbv) { - frameEnc->m_encData->m_rowStat[index].intraSatdForVbv = frameEnc->m_analysisData.lookahead.intraSatdForVbv[index]; - frameEnc->m_encData->m_rowStat[index].satdForVbv = frameEnc->m_analysisData.lookahead.satdForVbv[index]; - } - for (uint32_t index = 0; index < frameEnc->m_analysisData.numCUsInFrame; index++) - { - frameEnc->m_encData->m_cuStat[index].intraVbvCost = frameEnc->m_analysisData.lookahead.intraVbvCost[index]; - frameEnc->m_encData->m_cuStat[index].vbvCost = frameEnc->m_analysisData.lookahead.vbvCost[index]; + for (uint32_t index = 0; index < frameEnc->m_analysisData.numCuInHeight; index++) + { + frameEnc->m_encData->m_rowStat[index].intraSatdForVbv = frameEnc->m_analysisData.lookahead.intraSatdForVbv[index]; + frameEnc->m_encData->m_rowStat[index].satdForVbv = frameEnc->m_analysisData.lookahead.satdForVbv[index]; + } + for (uint32_t index = 0; index < frameEnc->m_analysisData.numCUsInFrame; index++) + { + frameEnc->m_encData->m_cuStat[index].intraVbvCost = frameEnc->m_analysisData.lookahead.intraVbvCost[index]; + frameEnc->m_encData->m_cuStat[index].vbvCost = frameEnc->m_analysisData.lookahead.vbvCost[index]; + } } } if (m_param->searchMethod == X265_SEA && frameEnc->m_lowres.sliceType != X265_TYPE_B) @@ -3032,7 +3041,7 @@ void Encoder::allocAnalysis(x265_analysis_data* analysis) { X265_CHECK(analysis->sliceType, "invalid slice type\n"); analysis->interData = analysis->intraData = NULL; - if (m_param->bDisableLookahead) + if (m_param->bDisableLookahead && m_rateControl->m_isVbv) { CHECKED_MALLOC_ZERO(analysis->lookahead.intraSatdForVbv, uint32_t, analysis->numCuInHeight); CHECKED_MALLOC_ZERO(analysis->lookahead.satdForVbv, uint32_t, analysis->numCuInHeight); @@ -3103,10 +3112,9 @@ void Encoder::allocAnalysis(x265_analysis_data* analysis) freeAnalysis(analysis); m_aborted = true; } - void Encoder::freeAnalysis(x265_analysis_data* analysis) { - if (m_param->bDisableLookahead) + if (m_param->bDisableLookahead && m_rateControl->m_isVbv) { X265_FREE(analysis->lookahead.satdForVbv); X265_FREE(analysis->lookahead.intraSatdForVbv); @@ -3309,10 +3317,9 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x if (m_param->scaleFactor) analysis->numPartitions *= factor; - /* Memory is allocated for inter and intra analysis data based on the slicetype */ allocAnalysis(analysis); - if (m_param->bDisableLookahead) + if (m_param->bDisableLookahead && m_rateControl->m_isVbv) { X265_FREAD(analysis->lookahead.intraVbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFile, picData->lookahead.intraVbvCost); X265_FREAD(analysis->lookahead.vbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFile, picData->lookahead.vbvCost); From 0d0adbf290dd3945e8f75284d3a3c93c46476b30 Mon Sep 17 00:00:00 2001 From: Aruna Matheswaran Date: Thu, 21 Dec 2017 15:26:55 +0530 Subject: [PATCH 21/51] analysis: Enable analysis save and load in a single run. This patch does the following, 1. Deprecates the cli option --analysis-reuse-mode and introduces two new cli options --analysis-save and --analysis-load . 2. Deprecates the param analysisReuseMode and introduces two new param options analysisSave and analysisLoad. --- doc/reST/cli.rst | 21 ++-- source/CMakeLists.txt | 2 +- source/common/cudata.cpp | 4 +- source/common/param.cpp | 15 ++- source/encoder/analysis.cpp | 26 ++-- source/encoder/api.cpp | 2 +- source/encoder/encoder.cpp | 199 ++++++++++++++++--------------- source/encoder/encoder.h | 1 - source/encoder/frameencoder.cpp | 12 +- source/encoder/search.cpp | 6 +- source/encoder/slicetype.cpp | 12 +- source/test/regression-tests.txt | 17 +-- source/x265.cpp | 4 +- source/x265.h | 36 +++--- source/x265cli.h | 13 +- 15 files changed, 197 insertions(+), 173 deletions(-) diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst index cd6bb41cff..4a60d8aa94 100644 --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -863,21 +863,22 @@ Analysis re-use options, to improve performance when encoding the same sequence multiple times (presumably at varying bitrates). The encoder will not reuse analysis if slice type parameters do not match. -.. option:: --analysis-reuse-mode +.. option:: --analysis-save - This option allows reuse of analysis information from first pass to second pass. - :option:`--analysis-reuse-mode save` specifies that encoder outputs analysis information of each frame. - :option:`--analysis-reuse-mode load` specifies that encoder reuses analysis information from first pass. - There is no benefit using load mode without running encoder in save mode. Analysis data from save mode is - written to a file specified by :option:`--analysis-reuse-file`. The amount of analysis data stored/reused - is determined by :option:`--analysis-reuse-level`. By reading the analysis data writen by an earlier encode - of the same sequence, substantial redundant work may be avoided. Requires cutree, pmode to be off. Default 0. + Encoder outputs analysis information of each frame. Analysis data from save mode is + written to the file specified. Requires cutree, pmode to be off. Default disabled. + +.. option:: --analysis-load + + Encoder reuses analysis information from the file specified. By reading the analysis data writen by + an earlier encode of the same sequence, substantial redundant work may be avoided. Requires cutree, pmode + to be off. Default disabled. - **Values:** off(0), save(1): dump analysis data, load(2): read analysis data + The amount of analysis data stored/reused is determined by :option:`--analysis-reuse-level`. .. option:: --analysis-reuse-file - Specify a filename for analysis data (see :option:`--analysis-reuse-mode`) + Specify a filename for `multi-pass-opt-analysis` and `multi-pass-opt-distortion`. If no filename is specified, x265_analysis.dat is used. .. option:: --analysis-reuse-level <1..10> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index c859ff4cc0..d7334e055f 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -29,7 +29,7 @@ option(NATIVE_BUILD "Target the build CPU" OFF) option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 148) +set(X265_BUILD 149) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" diff --git a/source/common/cudata.cpp b/source/common/cudata.cpp index a519372e4a..eada66cfe1 100644 --- a/source/common/cudata.cpp +++ b/source/common/cudata.cpp @@ -1626,7 +1626,7 @@ uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MV dir |= (1 << list); candMvField[count][list].mv = colmv; candMvField[count][list].refIdx = refIdx; - if (m_encData->m_param->scaleFactor && m_encData->m_param->analysisReuseMode == X265_ANALYSIS_SAVE && m_log2CUSize[0] < 4) + if (m_encData->m_param->scaleFactor && m_encData->m_param->analysisSave && m_log2CUSize[0] < 4) { MV dist(MAX_MV, MAX_MV); candMvField[count][list].mv = dist; @@ -1791,7 +1791,7 @@ int CUData::getPMV(InterNeighbourMV *neighbours, uint32_t picList, uint32_t refI int curRefPOC = m_slice->m_refPOCList[picList][refIdx]; int curPOC = m_slice->m_poc; - if (m_encData->m_param->scaleFactor && m_encData->m_param->analysisReuseMode == X265_ANALYSIS_SAVE && (m_log2CUSize[0] < 4)) + if (m_encData->m_param->scaleFactor && m_encData->m_param->analysisSave && (m_log2CUSize[0] < 4)) { MV dist(MAX_MV, MAX_MV); pmv[numMvc++] = amvpCand[num++] = dist; diff --git a/source/common/param.cpp b/source/common/param.cpp index 4b037d50f1..f8c08a084e 100644 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -197,10 +197,12 @@ void x265_param_default(x265_param* param) param->rdPenalty = 0; param->psyRd = 2.0; param->psyRdoq = 0.0; - param->analysisReuseMode = 0; + param->analysisReuseMode = 0; /*DEPRECATED*/ param->analysisMultiPassRefine = 0; param->analysisMultiPassDistortion = 0; param->analysisReuseFileName = NULL; + param->analysisSave = NULL; + param->analysisLoad = NULL; param->bIntraInBFrames = 0; param->bLossless = 0; param->bCULossless = 0; @@ -850,7 +852,7 @@ int x265_param_parse(x265_param* p, const char* name, const char* value) p->rc.bStrictCbr = atobool(value); p->rc.pbFactor = 1.0; } - OPT("analysis-reuse-mode") p->analysisReuseMode = parseName(value, x265_analysis_names, bError); + OPT("analysis-reuse-mode") p->analysisReuseMode = parseName(value, x265_analysis_names, bError); /*DEPRECATED*/ OPT("sar") { p->vui.aspectRatioIdc = parseName(value, x265_sar_names, bError); @@ -1006,6 +1008,8 @@ int x265_param_parse(x265_param* p, const char* name, const char* value) } } OPT("gop-lookahead") p->gopLookahead = atoi(value); + OPT("analysis-save") p->analysisSave = strdup(value); + OPT("analysis-load") p->analysisLoad = strdup(value); else return X265_PARAM_BAD_NAME; } @@ -1344,9 +1348,7 @@ int x265_check_params(x265_param* param) "Constant QP is incompatible with 2pass"); CHECK(param->rc.bStrictCbr && (param->rc.bitrate <= 0 || param->rc.vbvBufferSize <=0), "Strict-cbr cannot be applied without specifying target bitrate or vbv bufsize"); - CHECK(param->analysisReuseMode && (param->analysisReuseMode < X265_ANALYSIS_OFF || param->analysisReuseMode > X265_ANALYSIS_LOAD), - "Invalid analysis mode. Analysis mode 0: OFF 1: SAVE : 2 LOAD"); - CHECK(param->analysisReuseMode && (param->analysisReuseLevel < 1 || param->analysisReuseLevel > 10), + CHECK((param->analysisSave || param->analysisLoad) && (param->analysisReuseLevel < 1 || param->analysisReuseLevel > 10), "Invalid analysis refine level. Value must be between 1 and 10 (inclusive)"); CHECK(param->scaleFactor > 2, "Invalid scale-factor. Supports factor <= 2"); CHECK(param->rc.qpMax < QP_MIN || param->rc.qpMax > QP_MAX_MAX, @@ -1618,7 +1620,6 @@ char *x265_param2string(x265_param* p, int padx, int pady) s += sprintf(s, " psy-rd=%.2f", p->psyRd); s += sprintf(s, " psy-rdoq=%.2f", p->psyRdoq); BOOL(p->bEnableRdRefine, "rd-refine"); - s += sprintf(s, " analysis-reuse-mode=%d", p->analysisReuseMode); BOOL(p->bLossless, "lossless"); s += sprintf(s, " cbqpoffs=%d", p->cbQpOffset); s += sprintf(s, " crqpoffs=%d", p->crQpOffset); @@ -1716,6 +1717,8 @@ char *x265_param2string(x265_param* p, int padx, int pady) BOOL(p->bEmitHDRSEI, "hdr"); BOOL(p->bHDROpt, "hdr-opt"); BOOL(p->bDhdr10opt, "dhdr10-opt"); + s += sprintf(s, " analysis-save=%s", p->analysisSave); + s += sprintf(s, " analysis-load=%s", p->analysisLoad); s += sprintf(s, " analysis-reuse-level=%d", p->analysisReuseLevel); s += sprintf(s, " scale-factor=%d", p->scaleFactor); s += sprintf(s, " refine-intra=%d", p->intraRefine); diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp index 6eecc4375e..76b37cb0d8 100644 --- a/source/encoder/analysis.cpp +++ b/source/encoder/analysis.cpp @@ -207,11 +207,11 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con } } - if (m_param->analysisReuseMode && m_slice->m_sliceType != I_SLICE && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel < 10) + if ((m_param->analysisSave || m_param->analysisLoad) && m_slice->m_sliceType != I_SLICE && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel < 10) { int numPredDir = m_slice->isInterP() ? 1 : 2; m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData; - m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir]; + m_reuseRef = &m_reuseInterDataCTU->ref [ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir]; m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions]; m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions]; if (m_param->analysisReuseLevel > 4) @@ -219,7 +219,7 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con m_reusePartSize = &m_reuseInterDataCTU->partSize[ctu.m_cuAddr * ctu.m_numPartitions]; m_reuseMergeFlag = &m_reuseInterDataCTU->mergeFlag[ctu.m_cuAddr * ctu.m_numPartitions]; } - if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE) + if (m_param->analysisSave && !m_param->analysisLoad) for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * numPredDir; i++) m_reuseRef[i] = -1; } @@ -228,7 +228,7 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con if (m_slice->m_sliceType == I_SLICE) { analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData; - if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1) + if (m_param->analysisLoad && m_param->analysisReuseLevel > 1) { memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition); memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition); @@ -239,7 +239,7 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con } else { - bool bCopyAnalysis = ((m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel == 10) || (m_param->bMVType && m_param->analysisReuseLevel >= 7 && ctu.m_numPartitions <= 16)); + bool bCopyAnalysis = ((m_param->analysisLoad && m_param->analysisReuseLevel == 10) || (m_param->bMVType && m_param->analysisReuseLevel >= 7 && ctu.m_numPartitions <= 16)); bool BCompressInterCUrd0_4 = (m_param->bMVType && m_param->analysisReuseLevel >= 7 && m_param->rdLevel <= 4); bool BCompressInterCUrd5_6 = (m_param->bMVType && m_param->analysisReuseLevel >= 7 && m_param->rdLevel >= 5 && m_param->rdLevel <= 6); bCopyAnalysis = bCopyAnalysis || BCompressInterCUrd0_4 || BCompressInterCUrd5_6; @@ -280,7 +280,7 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con /* generate residual for entire CTU at once and copy to reconPic */ encodeResidue(ctu, cuGeom); } - else if ((m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel == 10) || ((m_param->bMVType == AVC_INFO) && m_param->analysisReuseLevel >= 7 && ctu.m_numPartitions <= 16)) + else if ((m_param->analysisLoad && m_param->analysisReuseLevel == 10) || ((m_param->bMVType == AVC_INFO) && m_param->analysisReuseLevel >= 7 && ctu.m_numPartitions <= 16)) { analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData; int posCTU = ctu.m_cuAddr * numPartition; @@ -651,7 +651,7 @@ uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom } /* Save Intra CUs TU depth only when analysis mode is OFF */ - if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4 && !m_param->analysisReuseMode) + if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4 && (!m_param->analysisSave && !m_param->analysisLoad)) { CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr); int8_t maxTUDepth = -1; @@ -1263,7 +1263,7 @@ SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& mightSplit &= !bDecidedDepth; } } - if ((m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10)) + if ((m_param->analysisLoad && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10)) { if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx]) { @@ -1957,7 +1957,7 @@ SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& mightSplit &= !bDecidedDepth; } } - if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) + if (m_param->analysisLoad && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) { if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx]) { @@ -2890,7 +2890,7 @@ void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize interMode.cu.setPredModeSubParts(MODE_INTER); int numPredDir = m_slice->isInterP() ? 1 : 2; - if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) + if (m_param->analysisLoad && m_reuseInterDataCTU && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) { int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2; int index = 0; @@ -2932,7 +2932,7 @@ void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize } interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits); - if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU && m_param->analysisReuseLevel > 1) + if (m_param->analysisSave && m_reuseInterDataCTU && m_param->analysisReuseLevel > 1) { int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2; int index = 0; @@ -2954,7 +2954,7 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize interMode.cu.setPredModeSubParts(MODE_INTER); int numPredDir = m_slice->isInterP() ? 1 : 2; - if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) + if (m_param->analysisLoad && m_reuseInterDataCTU && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) { int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2; int index = 0; @@ -2988,7 +2988,7 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize /* predInterSearch sets interMode.sa8dBits, but this is ignored */ encodeResAndCalcRdInterCU(interMode, cuGeom); - if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU && m_param->analysisReuseLevel > 1) + if (m_param->analysisSave && m_reuseInterDataCTU && m_param->analysisReuseLevel > 1) { int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2; int index = 0; diff --git a/source/encoder/api.cpp b/source/encoder/api.cpp index 955c941243..21a58c18f2 100644 --- a/source/encoder/api.cpp +++ b/source/encoder/api.cpp @@ -398,7 +398,7 @@ void x265_picture_init(x265_param *param, x265_picture *pic) pic->userSEI.payloads = NULL; pic->userSEI.numPayloads = 0; - if (param->analysisReuseMode || (param->bMVType == AVC_INFO)) + if ((param->analysisSave || param->analysisLoad) || (param->bMVType == AVC_INFO)) { uint32_t widthInCU = (param->sourceWidth + param->maxCUSize - 1) >> param->maxLog2CUSize; uint32_t heightInCU = (param->sourceHeight + param->maxCUSize - 1) >> param->maxLog2CUSize; diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index cf7589ba8b..b060289a64 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -77,7 +77,6 @@ Encoder::Encoder() m_param = NULL; m_latestParam = NULL; m_threadPool = NULL; - m_analysisFile = NULL; m_analysisFileIn = NULL; m_analysisFileOut = NULL; m_offsetEmergency = NULL; @@ -343,16 +342,28 @@ void Encoder::create() if (!m_lookahead->create()) m_aborted = true; initRefIdx(); - if (m_param->analysisReuseMode && m_param->bUseAnalysisFile) + if (m_param->analysisSave && m_param->bUseAnalysisFile) { - const char* name = m_param->analysisReuseFileName; - if (!name) - name = defaultAnalysisFileName; - const char* mode = m_param->analysisReuseMode == X265_ANALYSIS_LOAD ? "rb" : "wb"; - m_analysisFile = x265_fopen(name, mode); - if (!m_analysisFile) + char* temp = strcatFilename(m_param->analysisSave, ".temp"); + if (!temp) + m_aborted = true; + else + { + m_analysisFileOut = x265_fopen(temp, "wb"); + X265_FREE(temp); + } + if (!m_analysisFileOut) { - x265_log_file(NULL, X265_LOG_ERROR, "Analysis load/save: failed to open file %s\n", name); + x265_log_file(NULL, X265_LOG_ERROR, "Analysis save: failed to open file %s.temp\n", m_param->analysisSave); + m_aborted = true; + } + } + if (m_param->analysisLoad && m_param->bUseAnalysisFile) + { + m_analysisFileIn = x265_fopen(m_param->analysisLoad, "rb"); + if (!m_analysisFileIn) + { + x265_log_file(NULL, X265_LOG_ERROR, "Analysis load: failed to open file %s\n", m_param->analysisLoad); m_aborted = true; } } @@ -718,9 +729,6 @@ void Encoder::destroy() X265_FREE(m_offsetEmergency); - if (m_analysisFile) - fclose(m_analysisFile); - if (m_latestParam != NULL && m_latestParam != m_param) { if (m_latestParam->scalingLists != m_param->scalingLists) @@ -735,7 +743,7 @@ void Encoder::destroy() { int bError = 1; fclose(m_analysisFileOut); - const char* name = m_param->analysisReuseFileName; + const char* name = m_param->analysisSave ? m_param->analysisSave : m_param->analysisReuseFileName; if (!name) name = defaultAnalysisFileName; char* temp = strcatFilename(name, ".temp"); @@ -763,6 +771,8 @@ void Encoder::destroy() free((char*)m_param->numaPools); free((char*)m_param->masteringDisplayColorVolume); free((char*)m_param->toneMapFile); + free((char*)m_param->analysisSave); + free((char*)m_param->analysisLoad); PARAM_NS::x265_param_free(m_param); } } @@ -849,7 +859,7 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) if (m_exportedPic) { - if (!m_param->bUseAnalysisFile && m_param->analysisReuseMode == X265_ANALYSIS_SAVE) + if (!m_param->bUseAnalysisFile && m_param->analysisSave) freeAnalysis(&m_exportedPic->m_analysisData); ATOMIC_DEC(&m_exportedPic->m_countRefEncoders); m_exportedPic = NULL; @@ -1034,7 +1044,7 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) /* In analysisSave mode, x265_analysis_data is allocated in pic_in and inFrame points to this */ /* Load analysis data before lookahead->addPicture, since sliceType has been decided */ - if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD) + if (m_param->analysisLoad) { /* readAnalysisFile reads analysis data for the frame and allocates memory based on slicetype */ readAnalysisFile(&inFrame->m_analysisData, inFrame->m_poc, pic_in); @@ -1119,7 +1129,7 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) x265_frame_stats* frameData = NULL; /* Free up pic_in->analysisData since it has already been used */ - if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD || (m_param->bMVType && slice->m_sliceType != I_SLICE)) + if ((m_param->analysisLoad && !m_param->analysisSave) || (m_param->bMVType && slice->m_sliceType != I_SLICE)) freeAnalysis(&outFrame->m_analysisData); if (pic_out) @@ -1145,7 +1155,7 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) } /* Dump analysis data from pic_out to file in save mode and free */ - if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE) + if (m_param->analysisSave) { pic_out->analysisData.poc = pic_out->poc; pic_out->analysisData.sliceType = pic_out->sliceType; @@ -1354,7 +1364,7 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) slice->m_maxNumMergeCand = m_param->maxNumMergeCand; slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * m_param->num4x4Partitions); } - if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->bDisableLookahead) + if (m_param->analysisLoad && m_param->bDisableLookahead) { frameEnc->m_dts = frameEnc->m_analysisData.lookahead.dts; if (m_rateControl->m_isVbv) @@ -1423,7 +1433,7 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) frameEnc->m_encData->m_slice->m_iNumRPSInSPS = m_sps.spsrpsNum; curEncoder->m_rce.encodeOrder = frameEnc->m_encodeOrder = m_encodedFrameNum++; - if (m_param->analysisReuseMode != X265_ANALYSIS_LOAD || !m_param->bDisableLookahead) + if (!m_param->analysisLoad || !m_param->bDisableLookahead) { if (m_bframeDelay) { @@ -1438,7 +1448,7 @@ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) } /* Allocate analysis data before encode in save mode. This is allocated in frameEnc */ - if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE) + if (m_param->analysisSave && !m_param->analysisLoad) { x265_analysis_data* analysis = &frameEnc->m_analysisData; analysis->poc = frameEnc->m_poc; @@ -2623,23 +2633,24 @@ void Encoder::configure(x265_param *p) p->rc.rfConstantMin = 0; } - if (p->analysisReuseMode && (p->bDistributeModeAnalysis || p->bDistributeMotionEstimation)) + if ((p->analysisLoad || p->analysisSave) && (p->bDistributeModeAnalysis || p->bDistributeMotionEstimation)) { x265_log(p, X265_LOG_WARNING, "Analysis load/save options incompatible with pmode/pme, Disabling pmode/pme\n"); p->bDistributeMotionEstimation = p->bDistributeModeAnalysis = 0; } - if (p->analysisReuseMode && p->rc.cuTree) + if ((p->analysisLoad || p->analysisSave) && p->rc.cuTree) { x265_log(p, X265_LOG_WARNING, "Analysis load/save options works only with cu-tree off, Disabling cu-tree\n"); p->rc.cuTree = 0; } - if (p->analysisReuseMode && (p->analysisMultiPassRefine || p->analysisMultiPassDistortion)) + if ((p->analysisLoad || p->analysisSave) && (p->analysisMultiPassRefine || p->analysisMultiPassDistortion)) { x265_log(p, X265_LOG_WARNING, "Cannot use Analysis load/save option and multi-pass-opt-analysis/multi-pass-opt-distortion together," "Disabling Analysis load/save and multi-pass-opt-analysis/multi-pass-opt-distortion\n"); - p->analysisReuseMode = p->analysisMultiPassRefine = p->analysisMultiPassDistortion = 0; + p->analysisSave = p->analysisLoad = NULL; + p->analysisMultiPassRefine = p->analysisMultiPassDistortion = 0; } if (p->scaleFactor) { @@ -2647,16 +2658,16 @@ void Encoder::configure(x265_param *p) { p->scaleFactor = 0; } - else if (!p->analysisReuseMode || p->analysisReuseLevel < 10) + else if ((!p->analysisLoad && !p->analysisSave) || p->analysisReuseLevel < 10) { - x265_log(p, X265_LOG_WARNING, "Input scaling works with analysis-reuse-mode, analysis-reuse-level 10. Disabling scale-factor.\n"); + x265_log(p, X265_LOG_WARNING, "Input scaling works with analysis load/save, analysis-reuse-level 10. Disabling scale-factor.\n"); p->scaleFactor = 0; } } if (p->intraRefine) { - if (p->analysisReuseMode!= X265_ANALYSIS_LOAD || p->analysisReuseLevel < 10 || !p->scaleFactor) + if (!p->analysisLoad || p->analysisReuseLevel < 10 || !p->scaleFactor) { x265_log(p, X265_LOG_WARNING, "Intra refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling intra refine.\n"); p->intraRefine = 0; @@ -2665,7 +2676,7 @@ void Encoder::configure(x265_param *p) if (p->interRefine) { - if (p->analysisReuseMode != X265_ANALYSIS_LOAD || p->analysisReuseLevel < 10 || !p->scaleFactor) + if (!p->analysisLoad || p->analysisReuseLevel < 10 || !p->scaleFactor) { x265_log(p, X265_LOG_WARNING, "Inter refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling inter refine.\n"); p->interRefine = 0; @@ -2680,7 +2691,7 @@ void Encoder::configure(x265_param *p) if (p->mvRefine) { - if (p->analysisReuseMode != X265_ANALYSIS_LOAD || p->analysisReuseLevel < 10 || !p->scaleFactor) + if (!p->analysisLoad || p->analysisReuseLevel < 10 || !p->scaleFactor) { x265_log(p, X265_LOG_WARNING, "MV refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling MV refine.\n"); p->mvRefine = 0; @@ -2781,7 +2792,7 @@ void Encoder::configure(x265_param *p) m_conformanceWindow.bottomOffset = 0; m_conformanceWindow.leftOffset = 0; /* set pad size if width is not multiple of the minimum CU size */ - if (p->scaleFactor == 2 && ((p->sourceWidth / 2) & (p->minCUSize - 1)) && p->analysisReuseMode == X265_ANALYSIS_LOAD) + if (p->scaleFactor == 2 && ((p->sourceWidth / 2) & (p->minCUSize - 1)) && p->analysisLoad) { uint32_t rem = (p->sourceWidth / 2) & (p->minCUSize - 1); uint32_t padsize = p->minCUSize - rem; @@ -2970,7 +2981,7 @@ void Encoder::configure(x265_param *p) } } /* set pad size if height is not multiple of the minimum CU size */ - if (p->scaleFactor == 2 && ((p->sourceHeight / 2) & (p->minCUSize - 1)) && p->analysisReuseMode == X265_ANALYSIS_LOAD) + if (p->scaleFactor == 2 && ((p->sourceHeight / 2) & (p->minCUSize - 1)) && p->analysisLoad) { uint32_t rem = (p->sourceHeight / 2) & (p->minCUSize - 1); uint32_t padsize = p->minCUSize - rem; @@ -3268,30 +3279,30 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x static uint64_t totalConsumedBytes = 0; uint32_t depthBytes = 0; if (m_param->bUseAnalysisFile) - fseeko(m_analysisFile, totalConsumedBytes, SEEK_SET); + fseeko(m_analysisFileIn, totalConsumedBytes, SEEK_SET); const x265_analysis_data *picData = &(picIn->analysisData); analysis_intra_data *intraPic = (analysis_intra_data *)picData->intraData; analysis_inter_data *interPic = (analysis_inter_data *)picData->interData; int poc; uint32_t frameRecordSize; - X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile, &(picData->frameRecordSize)); - X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFile, &(picData->depthBytes)); - X265_FREAD(&poc, sizeof(int), 1, m_analysisFile, &(picData->poc)); + X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->frameRecordSize)); + X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->depthBytes)); + X265_FREAD(&poc, sizeof(int), 1, m_analysisFileIn, &(picData->poc)); if (m_param->bUseAnalysisFile) { uint64_t currentOffset = totalConsumedBytes; /* Seeking to the right frame Record */ - while (poc != curPoc && !feof(m_analysisFile)) + while (poc != curPoc && !feof(m_analysisFileIn)) { currentOffset += frameRecordSize; - fseeko(m_analysisFile, currentOffset, SEEK_SET); - X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile, &(picData->frameRecordSize)); - X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFile, &(picData->depthBytes)); - X265_FREAD(&poc, sizeof(int), 1, m_analysisFile, &(picData->poc)); + fseeko(m_analysisFileIn, currentOffset, SEEK_SET); + X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->frameRecordSize)); + X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->depthBytes)); + X265_FREAD(&poc, sizeof(int), 1, m_analysisFileIn, &(picData->poc)); } - if (poc != curPoc || feof(m_analysisFile)) + if (poc != curPoc || feof(m_analysisFileIn)) { x265_log(NULL, X265_LOG_WARNING, "Error reading analysis data: Cannot find POC %d\n", curPoc); freeAnalysis(analysis); @@ -3302,15 +3313,15 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x /* Now arrived at the right frame, read the record */ analysis->poc = poc; analysis->frameRecordSize = frameRecordSize; - X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFile, &(picData->sliceType)); - X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFile, &(picData->bScenecut)); - X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFile, &(picData->satdCost)); - X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile, &(picData->numCUsInFrame)); - X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFile, &(picData->numPartitions)); + X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFileIn, &(picData->sliceType)); + X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFileIn, &(picData->bScenecut)); + X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileIn, &(picData->satdCost)); + X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame)); + X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFileIn, &(picData->numPartitions)); if (m_param->bDisableLookahead) { - X265_FREAD(&analysis->numCuInHeight, sizeof(uint32_t), 1, m_analysisFile, &(picData->numCuInHeight)); - X265_FREAD(&analysis->lookahead, sizeof(x265_lookahead_data), 1, m_analysisFile, &(picData->lookahead)); + X265_FREAD(&analysis->numCuInHeight, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->numCuInHeight)); + X265_FREAD(&analysis->lookahead, sizeof(x265_lookahead_data), 1, m_analysisFileIn, &(picData->lookahead)); } int scaledNumPartition = analysis->numPartitions; int factor = 1 << m_param->scaleFactor; @@ -3321,10 +3332,10 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x allocAnalysis(analysis); if (m_param->bDisableLookahead && m_rateControl->m_isVbv) { - X265_FREAD(analysis->lookahead.intraVbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFile, picData->lookahead.intraVbvCost); - X265_FREAD(analysis->lookahead.vbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFile, picData->lookahead.vbvCost); - X265_FREAD(analysis->lookahead.satdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFile, picData->lookahead.satdForVbv); - X265_FREAD(analysis->lookahead.intraSatdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFile, picData->lookahead.intraSatdForVbv); + X265_FREAD(analysis->lookahead.intraVbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.intraVbvCost); + X265_FREAD(analysis->lookahead.vbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.vbvCost); + X265_FREAD(analysis->lookahead.satdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.satdForVbv); + X265_FREAD(analysis->lookahead.intraSatdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.intraSatdForVbv); } if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) { @@ -3338,9 +3349,9 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x modeBuf = tempBuf + depthBytes; partSizes = tempBuf + 2 * depthBytes; - X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, m_analysisFile, intraPic->depth); - X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, m_analysisFile, intraPic->chromaModes); - X265_FREAD(partSizes, sizeof(uint8_t), depthBytes, m_analysisFile, intraPic->partSizes); + X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn, intraPic->depth); + X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn, intraPic->chromaModes); + X265_FREAD(partSizes, sizeof(uint8_t), depthBytes, m_analysisFileIn, intraPic->partSizes); size_t count = 0; for (uint32_t d = 0; d < depthBytes; d++) @@ -3361,12 +3372,12 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x if (!m_param->scaleFactor) { - X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile, intraPic->modes); + X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileIn, intraPic->modes); } else { uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition); - X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFile, intraPic->modes); + X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes); for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor) memset(&((analysis_intra_data *)analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor); X265_FREE(tempLumaBuf); @@ -3379,7 +3390,7 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x { uint32_t numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2; uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3; - X265_FREAD((WeightParam*)analysis->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFile, (picIn->analysisData.wt)); + X265_FREAD((WeightParam*)analysis->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFileIn, (picIn->analysisData.wt)); if (m_param->analysisReuseLevel < 2) return; @@ -3401,33 +3412,33 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x depthBuf = tempBuf; modeBuf = tempBuf + depthBytes; - X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, m_analysisFile, interPic->depth); - X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, m_analysisFile, interPic->modes); + X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->depth); + X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->modes); if (m_param->analysisReuseLevel > 4) { partSize = modeBuf + depthBytes; mergeFlag = partSize + depthBytes; - X265_FREAD(partSize, sizeof(uint8_t), depthBytes, m_analysisFile, interPic->partSize); - X265_FREAD(mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFile, interPic->mergeFlag); + X265_FREAD(partSize, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->partSize); + X265_FREAD(mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->mergeFlag); if (m_param->analysisReuseLevel == 10) { interDir = mergeFlag + depthBytes; - X265_FREAD(interDir, sizeof(uint8_t), depthBytes, m_analysisFile, interPic->interDir); + X265_FREAD(interDir, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->interDir); if (bIntraInInter) { chromaDir = interDir + depthBytes; - X265_FREAD(chromaDir, sizeof(uint8_t), depthBytes, m_analysisFile, intraPic->chromaModes); + X265_FREAD(chromaDir, sizeof(uint8_t), depthBytes, m_analysisFileIn, intraPic->chromaModes); } for (uint32_t i = 0; i < numDir; i++) { mvpIdx[i] = X265_MALLOC(uint8_t, depthBytes); refIdx[i] = X265_MALLOC(int8_t, depthBytes); mv[i] = X265_MALLOC(MV, depthBytes); - X265_FREAD(mvpIdx[i], sizeof(uint8_t), depthBytes, m_analysisFile, interPic->mvpIdx[i]); - X265_FREAD(refIdx[i], sizeof(int8_t), depthBytes, m_analysisFile, interPic->refIdx[i]); - X265_FREAD(mv[i], sizeof(MV), depthBytes, m_analysisFile, interPic->mv[i]); + X265_FREAD(mvpIdx[i], sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->mvpIdx[i]); + X265_FREAD(refIdx[i], sizeof(int8_t), depthBytes, m_analysisFileIn, interPic->refIdx[i]); + X265_FREAD(mv[i], sizeof(MV), depthBytes, m_analysisFileIn, interPic->mv[i]); } } } @@ -3486,12 +3497,12 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x { if (!m_param->scaleFactor) { - X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile, intraPic->modes); + X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileIn, intraPic->modes); } else { uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition); - X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFile, intraPic->modes); + X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes); for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor) memset(&((analysis_intra_data *)analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor); X265_FREE(tempLumaBuf); @@ -3499,7 +3510,7 @@ void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x } } else - X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFile, interPic->ref); + X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileIn, interPic->ref); consumedBytes += frameRecordSize; if (numDir == 1) @@ -3774,51 +3785,51 @@ void Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncD if (!m_param->bUseAnalysisFile) return; - X265_FWRITE(&analysis->frameRecordSize, sizeof(uint32_t), 1, m_analysisFile); - X265_FWRITE(&depthBytes, sizeof(uint32_t), 1, m_analysisFile); - X265_FWRITE(&analysis->poc, sizeof(int), 1, m_analysisFile); - X265_FWRITE(&analysis->sliceType, sizeof(int), 1, m_analysisFile); - X265_FWRITE(&analysis->bScenecut, sizeof(int), 1, m_analysisFile); - X265_FWRITE(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFile); - X265_FWRITE(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile); - X265_FWRITE(&analysis->numPartitions, sizeof(int), 1, m_analysisFile); + X265_FWRITE(&analysis->frameRecordSize, sizeof(uint32_t), 1, m_analysisFileOut); + X265_FWRITE(&depthBytes, sizeof(uint32_t), 1, m_analysisFileOut); + X265_FWRITE(&analysis->poc, sizeof(int), 1, m_analysisFileOut); + X265_FWRITE(&analysis->sliceType, sizeof(int), 1, m_analysisFileOut); + X265_FWRITE(&analysis->bScenecut, sizeof(int), 1, m_analysisFileOut); + X265_FWRITE(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileOut); + X265_FWRITE(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileOut); + X265_FWRITE(&analysis->numPartitions, sizeof(int), 1, m_analysisFileOut); if (analysis->sliceType > X265_TYPE_I) - X265_FWRITE((WeightParam*)analysis->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFile); + X265_FWRITE((WeightParam*)analysis->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFileOut); if (m_param->analysisReuseLevel < 2) return; if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) { - X265_FWRITE(((analysis_intra_data*)analysis->intraData)->depth, sizeof(uint8_t), depthBytes, m_analysisFile); - X265_FWRITE(((analysis_intra_data*)analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFile); - X265_FWRITE(((analysis_intra_data*)analysis->intraData)->partSizes, sizeof(char), depthBytes, m_analysisFile); - X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + X265_FWRITE(((analysis_intra_data*)analysis->intraData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut); + X265_FWRITE(((analysis_intra_data*)analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFileOut); + X265_FWRITE(((analysis_intra_data*)analysis->intraData)->partSizes, sizeof(char), depthBytes, m_analysisFileOut); + X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileOut); } else { - X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), depthBytes, m_analysisFile); - X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), depthBytes, m_analysisFile); + X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut); + X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), depthBytes, m_analysisFileOut); if (m_param->analysisReuseLevel > 4) { - X265_FWRITE(((analysis_inter_data*)analysis->interData)->partSize, sizeof(uint8_t), depthBytes, m_analysisFile); - X265_FWRITE(((analysis_inter_data*)analysis->interData)->mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFile); + X265_FWRITE(((analysis_inter_data*)analysis->interData)->partSize, sizeof(uint8_t), depthBytes, m_analysisFileOut); + X265_FWRITE(((analysis_inter_data*)analysis->interData)->mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFileOut); if (m_param->analysisReuseLevel == 10) { - X265_FWRITE(((analysis_inter_data*)analysis->interData)->interDir, sizeof(uint8_t), depthBytes, m_analysisFile); - if (bIntraInInter) X265_FWRITE(((analysis_intra_data*)analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFile); + X265_FWRITE(((analysis_inter_data*)analysis->interData)->interDir, sizeof(uint8_t), depthBytes, m_analysisFileOut); + if (bIntraInInter) X265_FWRITE(((analysis_intra_data*)analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFileOut); for (uint32_t dir = 0; dir < numDir; dir++) { - X265_FWRITE(((analysis_inter_data*)analysis->interData)->mvpIdx[dir], sizeof(uint8_t), depthBytes, m_analysisFile); - X265_FWRITE(((analysis_inter_data*)analysis->interData)->refIdx[dir], sizeof(int8_t), depthBytes, m_analysisFile); - X265_FWRITE(((analysis_inter_data*)analysis->interData)->mv[dir], sizeof(MV), depthBytes, m_analysisFile); + X265_FWRITE(((analysis_inter_data*)analysis->interData)->mvpIdx[dir], sizeof(uint8_t), depthBytes, m_analysisFileOut); + X265_FWRITE(((analysis_inter_data*)analysis->interData)->refIdx[dir], sizeof(int8_t), depthBytes, m_analysisFileOut); + X265_FWRITE(((analysis_inter_data*)analysis->interData)->mv[dir], sizeof(MV), depthBytes, m_analysisFileOut); } if (bIntraInInter) - X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileOut); } } if (m_param->analysisReuseLevel != 10) - X265_FWRITE(((analysis_inter_data*)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFile); + X265_FWRITE(((analysis_inter_data*)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileOut); } #undef X265_FWRITE diff --git a/source/encoder/encoder.h b/source/encoder/encoder.h index 5948346e96..4e6c439f06 100644 --- a/source/encoder/encoder.h +++ b/source/encoder/encoder.h @@ -130,7 +130,6 @@ class Encoder : public x265_encoder FrameEncoder* m_frameEncoder[X265_MAX_FRAME_THREADS]; DPB* m_dpb; Frame* m_exportedPic; - FILE* m_analysisFile; FILE* m_analysisFileIn; FILE* m_analysisFileOut; x265_param* m_param; diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp index ce24227c70..44093b9339 100644 --- a/source/encoder/frameencoder.cpp +++ b/source/encoder/frameencoder.cpp @@ -335,7 +335,7 @@ void FrameEncoder::threadMain() while (!m_frame->m_ctuInfo) m_frame->m_copied.wait(); } - if ((m_param->bMVType == AVC_INFO) && !m_param->analysisReuseMode && !(IS_X265_TYPE_I(m_frame->m_lowres.sliceType))) + if ((m_param->bMVType == AVC_INFO) && !m_param->analysisSave && !m_param->analysisLoad && !(IS_X265_TYPE_I(m_frame->m_lowres.sliceType))) { while (((m_frame->m_analysisData.interData == NULL && m_frame->m_analysisData.intraData == NULL) || (uint32_t)m_frame->m_poc != m_frame->m_analysisData.poc)) m_frame->m_copyMVType.wait(); @@ -430,7 +430,7 @@ void FrameEncoder::compressFrame() bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred; WeightParam* reuseWP = NULL; - if (m_param->analysisReuseMode && (bUseWeightP || bUseWeightB)) + if (m_param->analysisLoad && (bUseWeightP || bUseWeightB)) reuseWP = (WeightParam*)m_frame->m_analysisData.wt; if (bUseWeightP || bUseWeightB) @@ -439,7 +439,7 @@ void FrameEncoder::compressFrame() m_cuStats.countWeightAnalyze++; ScopedElapsedTime time(m_cuStats.weightAnalyzeTime); #endif - if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD) + if (m_param->analysisLoad) { for (int list = 0; list < slice->isInterB() + 1; list++) { @@ -466,6 +466,8 @@ void FrameEncoder::compressFrame() else slice->disableWeights(); + if (m_param->analysisSave && (bUseWeightP || bUseWeightB)) + reuseWP = (WeightParam*)m_frame->m_analysisData.wt; // Generate motion references int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0; for (int l = 0; l < numPredDir; l++) @@ -478,7 +480,7 @@ void FrameEncoder::compressFrame() slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic; m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param); } - if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE && (bUseWeightP || bUseWeightB)) + if (m_param->analysisSave && (bUseWeightP || bUseWeightB)) { for (int i = 0; i < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); i++) *(reuseWP++) = slice->m_weightPredTable[l][0][i]; @@ -1411,7 +1413,7 @@ void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) /* TODO: use defines from slicetype.h for lowres block size */ uint32_t block_y = (ctu->m_cuPelY >> m_param->maxLog2CUSize) * noOfBlocks; uint32_t block_x = (ctu->m_cuPelX >> m_param->maxLog2CUSize) * noOfBlocks; - if (m_param->analysisReuseMode != X265_ANALYSIS_LOAD || !m_param->bDisableLookahead) + if (!m_param->analysisLoad || !m_param->bDisableLookahead) { cuStat.vbvCost = 0; cuStat.intraVbvCost = 0; diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp index c890e31967..dd38b157a9 100644 --- a/source/encoder/search.cpp +++ b/source/encoder/search.cpp @@ -2073,7 +2073,7 @@ void Search::singleMotionEstimation(Search& master, Mode& interMode, const Predi int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref); MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx]; - if (!m_param->analysisReuseMode) /* Prevents load/save outputs from diverging if lowresMV is not available */ + if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging if lowresMV is not available */ { MV lmv = getLowresMV(interMode.cu, pu, list, ref); if (lmv.notZero()) @@ -2161,7 +2161,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours); /* Uni-directional prediction */ - if ((m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) + if ((m_param->analysisLoad && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10) || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bMVType == AVC_INFO)) { for (int list = 0; list < numPredDir; list++) @@ -2297,7 +2297,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma int mvpIdx = selectMVP(cu, pu, amvp, list, ref); MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx]; - if (!m_param->analysisReuseMode) /* Prevents load/save outputs from diverging when lowresMV is not available */ + if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging when lowresMV is not available */ { MV lmv = getLowresMV(cu, pu, list, ref); if (lmv.notZero()) diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp index 7129db2d2b..8513269709 100644 --- a/source/encoder/slicetype.cpp +++ b/source/encoder/slicetype.cpp @@ -746,7 +746,7 @@ void Lookahead::destroy() /* Called by API thread */ void Lookahead::addPicture(Frame& curFrame, int sliceType) { - if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->bDisableLookahead) + if (m_param->analysisLoad && m_param->bDisableLookahead) { if (!m_filled) m_filled = true; @@ -847,7 +847,7 @@ Frame* Lookahead::getDecidedPicture() return out; } - if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->bDisableLookahead) + if (m_param->analysisLoad && m_param->bDisableLookahead) return NULL; findJob(-1); /* run slicetypeDecide() if necessary */ @@ -906,13 +906,13 @@ void Lookahead::getEstimatedPictureCost(Frame *curFrame) default: return; } - if (m_param->analysisReuseMode != X265_ANALYSIS_LOAD || !m_param->bDisableLookahead) + if (!m_param->analysisLoad || !m_param->bDisableLookahead) { X265_CHECK(curFrame->m_lowres.costEst[b - p0][p1 - b] > 0, "Slice cost not estimated\n") if (m_param->rc.cuTree && !m_param->rc.bStatRead) /* update row satds based on cutree offsets */ curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b); - else if (m_param->analysisReuseMode != X265_ANALYSIS_LOAD || m_param->scaleFactor) + else if (!m_param->analysisLoad || m_param->scaleFactor) { if (m_param->rc.aqMode) curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b]; @@ -1056,7 +1056,7 @@ void Lookahead::slicetypeDecide() { slicetypeAnalyse(frames, false); bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0; - if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->scaleFactor && bIsVbv) + if (m_param->analysisLoad && m_param->scaleFactor && bIsVbv) { int numFrames; for (numFrames = 0; numFrames < maxSearch; numFrames++) @@ -1252,7 +1252,7 @@ void Lookahead::slicetypeDecide() frames[j + 1] = NULL; slicetypeAnalyse(frames, true); bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0; - if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->scaleFactor && bIsVbv) + if (m_param->analysisLoad && m_param->scaleFactor && bIsVbv) { int numFrames; for (numFrames = 0; numFrames < maxSearch; numFrames++) diff --git a/source/test/regression-tests.txt b/source/test/regression-tests.txt index 48e2d43e32..45cd0f2b1c 100644 --- a/source/test/regression-tests.txt +++ b/source/test/regression-tests.txt @@ -18,17 +18,17 @@ BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-tempo BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao -BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-reuse-mode=save --analysis-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --no-cutree --analysis-reuse-mode=load --analysis-reuse-level 2 --bitrate 7000 --limit-modes +BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 2 --bitrate 7000 --limit-modes::--preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 2 --bitrate 7000 --limit-modes BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4 -BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-reuse-mode=save --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-reuse-mode=load --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0 +BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0 BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3 -BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-reuse-mode=save --bitrate 7000 --tskip-fast --limit-tu 4::--preset veryslow --no-cutree --analysis-reuse-mode=load --bitrate 7000 --tskip-fast --limit-tu 4 +BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --bitrate 7000 --tskip-fast --limit-tu 4::--preset veryslow --no-cutree --analysis-load x265_analysis.dat --bitrate 7000 --tskip-fast --limit-tu 4 BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit" Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit" Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop Coastguard-4k.y4m,--preset superfast --tune grain --pme --aq-strength 2 --merange 190 -Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-reuse-mode=save --analysis-reuse-level 1 --bitrate 15000::--preset veryfast --no-cutree --analysis-reuse-mode=load --analysis-reuse-level 1 --bitrate 15000 +Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 1 --bitrate 15000::--preset veryfast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 1 --bitrate 15000 Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh --slices 2 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16 @@ -52,7 +52,7 @@ DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3 --tu-inter-depth 4 --limit-tu 3 -DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-reuse-mode=save --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --no-cutree --analysis-reuse-mode=load --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1 +DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-save x265_analysis.dat --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1::--preset fast --no-cutree --analysis-load x265_analysis.dat --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2 FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd @@ -69,8 +69,8 @@ KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16 KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes --limit-tu 1 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2 -NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-reuse-mode=save --rd 5 --analysis-reuse-level 10 --bitrate 9000::--preset slow --no-cutree --analysis-reuse-mode=load --rd 5 --analysis-reuse-level 10 --bitrate 9000 -News-4k.y4m,--preset ultrafast --no-cutree --analysis-reuse-mode=save --analysis-reuse-level 2 --bitrate 15000::--preset ultrafast --no-cutree --analysis-reuse-mode=load --analysis-reuse-level 2 --bitrate 15000 +NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-save x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000::--preset slow --no-cutree --analysis-load x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000 +News-4k.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 2 --bitrate 15000::--preset ultrafast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 2 --bitrate 15000 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0 News-4k.y4m,--preset superfast --slices 4 --aq-mode 0 News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16 @@ -125,7 +125,7 @@ old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32 old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16 --limit-modes old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim old_town_cross_444_720p50.y4m,--preset faster --rd 1 --tune zero-latency -old_town_cross_444_720p50.y4m,--preset fast --no-cutree --analysis-reuse-mode=save --analysis-reuse-level 1 --bitrate 3000 --early-skip::--preset fast --no-cutree --analysis-reuse-mode=load --analysis-reuse-level 1 --bitrate 3000 --early-skip +old_town_cross_444_720p50.y4m,--preset fast --no-cutree --analysis-save pass1_analysis.dat --analysis-reuse-level 1 --bitrate 3000 --early-skip::--preset fast --no-cutree --analysis-load pass1_analysis.dat --analysis-save pass2_analysis.dat --analysis-reuse-level 1 --bitrate 3000 --early-skip::--preset fast --no-cutree --analysis-load pass2_analysis.dat --analysis-reuse-level 1 --bitrate 3000 --early-skip old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6 old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid old_town_cross_444_720p50.y4m,--preset slower --crf 4 --cu-lossless @@ -151,6 +151,7 @@ Kimono1_1920x1080_24_400.yuv,--preset medium --rdoq-level 0 --limit-refs 3 --sli Kimono1_1920x1080_24_400.yuv,--preset veryslow --crf 4 --cu-lossless --slices 2 --limit-refs 3 --limit-modes Kimono1_1920x1080_24_400.yuv,--preset placebo --ctu 32 --max-tu-size 8 --limit-tu 2 big_buck_bunny_360p24.y4m, --keyint 60 --min-keyint 40 --gop-lookahead 14 + # Main12 intraCost overflow bug test 720p50_parkrun_ter.y4m,--preset medium diff --git a/source/x265.cpp b/source/x265.cpp index e9d0bf97bb..b2b31f2ec9 100644 --- a/source/x265.cpp +++ b/source/x265.cpp @@ -585,9 +585,9 @@ int main(int argc, char **argv) x265_picture pic_orig, pic_out; x265_picture *pic_in = &pic_orig; - /* Allocate recon picture if analysisReuseMode is enabled */ + /* Allocate recon picture if analysis save/load is enabled */ std::priority_queue* pts_queue = cliopt.output->needPTS() ? new std::priority_queue() : NULL; - x265_picture *pic_recon = (cliopt.recon || !!param->analysisReuseMode || pts_queue || reconPlay || param->csvLogLevel) ? &pic_out : NULL; + x265_picture *pic_recon = (cliopt.recon || param->analysisSave || param->analysisLoad || pts_queue || reconPlay || param->csvLogLevel) ? &pic_out : NULL; uint32_t inFrameCount = 0; uint32_t outFrameCount = 0; x265_nal *p_nal; diff --git a/source/x265.h b/source/x265.h index 9d6b5d606f..808586d7ea 100644 --- a/source/x265.h +++ b/source/x265.h @@ -327,15 +327,15 @@ typedef struct x265_picture * to allow the encoder to determine base QP */ int forceqp; - /* If param.analysisReuseMode is X265_ANALYSIS_OFF this field is ignored on input - * and output. Else the user must call x265_alloc_analysis_data() to - * allocate analysis buffers for every picture passed to the encoder. + /* If param.analysisLoad and param.analysisSave are disabled, this field is + * ignored on input and output. Else the user must call x265_alloc_analysis_data() + * to allocate analysis buffers for every picture passed to the encoder. * - * On input when param.analysisReuseMode is X265_ANALYSIS_LOAD and analysisData + * On input when param.analysisLoad is enabled and analysisData * member pointers are valid, the encoder will use the data stored here to * reduce encoder work. * - * On output when param.analysisReuseMode is X265_ANALYSIS_SAVE and analysisData + * On output when param.analysisSave is enabled and analysisData * member pointers are valid, the encoder will write output analysis into * this data structure */ x265_analysis_data analysisData; @@ -484,11 +484,6 @@ typedef enum #define X265_EXTENDED_SAR 255 /* aspect ratio explicitly specified as width:height */ -/* Analysis options */ -#define X265_ANALYSIS_OFF 0 -#define X265_ANALYSIS_SAVE 1 -#define X265_ANALYSIS_LOAD 2 - typedef struct x265_cli_csp { int planes; @@ -1129,13 +1124,13 @@ typedef struct x265_param * Default disabled */ int bEnableRdRefine; - /* If X265_ANALYSIS_SAVE, write per-frame analysis information into analysis - * buffers. if X265_ANALYSIS_LOAD, read analysis information into analysis - * buffer and use this analysis information to reduce the amount of work - * the encoder must perform. Default X265_ANALYSIS_OFF */ + /* If save, write per-frame analysis information into analysis buffers. + * If load, read analysis information into analysis buffer and use this + * analysis information to reduce the amount of work the encoder must perform. + * Default disabled. Now deprecated*/ int analysisReuseMode; - /* Filename for analysisReuseMode save/load. Default name is "x265_analysis.dat" */ + /* Filename for multi-pass-opt-analysis/distortion. Default name is "x265_analysis.dat" */ const char* analysisReuseFileName; /*== Rate Control ==*/ @@ -1455,7 +1450,7 @@ typedef struct x265_param int bHDROpt; /* A value between 1 and 10 (both inclusive) determines the level of - * information stored/reused in save/load analysis-reuse-mode. Higher the refine + * information stored/reused in analysis save/load. Higher the refine * level higher the information stored/reused. Default is 5 */ int analysisReuseLevel; @@ -1539,7 +1534,16 @@ typedef struct x265_param * within this from the gop boundary set by keyint, the GOP will be extented until such a point, * otherwise the GOP will be terminated as set by keyint*/ int gopLookahead; + + /*Write per-frame analysis information into analysis buffers. Default disabled. */ + const char* analysisSave; + + /* Read analysis information into analysis buffer and use this analysis information + * to reduce the amount of work the encoder must perform. Default disabled. */ + const char* analysisLoad; + } x265_param; + /* x265_param_alloc: * Allocates an x265_param instance. The returned param structure is not * special in any way, but using this method together with x265_param_free() diff --git a/source/x265cli.h b/source/x265cli.h index 3b59cf3a91..5f8a69b9e1 100644 --- a/source/x265cli.h +++ b/source/x265cli.h @@ -254,9 +254,11 @@ static const struct option long_options[] = { "no-slow-firstpass", no_argument, NULL, 0 }, { "multi-pass-opt-rps", no_argument, NULL, 0 }, { "no-multi-pass-opt-rps", no_argument, NULL, 0 }, - { "analysis-reuse-mode", required_argument, NULL, 0 }, - { "analysis-reuse-file", required_argument, NULL, 0 }, + { "analysis-reuse-mode", required_argument, NULL, 0 }, /* DEPRECATED */ + { "analysis-reuse-file", required_argument, NULL, 0 }, { "analysis-reuse-level", required_argument, NULL, 0 }, + { "analysis-save", required_argument, NULL, 0 }, + { "analysis-load", required_argument, NULL, 0 }, { "scale-factor", required_argument, NULL, 0 }, { "refine-intra", required_argument, NULL, 0 }, { "refine-inter", required_argument, NULL, 0 }, @@ -465,18 +467,19 @@ static void showHelp(x265_param *param) H0(" --[no-]analyze-src-pics Motion estimation uses source frame planes. Default disable\n"); H0(" --[no-]slow-firstpass Enable a slow first pass in a multipass rate control mode. Default %s\n", OPT(param->rc.bEnableSlowFirstPass)); H0(" --[no-]strict-cbr Enable stricter conditions and tolerance for bitrate deviations in CBR mode. Default %s\n", OPT(param->rc.bStrictCbr)); - H0(" --analysis-reuse-mode save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisReuseMode); + H0(" --analysis-save Dump analysis info into the specified file. Default Disabled\n"); + H0(" --analysis-load Load analysis buffers from the file specified. Default Disabled\n"); H0(" --analysis-reuse-file Specify file name used for either dumping or reading analysis data. Deault x265_analysis.dat\n"); H0(" --analysis-reuse-level <1..10> Level of analysis reuse indicates amount of info stored/reused in save/load mode, 1:least..10:most. Default %d\n", param->analysisReuseLevel); H0(" --refine-mv-type Reuse MV information received through API call. Supported option is avc. Default disabled - %d\n", param->bMVType); H0(" --scale-factor Specify factor by which input video is scaled down for analysis save mode. Default %d\n", param->scaleFactor); - H0(" --refine-intra <0..3> Enable intra refinement for encode that uses analysis-reuse-mode=load.\n" + H0(" --refine-intra <0..3> Enable intra refinement for encode that uses analysis-load.\n" " - 0 : Forces both mode and depth from the save encode.\n" " - 1 : Functionality of (0) + evaluate all intra modes at min-cu-size's depth when current depth is one smaller than min-cu-size's depth.\n" " - 2 : Functionality of (1) + irrespective of size evaluate all angular modes when the save encode decides the best mode as angular.\n" " - 3 : Functionality of (1) + irrespective of size evaluate all intra modes.\n" " Default:%d\n", param->intraRefine); - H0(" --refine-inter <0..3> Enable inter refinement for encode that uses analysis-reuse-mode=load.\n" + H0(" --refine-inter <0..3> Enable inter refinement for encode that uses analysis-load.\n" " - 0 : Forces both mode and depth from the save encode.\n" " - 1 : Functionality of (0) + evaluate all inter modes at min-cu-size's depth when current depth is one smaller than\n" " min-cu-size's depth. When save encode decides the current block as skip(for all sizes) evaluate skip/merge.\n" From abbe51b956d4c438e8c16bd72738f40c982a364e Mon Sep 17 00:00:00 2001 From: Santhoshini Sekar Date: Fri, 22 Dec 2017 12:48:37 +0530 Subject: [PATCH 22/51] Fix possible NULL pointer dereferencing in cudata init --- source/common/framedata.cpp | 11 ++++++----- source/encoder/analysis.cpp | 17 +++++++++-------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/source/common/framedata.cpp b/source/common/framedata.cpp index 6292b9f628..e0ecd57149 100644 --- a/source/common/framedata.cpp +++ b/source/common/framedata.cpp @@ -40,11 +40,12 @@ bool FrameData::create(const x265_param& param, const SPS& sps, int csp) m_spsrpsIdx = -1; if (param.rc.bStatWrite) m_spsrps = const_cast(sps.spsrps); - - m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame, param); - for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++) - m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param, ctuAddr); - + bool isallocated = m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame, param); + if (isallocated) + for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++) + m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param, ctuAddr); + else + return false; CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame); CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight); reinit(sps); diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp index e3d21bf042..86436c821a 100644 --- a/source/encoder/analysis.cpp +++ b/source/encoder/analysis.cpp @@ -100,16 +100,17 @@ bool Analysis::create(ThreadLocalData *tld) for (uint32_t depth = 0; depth <= m_param->maxCUDepth; depth++, cuSize >>= 1) { ModeDepth &md = m_modeDepth[depth]; - - md.cuMemPool.create(depth, csp, MAX_PRED_TYPES, *m_param); + ok &= md.cuMemPool.create(depth, csp, MAX_PRED_TYPES, *m_param); ok &= md.fencYuv.create(cuSize, csp); - - for (int j = 0; j < MAX_PRED_TYPES; j++) + if (ok) { - md.pred[j].cu.initialize(md.cuMemPool, depth, *m_param, j); - ok &= md.pred[j].predYuv.create(cuSize, csp); - ok &= md.pred[j].reconYuv.create(cuSize, csp); - md.pred[j].fencYuv = &md.fencYuv; + for (int j = 0; j < MAX_PRED_TYPES; j++) + { + md.pred[j].cu.initialize(md.cuMemPool, depth, *m_param, j); + ok &= md.pred[j].predYuv.create(cuSize, csp); + ok &= md.pred[j].reconYuv.create(cuSize, csp); + md.pred[j].fencYuv = &md.fencYuv; + } } } if (m_param->sourceHeight >= 1080) From 77c886e3cab825b95a610ebe18cabe61f77d93ed Mon Sep 17 00:00:00 2001 From: Aruna Matheswaran Date: Fri, 22 Dec 2017 14:50:08 +0530 Subject: [PATCH 23/51] Add support for RADL pictures --- doc/reST/cli.rst | 9 +++- source/CMakeLists.txt | 2 +- source/common/lowres.cpp | 6 +-- source/common/lowres.h | 4 +- source/common/param.cpp | 5 +++ source/encoder/dpb.cpp | 18 +++++--- source/encoder/encoder.cpp | 6 +++ source/encoder/search.cpp | 2 +- source/encoder/slicetype.cpp | 69 ++++++++++++++++++++--------- source/encoder/weightPrediction.cpp | 2 +- source/test/regression-tests.txt | 1 + source/x265.h | 3 ++ source/x265cli.h | 2 + 13 files changed, 92 insertions(+), 37 deletions(-) diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst index 4a60d8aa94..74a14d8fa6 100644 --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -1345,7 +1345,14 @@ Slice decision options This value represents the percentage difference between the inter cost and intra cost of a frame used in scenecut detection. For example, a value of 5 indicates, if the inter cost of a frame is greater than or equal to 95 percent of the intra cost of the frame, - then detect this frame as scenecut. Values between 5 and 15 are recommended. Default 5. + then detect this frame as scenecut. Values between 5 and 15 are recommended. Default 5. + +.. option:: --radl + + Number of RADL pictures allowed infront of IDR. Requires fixed keyframe interval. + Recommended value is 2-3. Default 0 (disabled). + + **Range of values: Between 0 and `--bframes` .. option:: --ctu-info <0, 1, 2, 4, 6> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index d7334e055f..e17cca9f56 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -29,7 +29,7 @@ option(NATIVE_BUILD "Target the build CPU" OFF) option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 149) +set(X265_BUILD 150) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" diff --git a/source/common/lowres.cpp b/source/common/lowres.cpp index d166f5d25e..e81fc23e72 100644 --- a/source/common/lowres.cpp +++ b/source/common/lowres.cpp @@ -89,7 +89,7 @@ bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled, uint32_t qgS } } - for (int i = 0; i < bframes + 1; i++) + for (int i = 0; i < bframes + 2; i++) { CHECKED_MALLOC(lowresMvs[0][i], MV, cuCount); CHECKED_MALLOC(lowresMvs[1][i], MV, cuCount); @@ -118,7 +118,7 @@ void Lowres::destroy() } } - for (int i = 0; i < bframes + 1; i++) + for (int i = 0; i < bframes + 2; i++) { X265_FREE(lowresMvs[0][i]); X265_FREE(lowresMvs[1][i]); @@ -152,7 +152,7 @@ void Lowres::init(PicYuv *origPic, int poc) for (int x = 0; x < bframes + 2; x++) rowSatds[y][x][0] = -1; - for (int i = 0; i < bframes + 1; i++) + for (int i = 0; i < bframes + 2; i++) { lowresMvs[0][i][0].x = 0x7FFF; lowresMvs[1][i][0].x = 0x7FFF; diff --git a/source/common/lowres.h b/source/common/lowres.h index 072739e80e..ed7642cfbd 100644 --- a/source/common/lowres.h +++ b/source/common/lowres.h @@ -130,8 +130,8 @@ struct Lowres : public ReferencePlanes int64_t satdCost; uint16_t* lowresCostForRc; uint16_t* lowresCosts[X265_BFRAME_MAX + 2][X265_BFRAME_MAX + 2]; - int32_t* lowresMvCosts[2][X265_BFRAME_MAX + 1]; - MV* lowresMvs[2][X265_BFRAME_MAX + 1]; + int32_t* lowresMvCosts[2][X265_BFRAME_MAX + 2]; + MV* lowresMvs[2][X265_BFRAME_MAX + 2]; uint32_t maxBlocksInRow; uint32_t maxBlocksInCol; uint32_t maxBlocksInRowFullRes; diff --git a/source/common/param.cpp b/source/common/param.cpp index f8c08a084e..ab8f419391 100644 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -154,6 +154,7 @@ void x265_param_default(x265_param* param) param->lookaheadSlices = 8; param->lookaheadThreads = 0; param->scenecutBias = 5.0; + param->radl = 0; /* Intra Coding Tools */ param->bEnableConstrainedIntra = 0; param->bEnableStrongIntraSmoothing = 1; @@ -1010,6 +1011,7 @@ int x265_param_parse(x265_param* p, const char* name, const char* value) OPT("gop-lookahead") p->gopLookahead = atoi(value); OPT("analysis-save") p->analysisSave = strdup(value); OPT("analysis-load") p->analysisLoad = strdup(value); + OPT("radl") p->radl = atoi(value); else return X265_PARAM_BAD_NAME; } @@ -1316,6 +1318,8 @@ int x265_check_params(x265_param* param) "scenecutThreshold must be greater than 0"); CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias, "scenecut-bias must be between 0 and 100"); + CHECK(param->radl < 0 || param->radl > param->bframes, + "radl must be between 0 and bframes"); CHECK(param->rdPenalty < 0 || param->rdPenalty > 2, "Valid penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum"); CHECK(param->keyframeMax < -1, @@ -1575,6 +1579,7 @@ char *x265_param2string(x265_param* p, int padx, int pady) s += sprintf(s, " rc-lookahead=%d", p->lookaheadDepth); s += sprintf(s, " lookahead-slices=%d", p->lookaheadSlices); s += sprintf(s, " scenecut=%d", p->scenecutThreshold); + s += sprintf(s, " radl=%d", p->radl); BOOL(p->bIntraRefresh, "intra-refresh"); s += sprintf(s, " ctu=%d", p->maxCUSize); s += sprintf(s, " min-cu-size=%d", p->minCUSize); diff --git a/source/encoder/dpb.cpp b/source/encoder/dpb.cpp index c225cf3eff..19d41f9cdd 100644 --- a/source/encoder/dpb.cpp +++ b/source/encoder/dpb.cpp @@ -181,7 +181,10 @@ void DPB::prepareEncode(Frame *newFrame) // Mark pictures in m_piclist as unreferenced if they are not included in RPS applyReferencePictureSet(&slice->m_rps, pocCurr); - slice->m_numRefIdx[0] = X265_MIN(newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC + if (slice->m_sliceType != I_SLICE) + slice->m_numRefIdx[0] = x265_clip3(1, newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures); + else + slice->m_numRefIdx[0] = X265_MIN(newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC slice->m_numRefIdx[1] = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures); slice->setRefPicList(m_picList); @@ -230,11 +233,14 @@ void DPB::computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBu { if ((iterPic->m_poc != curPoc) && iterPic->m_encData->m_bHasReferences) { - rps->poc[poci] = iterPic->m_poc; - rps->deltaPOC[poci] = rps->poc[poci] - curPoc; - (rps->deltaPOC[poci] < 0) ? numNeg++ : numPos++; - rps->bUsed[poci] = !isRAP; - poci++; + if ((m_lastIDR >= curPoc) || (m_lastIDR <= iterPic->m_poc)) + { + rps->poc[poci] = iterPic->m_poc; + rps->deltaPOC[poci] = rps->poc[poci] - curPoc; + (rps->deltaPOC[poci] < 0) ? numNeg++ : numPos++; + rps->bUsed[poci] = !isRAP; + poci++; + } } iterPic = iterPic->m_next; } diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index b060289a64..825e7b3f7d 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -3046,6 +3046,12 @@ void Encoder::configure(x265_param *p) p->maxCUDepth = p->maxLog2CUSize - g_log2Size[p->minCUSize]; p->unitSizeDepth = p->maxLog2CUSize - LOG2_UNIT_SIZE; p->num4x4Partitions = (1U << (p->unitSizeDepth << 1)); + + if (p->radl && (p->keyframeMax != p->keyframeMin)) + { + p->radl = 0; + x265_log(p, X265_LOG_WARNING, "Radl requires fixed gop-length (keyint == min-keyint). Disabling radl.\n"); + } } void Encoder::allocAnalysis(x265_analysis_data* analysis) diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp index dd38b157a9..d848c9a60d 100644 --- a/source/encoder/search.cpp +++ b/source/encoder/search.cpp @@ -1947,7 +1947,7 @@ MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int /* poc difference is out of range for lookahead */ return 0; - MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc - 1]; + MV* mvs = m_frame->m_lowres.lowresMvs[list][diffPoc]; if (mvs[0].x == 0x7FFF) /* this motion search was not estimated by lookahead */ return 0; diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp index 8513269709..9f73a3e211 100644 --- a/source/encoder/slicetype.cpp +++ b/source/encoder/slicetype.cpp @@ -879,7 +879,7 @@ void Lookahead::getEstimatedPictureCost(Frame *curFrame) Slice *slice = curFrame->m_encData->m_slice; int p0 = 0, p1, b; int poc = slice->m_poc; - int l0poc = slice->m_refPOCList[0][0]; + int l0poc = slice->m_rps.numberOfNegativePictures ? slice->m_refPOCList[0][0] : -1; int l1poc = slice->m_refPOCList[1][0]; switch (slice->m_sliceType) @@ -896,11 +896,22 @@ void Lookahead::getEstimatedPictureCost(Frame *curFrame) break; case B_SLICE: - b = poc - l0poc; - p1 = b + l1poc - poc; - frames[p0] = &slice->m_refFrameList[0][0]->m_lowres; - frames[b] = &curFrame->m_lowres; - frames[p1] = &slice->m_refFrameList[1][0]->m_lowres; + if (l0poc >= 0) + { + b = poc - l0poc; + p1 = b + l1poc - poc; + frames[p0] = &slice->m_refFrameList[0][0]->m_lowres; + frames[b] = &curFrame->m_lowres; + frames[p1] = &slice->m_refFrameList[1][0]->m_lowres; + } + else + { + p0 = b = 0; + p1 = b + l1poc - poc; + frames[p0] = frames[b] = &curFrame->m_lowres; + frames[p1] = &slice->m_refFrameList[1][0]->m_lowres; + } + break; default: @@ -1120,12 +1131,20 @@ void Lookahead::slicetypeDecide() /* Closed GOP */ m_lastKeyframe = frm.frameNum; frm.bKeyframe = true; - if (bframes > 0) + if (bframes > 0 && !m_param->radl) { list[bframes - 1]->m_lowres.sliceType = X265_TYPE_P; bframes--; } } + if (m_param->radl && !m_param->bOpenGOP && list[bframes + 1]) + { + if ((frm.frameNum - m_lastKeyframe) > (m_param->keyframeMax - m_param->radl - 1) && (frm.frameNum - m_lastKeyframe) < m_param->keyframeMax) + frm.sliceType = X265_TYPE_B; + if ((frm.frameNum - m_lastKeyframe) == (m_param->keyframeMax - m_param->radl - 1)) + frm.sliceType = X265_TYPE_P; + } + if (bframes == m_param->bframes || !list[bframes + 1]) { if (IS_X265_TYPE_B(frm.sliceType)) @@ -1175,8 +1194,13 @@ void Lookahead::slicetypeDecide() if (bframes) { p0 = 0; // last nonb + bool isp0available = frames[bframes + 1]->sliceType == X265_TYPE_IDR ? false : true; + for (b = 1; b <= bframes; b++) { + if (!isp0available) + p0 = b; + if (frames[b]->sliceType == X265_TYPE_B) for (p1 = b; frames[p1]->sliceType == X265_TYPE_B; p1++) ; // find new nonb or bref @@ -1186,7 +1210,10 @@ void Lookahead::slicetypeDecide() estGroup.singleCost(p0, p1, b); if (frames[b]->sliceType == X265_TYPE_BREF) + { p0 = b; + isp0available = true; + } } } } @@ -1413,12 +1440,12 @@ void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe) continue; /* Skip search if already done */ - if (frames[b]->lowresMvs[0][i - 1][0].x != 0x7FFF) + if (frames[b]->lowresMvs[0][i][0].x != 0x7FFF) continue; /* perform search to p1 at same distance, if possible */ int p1 = b + i; - if (p1 >= numFrames || frames[b]->lowresMvs[1][i - 1][0].x != 0x7FFF) + if (p1 >= numFrames || frames[b]->lowresMvs[1][i][0].x != 0x7FFF) p1 = b; estGroup.add(p0, p1, b); @@ -1440,7 +1467,7 @@ void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe) /* only measure frame cost in this pass if motion searches * are already done */ - if (frames[b]->lowresMvs[0][i - 1][0].x == 0x7FFF) + if (frames[b]->lowresMvs[0][i][0].x == 0x7FFF) continue; int p0 = b - i; @@ -1452,7 +1479,7 @@ void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe) break; /* ensure P1 search is done */ - if (j && frames[b]->lowresMvs[1][j - 1][0].x == 0x7FFF) + if (j && frames[b]->lowresMvs[1][j][0].x == 0x7FFF) continue; /* ensure frame cost is not done */ @@ -1867,7 +1894,7 @@ void Lookahead::aqMotion(Lowres **frames, bool bIntra) void Lookahead::calcMotionAdaptiveQuantFrame(Lowres **frames, int p0, int p1, int b) { - int listDist[2] = { b - p0 - 1, p1 - b - 1 }; + int listDist[2] = { b - p0, p1 - b }; int32_t strideInCU = m_8x8Width; double qp_adj = 0, avg_adj = 0, avg_adj_pow2 = 0, sd; for (uint16_t blocky = 0; blocky < m_8x8Height; blocky++) @@ -2030,7 +2057,7 @@ void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0); int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32; int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight }; - int listDist[2] = { b - p0 - 1, p1 - b - 1 }; + int listDist[2] = { b - p0, p1 - b }; memset(m_scratch, 0, m_8x8Width * sizeof(int)); @@ -2305,17 +2332,15 @@ int64_t CostEstimateGroup::estimateFrameCost(LookaheadTLD& tld, int p0, int p1, score = fenc->costEst[b - p0][p1 - b]; else { - X265_CHECK(p0 != b, "I frame estimates should always be pre-calculated\n"); - bool bDoSearch[2]; - bDoSearch[0] = p0 < b && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF; - bDoSearch[1] = p1 > b && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF; + bDoSearch[0] = fenc->lowresMvs[0][b - p0][0].x == 0x7FFF; + bDoSearch[1] = p1 > b && fenc->lowresMvs[1][p1 - b][0].x == 0x7FFF; #if CHECKED_BUILD - X265_CHECK(!(p0 < b && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFE), "motion search batch duplication L0\n"); - X265_CHECK(!(p1 > b && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFE), "motion search batch duplication L1\n"); - if (bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0x7FFE; - if (bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0x7FFE; + X265_CHECK(!(p0 < b && fenc->lowresMvs[0][b - p0][0].x == 0x7FFE), "motion search batch duplication L0\n"); + X265_CHECK(!(p1 > b && fenc->lowresMvs[1][p1 - b][0].x == 0x7FFE), "motion search batch duplication L1\n"); + if (bDoSearch[0]) fenc->lowresMvs[0][b - p0][0].x = 0x7FFE; + if (bDoSearch[1]) fenc->lowresMvs[1][p1 - b][0].x = 0x7FFE; #endif fenc->weightedRef[b - p0].isWeighted = false; @@ -2406,7 +2431,7 @@ void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */ int lowresPenalty = 4; - int listDist[2] = { b - p0 - 1, p1 - b - 1 }; + int listDist[2] = { b - p0, p1 - b}; MV mvmin, mvmax; int bcost = tld.me.COST_MAX; diff --git a/source/encoder/weightPrediction.cpp b/source/encoder/weightPrediction.cpp index 7c16ac323b..19255f9acf 100644 --- a/source/encoder/weightPrediction.cpp +++ b/source/encoder/weightPrediction.cpp @@ -323,7 +323,7 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param) if (!plane && diffPoc <= param.bframes + 1) { - mvs = fenc.lowresMvs[list][diffPoc - 1]; + mvs = fenc.lowresMvs[list][diffPoc]; /* test whether this motion search was performed by lookahead */ if (mvs[0].x != 0x7FFF) diff --git a/source/test/regression-tests.txt b/source/test/regression-tests.txt index 45cd0f2b1c..51a1b0f338 100644 --- a/source/test/regression-tests.txt +++ b/source/test/regression-tests.txt @@ -151,6 +151,7 @@ Kimono1_1920x1080_24_400.yuv,--preset medium --rdoq-level 0 --limit-refs 3 --sli Kimono1_1920x1080_24_400.yuv,--preset veryslow --crf 4 --cu-lossless --slices 2 --limit-refs 3 --limit-modes Kimono1_1920x1080_24_400.yuv,--preset placebo --ctu 32 --max-tu-size 8 --limit-tu 2 big_buck_bunny_360p24.y4m, --keyint 60 --min-keyint 40 --gop-lookahead 14 +BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --radl 2 # Main12 intraCost overflow bug test 720p50_parkrun_ter.y4m,--preset medium diff --git a/source/x265.h b/source/x265.h index 808586d7ea..0ed077d5ee 100644 --- a/source/x265.h +++ b/source/x265.h @@ -1268,6 +1268,7 @@ typedef struct x265_param /* internally enable if tune grain is set */ int bEnableConstVbv; + } rc; /*== Video Usability Information ==*/ @@ -1542,6 +1543,8 @@ typedef struct x265_param * to reduce the amount of work the encoder must perform. Default disabled. */ const char* analysisLoad; + /*Number of RADL pictures allowed in front of IDR*/ + int radl; } x265_param; /* x265_param_alloc: diff --git a/source/x265cli.h b/source/x265cli.h index 5f8a69b9e1..0cdaa921da 100644 --- a/source/x265cli.h +++ b/source/x265cli.h @@ -124,6 +124,7 @@ static const struct option long_options[] = { "scenecut", required_argument, NULL, 0 }, { "no-scenecut", no_argument, NULL, 0 }, { "scenecut-bias", required_argument, NULL, 0 }, + { "radl", required_argument, NULL, 0 }, { "ctu-info", required_argument, NULL, 0 }, { "intra-refresh", no_argument, NULL, 0 }, { "rc-lookahead", required_argument, NULL, 0 }, @@ -427,6 +428,7 @@ static void showHelp(x265_param *param) H0(" --no-scenecut Disable adaptive I-frame decision\n"); H0(" --scenecut How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold); H1(" --scenecut-bias <0..100.0> Bias for scenecut detection. Default %.2f\n", param->scenecutBias); + H0(" --radl Number of RADL pictures allowed in front of IDR. Default %d\n", param->radl); H0(" --intra-refresh Use Periodic Intra Refresh instead of IDR frames\n"); H0(" --rc-lookahead Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth); H1(" --lookahead-slices <0..16> Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices); From acb679fc66b112fd92616f6f972f7d56f5da5142 Mon Sep 17 00:00:00 2001 From: Aruna Matheswaran Date: Fri, 22 Dec 2017 18:23:24 +0530 Subject: [PATCH 24/51] Restore macros defining analysis options --- source/x265.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/x265.h b/source/x265.h index 0ed077d5ee..ca6263ef57 100644 --- a/source/x265.h +++ b/source/x265.h @@ -481,8 +481,11 @@ typedef enum #define X265_CSP_BGRA 7 /* packed bgr 32bits */ #define X265_CSP_RGB 8 /* packed rgb 24bits */ #define X265_CSP_MAX 9 /* end of list */ - #define X265_EXTENDED_SAR 255 /* aspect ratio explicitly specified as width:height */ +/* Analysis options */ +#define X265_ANALYSIS_OFF 0 +#define X265_ANALYSIS_SAVE 1 +#define X265_ANALYSIS_LOAD 2 typedef struct x265_cli_csp { From 5fef3fef3ef4500090891922f0b9a69853300f65 Mon Sep 17 00:00:00 2001 From: Aasaipriya Chandran Date: Wed, 27 Dec 2017 19:05:56 +0530 Subject: [PATCH 25/51] Reconfigure csvfpt if encoder param is null --- source/encoder/api.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/encoder/api.cpp b/source/encoder/api.cpp index 2ae2bc82dc..985d21ddf0 100644 --- a/source/encoder/api.cpp +++ b/source/encoder/api.cpp @@ -192,9 +192,10 @@ int x265_encoder_reconfig(x265_encoder* enc, x265_param* param_in) { if (!enc || !param_in) return -1; - x265_param save; Encoder* encoder = static_cast(enc); + if (encoder->m_param->csvfn == NULL && param_in->csvfpt != NULL) + encoder->m_param->csvfpt = param_in->csvfpt; if (encoder->m_latestParam->forceFlush != param_in->forceFlush) return encoder->reconfigureParam(encoder->m_latestParam, param_in); bool isReconfigureRc = encoder->isReconfigureRc(encoder->m_latestParam, param_in); From b82b97087eaca97d10c687f617035bc001eb7405 Mon Sep 17 00:00:00 2001 From: Pradeep Ramachandran Date: Thu, 4 Jan 2018 13:56:07 +0530 Subject: [PATCH 26/51] doc: Fix invalid UTF-8 characters in docs to enable linux gen --- doc/reST/cli.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst index 74a14d8fa6..83a5eb6f6b 100644 --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -2058,7 +2058,7 @@ VUI fields must be manually specified. Example for MaxCLL=1000 candela per square meter, MaxFALL=400 candela per square meter: - --max-cll “1000,400” + --max-cll "1000,400" Note that this string value will need to be escaped or quoted to protect against shell expansion on many platforms. No default. From e30facce857804794944fd8ad0f72fc9338e0edd Mon Sep 17 00:00:00 2001 From: Aarthi Thirumalai Date: Thu, 4 Jan 2018 15:16:06 +0530 Subject: [PATCH 27/51] api: change signature of x265_csvlog_encode() replace x265_encoder* with x265_param* and int padx, int pady as function arguments. --- source/CMakeLists.txt | 2 +- source/encoder/api.cpp | 104 ++++++++++++++++++++--------------------- source/x265.h | 7 +-- 3 files changed, 55 insertions(+), 58 deletions(-) diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index e17cca9f56..557814ad40 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -29,7 +29,7 @@ option(NATIVE_BUILD "Target the build CPU" OFF) option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 150) +set(X265_BUILD 151) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" diff --git a/source/encoder/api.cpp b/source/encoder/api.cpp index eea76ebd39..85de1d762c 100644 --- a/source/encoder/api.cpp +++ b/source/encoder/api.cpp @@ -312,7 +312,9 @@ void x265_encoder_log(x265_encoder* enc, int argc, char **argv) Encoder *encoder = static_cast(enc); x265_stats stats; encoder->fetchStats(&stats, sizeof(stats)); - x265_csvlog_encode(enc, &stats, argc, argv); + int padx = encoder->m_sps.conformanceWindow.rightOffset; + int pady = encoder->m_sps.conformanceWindow.bottomOffset; + x265_csvlog_encode(encoder->m_param, &stats, padx, pady, argc, argv); } } @@ -868,45 +870,38 @@ void x265_csvlog_frame(const x265_param* param, const x265_picture* pic) fflush(stderr); } -void x265_csvlog_encode(x265_encoder *enc, const x265_stats* stats, int argc, char** argv) +void x265_csvlog_encode(const x265_param *p, const x265_stats *stats, int padx, int pady, int argc, char** argv) { - if (enc) + if (p && p->csvfpt) { - Encoder *encoder = static_cast(enc); - int padx = encoder->m_sps.conformanceWindow.rightOffset; - int pady = encoder->m_sps.conformanceWindow.bottomOffset; const x265_api * api = x265_api_get(0); - if (!encoder->m_param->csvfpt) - return; - - if (encoder->m_param->csvLogLevel) + if (p->csvLogLevel) { // adding summary to a per-frame csv log file, so it needs a summary header - fprintf(encoder->m_param->csvfpt, "\nSummary\n"); - fputs(summaryCSVHeader, encoder->m_param->csvfpt); + fprintf(p->csvfpt, "\nSummary\n"); + fputs(summaryCSVHeader, p->csvfpt); } // CLI arguments or other if (argc) { - fputc('"', encoder->m_param->csvfpt); + fputc('"', p->csvfpt); for (int i = 1; i < argc; i++) { - fputc(' ', encoder->m_param->csvfpt); - fputs(argv[i], encoder->m_param->csvfpt); + fputc(' ', p->csvfpt); + fputs(argv[i], p->csvfpt); } - fputc('"', encoder->m_param->csvfpt); + fputc('"', p->csvfpt); } else { - const x265_param* paramTemp = encoder->m_param; - char *opts = x265_param2string((x265_param*)paramTemp, padx, pady); + char *opts = x265_param2string((x265_param*)p, padx, pady); if (opts) { - fputc('"', encoder->m_param->csvfpt); - fputs(opts, encoder->m_param->csvfpt); - fputc('"', encoder->m_param->csvfpt); + fputc('"', p->csvfpt); + fputs(opts, p->csvfpt); + fputc('"', p->csvfpt); } } @@ -917,70 +912,71 @@ void x265_csvlog_encode(x265_encoder *enc, const x265_stats* stats, int argc, ch timeinfo = localtime(&now); char buffer[200]; strftime(buffer, 128, "%c", timeinfo); - fprintf(encoder->m_param->csvfpt, ", %s, ", buffer); + fprintf(p->csvfpt, ", %s, ", buffer); // elapsed time, fps, bitrate - fprintf(encoder->m_param->csvfpt, "%.2f, %.2f, %.2f,", + fprintf(p->csvfpt, "%.2f, %.2f, %.2f,", stats->elapsedEncodeTime, stats->encodedPictureCount / stats->elapsedEncodeTime, stats->bitrate); - if (encoder->m_param->bEnablePsnr) - fprintf(encoder->m_param->csvfpt, " %.3lf, %.3lf, %.3lf, %.3lf,", + if (p->bEnablePsnr) + fprintf(p->csvfpt, " %.3lf, %.3lf, %.3lf, %.3lf,", stats->globalPsnrY / stats->encodedPictureCount, stats->globalPsnrU / stats->encodedPictureCount, stats->globalPsnrV / stats->encodedPictureCount, stats->globalPsnr); else - fprintf(encoder->m_param->csvfpt, " -, -, -, -,"); - if (encoder->m_param->bEnableSsim) - fprintf(encoder->m_param->csvfpt, " %.6f, %6.3f,", stats->globalSsim, x265_ssim2dB(stats->globalSsim)); + fprintf(p->csvfpt, " -, -, -, -,"); + if (p->bEnableSsim) + fprintf(p->csvfpt, " %.6f, %6.3f,", stats->globalSsim, x265_ssim2dB(stats->globalSsim)); else - fprintf(encoder->m_param->csvfpt, " -, -,"); + fprintf(p->csvfpt, " -, -,"); if (stats->statsI.numPics) { - fprintf(encoder->m_param->csvfpt, " %-6u, %2.2lf, %-8.2lf,", stats->statsI.numPics, stats->statsI.avgQp, stats->statsI.bitrate); - if (encoder->m_param->bEnablePsnr) - fprintf(encoder->m_param->csvfpt, " %.3lf, %.3lf, %.3lf,", stats->statsI.psnrY, stats->statsI.psnrU, stats->statsI.psnrV); + fprintf(p->csvfpt, " %-6u, %2.2lf, %-8.2lf,", stats->statsI.numPics, stats->statsI.avgQp, stats->statsI.bitrate); + if (p->bEnablePsnr) + fprintf(p->csvfpt, " %.3lf, %.3lf, %.3lf,", stats->statsI.psnrY, stats->statsI.psnrU, stats->statsI.psnrV); else - fprintf(encoder->m_param->csvfpt, " -, -, -,"); - if (encoder->m_param->bEnableSsim) - fprintf(encoder->m_param->csvfpt, " %.3lf,", stats->statsI.ssim); + fprintf(p->csvfpt, " -, -, -,"); + if (p->bEnableSsim) + fprintf(p->csvfpt, " %.3lf,", stats->statsI.ssim); else - fprintf(encoder->m_param->csvfpt, " -,"); + fprintf(p->csvfpt, " -,"); } else - fprintf(encoder->m_param->csvfpt, " -, -, -, -, -, -, -,"); + fprintf(p->csvfpt, " -, -, -, -, -, -, -,"); if (stats->statsP.numPics) { - fprintf(encoder->m_param->csvfpt, " %-6u, %2.2lf, %-8.2lf,", stats->statsP.numPics, stats->statsP.avgQp, stats->statsP.bitrate); - if (encoder->m_param->bEnablePsnr) - fprintf(encoder->m_param->csvfpt, " %.3lf, %.3lf, %.3lf,", stats->statsP.psnrY, stats->statsP.psnrU, stats->statsP.psnrV); + fprintf(p->csvfpt, " %-6u, %2.2lf, %-8.2lf,", stats->statsP.numPics, stats->statsP.avgQp, stats->statsP.bitrate); + if (p->bEnablePsnr) + fprintf(p->csvfpt, " %.3lf, %.3lf, %.3lf,", stats->statsP.psnrY, stats->statsP.psnrU, stats->statsP.psnrV); else - fprintf(encoder->m_param->csvfpt, " -, -, -,"); - if (encoder->m_param->bEnableSsim) - fprintf(encoder->m_param->csvfpt, " %.3lf,", stats->statsP.ssim); + fprintf(p->csvfpt, " -, -, -,"); + if (p->bEnableSsim) + fprintf(p->csvfpt, " %.3lf,", stats->statsP.ssim); else - fprintf(encoder->m_param->csvfpt, " -,"); + fprintf(p->csvfpt, " -,"); } else - fprintf(encoder->m_param->csvfpt, " -, -, -, -, -, -, -,"); + fprintf(p->csvfpt, " -, -, -, -, -, -, -,"); if (stats->statsB.numPics) { - fprintf(encoder->m_param->csvfpt, " %-6u, %2.2lf, %-8.2lf,", stats->statsB.numPics, stats->statsB.avgQp, stats->statsB.bitrate); - if (encoder->m_param->bEnablePsnr) - fprintf(encoder->m_param->csvfpt, " %.3lf, %.3lf, %.3lf,", stats->statsB.psnrY, stats->statsB.psnrU, stats->statsB.psnrV); + fprintf(p->csvfpt, " %-6u, %2.2lf, %-8.2lf,", stats->statsB.numPics, stats->statsB.avgQp, stats->statsB.bitrate); + if (p->bEnablePsnr) + fprintf(p->csvfpt, " %.3lf, %.3lf, %.3lf,", stats->statsB.psnrY, stats->statsB.psnrU, stats->statsB.psnrV); else - fprintf(encoder->m_param->csvfpt, " -, -, -,"); - if (encoder->m_param->bEnableSsim) - fprintf(encoder->m_param->csvfpt, " %.3lf,", stats->statsB.ssim); + fprintf(p->csvfpt, " -, -, -,"); + if (p->bEnableSsim) + fprintf(p->csvfpt, " %.3lf,", stats->statsB.ssim); else - fprintf(encoder->m_param->csvfpt, " -,"); + fprintf(p->csvfpt, " -,"); } else - fprintf(encoder->m_param->csvfpt, " -, -, -, -, -, -, -,"); + fprintf(p->csvfpt, " -, -, -, -, -, -, -,"); - fprintf(encoder->m_param->csvfpt, " %-6u, %-6u, %s\n", stats->maxCLL, stats->maxFALL, api->version_str); + fprintf(p->csvfpt, " %-6u, %-6u, %s\n", stats->maxCLL, stats->maxFALL, api->version_str); } + } /* The dithering algorithm is based on Sierra-2-4A error diffusion. diff --git a/source/x265.h b/source/x265.h index ca6263ef57..6d783dcbe1 100644 --- a/source/x265.h +++ b/source/x265.h @@ -1779,9 +1779,10 @@ FILE* x265_csvlog_open(const x265_param *); void x265_csvlog_frame(const x265_param *, const x265_picture *); /* Log final encode statistics to the CSV file handle. 'argc' and 'argv' are - * intended to be command line arguments passed to the encoder. Encode + * intended to be command line arguments passed to the encoder. padx and pady are + * padding offsets for conformance and can be given from sps settings. Encode * statistics should be queried from the encoder just prior to closing it. */ -void x265_csvlog_encode(x265_encoder *encoder, const x265_stats *, int argc, char** argv); +void x265_csvlog_encode(const x265_param*, const x265_stats *, int padx, int pady, int argc, char** argv); /* In-place downshift from a bit-depth greater than 8 to a bit-depth of 8, using * the residual bits to dither each row. */ @@ -1836,7 +1837,7 @@ typedef struct x265_api int (*get_ref_frame_list)(x265_encoder*, x265_picyuv**, x265_picyuv**, int, int, int*, int*); FILE* (*csvlog_open)(const x265_param*); void (*csvlog_frame)(const x265_param*, const x265_picture*); - void (*csvlog_encode)(x265_encoder*, const x265_stats*, int, char**); + void (*csvlog_encode)(const x265_param*, const x265_stats *, int, int, int, char**); void (*dither_image)(x265_picture*, int, int, int16_t*, int); int (*set_analysis_data)(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes); /* add new pointers to the end, or increment X265_MAJOR_VERSION */ From 1a3e688428002aed95a4933ce87e63d8cb97c4fa Mon Sep 17 00:00:00 2001 From: Vignesh Vijayakumar Date: Thu, 4 Jan 2018 12:37:01 +0530 Subject: [PATCH 28/51] Fix Mac OS build warnings 1. due to SEA motion search 2. due to condition check for CTUInfo --- source/common/framedata.cpp | 12 +++---- source/encoder/analysis.cpp | 58 ++++++++++++++++------------------ source/encoder/dpb.cpp | 13 +++----- source/encoder/framefilter.cpp | 2 +- 4 files changed, 36 insertions(+), 49 deletions(-) diff --git a/source/common/framedata.cpp b/source/common/framedata.cpp index e0ecd57149..d7d475311b 100644 --- a/source/common/framedata.cpp +++ b/source/common/framedata.cpp @@ -77,16 +77,12 @@ void FrameData::destroy() X265_FREE(m_cuStat); X265_FREE(m_rowStat); - - if (m_meBuffer) + for (int i = 0; i < INTEGRAL_PLANE_NUM; i++) { - for (int i = 0; i < INTEGRAL_PLANE_NUM; i++) + if (m_meBuffer[i] != NULL) { - if (m_meBuffer[i] != NULL) - { - X265_FREE(m_meBuffer[i]); - m_meBuffer[i] = NULL; - } + X265_FREE(m_meBuffer[i]); + m_meBuffer[i] = NULL; } } } diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp index 3be581ca6f..3622ed51d6 100644 --- a/source/encoder/analysis.cpp +++ b/source/encoder/analysis.cpp @@ -159,38 +159,34 @@ Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, con if (m_param->bCTUInfo && (*m_frame->m_ctuInfo + ctu.m_cuAddr)) { x265_ctu_info_t* ctuTemp = *m_frame->m_ctuInfo + ctu.m_cuAddr; - if (ctuTemp->ctuPartitions) - { - int32_t depthIdx = 0; - uint32_t maxNum8x8Partitions = 64; - uint8_t* depthInfoPtr = m_frame->m_addOnDepth[ctu.m_cuAddr]; - uint8_t* contentInfoPtr = m_frame->m_addOnCtuInfo[ctu.m_cuAddr]; - int* prevCtuInfoChangePtr = m_frame->m_addOnPrevChange[ctu.m_cuAddr]; - do - { - uint8_t depth = (uint8_t)ctuTemp->ctuPartitions[depthIdx]; - uint8_t content = (uint8_t)(*((int32_t *)ctuTemp->ctuInfo + depthIdx)); - int prevCtuInfoChange = m_frame->m_prevCtuInfoChange[ctu.m_cuAddr * maxNum8x8Partitions + depthIdx]; - memset(depthInfoPtr, depth, sizeof(uint8_t) * numPartition >> 2 * depth); - memset(contentInfoPtr, content, sizeof(uint8_t) * numPartition >> 2 * depth); - memset(prevCtuInfoChangePtr, 0, sizeof(int) * numPartition >> 2 * depth); - for (uint32_t l = 0; l < numPartition >> 2 * depth; l++) - prevCtuInfoChangePtr[l] = prevCtuInfoChange; - depthInfoPtr += ctu.m_numPartitions >> 2 * depth; - contentInfoPtr += ctu.m_numPartitions >> 2 * depth; - prevCtuInfoChangePtr += ctu.m_numPartitions >> 2 * depth; - depthIdx++; - } while (ctuTemp->ctuPartitions[depthIdx] != 0); - - m_additionalCtuInfo = m_frame->m_addOnCtuInfo[ctu.m_cuAddr]; - m_prevCtuInfoChange = m_frame->m_addOnPrevChange[ctu.m_cuAddr]; - memcpy(ctu.m_cuDepth, m_frame->m_addOnDepth[ctu.m_cuAddr], sizeof(uint8_t) * numPartition); - //Calculate log2CUSize from depth - for (uint32_t i = 0; i < cuGeom.numPartitions; i++) - ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i]; - } + int32_t depthIdx = 0; + uint32_t maxNum8x8Partitions = 64; + uint8_t* depthInfoPtr = m_frame->m_addOnDepth[ctu.m_cuAddr]; + uint8_t* contentInfoPtr = m_frame->m_addOnCtuInfo[ctu.m_cuAddr]; + int* prevCtuInfoChangePtr = m_frame->m_addOnPrevChange[ctu.m_cuAddr]; + do + { + uint8_t depth = (uint8_t)ctuTemp->ctuPartitions[depthIdx]; + uint8_t content = (uint8_t)(*((int32_t *)ctuTemp->ctuInfo + depthIdx)); + int prevCtuInfoChange = m_frame->m_prevCtuInfoChange[ctu.m_cuAddr * maxNum8x8Partitions + depthIdx]; + memset(depthInfoPtr, depth, sizeof(uint8_t) * numPartition >> 2 * depth); + memset(contentInfoPtr, content, sizeof(uint8_t) * numPartition >> 2 * depth); + memset(prevCtuInfoChangePtr, 0, sizeof(int) * numPartition >> 2 * depth); + for (uint32_t l = 0; l < numPartition >> 2 * depth; l++) + prevCtuInfoChangePtr[l] = prevCtuInfoChange; + depthInfoPtr += ctu.m_numPartitions >> 2 * depth; + contentInfoPtr += ctu.m_numPartitions >> 2 * depth; + prevCtuInfoChangePtr += ctu.m_numPartitions >> 2 * depth; + depthIdx++; + } while (ctuTemp->ctuPartitions[depthIdx] != 0); + + m_additionalCtuInfo = m_frame->m_addOnCtuInfo[ctu.m_cuAddr]; + m_prevCtuInfoChange = m_frame->m_addOnPrevChange[ctu.m_cuAddr]; + memcpy(ctu.m_cuDepth, m_frame->m_addOnDepth[ctu.m_cuAddr], sizeof(uint8_t) * numPartition); + //Calculate log2CUSize from depth + for (uint32_t i = 0; i < cuGeom.numPartitions; i++) + ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i]; } - if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) { m_multipassAnalysis = (analysis2PassFrameData*)m_frame->m_analysis2Pass.analysisFramedata; diff --git a/source/encoder/dpb.cpp b/source/encoder/dpb.cpp index 19d41f9cdd..6c4ce15f3b 100644 --- a/source/encoder/dpb.cpp +++ b/source/encoder/dpb.cpp @@ -92,19 +92,14 @@ void DPB::recycleUnreferenced() m_freeList.pushBack(*curFrame); curFrame->m_encData->m_freeListNext = m_frameDataFreeList; m_frameDataFreeList = curFrame->m_encData; - - if (curFrame->m_encData->m_meBuffer) + for (int i = 0; i < INTEGRAL_PLANE_NUM; i++) { - for (int i = 0; i < INTEGRAL_PLANE_NUM; i++) + if (curFrame->m_encData->m_meBuffer[i] != NULL) { - if (curFrame->m_encData->m_meBuffer[i] != NULL) - { - X265_FREE(curFrame->m_encData->m_meBuffer[i]); - curFrame->m_encData->m_meBuffer[i] = NULL; - } + X265_FREE(curFrame->m_encData->m_meBuffer[i]); + curFrame->m_encData->m_meBuffer[i] = NULL; } } - if (curFrame->m_ctuInfo != NULL) { uint32_t widthInCU = (curFrame->m_param->sourceWidth + curFrame->m_param->maxCUSize - 1) >> curFrame->m_param->maxLog2CUSize; diff --git a/source/encoder/framefilter.cpp b/source/encoder/framefilter.cpp index ccc7231b70..fd03d24fb8 100644 --- a/source/encoder/framefilter.cpp +++ b/source/encoder/framefilter.cpp @@ -795,7 +795,7 @@ void FrameFilter::processPostRow(int row) void FrameFilter::computeMEIntegral(int row) { int lastRow = row == (int)m_frame->m_encData->m_slice->m_sps->numCuInHeight - 1; - if (m_frame->m_encData->m_meIntegral && m_frame->m_lowres.sliceType != X265_TYPE_B) + if (m_frame->m_lowres.sliceType != X265_TYPE_B) { /* If WPP, other than first row, integral calculation for current row needs to wait till the * integral for the previous row is computed */ From 83e856dc9209809586f73ec4d470343267c4998e Mon Sep 17 00:00:00 2001 From: Ma0 Date: Sat, 13 Jan 2018 00:25:44 +0100 Subject: [PATCH 29/51] param2string: increase buffer size, do not store file names --- source/common/param.cpp | 16 +++++++++++----- source/common/param.h | 3 --- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/source/common/param.cpp b/source/common/param.cpp index ab8f419391..b4b0b83c2f 100644 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -1530,11 +1530,15 @@ void x265_print_params(x265_param* param) char *x265_param2string(x265_param* p, int padx, int pady) { char *buf, *s; + int bufSize = 4000 + p->rc.zoneCount * 64; + if (p->numaPools) + bufSize += strlen(p->numaPools); + if (p->masteringDisplayColorVolume) + bufSize += strlen(p->masteringDisplayColorVolume); - buf = s = X265_MALLOC(char, MAXPARAMSIZE); + buf = s = X265_MALLOC(char, bufSize); if (!buf) return NULL; - #define BOOL(param, cliopt) \ s += sprintf(s, " %s", (param) ? cliopt : "no-" cliopt); @@ -1549,7 +1553,7 @@ char *x265_param2string(x265_param* p, int padx, int pady) BOOL(p->bEnableSsim, "ssim"); s += sprintf(s, " log-level=%d", p->logLevel); if (p->csvfn) - s += sprintf(s, " csvfn=%s csv-log-level=%d", p->csvfn, p->csvLogLevel); + s += sprintf(s, " csv csv-log-level=%d", p->csvLogLevel); s += sprintf(s, " bitdepth=%d", p->internalBitDepth); s += sprintf(s, " input-csp=%d", p->internalCsp); s += sprintf(s, " fps=%u/%u", p->fpsNum, p->fpsDenom); @@ -1722,8 +1726,10 @@ char *x265_param2string(x265_param* p, int padx, int pady) BOOL(p->bEmitHDRSEI, "hdr"); BOOL(p->bHDROpt, "hdr-opt"); BOOL(p->bDhdr10opt, "dhdr10-opt"); - s += sprintf(s, " analysis-save=%s", p->analysisSave); - s += sprintf(s, " analysis-load=%s", p->analysisLoad); + if (p->analysisSave) + s += sprintf(s, " analysis-save"); + if (p->analysisLoad) + s += sprintf(s, " analysis-load"); s += sprintf(s, " analysis-reuse-level=%d", p->analysisReuseLevel); s += sprintf(s, " scale-factor=%d", p->scaleFactor); s += sprintf(s, " refine-intra=%d", p->intraRefine); diff --git a/source/common/param.h b/source/common/param.h index 9424b44c41..96775a601c 100644 --- a/source/common/param.h +++ b/source/common/param.h @@ -53,8 +53,5 @@ int x265_param_apply_profile(x265_param *, const char *profile); int x265_param_parse(x265_param *p, const char *name, const char *value); #define PARAM_NS X265_NS #endif - -#define MAXPARAMSIZE 2000 } - #endif // ifndef X265_PARAM_H From 20477d0f17d6e6a4244029905f11807bf97c9f10 Mon Sep 17 00:00:00 2001 From: Ma0 Date: Sat, 13 Jan 2018 00:37:31 +0100 Subject: [PATCH 30/51] change date to 2018 in output file header --- source/encoder/encoder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index 825e7b3f7d..15b6c0c622 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -2302,7 +2302,7 @@ void Encoder::getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs) if (buffer) { sprintf(buffer, "x265 (build %d) - %s:%s - H.265/HEVC codec - " - "Copyright 2013-2017 (c) Multicoreware, Inc - " + "Copyright 2013-2018 (c) Multicoreware, Inc - " "http://x265.org - options: %s", X265_BUILD, PFX(version_str), PFX(build_info_str), opts); From 94755d3fcd653a8a2380a422b37a170fcb414e8b Mon Sep 17 00:00:00 2001 From: Ashok Kumar Mishra Date: Tue, 16 Jan 2018 19:25:26 +0530 Subject: [PATCH 31/51] Change type of bufSize variable to size_t to avoid warning. --- source/common/param.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/common/param.cpp b/source/common/param.cpp index b4b0b83c2f..bae37fb61d 100644 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -1530,7 +1530,7 @@ void x265_print_params(x265_param* param) char *x265_param2string(x265_param* p, int padx, int pady) { char *buf, *s; - int bufSize = 4000 + p->rc.zoneCount * 64; + size_t bufSize = 4000 + p->rc.zoneCount * 64; if (p->numaPools) bufSize += strlen(p->numaPools); if (p->masteringDisplayColorVolume) From e5947ad2f61a401641efd9ea2eec42b502cdf9a2 Mon Sep 17 00:00:00 2001 From: Ma0 Date: Sat, 13 Jan 2018 02:47:30 +0100 Subject: [PATCH 32/51] input: change from ifstream to stdio stream --- source/common/common.h | 6 +- source/input/y4m.cpp | 146 +++++++++++++---------------------------- source/input/y4m.h | 15 +---- source/input/yuv.cpp | 64 +++++------------- source/input/yuv.h | 14 +--- 5 files changed, 71 insertions(+), 174 deletions(-) diff --git a/source/common/common.h b/source/common/common.h index 38c2ba07e1..e35d431864 100644 --- a/source/common/common.h +++ b/source/common/common.h @@ -75,11 +75,10 @@ #define ALIGN_VAR_8(T, var) T var __attribute__((aligned(8))) #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16))) #define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32))) - #if defined(__MINGW32__) #define fseeko fseeko64 +#define ftello ftello64 #endif - #elif defined(_MSC_VER) #define ALIGN_VAR_4(T, var) __declspec(align(4)) T var @@ -87,9 +86,8 @@ #define ALIGN_VAR_16(T, var) __declspec(align(16)) T var #define ALIGN_VAR_32(T, var) __declspec(align(32)) T var #define fseeko _fseeki64 - +#define ftello _ftelli64 #endif // if defined(__GNUC__) - #if HAVE_INT_TYPES_H #define __STDC_FORMAT_MACROS #include diff --git a/source/input/y4m.cpp b/source/input/y4m.cpp index 38732643c5..a1f6379f5f 100644 --- a/source/input/y4m.cpp +++ b/source/input/y4m.cpp @@ -38,9 +38,7 @@ using namespace X265_NS; using namespace std; - -static const char header[] = "FRAME"; - +static const char header[] = {'F','R','A','M','E'}; Y4MInput::Y4MInput(InputFileInfo& info) { for (int i = 0; i < QUEUE_SIZE; i++) @@ -60,15 +58,14 @@ Y4MInput::Y4MInput(InputFileInfo& info) ifs = NULL; if (!strcmp(info.filename, "-")) { - ifs = &cin; + ifs = stdin; #if _WIN32 setmode(fileno(stdin), O_BINARY); #endif } else - ifs = new ifstream(info.filename, ios::binary | ios::in); - - if (ifs && ifs->good() && parseHeader()) + ifs = x265_fopen(info.filename, "rb"); + if (ifs && !ferror(ifs) && parseHeader()) { int pixelbytes = depth > 8 ? 2 : 1; for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++) @@ -91,8 +88,8 @@ Y4MInput::Y4MInput(InputFileInfo& info) } if (!threadActive) { - if (ifs && ifs != &cin) - delete ifs; + if (ifs && ifs != stdin) + fclose(ifs); ifs = NULL; return; } @@ -106,61 +103,34 @@ Y4MInput::Y4MInput(InputFileInfo& info) info.csp = colorSpace; info.depth = depth; info.frameCount = -1; - - size_t estFrameSize = framesize + strlen(header) + 1; /* assume basic FRAME\n headers */ - + size_t estFrameSize = framesize + sizeof(header) + 1; /* assume basic FRAME\n headers */ /* try to estimate frame count, if this is not stdin */ - if (ifs != &cin) + if (ifs != stdin) { - istream::pos_type cur = ifs->tellg(); - -#if defined(_MSC_VER) && _MSC_VER < 1700 - /* Older MSVC versions cannot handle 64bit file sizes properly, so go native */ - HANDLE hFile = CreateFileA(info.filename, GENERIC_READ, - FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, - FILE_ATTRIBUTE_NORMAL, NULL); - if (hFile != INVALID_HANDLE_VALUE) - { - LARGE_INTEGER size; - if (GetFileSizeEx(hFile, &size)) - info.frameCount = (int)((size.QuadPart - (int64_t)cur) / estFrameSize); - CloseHandle(hFile); - } -#else // if defined(_MSC_VER) && _MSC_VER < 1700 + int64_t cur = ftello(ifs); if (cur >= 0) { - ifs->seekg(0, ios::end); - istream::pos_type size = ifs->tellg(); - ifs->seekg(cur, ios::beg); + fseeko(ifs, 0, SEEK_END); + int64_t size = ftello(ifs); + fseeko(ifs, cur, SEEK_SET); if (size > 0) info.frameCount = (int)((size - cur) / estFrameSize); } -#endif // if defined(_MSC_VER) && _MSC_VER < 1700 } - if (info.skipFrames) { -#if X86_64 - if (ifs != &cin) - ifs->seekg((uint64_t)estFrameSize * info.skipFrames, ios::cur); + if (ifs != stdin) + fseeko(ifs, (int64_t)estFrameSize * info.skipFrames, SEEK_CUR); else for (int i = 0; i < info.skipFrames; i++) - { - ifs->read(buf[0], estFrameSize - framesize); - ifs->read(buf[0], framesize); - } -#else - for (int i = 0; i < info.skipFrames; i++) - ifs->ignore(estFrameSize); -#endif + if (fread(buf[0], estFrameSize - framesize, 1, ifs) + fread(buf[0], framesize, 1, ifs) != 2) + break; } } - Y4MInput::~Y4MInput() { - if (ifs && ifs != &cin) - delete ifs; - + if (ifs && ifs != stdin) + fclose(ifs); for (int i = 0; i < QUEUE_SIZE; i++) X265_FREE(buf[i]); } @@ -180,37 +150,31 @@ bool Y4MInput::parseHeader() int csp = 0; int d = 0; - - while (ifs->good()) + int c; + while ((c = fgetc(ifs)) != EOF) { // Skip Y4MPEG string - int c = ifs->get(); - while (ifs->good() && (c != ' ') && (c != '\n')) - c = ifs->get(); - - while (c == ' ' && ifs->good()) + while ((c != EOF) && (c != ' ') && (c != '\n')) + c = fgetc(ifs); + while (c == ' ') { // read parameter identifier - switch (ifs->get()) + switch (fgetc(ifs)) { case 'W': width = 0; - while (ifs->good()) + while ((c = fgetc(ifs)) != EOF) { - c = ifs->get(); - if (c == ' ' || c == '\n') break; else width = width * 10 + (c - '0'); } break; - case 'H': height = 0; - while (ifs->good()) + while ((c = fgetc(ifs)) != EOF) { - c = ifs->get(); if (c == ' ' || c == '\n') break; else @@ -221,15 +185,13 @@ bool Y4MInput::parseHeader() case 'F': rateNum = 0; rateDenom = 0; - while (ifs->good()) + while ((c = fgetc(ifs)) != EOF) { - c = ifs->get(); if (c == '.') { rateDenom = 1; - while (ifs->good()) + while ((c = fgetc(ifs)) != EOF) { - c = ifs->get(); if (c == ' ' || c == '\n') break; else @@ -242,9 +204,8 @@ bool Y4MInput::parseHeader() } else if (c == ':') { - while (ifs->good()) + while ((c = fgetc(ifs)) != EOF) { - c = ifs->get(); if (c == ' ' || c == '\n') break; else @@ -260,14 +221,12 @@ bool Y4MInput::parseHeader() case 'A': sarWidth = 0; sarHeight = 0; - while (ifs->good()) + while ((c = fgetc(ifs)) != EOF) { - c = ifs->get(); if (c == ':') { - while (ifs->good()) + while ((c = fgetc(ifs)) != EOF) { - c = ifs->get(); if (c == ' ' || c == '\n') break; else @@ -283,19 +242,15 @@ bool Y4MInput::parseHeader() case 'C': csp = 0; d = 0; - while (ifs->good()) + while ((c = fgetc(ifs)) != EOF) { - c = ifs->get(); - if (c <= 'o' && c >= '0') csp = csp * 10 + (c - '0'); else if (c == 'p') { // example: C420p16 - while (ifs->good()) + while ((c = fgetc(ifs)) != EOF) { - c = ifs->get(); - if (c <= '9' && c >= '0') d = d * 10 + (c - '0'); else @@ -328,12 +283,10 @@ bool Y4MInput::parseHeader() if (d >= 8 && d <= 16) depth = d; break; - default: - while (ifs->good()) + while ((c = fgetc(ifs)) != EOF) { // consume this unsupported configuration word - c = ifs->get(); if (c == ' ' || c == '\n') break; } @@ -375,30 +328,23 @@ void Y4MInput::threadMain() threadActive = false; writeCount.poke(); } - bool Y4MInput::populateFrameQueue() { - if (!ifs || ifs->fail()) + if (!ifs || ferror(ifs)) return false; - - /* strip off the FRAME header */ - char hbuf[sizeof(header)]; - - ifs->read(hbuf, strlen(header)); - if (ifs->eof()) - return false; - - if (!ifs->good() || memcmp(hbuf, header, strlen(header))) + /* strip off the FRAME\n header */ + char hbuf[sizeof(header) + 1]; + if (fread(hbuf, sizeof(hbuf), 1, ifs) != 1 || memcmp(hbuf, header, sizeof(header))) { - x265_log(NULL, X265_LOG_ERROR, "y4m: frame header missing\n"); + if (!feof(ifs)) + x265_log(NULL, X265_LOG_ERROR, "y4m: frame header missing\n"); return false; } - /* consume bytes up to line feed */ - int c = ifs->get(); - while (c != '\n' && ifs->good()) - c = ifs->get(); - + int c = hbuf[sizeof(header)]; + while (c != '\n') + if ((c = fgetc(ifs)) == EOF) + break; /* wait for room in the ring buffer */ int written = writeCount.get(); int read = readCount.get(); @@ -408,10 +354,8 @@ bool Y4MInput::populateFrameQueue() if (!threadActive) return false; } - ProfileScopeEvent(frameRead); - ifs->read(buf[written % QUEUE_SIZE], framesize); - if (ifs->good()) + if (fread(buf[written % QUEUE_SIZE], framesize, 1, ifs) == 1) { writeCount.incr(); return true; diff --git a/source/input/y4m.h b/source/input/y4m.h index 3ca9cc67b4..14e80c7567 100644 --- a/source/input/y4m.h +++ b/source/input/y4m.h @@ -60,13 +60,9 @@ class Y4MInput : public InputFile, public Thread ThreadSafeInteger readCount; ThreadSafeInteger writeCount; - char* buf[QUEUE_SIZE]; - - std::istream *ifs; - + FILE *ifs; bool parseHeader(); - void threadMain(); bool populateFrameQueue(); @@ -76,15 +72,10 @@ class Y4MInput : public InputFile, public Thread Y4MInput(InputFileInfo& info); virtual ~Y4MInput(); - void release(); - - bool isEof() const { return ifs && ifs->eof(); } - - bool isFail() { return !(ifs && !ifs->fail() && threadActive); } - + bool isEof() const { return ifs && feof(ifs); } + bool isFail() { return !(ifs && !ferror(ifs) && threadActive); } void startReader(); - bool readPicture(x265_picture&); const char *getName() const { return "y4m"; } diff --git a/source/input/yuv.cpp b/source/input/yuv.cpp index 64a97cd2a2..e53c8b1ef4 100644 --- a/source/input/yuv.cpp +++ b/source/input/yuv.cpp @@ -65,23 +65,21 @@ YUVInput::YUVInput(InputFileInfo& info) x265_log(NULL, X265_LOG_ERROR, "yuv: width, height, and FPS must be specified\n"); return; } - if (!strcmp(info.filename, "-")) { - ifs = &cin; + ifs = stdin; #if _WIN32 setmode(fileno(stdin), O_BINARY); #endif } else - ifs = new ifstream(info.filename, ios::binary | ios::in); - - if (ifs && ifs->good()) + ifs = x265_fopen(info.filename, "rb"); + if (ifs && !ferror(ifs)) threadActive = true; else { - if (ifs && ifs != &cin) - delete ifs; + if (ifs && ifs != stdin) + fclose(ifs); ifs = NULL; return; } @@ -98,55 +96,33 @@ YUVInput::YUVInput(InputFileInfo& info) } info.frameCount = -1; - /* try to estimate frame count, if this is not stdin */ - if (ifs != &cin) + if (ifs != stdin) { - istream::pos_type cur = ifs->tellg(); - -#if defined(_MSC_VER) && _MSC_VER < 1700 - /* Older MSVC versions cannot handle 64bit file sizes properly, so go native */ - HANDLE hFile = CreateFileA(info.filename, GENERIC_READ, - FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, - FILE_ATTRIBUTE_NORMAL, NULL); - if (hFile != INVALID_HANDLE_VALUE) - { - LARGE_INTEGER size; - if (GetFileSizeEx(hFile, &size)) - info.frameCount = (int)((size.QuadPart - (int64_t)cur) / framesize); - CloseHandle(hFile); - } -#else // if defined(_MSC_VER) && _MSC_VER < 1700 + int64_t cur = ftello(ifs); if (cur >= 0) { - ifs->seekg(0, ios::end); - istream::pos_type size = ifs->tellg(); - ifs->seekg(cur, ios::beg); + fseeko(ifs, 0, SEEK_END); + int64_t size = ftello(ifs); + fseeko(ifs, cur, SEEK_SET); if (size > 0) info.frameCount = (int)((size - cur) / framesize); } -#endif // if defined(_MSC_VER) && _MSC_VER < 1700 } - if (info.skipFrames) { -#if X86_64 - if (ifs != &cin) - ifs->seekg((uint64_t)framesize * info.skipFrames, ios::cur); + if (ifs != stdin) + fseeko(ifs, (int64_t)framesize * info.skipFrames, SEEK_CUR); else for (int i = 0; i < info.skipFrames; i++) - ifs->read(buf[0], framesize); -#else - for (int i = 0; i < info.skipFrames; i++) - ifs->ignore(framesize); -#endif + if (fread(buf[0], framesize, 1, ifs) != 1) + break; } } - YUVInput::~YUVInput() { - if (ifs && ifs != &cin) - delete ifs; + if (ifs && ifs != stdin) + fclose(ifs); for (int i = 0; i < QUEUE_SIZE; i++) X265_FREE(buf[i]); } @@ -179,12 +155,10 @@ void YUVInput::threadMain() threadActive = false; writeCount.poke(); } - bool YUVInput::populateFrameQueue() { - if (!ifs || ifs->fail()) + if (!ifs || ferror(ifs)) return false; - /* wait for room in the ring buffer */ int written = writeCount.get(); int read = readCount.get(); @@ -195,10 +169,8 @@ bool YUVInput::populateFrameQueue() // release() has been called return false; } - ProfileScopeEvent(frameRead); - ifs->read(buf[written % QUEUE_SIZE], framesize); - if (ifs->good()) + if (fread(buf[written % QUEUE_SIZE], framesize, 1, ifs) == 1) { writeCount.incr(); return true; diff --git a/source/input/yuv.h b/source/input/yuv.h index dd33cb3633..a67b22c535 100644 --- a/source/input/yuv.h +++ b/source/input/yuv.h @@ -52,13 +52,9 @@ class YUVInput : public InputFile, public Thread ThreadSafeInteger readCount; ThreadSafeInteger writeCount; - char* buf[QUEUE_SIZE]; - - std::istream *ifs; - + FILE *ifs; int guessFrameCount(); - void threadMain(); bool populateFrameQueue(); @@ -68,13 +64,9 @@ class YUVInput : public InputFile, public Thread YUVInput(InputFileInfo& info); virtual ~YUVInput(); - void release(); - - bool isEof() const { return ifs && ifs->eof(); } - - bool isFail() { return !(ifs && !ifs->fail() && threadActive); } - + bool isEof() const { return ifs && feof(ifs); } + bool isFail() { return !(ifs && !ferror(ifs) && threadActive); } void startReader(); bool readPicture(x265_picture&); From a0fee97c00816aa01cbcd88f8d8ade337cadd453 Mon Sep 17 00:00:00 2001 From: Ashok Kumar Mishra Date: Fri, 19 Jan 2018 14:46:54 +0530 Subject: [PATCH 33/51] fix for unable to open input file for gcc 32 bit compiler --- source/input/y4m.cpp | 3 ++- source/input/yuv.cpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/source/input/y4m.cpp b/source/input/y4m.cpp index a1f6379f5f..8a52496679 100644 --- a/source/input/y4m.cpp +++ b/source/input/y4m.cpp @@ -20,7 +20,8 @@ * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ - +#define _FILE_OFFSET_BITS 64 +#define _LARGEFILE_SOURCE #include "y4m.h" #include "common.h" diff --git a/source/input/yuv.cpp b/source/input/yuv.cpp index e53c8b1ef4..ddfff59e44 100644 --- a/source/input/yuv.cpp +++ b/source/input/yuv.cpp @@ -20,7 +20,8 @@ * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ - +#define _FILE_OFFSET_BITS 64 +#define _LARGEFILE_SOURCE #include "yuv.h" #include "common.h" From 81039fa83d7855c411b40c422bb58998a1443467 Mon Sep 17 00:00:00 2001 From: Santhoshini Sekar Date: Wed, 24 Jan 2018 13:11:11 +0530 Subject: [PATCH 34/51] fix warning in getRefFrameList --- source/encoder/encoder.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index 15b6c0c622..c47912187f 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -490,7 +490,10 @@ int Encoder::getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc, i } } else - x265_log(NULL, X265_LOG_WARNING, "Refrence List is not in piclist\n"); + { + x265_log(NULL, X265_LOG_WARNING, "Current frame is not in DPB piclist.\n"); + return 1; + } } else { From ad6d7e090726f2545765f315caf600929ca81fda Mon Sep 17 00:00:00 2001 From: Ricardo Constantino Date: Tue, 16 Jan 2018 17:39:53 +0000 Subject: [PATCH 35/51] CMake: blacklist mingw implicit link libraries These also aren't meant to be in pkg-config's Libs.Private. --- source/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 557814ad40..e6d49c69a6 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -647,7 +647,9 @@ if(X265_LATEST_TAG) endforeach() if(PLIBLIST) # blacklist of libraries that should not be in Libs.private - list(REMOVE_ITEM PLIBLIST "-lc" "-lpthread") + list(REMOVE_ITEM PLIBLIST "-lc" "-lpthread" "-lmingwex" "-lmingwthrd" + "-lmingw32" "-lmoldname" "-lmsvcrt" "-ladvapi32" "-lshell32" + "-luser32" "-lkernel32") string(REPLACE ";" " " PRIVATE_LIBS "${PLIBLIST}") else() set(PRIVATE_LIBS "") From 8535ec0405b1e7bf1691af7f047e61e28bd3228e Mon Sep 17 00:00:00 2001 From: Aruna Matheswaran Date: Wed, 31 Jan 2018 18:37:13 +0530 Subject: [PATCH 36/51] slicetype: fix hanging issue due to dropping of BREF frames While forcing slicetypes through qp file, few BREF frames were not copied into lookahead's output queue due to incorrect check condition. --- source/encoder/slicetype.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp index eb81529c65..c01beacdd3 100644 --- a/source/encoder/slicetype.cpp +++ b/source/encoder/slicetype.cpp @@ -1234,9 +1234,8 @@ void Lookahead::slicetypeDecide() int idx = 0; list[bframes]->m_reorderedPts = pts[idx++]; m_outputQueue.pushBack(*list[bframes]); - /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */ - if (bframes > 1 && m_param->bBPyramid) + if (brefs) { for (int i = 0; i < bframes; i++) { From 7c9bbbcb9af2531529c24c38f6511f0499cc3319 Mon Sep 17 00:00:00 2001 From: Bhavna Hariharan Date: Wed, 31 Jan 2018 19:44:19 +0530 Subject: [PATCH 37/51] csv: Bypass luma calculatations when --max-cll is OFF or when csv-log-level < 2 The maxFall and maxCll values are calculated from the min, max and average luma values. The luma values were being calculated even when --max-cll is disabled. This patch bypasses the luma calculations when --max-cll is OFF or when csv-log-level < 2 --- source/common/picyuv.cpp | 18 ++++++++++-------- source/encoder/api.cpp | 19 ++++++++++++------- source/encoder/encoder.cpp | 27 +++++++++++++-------------- 3 files changed, 35 insertions(+), 29 deletions(-) diff --git a/source/common/picyuv.cpp b/source/common/picyuv.cpp index 311a7bd68d..062f8facff 100644 --- a/source/common/picyuv.cpp +++ b/source/common/picyuv.cpp @@ -358,18 +358,20 @@ void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, i pixel *uPic = m_picOrg[1]; pixel *vPic = m_picOrg[2]; - for (int r = 0; r < height; r++) + if (param.csvLogLevel >= 2 || param.maxCLL || param.maxFALL) { - for (int c = 0; c < width; c++) + for (int r = 0; r < height; r++) { - m_maxLumaLevel = X265_MAX(yPic[c], m_maxLumaLevel); - m_minLumaLevel = X265_MIN(yPic[c], m_minLumaLevel); - lumaSum += yPic[c]; + for (int c = 0; c < width; c++) + { + m_maxLumaLevel = X265_MAX(yPic[c], m_maxLumaLevel); + m_minLumaLevel = X265_MIN(yPic[c], m_minLumaLevel); + lumaSum += yPic[c]; + } + yPic += m_stride; } - yPic += m_stride; + m_avgLumaLevel = (double)lumaSum / (m_picHeight * m_picWidth); } - m_avgLumaLevel = (double)lumaSum / (m_picHeight * m_picWidth); - if (param.csvLogLevel >= 2) { if (param.internalCsp != X265_CSP_I400) diff --git a/source/encoder/api.cpp b/source/encoder/api.cpp index 85de1d762c..9ea28c2dc6 100644 --- a/source/encoder/api.cpp +++ b/source/encoder/api.cpp @@ -67,9 +67,7 @@ static const char* summaryCSVHeader = "Y PSNR, U PSNR, V PSNR, Global PSNR, SSIM, SSIM (dB), " "I count, I ave-QP, I kbps, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), " "P count, P ave-QP, P kbps, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), " - "B count, B ave-QP, B kbps, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), " - "MaxCLL, MaxFALL, Version\n"; - + "B count, B ave-QP, B kbps, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "; x265_encoder *x265_encoder_open(x265_param *p) { if (!p) @@ -757,7 +755,12 @@ FILE* x265_csvlog_open(const x265_param* param) fprintf(csvfp, "\n"); } else + { fputs(summaryCSVHeader, csvfp); + if (param->csvLogLevel >= 2 || param->maxCLL || param->maxFALL) + fputs("MaxCLL, MaxFALL,", csvfp); + fputs(" Version\n", csvfp); + } } return csvfp; } @@ -881,8 +884,10 @@ void x265_csvlog_encode(const x265_param *p, const x265_stats *stats, int padx, // adding summary to a per-frame csv log file, so it needs a summary header fprintf(p->csvfpt, "\nSummary\n"); fputs(summaryCSVHeader, p->csvfpt); + if (p->csvLogLevel >= 2 || p->maxCLL || p->maxFALL) + fputs("MaxCLL, MaxFALL,", p->csvfpt); + fputs(" Version\n",p->csvfpt); } - // CLI arguments or other if (argc) { @@ -973,10 +978,10 @@ void x265_csvlog_encode(const x265_param *p, const x265_stats *stats, int padx, } else fprintf(p->csvfpt, " -, -, -, -, -, -, -,"); - - fprintf(p->csvfpt, " %-6u, %-6u, %s\n", stats->maxCLL, stats->maxFALL, api->version_str); + if (p->csvLogLevel >= 2 || p->maxCLL || p->maxFALL) + fprintf(p->csvfpt, " %-6u, %-6u,", stats->maxCLL, stats->maxFALL); + fprintf(p->csvfpt, " %s\n", api->version_str); } - } /* The dithering algorithm is based on Sierra-2-4A error diffusion. diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp index c47912187f..9325ed19c9 100644 --- a/source/encoder/encoder.cpp +++ b/source/encoder/encoder.cpp @@ -1983,11 +1983,12 @@ void Encoder::fetchStats(x265_stats *stats, size_t statsSizeBytes) stats->statsB.psnrU = m_analyzeB.m_psnrSumU / (double)m_analyzeB.m_numPics; stats->statsB.psnrV = m_analyzeB.m_psnrSumV / (double)m_analyzeB.m_numPics; stats->statsB.ssim = x265_ssim2dB(m_analyzeB.m_globalSsim / (double)m_analyzeB.m_numPics); - - stats->maxCLL = m_analyzeAll.m_maxCLL; - stats->maxFALL = (uint16_t)(m_analyzeAll.m_maxFALL / m_analyzeAll.m_numPics); + if (m_param->csvLogLevel >= 2 || m_param->maxCLL || m_param->maxFALL) + { + stats->maxCLL = m_analyzeAll.m_maxCLL; + stats->maxFALL = (uint16_t)(m_analyzeAll.m_maxFALL / m_analyzeAll.m_numPics); + } } - /* If new statistics are added to x265_stats, we must check here whether the * structure provided by the user is the new structure or an older one (for * future safety) */ @@ -2059,10 +2060,11 @@ void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, x265_f if (m_param->bEnableSsim) m_analyzeB.addSsim(ssim); } - - m_analyzeAll.m_maxFALL += curFrame->m_fencPic->m_avgLumaLevel; - m_analyzeAll.m_maxCLL = X265_MAX(m_analyzeAll.m_maxCLL, curFrame->m_fencPic->m_maxLumaLevel); - + if (m_param->csvLogLevel >= 2 || m_param->maxCLL || m_param->maxFALL) + { + m_analyzeAll.m_maxFALL += curFrame->m_fencPic->m_avgLumaLevel; + m_analyzeAll.m_maxCLL = X265_MAX(m_analyzeAll.m_maxCLL, curFrame->m_fencPic->m_maxLumaLevel); + } char c = (slice->isIntra() ? (curFrame->m_lowres.sliceType == X265_TYPE_IDR ? 'I' : 'i') : slice->isInterP() ? 'P' : 'B'); int poc = slice->m_poc; if (!IS_REFERENCED(curFrame)) @@ -2101,13 +2103,7 @@ void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, x265_f frameStats->list1POC[ref] = ref < slice->m_numRefIdx[1] ? slice->m_refPOCList[1][ref] - slice->m_lastIDR : -1; } } - #define ELAPSED_MSEC(start, end) (((double)(end) - (start)) / 1000) - - frameStats->maxLumaLevel = curFrame->m_fencPic->m_maxLumaLevel; - frameStats->minLumaLevel = curFrame->m_fencPic->m_minLumaLevel; - frameStats->avgLumaLevel = curFrame->m_fencPic->m_avgLumaLevel; - if (m_param->csvLogLevel >= 2) { frameStats->decideWaitTime = ELAPSED_MSEC(0, curEncoder->m_slicetypeWaitTime); @@ -2127,6 +2123,9 @@ void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, x265_f frameStats->avgLumaDistortion = curFrame->m_encData->m_frameStats.avgLumaDistortion; frameStats->avgPsyEnergy = curFrame->m_encData->m_frameStats.avgPsyEnergy; frameStats->avgResEnergy = curFrame->m_encData->m_frameStats.avgResEnergy; + frameStats->maxLumaLevel = curFrame->m_fencPic->m_maxLumaLevel; + frameStats->minLumaLevel = curFrame->m_fencPic->m_minLumaLevel; + frameStats->avgLumaLevel = curFrame->m_fencPic->m_avgLumaLevel; frameStats->maxChromaULevel = curFrame->m_fencPic->m_maxChromaULevel; frameStats->minChromaULevel = curFrame->m_fencPic->m_minChromaULevel; From 39ed4b6e54448bff16f28d9ba082e5fdbf512ff1 Mon Sep 17 00:00:00 2001 From: David Christenson Date: Fri, 2 Feb 2018 22:18:18 -0700 Subject: [PATCH 38/51] CMake: fix generation of version info from .hg_archival.txt When building from a commit more recent than the latest tag, version.cmake wouldn't set X265_LATEST_TAG and X265_TAG_DISTANCE resulting in their defaults (0.0 and 0) being used. Also truncate the revision ID to 12 characters to match the length used when parsing from the hg executable. --- source/cmake/version.cmake | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/source/cmake/version.cmake b/source/cmake/version.cmake index c63d9f442e..b1e995f924 100644 --- a/source/cmake/version.cmake +++ b/source/cmake/version.cmake @@ -22,12 +22,11 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../.hg_archival.txt) set(hg_${key} ${value}) endforeach() if(DEFINED hg_tag) - set(X265_VERSION ${hg_tag}) set(X265_LATEST_TAG ${hg_tag}) - set(X265_TAG_DISTANCE "0") elseif(DEFINED hg_node) - string(SUBSTRING "${hg_node}" 0 16 hg_id) - set(X265_VERSION "${hg_latesttag}+${hg_latesttagdistance}-${hg_id}") + set(X265_LATEST_TAG ${hg_latesttag}) + set(X265_TAG_DISTANCE ${hg_latesttagdistance}) + string(SUBSTRING "${hg_node}" 0 12 X265_REVISION_ID) endif() elseif(HG_EXECUTABLE AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../.hg) if(EXISTS "${HG_EXECUTABLE}.bat") From 2b8f6dba0b860102f877ee710abc23c5ec8d8b75 Mon Sep 17 00:00:00 2001 From: Ma0 Date: Sat, 10 Feb 2018 06:16:45 +0100 Subject: [PATCH 39/51] fix output to pipe on Windows --- source/output/raw.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/source/output/raw.cpp b/source/output/raw.cpp index 1529908d50..fe69084829 100644 --- a/source/output/raw.cpp +++ b/source/output/raw.cpp @@ -21,18 +21,26 @@ * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ - #include "raw.h" +#if _WIN32 +#include +#include +#if defined(_MSC_VER) +#pragma warning(disable: 4996) // POSIX setmode and fileno deprecated +#endif +#endif using namespace X265_NS; using namespace std; - RAWOutput::RAWOutput(const char* fname, InputFileInfo&) { b_fail = false; if (!strcmp(fname, "-")) { ofs = stdout; +#if _WIN32 + setmode(fileno(stdout), O_BINARY); +#endif return; } ofs = x265_fopen(fname, "wb"); From 4c2d4a96ea24af1eb58adaa78c01e27b0e624c89 Mon Sep 17 00:00:00 2001 From: Bhavna Hariharan Date: Wed, 14 Feb 2018 15:45:29 +0530 Subject: [PATCH 40/51] limitTU: Save intra CU's TU depth when analysis save/load is enabled This patch will cause a output mismatch between analysis save and load when limit-tu 3/4 is enabled. This change is expected as the load run will have only the best mode's TU information. For CUs where the neighbour's TU depth is unavailable, load encode will evaluate all TU detphs. --- doc/reST/cli.rst | 8 +++++++- source/encoder/analysis.cpp | 5 ++--- source/test/regression-tests.txt | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/doc/reST/cli.rst b/doc/reST/cli.rst index 83a5eb6f6b..ff70169d8a 100644 --- a/doc/reST/cli.rst +++ b/doc/reST/cli.rst @@ -1029,7 +1029,13 @@ as the residual quad-tree (RQT). Level 4 - uses the depth of the neighbouring/ co-located CUs TU depth to limit the 1st subTU depth. The 1st subTU depth is taken as the limiting depth for the other subTUs. - + + Enabling levels 3 or 4 may cause a mismatch in the output bitstreams + between option:`--analysis-save` and option:`--analysis-load` + as all neighbouring CUs TU depth may not be available in the + option:`--analysis-load` run as only the best mode's information is + available to it. + Default: 0 .. option:: --nr-intra , --nr-inter diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp index 3622ed51d6..a8e69c2db9 100644 --- a/source/encoder/analysis.cpp +++ b/source/encoder/analysis.cpp @@ -647,13 +647,12 @@ uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom cacheCost[cuIdx] = md.bestMode->rdCost; } - /* Save Intra CUs TU depth only when analysis mode is OFF */ - if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4 && (!m_param->analysisSave && !m_param->analysisLoad)) + if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4) { CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr); int8_t maxTUDepth = -1; for (uint32_t i = 0; i < cuGeom.numPartitions; i++) - maxTUDepth = X265_MAX(maxTUDepth, md.pred[PRED_INTRA].cu.m_tuDepth[i]); + maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]); ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth; } diff --git a/source/test/regression-tests.txt b/source/test/regression-tests.txt index 51a1b0f338..d5543dda1c 100644 --- a/source/test/regression-tests.txt +++ b/source/test/regression-tests.txt @@ -23,7 +23,7 @@ BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4 BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0 BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3 -BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --bitrate 7000 --tskip-fast --limit-tu 4::--preset veryslow --no-cutree --analysis-load x265_analysis.dat --bitrate 7000 --tskip-fast --limit-tu 4 +BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --bitrate 7000 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat --bitrate 7000 --tskip-fast --limit-tu 2 BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit" Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit" Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop From 4617e86ec94d68fab403a4e100fca6afa1d181f2 Mon Sep 17 00:00:00 2001 From: Bhavna Hariharan Date: Fri, 16 Feb 2018 11:37:45 +0530 Subject: [PATCH 41/51] dhdr: comply to HDR10+ LLC spec This patch does the following - 1) Consider Json files without Bezier Curve data as valid and inject them correctly without errors. 2) The numWindows values is now taken from the numWindows JSON value, instead of taking it from 1+ellipsesNum. Add numWindows to the dictionary. 3) Changed hardcoded applicationVersion value to 1 due to LLC requirement. --- source/dynamicHDR10/SeiMetadataDictionary.cpp | 1 + source/dynamicHDR10/SeiMetadataDictionary.h | 1 + source/dynamicHDR10/metadataFromJson.cpp | 57 +++++++++---------- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/source/dynamicHDR10/SeiMetadataDictionary.cpp b/source/dynamicHDR10/SeiMetadataDictionary.cpp index 039a4887e2..ad95570926 100644 --- a/source/dynamicHDR10/SeiMetadataDictionary.cpp +++ b/source/dynamicHDR10/SeiMetadataDictionary.cpp @@ -28,6 +28,7 @@ using namespace SeiMetadataDictionary; const std::string JsonDataKeys::LocalParameters = std::string("LocalParameters"); const std::string JsonDataKeys::TargetDisplayLuminance = std::string("TargetedSystemDisplayMaximumLuminance"); +const std::string JsonDataKeys::NumberOfWindows = std::string("NumberOfWindows"); const std::string BezierCurveNames::TagName = std::string("BezierCurveData"); const std::string BezierCurveNames::NumberOfAnchors = std::string("NumberOfAnchors"); diff --git a/source/dynamicHDR10/SeiMetadataDictionary.h b/source/dynamicHDR10/SeiMetadataDictionary.h index 3406652936..727311cc00 100644 --- a/source/dynamicHDR10/SeiMetadataDictionary.h +++ b/source/dynamicHDR10/SeiMetadataDictionary.h @@ -37,6 +37,7 @@ namespace SeiMetadataDictionary public: static const std::string LocalParameters; static const std::string TargetDisplayLuminance; + static const std::string NumberOfWindows; }; //Bezier Curve Data diff --git a/source/dynamicHDR10/metadataFromJson.cpp b/source/dynamicHDR10/metadataFromJson.cpp index 64bf2f8b9b..1e2e1d5ceb 100644 --- a/source/dynamicHDR10/metadataFromJson.cpp +++ b/source/dynamicHDR10/metadataFromJson.cpp @@ -372,7 +372,7 @@ void metadataFromJson::fillMetadataArray(const JsonArray &fileData, int frame, u const uint16_t terminalProviderCode = 0x003C; const uint16_t terminalProviderOrientedCode = 0x0001; const uint8_t applicationIdentifier = 4; - const uint8_t applicationVersion = 0; + const uint8_t applicationVersion = 1; mPimpl->appendBits(metadata, countryCode, 8); mPimpl->appendBits(metadata, terminalProviderCode, 16); @@ -384,9 +384,7 @@ void metadataFromJson::fillMetadataArray(const JsonArray &fileData, int frame, u //Note: Validated only add up to two local selections, ignore the rest JsonArray jsonArray = fileData[frame][JsonDataKeys::LocalParameters].array_items(); int ellipsesNum = static_cast(jsonArray.size() > 2 ? 2 : jsonArray.size()); - - uint16_t numWindows = 1 + static_cast(ellipsesNum); - + uint16_t numWindows = (uint16_t)fileData[frame][JsonDataKeys::NumberOfWindows].int_value(); mPimpl->appendBits(metadata, numWindows, 2); for (int i = 0; i < ellipsesNum; ++i) { @@ -426,16 +424,15 @@ void metadataFromJson::fillMetadataArray(const JsonArray &fileData, int frame, u mPimpl->appendBits(metadata, semimajorExternalAxis, 16); mPimpl->appendBits(metadata, semiminorExternalAxis, 16); - /*bool*/ uint8_t overlapProcessOption = static_cast(ellipseJsonObject[EllipseNames::OverlapProcessOption].int_value()); //1; + uint8_t overlapProcessOption = static_cast(ellipseJsonObject[EllipseNames::OverlapProcessOption].int_value()); //TODO: Uses Layering method, the value is "1" mPimpl->appendBits(metadata, overlapProcessOption, 1); } /* Targeted System Display Data */ - uint32_t TEMPmonitorPeak = fileData[frame][JsonDataKeys::TargetDisplayLuminance].int_value(); //500; - mPimpl->appendBits(metadata, TEMPmonitorPeak, 27); - + uint32_t monitorPeak = fileData[frame][JsonDataKeys::TargetDisplayLuminance].int_value(); //500; + mPimpl->appendBits(metadata, monitorPeak, 27); //NOTE: Set as false for now, as requested - /*bool*/uint8_t targetedSystemDisplayActualPeakLuminanceFlag = 0; /*false*/ + uint8_t targetedSystemDisplayActualPeakLuminanceFlag = 0; mPimpl->appendBits(metadata, targetedSystemDisplayActualPeakLuminanceFlag, 1); if (targetedSystemDisplayActualPeakLuminanceFlag) { @@ -463,7 +460,6 @@ void metadataFromJson::fillMetadataArray(const JsonArray &fileData, int frame, u mPimpl->appendBits(metadata, static_cast((int)luminanceData.maxGLuminance & 0xFFFF), 16); mPimpl->appendBits(metadata, static_cast(((int)luminanceData.maxBLuminance & 0x10000) >> 16), 1); mPimpl->appendBits(metadata, static_cast((int)luminanceData.maxBLuminance & 0xFFFF), 16); - /* changed from maxRGBLuminance to average luminance to match stms implementation */ mPimpl->appendBits(metadata, static_cast(((int)luminanceData.averageLuminance & 0x10000) >> 16), 1); mPimpl->appendBits(metadata, static_cast((int)luminanceData.averageLuminance & 0xFFFF), 16); @@ -478,7 +474,7 @@ void metadataFromJson::fillMetadataArray(const JsonArray &fileData, int frame, u uint8_t distributionMaxrgbPercentage = static_cast(percentilPercentages.at(i)); mPimpl->appendBits(metadata, distributionMaxrgbPercentage, 7); - // 17bits: 1bit then 16 + /* 17bits: 1bit then 16 */ unsigned int ithPercentile = luminanceData.percentiles.at(i); uint8_t highValue = static_cast((ithPercentile & 0x10000) >> 16); uint16_t lowValue = static_cast(ithPercentile & 0xFFFF); @@ -499,33 +495,32 @@ void metadataFromJson::fillMetadataArray(const JsonArray &fileData, int frame, u { //TODO } - // BEZIER CURVE DATA + /* Bezier Curve Data */ for (int w = 0; w < numWindows; ++w) { - //TODO: uint8_t toneMappingFlag = 1; - mPimpl->appendBits(metadata, toneMappingFlag, 1); - if (toneMappingFlag) + /* Check if the window contains tone mapping bezier curve data and set toneMappingFlag appropriately */ + //Json bezierData = fileData[frame][BezierCurveNames::TagName]; + BezierCurveData curveData; + /* Select curve data based on global window */ + if (w == 0) { - Json bezierData = fileData[frame][BezierCurveNames::TagName]; - BezierCurveData curveData; - - /* Select curve data based on global window or local window */ - if (w == 0) + if (!mPimpl->bezierCurveFromJson(fileData[frame][BezierCurveNames::TagName], curveData)) { - if (!mPimpl->bezierCurveFromJson(bezierData, curveData)) - { - std::cout << "error parsing bezierCurve frame: " << w << std::endl; - } + toneMappingFlag = 0; } - else + } + /* Select curve data based on local window */ + else + { + if (!mPimpl->bezierCurveFromJson(jsonArray[w - 1][BezierCurveNames::TagName], curveData)) { - if (!mPimpl->bezierCurveFromJson(jsonArray[w - 1][BezierCurveNames::TagName], curveData)) - { - std::cout << "error parsing bezierCurve ellipse: " << w - 1 << std::endl; - } + toneMappingFlag = 0; } - + } + mPimpl->appendBits(metadata, toneMappingFlag, 1); + if (toneMappingFlag) + { uint16_t kneePointX = static_cast(curveData.sPx); mPimpl->appendBits(metadata, kneePointX, 12); uint16_t kneePointY = static_cast(curveData.sPy); @@ -541,7 +536,7 @@ void metadataFromJson::fillMetadataArray(const JsonArray &fileData, int frame, u mPimpl->appendBits(metadata, anchor, 10); } } - } + } /* Set to false as requested */ bool colorSaturationMappingFlag = 0; mPimpl->appendBits(metadata, colorSaturationMappingFlag, 1); From 9936a4760c7470637f29bdcbf7cbd8cdcbe942d3 Mon Sep 17 00:00:00 2001 From: Bhavna Hariharan Date: Fri, 16 Feb 2018 11:40:59 +0530 Subject: [PATCH 42/51] dhdr: Skip comments in the json file while parsing it. Comments of the type /* */ OR // are ignored. --- source/dynamicHDR10/JsonHelper.cpp | 43 +++++++----------------------- 1 file changed, 10 insertions(+), 33 deletions(-) diff --git a/source/dynamicHDR10/JsonHelper.cpp b/source/dynamicHDR10/JsonHelper.cpp index 4257313faf..b073f3f3ff 100644 --- a/source/dynamicHDR10/JsonHelper.cpp +++ b/source/dynamicHDR10/JsonHelper.cpp @@ -139,21 +139,13 @@ JsonObject JsonHelper::readJson(string path) return JsonObject(); } - ifstream tfile; - string json_str; - string json_str2; + std::ifstream ifs(path); + const std::string json_str2((std::istreambuf_iterator(ifs)), + (std::istreambuf_iterator())); + string err = ""; - tfile.open(path); - while(tfile) - { - std::getline(tfile, json_str); - json_str2.append(json_str); - } - tfile.close(); - size_t beginning = json_str2.find_first_of("{"); - int fixchar = json_str2[json_str2.size() - 2] == '}' ? 1 : 0; - return Json::parse(json_str2.substr(beginning,json_str2.size() - fixchar),err).object_items(); + return Json::parse(json_str2,err, JsonParse::COMMENTS).object_items(); } JsonArray JsonHelper::readJsonArray(const string &path) @@ -174,28 +166,13 @@ JsonArray JsonHelper::readJsonArray(const string &path) return JsonArray(); } - ifstream tfile; - string json_str; - string json_str2; - string err = ""; - tfile.open(path); - while(tfile) - { - std::getline(tfile, json_str); - json_str2.append(json_str); - } + std::ifstream ifs(path); + const std::string json_str2((std::istreambuf_iterator(ifs)), + (std::istreambuf_iterator())); - tfile.close(); + string err = ""; - vector data; - if (json_str2.size() != 0) - { - size_t beginning = json_str2.find_first_of("["); - int fixchar = json_str2[json_str2.size() - 2] == ']' ? 1 : 0; - return Json::parse(json_str2.substr(beginning, json_str2.size() - fixchar), err).array_items(); - } - else - return data; + return Json::parse(json_str2,err, JsonParse::COMMENTS).array_items(); } bool JsonHelper::validatePathExtension(string &path) From 537f741d89c622fd2889550fc15425ec5909e72a Mon Sep 17 00:00:00 2001 From: Praveen Tiwari Date: Fri, 19 Jan 2018 12:04:23 +0530 Subject: [PATCH 43/51] x86: split ipfilter8 kernels into two different source file This patch implements infrastructure to split ipfiletr8 asm source file into two different files in order to avoid longer build time. It moves interp_8tap_horizontal kernels to the newly created file. --- source/common/CMakeLists.txt | 6 +- source/common/x86/asm-primitives.cpp | 2 +- source/common/x86/h-ipfilter8.asm | 267 +++++++++++++++++++++++++++ source/common/x86/h-ipfilter8.h | 39 ++++ source/common/x86/ipfilter8.asm | 131 ------------- 5 files changed, 309 insertions(+), 136 deletions(-) create mode 100644 source/common/x86/h-ipfilter8.asm create mode 100644 source/common/x86/h-ipfilter8.h diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 65624069c4..021645337c 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -56,17 +56,15 @@ if(ENABLE_ASSEMBLY AND X86) endif() set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES}) source_group(Intrinsics FILES ${VEC_PRIMITIVES}) - - set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h seaintegral.h) + set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h h-ipfilter8.h loopfilter.h seaintegral.h) set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm mc-a2.asm pixel-util8.asm blockcopy8.asm pixeladd8.asm dct8.asm seaintegral.asm) if(HIGH_BIT_DEPTH) set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm loopfilter.asm) else() - set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm ipfilter8.asm loopfilter.asm) + set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm h-ipfilter8.asm ipfilter8.asm loopfilter.asm) endif() - if(NOT X64) set(A_SRCS ${A_SRCS} pixel-32.asm) endif() diff --git a/source/common/x86/asm-primitives.cpp b/source/common/x86/asm-primitives.cpp index 1546734c91..0e54329bc1 100644 --- a/source/common/x86/asm-primitives.cpp +++ b/source/common/x86/asm-primitives.cpp @@ -115,8 +115,8 @@ extern "C" { #include "intrapred.h" #include "dct8.h" #include "seaintegral.h" +#include "h-ipfilter8.h" } - #define ALL_LUMA_CU_TYPED(prim, fncdef, fname, cpu) \ p.cu[BLOCK_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \ p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \ diff --git a/source/common/x86/h-ipfilter8.asm b/source/common/x86/h-ipfilter8.asm new file mode 100644 index 0000000000..b5ce12c1ea --- /dev/null +++ b/source/common/x86/h-ipfilter8.asm @@ -0,0 +1,267 @@ +;***************************************************************************** +;* Copyright (C) 2013-2017 MulticoreWare, Inc +;* +;* Authors: Min Chen +;* Nabajit Deka +;* Praveen Kumar Tiwari +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +const h_tabw_LumaCoeff, dw 0, 0, 0, 64, 0, 0, 0, 0 + dw -1, 4, -10, 58, 17, -5, 1, 0 + dw -1, 4, -11, 40, 40, -11, 4, -1 + dw 0, 1, -5, 17, 58, -10, 4, -1 + +SECTION .text + +cextern pw_32 +cextern pw_2000 + +%macro FILTER_H8_W8_sse2 0 + movh m1, [r0 + x - 3] + movh m4, [r0 + x - 2] + punpcklbw m1, m6 + punpcklbw m4, m6 + movh m5, [r0 + x - 1] + movh m0, [r0 + x] + punpcklbw m5, m6 + punpcklbw m0, m6 + pmaddwd m1, m3 + pmaddwd m4, m3 + pmaddwd m5, m3 + pmaddwd m0, m3 + packssdw m1, m4 + packssdw m5, m0 + pshuflw m4, m1, q2301 + pshufhw m4, m4, q2301 + pshuflw m0, m5, q2301 + pshufhw m0, m0, q2301 + paddw m1, m4 + paddw m5, m0 + psrldq m1, 2 + psrldq m5, 2 + pshufd m1, m1, q3120 + pshufd m5, m5, q3120 + punpcklqdq m1, m5 + movh m7, [r0 + x + 1] + movh m4, [r0 + x + 2] + punpcklbw m7, m6 + punpcklbw m4, m6 + movh m5, [r0 + x + 3] + movh m0, [r0 + x + 4] + punpcklbw m5, m6 + punpcklbw m0, m6 + pmaddwd m7, m3 + pmaddwd m4, m3 + pmaddwd m5, m3 + pmaddwd m0, m3 + packssdw m7, m4 + packssdw m5, m0 + pshuflw m4, m7, q2301 + pshufhw m4, m4, q2301 + pshuflw m0, m5, q2301 + pshufhw m0, m0, q2301 + paddw m7, m4 + paddw m5, m0 + psrldq m7, 2 + psrldq m5, 2 + pshufd m7, m7, q3120 + pshufd m5, m5, q3120 + punpcklqdq m7, m5 + pshuflw m4, m1, q2301 + pshufhw m4, m4, q2301 + pshuflw m0, m7, q2301 + pshufhw m0, m0, q2301 + paddw m1, m4 + paddw m7, m0 + psrldq m1, 2 + psrldq m7, 2 + pshufd m1, m1, q3120 + pshufd m7, m7, q3120 + punpcklqdq m1, m7 +%endmacro + +%macro FILTER_H8_W4_sse2 0 + movh m1, [r0 + x - 3] + movh m0, [r0 + x - 2] + punpcklbw m1, m6 + punpcklbw m0, m6 + movh m4, [r0 + x - 1] + movh m5, [r0 + x] + punpcklbw m4, m6 + punpcklbw m5, m6 + pmaddwd m1, m3 + pmaddwd m0, m3 + pmaddwd m4, m3 + pmaddwd m5, m3 + packssdw m1, m0 + packssdw m4, m5 + pshuflw m0, m1, q2301 + pshufhw m0, m0, q2301 + pshuflw m5, m4, q2301 + pshufhw m5, m5, q2301 + paddw m1, m0 + paddw m4, m5 + psrldq m1, 2 + psrldq m4, 2 + pshufd m1, m1, q3120 + pshufd m4, m4, q3120 + punpcklqdq m1, m4 + pshuflw m0, m1, q2301 + pshufhw m0, m0, q2301 + paddw m1, m0 + psrldq m1, 2 + pshufd m1, m1, q3120 +%endmacro + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_LUMA_sse2 3 +INIT_XMM sse2 +cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8 + mov r4d, r4m + add r4d, r4d + pxor m6, m6 + +%ifidn %3, ps + add r3d, r3d + cmp r5m, byte 0 +%endif + +%ifdef PIC + lea r5, [h_tabw_LumaCoeff] + movu m3, [r5 + r4 * 8] +%else + movu m3, [h_tabw_LumaCoeff + r4 * 8] +%endif + + mov r4d, %2 + +%ifidn %3, pp + mova m2, [pw_32] +%else + mova m2, [pw_2000] + je .loopH + lea r5, [r1 + 2 * r1] + sub r0, r5 + add r4d, 7 +%endif + +.loopH: +%assign x 0 +%rep %1 / 8 + FILTER_H8_W8_sse2 + %ifidn %3, pp + paddw m1, m2 + psraw m1, 6 + packuswb m1, m1 + movh [r2 + x], m1 + %else + psubw m1, m2 + movu [r2 + 2 * x], m1 + %endif +%assign x x+8 +%endrep + +%rep (%1 % 8) / 4 + FILTER_H8_W4_sse2 + %ifidn %3, pp + paddw m1, m2 + psraw m1, 6 + packuswb m1, m1 + movd [r2 + x], m1 + %else + psubw m1, m2 + movh [r2 + 2 * x], m1 + %endif +%endrep + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET + +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- + IPFILTER_LUMA_sse2 4, 4, pp + IPFILTER_LUMA_sse2 4, 8, pp + IPFILTER_LUMA_sse2 8, 4, pp + IPFILTER_LUMA_sse2 8, 8, pp + IPFILTER_LUMA_sse2 16, 16, pp + IPFILTER_LUMA_sse2 16, 8, pp + IPFILTER_LUMA_sse2 8, 16, pp + IPFILTER_LUMA_sse2 16, 12, pp + IPFILTER_LUMA_sse2 12, 16, pp + IPFILTER_LUMA_sse2 16, 4, pp + IPFILTER_LUMA_sse2 4, 16, pp + IPFILTER_LUMA_sse2 32, 32, pp + IPFILTER_LUMA_sse2 32, 16, pp + IPFILTER_LUMA_sse2 16, 32, pp + IPFILTER_LUMA_sse2 32, 24, pp + IPFILTER_LUMA_sse2 24, 32, pp + IPFILTER_LUMA_sse2 32, 8, pp + IPFILTER_LUMA_sse2 8, 32, pp + IPFILTER_LUMA_sse2 64, 64, pp + IPFILTER_LUMA_sse2 64, 32, pp + IPFILTER_LUMA_sse2 32, 64, pp + IPFILTER_LUMA_sse2 64, 48, pp + IPFILTER_LUMA_sse2 48, 64, pp + IPFILTER_LUMA_sse2 64, 16, pp + IPFILTER_LUMA_sse2 16, 64, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- + IPFILTER_LUMA_sse2 4, 4, ps + IPFILTER_LUMA_sse2 8, 8, ps + IPFILTER_LUMA_sse2 8, 4, ps + IPFILTER_LUMA_sse2 4, 8, ps + IPFILTER_LUMA_sse2 16, 16, ps + IPFILTER_LUMA_sse2 16, 8, ps + IPFILTER_LUMA_sse2 8, 16, ps + IPFILTER_LUMA_sse2 16, 12, ps + IPFILTER_LUMA_sse2 12, 16, ps + IPFILTER_LUMA_sse2 16, 4, ps + IPFILTER_LUMA_sse2 4, 16, ps + IPFILTER_LUMA_sse2 32, 32, ps + IPFILTER_LUMA_sse2 32, 16, ps + IPFILTER_LUMA_sse2 16, 32, ps + IPFILTER_LUMA_sse2 32, 24, ps + IPFILTER_LUMA_sse2 24, 32, ps + IPFILTER_LUMA_sse2 32, 8, ps + IPFILTER_LUMA_sse2 8, 32, ps + IPFILTER_LUMA_sse2 64, 64, ps + IPFILTER_LUMA_sse2 64, 32, ps + IPFILTER_LUMA_sse2 32, 64, ps + IPFILTER_LUMA_sse2 64, 48, ps + IPFILTER_LUMA_sse2 48, 64, ps + IPFILTER_LUMA_sse2 64, 16, ps + IPFILTER_LUMA_sse2 16, 64, ps + diff --git a/source/common/x86/h-ipfilter8.h b/source/common/x86/h-ipfilter8.h new file mode 100644 index 0000000000..45ea99293d --- /dev/null +++ b/source/common/x86/h-ipfilter8.h @@ -0,0 +1,39 @@ +/***************************************************************************** +* Copyright (C) 2013-2017 MulticoreWare, Inc +* +* Authors: Steve Borho +* Praveen Kuamr Tiwari +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*****************************************************************************/ + +#ifndef X265_H_IPFILTER8_H +#define X265_H_IPFILTER8_H + + +#define SETUP_H_FUNC_DEF(cpu) \ + FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ + FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); + +SETUP_H_FUNC_DEF(sse2); +SETUP_H_FUNC_DEF(ssse3); +SETUP_H_FUNC_DEF(sse3); +SETUP_H_FUNC_DEF(sse4); +SETUP_H_FUNC_DEF(avx2); + +#endif // ifndef X265_H_IPFILTER8_H diff --git a/source/common/x86/ipfilter8.asm b/source/common/x86/ipfilter8.asm index 36d8986b5a..dc3f04add0 100644 --- a/source/common/x86/ipfilter8.asm +++ b/source/common/x86/ipfilter8.asm @@ -855,137 +855,6 @@ cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride psrldq m1, 2 pshufd m1, m1, q3120 %endmacro - -;---------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;---------------------------------------------------------------------------------------------------------------------------- -%macro IPFILTER_LUMA_sse2 3 -INIT_XMM sse2 -cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8 - mov r4d, r4m - add r4d, r4d - pxor m6, m6 - -%ifidn %3, ps - add r3d, r3d - cmp r5m, byte 0 -%endif - -%ifdef PIC - lea r5, [tabw_LumaCoeff] - movu m3, [r5 + r4 * 8] -%else - movu m3, [tabw_LumaCoeff + r4 * 8] -%endif - - mov r4d, %2 - -%ifidn %3, pp - mova m2, [pw_32] -%else - mova m2, [pw_2000] - je .loopH - lea r5, [r1 + 2 * r1] - sub r0, r5 - add r4d, 7 -%endif - -.loopH: -%assign x 0 -%rep %1 / 8 - FILTER_H8_W8_sse2 - %ifidn %3, pp - paddw m1, m2 - psraw m1, 6 - packuswb m1, m1 - movh [r2 + x], m1 - %else - psubw m1, m2 - movu [r2 + 2 * x], m1 - %endif -%assign x x+8 -%endrep - -%rep (%1 % 8) / 4 - FILTER_H8_W4_sse2 - %ifidn %3, pp - paddw m1, m2 - psraw m1, 6 - packuswb m1, m1 - movd [r2 + x], m1 - %else - psubw m1, m2 - movh [r2 + 2 * x], m1 - %endif -%endrep - - add r0, r1 - add r2, r3 - - dec r4d - jnz .loopH - RET - -%endmacro - -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- - IPFILTER_LUMA_sse2 4, 4, pp - IPFILTER_LUMA_sse2 4, 8, pp - IPFILTER_LUMA_sse2 8, 4, pp - IPFILTER_LUMA_sse2 8, 8, pp - IPFILTER_LUMA_sse2 16, 16, pp - IPFILTER_LUMA_sse2 16, 8, pp - IPFILTER_LUMA_sse2 8, 16, pp - IPFILTER_LUMA_sse2 16, 12, pp - IPFILTER_LUMA_sse2 12, 16, pp - IPFILTER_LUMA_sse2 16, 4, pp - IPFILTER_LUMA_sse2 4, 16, pp - IPFILTER_LUMA_sse2 32, 32, pp - IPFILTER_LUMA_sse2 32, 16, pp - IPFILTER_LUMA_sse2 16, 32, pp - IPFILTER_LUMA_sse2 32, 24, pp - IPFILTER_LUMA_sse2 24, 32, pp - IPFILTER_LUMA_sse2 32, 8, pp - IPFILTER_LUMA_sse2 8, 32, pp - IPFILTER_LUMA_sse2 64, 64, pp - IPFILTER_LUMA_sse2 64, 32, pp - IPFILTER_LUMA_sse2 32, 64, pp - IPFILTER_LUMA_sse2 64, 48, pp - IPFILTER_LUMA_sse2 48, 64, pp - IPFILTER_LUMA_sse2 64, 16, pp - IPFILTER_LUMA_sse2 16, 64, pp - -;---------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;---------------------------------------------------------------------------------------------------------------------------- - IPFILTER_LUMA_sse2 4, 4, ps - IPFILTER_LUMA_sse2 8, 8, ps - IPFILTER_LUMA_sse2 8, 4, ps - IPFILTER_LUMA_sse2 4, 8, ps - IPFILTER_LUMA_sse2 16, 16, ps - IPFILTER_LUMA_sse2 16, 8, ps - IPFILTER_LUMA_sse2 8, 16, ps - IPFILTER_LUMA_sse2 16, 12, ps - IPFILTER_LUMA_sse2 12, 16, ps - IPFILTER_LUMA_sse2 16, 4, ps - IPFILTER_LUMA_sse2 4, 16, ps - IPFILTER_LUMA_sse2 32, 32, ps - IPFILTER_LUMA_sse2 32, 16, ps - IPFILTER_LUMA_sse2 16, 32, ps - IPFILTER_LUMA_sse2 32, 24, ps - IPFILTER_LUMA_sse2 24, 32, ps - IPFILTER_LUMA_sse2 32, 8, ps - IPFILTER_LUMA_sse2 8, 32, ps - IPFILTER_LUMA_sse2 64, 64, ps - IPFILTER_LUMA_sse2 64, 32, ps - IPFILTER_LUMA_sse2 32, 64, ps - IPFILTER_LUMA_sse2 64, 48, ps - IPFILTER_LUMA_sse2 48, 64, ps - IPFILTER_LUMA_sse2 64, 16, ps - IPFILTER_LUMA_sse2 16, 64, ps - %macro PROCESS_LUMA_W4_4R_sse2 0 movd m2, [r0] movd m7, [r0 + r1] From c01c61058ff7ed31ee456688c1a8b5e0096b552f Mon Sep 17 00:00:00 2001 From: Radhakrishnan Date: Tue, 20 Feb 2018 14:59:18 +0530 Subject: [PATCH 44/51] x86: Remove header file for horizontal ipfilters Since all ipfilter kernals share the ipfilter8.h file, a separate header file isn't needed for horizontal ipfilter kernals. So this patch removes the duplicate header file --- source/common/CMakeLists.txt | 2 +- source/common/x86/asm-primitives.cpp | 1 - source/common/x86/h-ipfilter8.h | 39 ---------------------------- 3 files changed, 1 insertion(+), 41 deletions(-) delete mode 100644 source/common/x86/h-ipfilter8.h diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 021645337c..ab980a223e 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -56,7 +56,7 @@ if(ENABLE_ASSEMBLY AND X86) endif() set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES}) source_group(Intrinsics FILES ${VEC_PRIMITIVES}) - set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h h-ipfilter8.h loopfilter.h seaintegral.h) + set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h seaintegral.h) set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm mc-a2.asm pixel-util8.asm blockcopy8.asm pixeladd8.asm dct8.asm seaintegral.asm) diff --git a/source/common/x86/asm-primitives.cpp b/source/common/x86/asm-primitives.cpp index 0e54329bc1..cf9c7de59b 100644 --- a/source/common/x86/asm-primitives.cpp +++ b/source/common/x86/asm-primitives.cpp @@ -115,7 +115,6 @@ extern "C" { #include "intrapred.h" #include "dct8.h" #include "seaintegral.h" -#include "h-ipfilter8.h" } #define ALL_LUMA_CU_TYPED(prim, fncdef, fname, cpu) \ p.cu[BLOCK_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \ diff --git a/source/common/x86/h-ipfilter8.h b/source/common/x86/h-ipfilter8.h deleted file mode 100644 index 45ea99293d..0000000000 --- a/source/common/x86/h-ipfilter8.h +++ /dev/null @@ -1,39 +0,0 @@ -/***************************************************************************** -* Copyright (C) 2013-2017 MulticoreWare, Inc -* -* Authors: Steve Borho -* Praveen Kuamr Tiwari -* -* This program is free software; you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation; either version 2 of the License, or -* (at your option) any later version. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. -* -* You should have received a copy of the GNU General Public License -* along with this program; if not, write to the Free Software -* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. -* -* This program is also available under a commercial proprietary license. -* For more information, contact us at license @ x265.com. -*****************************************************************************/ - -#ifndef X265_H_IPFILTER8_H -#define X265_H_IPFILTER8_H - - -#define SETUP_H_FUNC_DEF(cpu) \ - FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \ - FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); - -SETUP_H_FUNC_DEF(sse2); -SETUP_H_FUNC_DEF(ssse3); -SETUP_H_FUNC_DEF(sse3); -SETUP_H_FUNC_DEF(sse4); -SETUP_H_FUNC_DEF(avx2); - -#endif // ifndef X265_H_IPFILTER8_H From 3f7829dd497154ea0b62dd1d47db16f3634a8556 Mon Sep 17 00:00:00 2001 From: Radhakrishnan Date: Wed, 24 Jan 2018 16:18:24 +0530 Subject: [PATCH 45/51] x86: Split ipfilter8 kernals into two different source files, part2 This patch port the horizontal 4tap and 8tap kernals from ipfilter8.asm to a different source file h-ipfilter8.asm. It improves the build time by 20%. --- source/common/x86/h-ipfilter8.asm | 6469 ++++++ source/common/x86/ipfilter8.asm | 34217 ++++++++++++---------------- 2 files changed, 20558 insertions(+), 20128 deletions(-) diff --git a/source/common/x86/h-ipfilter8.asm b/source/common/x86/h-ipfilter8.asm index b5ce12c1ea..48024c7740 100644 --- a/source/common/x86/h-ipfilter8.asm +++ b/source/common/x86/h-ipfilter8.asm @@ -33,10 +33,459 @@ const h_tabw_LumaCoeff, dw 0, 0, 0, 64, 0, 0, 0, 0 dw -1, 4, -11, 40, 40, -11, 4, -1 dw 0, 1, -5, 17, 58, -10, 4, -1 +const h_tabw_ChromaCoeff, dw 0, 64, 0, 0 + dw -2, 58, 10, -2 + dw -4, 54, 16, -2 + dw -6, 46, 28, -4 + dw -4, 36, 36, -4 + dw -4, 28, 46, -6 + dw -2, 16, 54, -4 + dw -2, 10, 58, -2 + +const h_tab_ChromaCoeff, db 0, 64, 0, 0 + db -2, 58, 10, -2 + db -4, 54, 16, -2 + db -6, 46, 28, -4 + db -4, 36, 36, -4 + db -4, 28, 46, -6 + db -2, 16, 54, -4 + db -2, 10, 58, -2 + +const h_tab_LumaCoeff, db 0, 0, 0, 64, 0, 0, 0, 0 + db -1, 4, -10, 58, 17, -5, 1, 0 + db -1, 4, -11, 40, 40, -11, 4, -1 + db 0, 1, -5, 17, 58, -10, 4, -1 + +const h_tab_Tm, db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14 + + +const h_tab_Lm, db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 + db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10 + db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12 + db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14 + +const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 + +const h_interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 + +const interp4_hpp_shuf, times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 + +const interp4_horiz_shuf1, db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + +const h_pd_526336, times 8 dd 8192*64+2048 + +const pb_LumaCoeffVer, times 16 db 0, 0 + times 16 db 0, 64 + times 16 db 0, 0 + times 16 db 0, 0 + + times 16 db -1, 4 + times 16 db -10, 58 + times 16 db 17, -5 + times 16 db 1, 0 + + times 16 db -1, 4 + times 16 db -11, 40 + times 16 db 40, -11 + times 16 db 4, -1 + + times 16 db 0, 1 + times 16 db -5, 17 + times 16 db 58, -10 + times 16 db 4, -1 + +const h_pw_LumaCoeffVer, times 8 dw 0, 0 + times 8 dw 0, 64 + times 8 dw 0, 0 + times 8 dw 0, 0 + + times 8 dw -1, 4 + times 8 dw -10, 58 + times 8 dw 17, -5 + times 8 dw 1, 0 + + times 8 dw -1, 4 + times 8 dw -11, 40 + times 8 dw 40, -11 + times 8 dw 4, -1 + + times 8 dw 0, 1 + times 8 dw -5, 17 + times 8 dw 58, -10 + times 8 dw 4, -1 + +const pb_8tap_hps_0, times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + times 2 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10 + times 2 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12 + times 2 db 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14 + +ALIGN 32 +interp4_hps_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 + SECTION .text +cextern pw_1 cextern pw_32 cextern pw_2000 +cextern pw_512 + +%macro PROCESS_LUMA_AVX2_W8_16R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhwd xm11, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 3 * mmsize] + paddd m4, m12 + pmaddwd m12, m10, [r5 + 2 * mmsize] + paddd m6, m12 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 3 * mmsize] + paddd m5, m13 + pmaddwd m13, m11, [r5 + 2 * mmsize] + paddd m7, m13 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] + +%ifidn %1,sp + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, 12 + psrad m5, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 +%ifidn %1,sp + packuswb m0, m2 + mova m5, [h_interp8_hps_shuf] + vpermd m0, m5, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif + + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m0, m12, [r5 + 3 * mmsize] + paddd m6, m0 + pmaddwd m0, m12, [r5 + 2 * mmsize] + paddd m8, m0 + pmaddwd m0, m12, [r5 + 1 * mmsize] + paddd m10, m0 + pmaddwd m12, [r5] + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddwd m1, m13, [r5 + 3 * mmsize] + paddd m7, m1 + pmaddwd m1, m13, [r5 + 2 * mmsize] + paddd m9, m1 + pmaddwd m1, m13, [r5 + 1 * mmsize] + paddd m11, m1 + pmaddwd m13, [r5] + +%ifidn %1,sp + paddd m6, m14 + paddd m7, m14 + psrad m6, 12 + psrad m7, 12 +%else + psrad m6, 6 + psrad m7, 6 +%endif + packssdw m6, m7 + lea r8, [r2 + r3 * 4] + +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m5, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm1, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 +%endif + + movu xm1, [r7 + r4] ; m1 = row 15 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m8, m2 + pmaddwd m2, m0, [r5 + 2 * mmsize] + paddd m10, m2 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m12, m2 + pmaddwd m0, [r5] + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m3, m1, [r5 + 3 * mmsize] + paddd m9, m3 + pmaddwd m3, m1, [r5 + 2 * mmsize] + paddd m11, m3 + pmaddwd m3, m1, [r5 + 1 * mmsize] + paddd m13, m3 + pmaddwd m1, [r5] + movu xm3, [r7 + r1] ; m3 = row 17 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m10, m4 + pmaddwd m4, m2, [r5 + 2 * mmsize] + paddd m12, m4 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddwd m2, m3, [r5 + 3 * mmsize] + paddd m11, m2 + pmaddwd m2, m3, [r5 + 2 * mmsize] + paddd m13, m2 + pmaddwd m3, [r5 + 1 * mmsize] + paddd m1, m3 + movu xm2, [r7 + r4] ; m2 = row 19 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 3 * mmsize] + paddd m12, m6 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m0, m4 + lea r7, [r7 + r1 * 4] + movu xm6, [r7] ; m6 = row 20 + punpckhwd xm7, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 3 * mmsize] + paddd m13, m7 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m1, m2 + movu xm7, [r7 + r1] ; m7 = row 21 + punpckhwd xm2, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddwd m6, [r5 + 3 * mmsize] + paddd m0, m6 + movu xm2, [r7 + r1 * 2] ; m2 = row 22 + punpckhwd xm3, xm7, xm2 + punpcklwd xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddwd m7, [r5 + 3 * mmsize] + paddd m1, m7 + +%ifidn %1,sp + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 + paddd m12, m14 + paddd m13, m14 + paddd m0, m14 + paddd m1, m14 + psrad m8, 12 + psrad m9, 12 + psrad m10, 12 + psrad m11, 12 + psrad m12, 12 + psrad m13, 12 + psrad m0, 12 + psrad m1, 12 +%else + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m0, m1 + lea r8, [r8 + r3 * 4] + +%ifidn %1,sp + packuswb m8, m10 + packuswb m12, m0 + vpermd m8, m5, m8 + vpermd m12, m5, m12 + vextracti128 xm10, m8, 1 + vextracti128 xm0, m12, 1 + movq [r8], xm8 + movhps [r8 + r3], xm8 + movq [r8 + r3 * 2], xm10 + movhps [r8 + r6], xm10 + lea r8, [r8 + r3 * 4] + movq [r8], xm12 + movhps [r8 + r3], xm12 + movq [r8 + r3 * 2], xm0 + movhps [r8 + r6], xm0 +%else + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 + lea r8, [r8 + r3 * 4] + movu [r8], xm12 + movu [r8 + r3], xm13 + movu [r8 + r3 * 2], xm0 + movu [r8 + r6], xm1 +%endif +%endmacro %macro FILTER_H8_W8_sse2 0 movh m1, [r0 + x - 3] @@ -265,3 +714,6023 @@ cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8 IPFILTER_LUMA_sse2 64, 16, ps IPFILTER_LUMA_sse2 16, 64, ps +%macro FILTER_H4_w2_2_sse2 0 + pxor m3, m3 + movd m0, [srcq - 1] + movd m2, [srcq] + punpckldq m0, m2 + punpcklbw m0, m3 + movd m1, [srcq + srcstrideq - 1] + movd m2, [srcq + srcstrideq] + punpckldq m1, m2 + punpcklbw m1, m3 + pmaddwd m0, m4 + pmaddwd m1, m4 + packssdw m0, m1 + pshuflw m1, m0, q2301 + pshufhw m1, m1, q2301 + paddw m0, m1 + psrld m0, 16 + packssdw m0, m0 + paddw m0, m5 + psraw m0, 6 + packuswb m0, m0 + movd r4d, m0 + mov [dstq], r4w + shr r4, 16 + mov [dstq + dststrideq], r4w +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_2xN(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_H4_W2xN_sse3 1 +INIT_XMM sse3 +cglobal interp_4tap_horiz_pp_2x%1, 4, 6, 6, src, srcstride, dst, dststride + mov r4d, r4m + mova m5, [pw_32] + +%ifdef PIC + lea r5, [h_tabw_ChromaCoeff] + movddup m4, [r5 + r4 * 8] +%else + movddup m4, [h_tabw_ChromaCoeff + r4 * 8] +%endif + +%assign x 1 +%rep %1/2 + FILTER_H4_w2_2_sse2 +%if x < %1/2 + lea srcq, [srcq + srcstrideq * 2] + lea dstq, [dstq + dststrideq * 2] +%endif +%assign x x+1 +%endrep + + RET + +%endmacro + + FILTER_H4_W2xN_sse3 4 + FILTER_H4_W2xN_sse3 8 + FILTER_H4_W2xN_sse3 16 + +%macro FILTER_H4_w4_2_sse2 0 + pxor m5, m5 + movd m0, [srcq - 1] + movd m6, [srcq] + punpckldq m0, m6 + punpcklbw m0, m5 + movd m1, [srcq + 1] + movd m6, [srcq + 2] + punpckldq m1, m6 + punpcklbw m1, m5 + movd m2, [srcq + srcstrideq - 1] + movd m6, [srcq + srcstrideq] + punpckldq m2, m6 + punpcklbw m2, m5 + movd m3, [srcq + srcstrideq + 1] + movd m6, [srcq + srcstrideq + 2] + punpckldq m3, m6 + punpcklbw m3, m5 + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m4 + pmaddwd m3, m4 + packssdw m0, m1 + packssdw m2, m3 + pshuflw m1, m0, q2301 + pshufhw m1, m1, q2301 + pshuflw m3, m2, q2301 + pshufhw m3, m3, q2301 + paddw m0, m1 + paddw m2, m3 + psrld m0, 16 + psrld m2, 16 + packssdw m0, m2 + paddw m0, m7 + psraw m0, 6 + packuswb m0, m2 + movd [dstq], m0 + psrldq m0, 4 + movd [dstq + dststrideq], m0 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_H4_W4xN_sse3 1 +INIT_XMM sse3 +cglobal interp_4tap_horiz_pp_4x%1, 4, 6, 8, src, srcstride, dst, dststride + mov r4d, r4m + mova m7, [pw_32] + +%ifdef PIC + lea r5, [h_tabw_ChromaCoeff] + movddup m4, [r5 + r4 * 8] +%else + movddup m4, [h_tabw_ChromaCoeff + r4 * 8] +%endif + +%assign x 1 +%rep %1/2 + FILTER_H4_w4_2_sse2 +%if x < %1/2 + lea srcq, [srcq + srcstrideq * 2] + lea dstq, [dstq + dststrideq * 2] +%endif +%assign x x+1 +%endrep + + RET + +%endmacro + + FILTER_H4_W4xN_sse3 2 + FILTER_H4_W4xN_sse3 4 + FILTER_H4_W4xN_sse3 8 + FILTER_H4_W4xN_sse3 16 + FILTER_H4_W4xN_sse3 32 + +%macro FILTER_H4_w6_sse2 0 + pxor m4, m4 + movh m0, [srcq - 1] + movh m5, [srcq] + punpckldq m0, m5 + movhlps m2, m0 + punpcklbw m0, m4 + punpcklbw m2, m4 + movd m1, [srcq + 1] + movd m5, [srcq + 2] + punpckldq m1, m5 + punpcklbw m1, m4 + pmaddwd m0, m6 + pmaddwd m1, m6 + pmaddwd m2, m6 + packssdw m0, m1 + packssdw m2, m2 + pshuflw m1, m0, q2301 + pshufhw m1, m1, q2301 + pshuflw m3, m2, q2301 + paddw m0, m1 + paddw m2, m3 + psrld m0, 16 + psrld m2, 16 + packssdw m0, m2 + paddw m0, m7 + psraw m0, 6 + packuswb m0, m0 + movd [dstq], m0 + pextrw r4d, m0, 2 + mov [dstq + 4], r4w +%endmacro + +%macro FILH4W8_sse2 1 + movh m0, [srcq - 1 + %1] + movh m5, [srcq + %1] + punpckldq m0, m5 + movhlps m2, m0 + punpcklbw m0, m4 + punpcklbw m2, m4 + movh m1, [srcq + 1 + %1] + movh m5, [srcq + 2 + %1] + punpckldq m1, m5 + movhlps m3, m1 + punpcklbw m1, m4 + punpcklbw m3, m4 + pmaddwd m0, m6 + pmaddwd m1, m6 + pmaddwd m2, m6 + pmaddwd m3, m6 + packssdw m0, m1 + packssdw m2, m3 + pshuflw m1, m0, q2301 + pshufhw m1, m1, q2301 + pshuflw m3, m2, q2301 + pshufhw m3, m3, q2301 + paddw m0, m1 + paddw m2, m3 + psrld m0, 16 + psrld m2, 16 + packssdw m0, m2 + paddw m0, m7 + psraw m0, 6 + packuswb m0, m0 + movh [dstq + %1], m0 +%endmacro + +%macro FILTER_H4_w8_sse2 0 + FILH4W8_sse2 0 +%endmacro + +%macro FILTER_H4_w12_sse2 0 + FILH4W8_sse2 0 + movd m1, [srcq - 1 + 8] + movd m3, [srcq + 8] + punpckldq m1, m3 + punpcklbw m1, m4 + movd m2, [srcq + 1 + 8] + movd m3, [srcq + 2 + 8] + punpckldq m2, m3 + punpcklbw m2, m4 + pmaddwd m1, m6 + pmaddwd m2, m6 + packssdw m1, m2 + pshuflw m2, m1, q2301 + pshufhw m2, m2, q2301 + paddw m1, m2 + psrld m1, 16 + packssdw m1, m1 + paddw m1, m7 + psraw m1, 6 + packuswb m1, m1 + movd [dstq + 8], m1 +%endmacro + +%macro FILTER_H4_w16_sse2 0 + FILH4W8_sse2 0 + FILH4W8_sse2 8 +%endmacro + +%macro FILTER_H4_w24_sse2 0 + FILH4W8_sse2 0 + FILH4W8_sse2 8 + FILH4W8_sse2 16 +%endmacro + +%macro FILTER_H4_w32_sse2 0 + FILH4W8_sse2 0 + FILH4W8_sse2 8 + FILH4W8_sse2 16 + FILH4W8_sse2 24 +%endmacro + +%macro FILTER_H4_w48_sse2 0 + FILH4W8_sse2 0 + FILH4W8_sse2 8 + FILH4W8_sse2 16 + FILH4W8_sse2 24 + FILH4W8_sse2 32 + FILH4W8_sse2 40 +%endmacro + +%macro FILTER_H4_w64_sse2 0 + FILH4W8_sse2 0 + FILH4W8_sse2 8 + FILH4W8_sse2 16 + FILH4W8_sse2 24 + FILH4W8_sse2 32 + FILH4W8_sse2 40 + FILH4W8_sse2 48 + FILH4W8_sse2 56 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_sse3 2 +INIT_XMM sse3 +cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 8, src, srcstride, dst, dststride + mov r4d, r4m + mova m7, [pw_32] + pxor m4, m4 + +%ifdef PIC + lea r5, [h_tabw_ChromaCoeff] + movddup m6, [r5 + r4 * 8] +%else + movddup m6, [h_tabw_ChromaCoeff + r4 * 8] +%endif + +%assign x 1 +%rep %2 + FILTER_H4_w%1_sse2 +%if x < %2 + add srcq, srcstrideq + add dstq, dststrideq +%endif +%assign x x+1 +%endrep + + RET + +%endmacro + + IPFILTER_CHROMA_sse3 6, 8 + IPFILTER_CHROMA_sse3 8, 2 + IPFILTER_CHROMA_sse3 8, 4 + IPFILTER_CHROMA_sse3 8, 6 + IPFILTER_CHROMA_sse3 8, 8 + IPFILTER_CHROMA_sse3 8, 16 + IPFILTER_CHROMA_sse3 8, 32 + IPFILTER_CHROMA_sse3 12, 16 + + IPFILTER_CHROMA_sse3 6, 16 + IPFILTER_CHROMA_sse3 8, 12 + IPFILTER_CHROMA_sse3 8, 64 + IPFILTER_CHROMA_sse3 12, 32 + + IPFILTER_CHROMA_sse3 16, 4 + IPFILTER_CHROMA_sse3 16, 8 + IPFILTER_CHROMA_sse3 16, 12 + IPFILTER_CHROMA_sse3 16, 16 + IPFILTER_CHROMA_sse3 16, 32 + IPFILTER_CHROMA_sse3 32, 8 + IPFILTER_CHROMA_sse3 32, 16 + IPFILTER_CHROMA_sse3 32, 24 + IPFILTER_CHROMA_sse3 24, 32 + IPFILTER_CHROMA_sse3 32, 32 + + IPFILTER_CHROMA_sse3 16, 24 + IPFILTER_CHROMA_sse3 16, 64 + IPFILTER_CHROMA_sse3 32, 48 + IPFILTER_CHROMA_sse3 24, 64 + IPFILTER_CHROMA_sse3 32, 64 + + IPFILTER_CHROMA_sse3 64, 64 + IPFILTER_CHROMA_sse3 64, 32 + IPFILTER_CHROMA_sse3 64, 48 + IPFILTER_CHROMA_sse3 48, 64 + IPFILTER_CHROMA_sse3 64, 16 + +%macro FILTER_2 2 + movd m3, [srcq + %1] + movd m4, [srcq + 1 + %1] + punpckldq m3, m4 + punpcklbw m3, m0 + pmaddwd m3, m1 + packssdw m3, m3 + pshuflw m4, m3, q2301 + paddw m3, m4 + psrldq m3, 2 + psubw m3, m2 + movd [dstq + %2], m3 +%endmacro + +%macro FILTER_4 2 + movd m3, [srcq + %1] + movd m4, [srcq + 1 + %1] + punpckldq m3, m4 + punpcklbw m3, m0 + pmaddwd m3, m1 + movd m4, [srcq + 2 + %1] + movd m5, [srcq + 3 + %1] + punpckldq m4, m5 + punpcklbw m4, m0 + pmaddwd m4, m1 + packssdw m3, m4 + pshuflw m4, m3, q2301 + pshufhw m4, m4, q2301 + paddw m3, m4 + psrldq m3, 2 + pshufd m3, m3, q3120 + psubw m3, m2 + movh [dstq + %2], m3 +%endmacro + +%macro FILTER_4TAP_HPS_sse3 2 +INIT_XMM sse3 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride + mov r4d, r4m + add dststrided, dststrided + mova m2, [pw_2000] + pxor m0, m0 + +%ifdef PIC + lea r6, [h_tabw_ChromaCoeff] + movddup m1, [r6 + r4 * 8] +%else + movddup m1, [h_tabw_ChromaCoeff + r4 * 8] +%endif + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: +%assign x -1 +%assign y 0 +%rep %1/4 + FILTER_4 x,y +%assign x x+4 +%assign y y+8 +%endrep +%rep (%1 % 4)/2 + FILTER_2 x,y +%endrep + add srcq, srcstrideq + add dstq, dststrideq + + dec r4d + jnz .loopH + RET + +%endmacro + + FILTER_4TAP_HPS_sse3 2, 4 + FILTER_4TAP_HPS_sse3 2, 8 + FILTER_4TAP_HPS_sse3 2, 16 + FILTER_4TAP_HPS_sse3 4, 2 + FILTER_4TAP_HPS_sse3 4, 4 + FILTER_4TAP_HPS_sse3 4, 8 + FILTER_4TAP_HPS_sse3 4, 16 + FILTER_4TAP_HPS_sse3 4, 32 + FILTER_4TAP_HPS_sse3 6, 8 + FILTER_4TAP_HPS_sse3 6, 16 + FILTER_4TAP_HPS_sse3 8, 2 + FILTER_4TAP_HPS_sse3 8, 4 + FILTER_4TAP_HPS_sse3 8, 6 + FILTER_4TAP_HPS_sse3 8, 8 + FILTER_4TAP_HPS_sse3 8, 12 + FILTER_4TAP_HPS_sse3 8, 16 + FILTER_4TAP_HPS_sse3 8, 32 + FILTER_4TAP_HPS_sse3 8, 64 + FILTER_4TAP_HPS_sse3 12, 16 + FILTER_4TAP_HPS_sse3 12, 32 + FILTER_4TAP_HPS_sse3 16, 4 + FILTER_4TAP_HPS_sse3 16, 8 + FILTER_4TAP_HPS_sse3 16, 12 + FILTER_4TAP_HPS_sse3 16, 16 + FILTER_4TAP_HPS_sse3 16, 24 + FILTER_4TAP_HPS_sse3 16, 32 + FILTER_4TAP_HPS_sse3 16, 64 + FILTER_4TAP_HPS_sse3 24, 32 + FILTER_4TAP_HPS_sse3 24, 64 + FILTER_4TAP_HPS_sse3 32, 8 + FILTER_4TAP_HPS_sse3 32, 16 + FILTER_4TAP_HPS_sse3 32, 24 + FILTER_4TAP_HPS_sse3 32, 32 + FILTER_4TAP_HPS_sse3 32, 48 + FILTER_4TAP_HPS_sse3 32, 64 + FILTER_4TAP_HPS_sse3 48, 64 + FILTER_4TAP_HPS_sse3 64, 16 + FILTER_4TAP_HPS_sse3 64, 32 + FILTER_4TAP_HPS_sse3 64, 48 + FILTER_4TAP_HPS_sse3 64, 64 + +%macro FILTER_H4_w2_2 3 + movh %2, [srcq - 1] + pshufb %2, %2, Tm0 + movh %1, [srcq + srcstrideq - 1] + pshufb %1, %1, Tm0 + punpcklqdq %2, %1 + pmaddubsw %2, coef2 + phaddw %2, %2 + pmulhrsw %2, %3 + packuswb %2, %2 + movd r4d, %2 + mov [dstq], r4w + shr r4, 16 + mov [dstq + dststrideq], r4w +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + movd coef2, [r5 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_512] + mova Tm0, [h_tab_Tm] + +%rep 2 + FILTER_H4_w2_2 t0, t1, t2 + lea srcq, [srcq + srcstrideq * 2] + lea dstq, [dstq + dststrideq * 2] +%endrep + + RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + movd coef2, [r5 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_512] + mova Tm0, [h_tab_Tm] + +%rep 4 + FILTER_H4_w2_2 t0, t1, t2 + lea srcq, [srcq + srcstrideq * 2] + lea dstq, [dstq + dststrideq * 2] +%endrep + + RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + movd coef2, [r5 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_512] + mova Tm0, [h_tab_Tm] + + mov r5d, 16/2 + +.loop: + FILTER_H4_w2_2 t0, t1, t2 + lea srcq, [srcq + srcstrideq * 2] + lea dstq, [dstq + dststrideq * 2] + dec r5d + jnz .loop + + RET + +%macro FILTER_H4_w4_2 3 + movh %2, [srcq - 1] + pshufb %2, %2, Tm0 + pmaddubsw %2, coef2 + movh %1, [srcq + srcstrideq - 1] + pshufb %1, %1, Tm0 + pmaddubsw %1, coef2 + phaddw %2, %1 + pmulhrsw %2, %3 + packuswb %2, %2 + movd [dstq], %2 + palignr %2, %2, 4 + movd [dstq + dststrideq], %2 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + movd coef2, [r5 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_512] + mova Tm0, [h_tab_Tm] + + FILTER_H4_w4_2 t0, t1, t2 + + RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + movd coef2, [r5 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_512] + mova Tm0, [h_tab_Tm] + +%rep 2 + FILTER_H4_w4_2 t0, t1, t2 + lea srcq, [srcq + srcstrideq * 2] + lea dstq, [dstq + dststrideq * 2] +%endrep + + RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + movd coef2, [r5 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_512] + mova Tm0, [h_tab_Tm] + +%rep 4 + FILTER_H4_w4_2 t0, t1, t2 + lea srcq, [srcq + srcstrideq * 2] + lea dstq, [dstq + dststrideq * 2] +%endrep + + RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + movd coef2, [r5 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_512] + mova Tm0, [h_tab_Tm] + +%rep 8 + FILTER_H4_w4_2 t0, t1, t2 + lea srcq, [srcq + srcstrideq * 2] + lea dstq, [dstq + dststrideq * 2] +%endrep + + RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride +%define coef2 m4 +%define Tm0 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + movd coef2, [r5 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_512] + mova Tm0, [h_tab_Tm] + + mov r5d, 32/2 + +.loop: + FILTER_H4_w4_2 t0, t1, t2 + lea srcq, [srcq + srcstrideq * 2] + lea dstq, [dstq + dststrideq * 2] + dec r5d + jnz .loop + + RET + +ALIGN 32 +const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 + +%macro FILTER_H4_w6 3 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + pmulhrsw %2, %3 + packuswb %2, %2 + movd [dstq], %2 + pextrw [dstq + 4], %2, 2 +%endmacro + +%macro FILTER_H4_w8 3 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + pmulhrsw %2, %3 + packuswb %2, %2 + movh [dstq], %2 +%endmacro + +%macro FILTER_H4_w12 3 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + pmulhrsw %2, %3 + movu %1, [srcq - 1 + 8] + pshufb %1, %1, Tm0 + pmaddubsw %1, coef2 + phaddw %1, %1 + pmulhrsw %1, %3 + packuswb %2, %1 + movh [dstq], %2 + pextrd [dstq + 8], %2, 2 +%endmacro + +%macro FILTER_H4_w16 4 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq - 1 + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + pmulhrsw %2, %3 + pmulhrsw %4, %3 + packuswb %2, %4 + movu [dstq], %2 +%endmacro + +%macro FILTER_H4_w24 4 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq - 1 + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + pmulhrsw %2, %3 + pmulhrsw %4, %3 + packuswb %2, %4 + movu [dstq], %2 + movu %1, [srcq - 1 + 16] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + pmulhrsw %2, %3 + packuswb %2, %2 + movh [dstq + 16], %2 +%endmacro + +%macro FILTER_H4_w32 4 + movu %1, [srcq - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq - 1 + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + pmulhrsw %2, %3 + pmulhrsw %4, %3 + packuswb %2, %4 + movu [dstq], %2 + movu %1, [srcq - 1 + 16] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq - 1 + 24] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + pmulhrsw %2, %3 + pmulhrsw %4, %3 + packuswb %2, %4 + movu [dstq + 16], %2 +%endmacro + +%macro FILTER_H4_w16o 5 + movu %1, [srcq + %5 - 1] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + %5 - 1 + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + pmulhrsw %2, %3 + pmulhrsw %4, %3 + packuswb %2, %4 + movu [dstq + %5], %2 +%endmacro + +%macro FILTER_H4_w48 4 + FILTER_H4_w16o %1, %2, %3, %4, 0 + FILTER_H4_w16o %1, %2, %3, %4, 16 + FILTER_H4_w16o %1, %2, %3, %4, 32 +%endmacro + +%macro FILTER_H4_w64 4 + FILTER_H4_w16o %1, %2, %3, %4, 0 + FILTER_H4_w16o %1, %2, %3, %4, 16 + FILTER_H4_w16o %1, %2, %3, %4, 32 + FILTER_H4_w16o %1, %2, %3, %4, 48 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro IPFILTER_CHROMA 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride +%define coef2 m5 +%define Tm0 m4 +%define Tm1 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + movd coef2, [r5 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mov r5d, %2 + + pshufd coef2, coef2, 0 + mova t2, [pw_512] + mova Tm0, [h_tab_Tm] + mova Tm1, [h_tab_Tm + 16] + +.loop: + FILTER_H4_w%1 t0, t1, t2 + add srcq, srcstrideq + add dstq, dststrideq + + dec r5d + jnz .loop + + RET +%endmacro + + + IPFILTER_CHROMA 6, 8 + IPFILTER_CHROMA 8, 2 + IPFILTER_CHROMA 8, 4 + IPFILTER_CHROMA 8, 6 + IPFILTER_CHROMA 8, 8 + IPFILTER_CHROMA 8, 16 + IPFILTER_CHROMA 8, 32 + IPFILTER_CHROMA 12, 16 + + IPFILTER_CHROMA 6, 16 + IPFILTER_CHROMA 8, 12 + IPFILTER_CHROMA 8, 64 + IPFILTER_CHROMA 12, 32 + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_W 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride +%define coef2 m6 +%define Tm0 m5 +%define Tm1 m4 +%define t3 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + movd coef2, [r5 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mov r5d, %2 + + pshufd coef2, coef2, 0 + mova t2, [pw_512] + mova Tm0, [h_tab_Tm] + mova Tm1, [h_tab_Tm + 16] + +.loop: + FILTER_H4_w%1 t0, t1, t2, t3 + add srcq, srcstrideq + add dstq, dststrideq + + dec r5d + jnz .loop + + RET +%endmacro + + IPFILTER_CHROMA_W 16, 4 + IPFILTER_CHROMA_W 16, 8 + IPFILTER_CHROMA_W 16, 12 + IPFILTER_CHROMA_W 16, 16 + IPFILTER_CHROMA_W 16, 32 + IPFILTER_CHROMA_W 32, 8 + IPFILTER_CHROMA_W 32, 16 + IPFILTER_CHROMA_W 32, 24 + IPFILTER_CHROMA_W 24, 32 + IPFILTER_CHROMA_W 32, 32 + + IPFILTER_CHROMA_W 16, 24 + IPFILTER_CHROMA_W 16, 64 + IPFILTER_CHROMA_W 32, 48 + IPFILTER_CHROMA_W 24, 64 + IPFILTER_CHROMA_W 32, 64 + + IPFILTER_CHROMA_W 64, 64 + IPFILTER_CHROMA_W 64, 32 + IPFILTER_CHROMA_W 64, 48 + IPFILTER_CHROMA_W 48, 64 + IPFILTER_CHROMA_W 64, 16 + +%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst + movu %1, %7 + pshufb %2, %1, [h_tab_Lm + 0] + pmaddubsw %2, %5 + pshufb %3, %1, [h_tab_Lm + 16] + pmaddubsw %3, %5 + phaddw %2, %3 + pshufb %4, %1, [h_tab_Lm + 32] + pmaddubsw %4, %5 + pshufb %1, %1, [h_tab_Lm + 48] + pmaddubsw %1, %5 + phaddw %4, %1 + phaddw %2, %4 + %if %0 == 8 + pmulhrsw %2, %6 + packuswb %2, %2 + movh %8, %2 + %endif +%endmacro + +%macro FILTER_H8_W4 2 + movu %1, [r0 - 3 + r5] + pshufb %2, %1, [h_tab_Lm] + pmaddubsw %2, m3 + pshufb m7, %1, [h_tab_Lm + 16] + pmaddubsw m7, m3 + phaddw %2, m7 + phaddw %2, %2 +%endmacro + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_LUMA 3 +INIT_XMM sse4 +cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8 + + mov r4d, r4m + +%ifdef PIC + lea r6, [h_tab_LumaCoeff] + movh m3, [r6 + r4 * 8] +%else + movh m3, [h_tab_LumaCoeff + r4 * 8] +%endif + punpcklqdq m3, m3 + +%ifidn %3, pp + mova m2, [pw_512] +%else + mova m2, [pw_2000] +%endif + + mov r4d, %2 +%ifidn %3, ps + add r3, r3 + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: + xor r5, r5 +%rep %1 / 8 + %ifidn %3, pp + FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5] + %else + FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5] + psubw m1, m2 + movu [r2 + 2 * r5], m1 + %endif + add r5, 8 +%endrep + +%rep (%1 % 8) / 4 + FILTER_H8_W4 m0, m1 + %ifidn %3, pp + pmulhrsw m1, m2 + packuswb m1, m1 + movd [r2 + r5], m1 + %else + psubw m1, m2 + movh [r2 + 2 * r5], m1 + %endif +%endrep + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_4x4, 4,6,6 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [h_tab_LumaCoeff + r4 * 8] +%endif + + mova m1, [h_tab_Lm] + vpbroadcastd m2, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + sub r0, 3 + ; Row 0-1 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] + + ; Row 2-3 + lea r0, [r0 + r1 * 2] + vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + + packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] + pmulhrsw m3, [pw_512] + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] + pshufb xm3, [interp4_shuf] ; [row3 row1 row2 row0] + + lea r0, [r3 * 3] + movd [r2], xm3 + pextrd [r2+r3], xm3, 2 + pextrd [r2+r3*2], xm3, 1 + pextrd [r2+r0], xm3, 3 + RET + +%macro FILTER_HORIZ_LUMA_AVX2_4xN 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_pp_4x%1, 4, 6, 9 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [h_tab_LumaCoeff + r4 * 8] +%endif + + mova m1, [h_tab_Lm] + mova m2, [pw_1] + mova m7, [h_interp8_hps_shuf] + mova m8, [pw_512] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + lea r4, [r1 * 3] + lea r5, [r3 * 3] + sub r0, 3 +%rep %1 / 8 + ; Row 0-1 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] + + ; Row 2-3 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + + packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] + lea r0, [r0 + r1 * 4] + ; Row 4-5 + vbroadcasti128 m5, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + phaddd m5, m4 ; DWORD [R5D R5C R4D R4C R5B R5A R4B R4A] + + ; Row 6-7 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m6, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m6, m1 + pmaddubsw m6, m0 + pmaddwd m6, m2 + phaddd m4, m6 ; DWORD [R7D R7C R6D R6C R7B R7A R6B R6A] + + packssdw m5, m4 ; WORD [R7D R7C R6D R6C R5D R5C R4D R4C R7B R7A R6B R6A R5B R5A R4B R4A] + vpermd m3, m7, m3 + vpermd m5, m7, m5 + pmulhrsw m3, m8 + pmulhrsw m5, m8 + packuswb m3, m5 + vextracti128 xm5, m3, 1 + + movd [r2], xm3 + pextrd [r2 + r3], xm3, 1 + movd [r2 + r3 * 2], xm5 + pextrd [r2 + r5], xm5, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm3, 2 + pextrd [r2 + r3], xm3, 3 + pextrd [r2 + r3 * 2], xm5, 2 + pextrd [r2 + r5], xm5, 3 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] +%endrep + RET +%endif +%endmacro + + FILTER_HORIZ_LUMA_AVX2_4xN 8 + FILTER_HORIZ_LUMA_AVX2_4xN 16 + +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [h_tab_LumaCoeff + r4 * 8] +%endif + + mova m1, [h_tab_Lm] + mova m2, [h_tab_Lm + 32] + + ; register map + ; m0 - interpolate coeff + ; m1, m2 - shuffle order table + + sub r0, 3 + lea r5, [r1 * 3] + lea r4, [r3 * 3] + + ; Row 0 + vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m2 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddubsw m4, m0 + phaddw m3, m4 + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m2 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddubsw m5, m0 + phaddw m4, m5 + + phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A] + pmulhrsw m3, [pw_512] + + ; Row 2 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m2 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddubsw m5, m0 + phaddw m4, m5 + ; Row 3 + vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m6, m5, m2 + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddubsw m6, m0 + phaddw m5, m6 + + phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A] + pmulhrsw m4, [pw_512] + + packuswb m3, m4 + vextracti128 xm4, m3, 1 + punpcklwd xm5, xm3, xm4 + + movq [r2], xm5 + movhps [r2 + r3], xm5 + + punpckhwd xm5, xm3, xm4 + movq [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm5 + RET + +%macro IPFILTER_LUMA_AVX2_8xN 2 +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_%1x%2, 4, 7, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [h_tab_LumaCoeff + r4 * 8] +%endif + + mova m1, [h_tab_Lm] + mova m2, [h_tab_Lm + 32] + + ; register map + ; m0 - interpolate coeff + ; m1, m2 - shuffle order table + + sub r0, 3 + lea r5, [r1 * 3] + lea r6, [r3 * 3] + mov r4d, %2 / 4 +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m2 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddubsw m4, m0 + phaddw m3, m4 + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m2 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddubsw m5, m0 + phaddw m4, m5 + + phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A] + pmulhrsw m3, [pw_512] + + ; Row 2 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m2 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddubsw m5, m0 + phaddw m4, m5 + ; Row 3 + vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m6, m5, m2 + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddubsw m6, m0 + phaddw m5, m6 + + phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A] + pmulhrsw m4, [pw_512] + + packuswb m3, m4 + vextracti128 xm4, m3, 1 + punpcklwd xm5, xm3, xm4 + + movq [r2], xm5 + movhps [r2 + r3], xm5 + + punpckhwd xm5, xm3, xm4 + movq [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm5 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + dec r4d + jnz .loop + RET +%endmacro + + IPFILTER_LUMA_AVX2_8xN 8, 8 + IPFILTER_LUMA_AVX2_8xN 8, 16 + IPFILTER_LUMA_AVX2_8xN 8, 32 + +%macro IPFILTER_LUMA_AVX2 2 +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 + sub r0, 3 + mov r4d, r4m +%ifdef PIC + lea r5, [h_tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8] + vpbroadcastd m1, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [h_tab_LumaCoeff + r4 * 8] + vpbroadcastd m1, [h_tab_LumaCoeff + r4 * 8 + 4] +%endif + movu m3, [h_tab_Tm + 16] + vpbroadcastd m7, [pw_1] + + ; register map + ; m0 , m1 interpolate coeff + ; m2 , m2 shuffle order table + ; m7 - pw_1 + + mov r4d, %2/2 +.loop: + ; Row 0 + vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m3 + pshufb m4, [h_tab_Tm] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + vbroadcasti128 m5, [r0 + 8] ; second 8 elements in Row0 + pshufb m6, m5, m3 + pshufb m5, [h_tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m4, [pw_512] + vbroadcasti128 m2, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m2, m3 + pshufb m2, [h_tab_Tm] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 + vbroadcasti128 m5, [r0 + r1 + 8] ; second 8 elements in Row0 + pshufb m6, m5, m3 + pshufb m5, [h_tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m2, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + movu [r2], xm4 + movu [r2+r3], xm5 + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] + dec r4d + jnz .loop + RET +%endmacro + +%macro IPFILTER_LUMA_32x_avx2 2 +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 + sub r0, 3 + mov r4d, r4m +%ifdef PIC + lea r5, [h_tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8] + vpbroadcastd m1, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [h_tab_LumaCoeff + r4 * 8] + vpbroadcastd m1, [h_tab_LumaCoeff + r4 * 8 + 4] +%endif + movu m3, [h_tab_Tm + 16] + vpbroadcastd m7, [pw_1] + + ; register map + ; m0 , m1 interpolate coeff + ; m2 , m2 shuffle order table + ; m7 - pw_1 + + mov r4d, %2 +.loop: + ; Row 0 + vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m3 + pshufb m4, [h_tab_Tm] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + vbroadcasti128 m5, [r0 + 8] + pshufb m6, m5, m3 + pshufb m5, [h_tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m4, [pw_512] + vbroadcasti128 m2, [r0 + 16] + pshufb m5, m2, m3 + pshufb m2, [h_tab_Tm] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 + vbroadcasti128 m5, [r0 + 24] + pshufb m6, m5, m3 + pshufb m5, [h_tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m2, m5 + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + movu [r2], xm4 + movu [r2 + 16], xm5 + lea r0, [r0 + r1] + lea r2, [r2 + r3] + dec r4d + jnz .loop + RET +%endmacro +%macro IPFILTER_LUMA_64x_avx2 2 +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 + sub r0, 3 + mov r4d, r4m +%ifdef PIC + lea r5, [h_tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8] + vpbroadcastd m1, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [h_tab_LumaCoeff + r4 * 8] + vpbroadcastd m1, [h_tab_LumaCoeff + r4 * 8 + 4] +%endif + movu m3, [h_tab_Tm + 16] + vpbroadcastd m7, [pw_1] + + ; register map + ; m0 , m1 interpolate coeff + ; m2 , m2 shuffle order table + ; m7 - pw_1 + + mov r4d, %2 +.loop: + ; Row 0 + vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m3 + pshufb m4, [h_tab_Tm] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + vbroadcasti128 m5, [r0 + 8] + pshufb m6, m5, m3 + pshufb m5, [h_tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m4, [pw_512] + vbroadcasti128 m2, [r0 + 16] + pshufb m5, m2, m3 + pshufb m2, [h_tab_Tm] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 + vbroadcasti128 m5, [r0 + 24] + pshufb m6, m5, m3 + pshufb m5, [h_tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m2, m5 + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + movu [r2], xm4 + movu [r2 + 16], xm5 + + vbroadcasti128 m4, [r0 + 32] + pshufb m5, m4, m3 + pshufb m4, [h_tab_Tm] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + vbroadcasti128 m5, [r0 + 40] + pshufb m6, m5, m3 + pshufb m5, [h_tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 + pmulhrsw m4, [pw_512] + vbroadcasti128 m2, [r0 + 48] + pshufb m5, m2, m3 + pshufb m2, [h_tab_Tm] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 + vbroadcasti128 m5, [r0 + 56] + pshufb m6, m5, m3 + pshufb m5, [h_tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m2, m5 + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + movu [r2 +32], xm4 + movu [r2 + 48], xm5 + + lea r0, [r0 + r1] + lea r2, [r2 + r3] + dec r4d + jnz .loop + RET +%endmacro + +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_48x64, 4,6,8 + sub r0, 3 + mov r4d, r4m +%ifdef PIC + lea r5, [h_tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8] + vpbroadcastd m1, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [h_tab_LumaCoeff + r4 * 8] + vpbroadcastd m1, [h_tab_LumaCoeff + r4 * 8 + 4] +%endif + movu m3, [h_tab_Tm + 16] + vpbroadcastd m7, [pw_1] + + ; register map + ; m0 , m1 interpolate coeff + ; m2 , m2 shuffle order table + ; m7 - pw_1 + + mov r4d, 64 +.loop: + ; Row 0 + vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m3 + pshufb m4, [h_tab_Tm] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + vbroadcasti128 m5, [r0 + 8] + pshufb m6, m5, m3 + pshufb m5, [h_tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m4, [pw_512] + + vbroadcasti128 m2, [r0 + 16] + pshufb m5, m2, m3 + pshufb m2, [h_tab_Tm] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 + vbroadcasti128 m5, [r0 + 24] + pshufb m6, m5, m3 + pshufb m5, [h_tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m2, m5 + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + movu [r2], xm4 + movu [r2 + 16], xm5 + + vbroadcasti128 m4, [r0 + 32] + pshufb m5, m4, m3 + pshufb m4, [h_tab_Tm] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + vbroadcasti128 m5, [r0 + 40] + pshufb m6, m5, m3 + pshufb m5, [h_tab_Tm] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 + pmulhrsw m4, [pw_512] + packuswb m4, m4 + vpermq m4, m4, 11011000b + pshufd xm4, xm4, 11011000b + movu [r2 + 32], xm4 + + lea r0, [r0 + r1] + lea r2, [r2 + r3] + dec r4d + jnz .loop + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_4x4, 4,6,6 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vpbroadcastd m2, [pw_1] + vbroadcasti128 m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + + ; Row 0-1 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 2-3 + lea r0, [r0 + r1 * 2] + vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + vinserti128 m4, m4, [r0 + r1], 1 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + pmulhrsw m3, [pw_512] + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 + + lea r0, [r3 * 3] + movd [r2], xm3 + pextrd [r2+r3], xm3, 2 + pextrd [r2+r3*2], xm3, 1 + pextrd [r2+r0], xm3, 3 + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_2x4, 4, 6, 3 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + dec r0 + lea r4, [r1 * 3] + movq xm1, [r0] + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m1, m1, xm2, 1 + pshufb m1, [interp4_hpp_shuf] + pmaddubsw m1, m0 + pmaddwd m1, [pw_1] + vextracti128 xm2, m1, 1 + packssdw xm1, xm2 + pmulhrsw xm1, [pw_512] + packuswb xm1, xm1 + + lea r4, [r3 * 3] + pextrw [r2], xm1, 0 + pextrw [r2 + r3], xm1, 1 + pextrw [r2 + r3 * 2], xm1, 2 + pextrw [r2 + r4], xm1, 3 + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m4, [interp4_hpp_shuf] + mova m5, [pw_1] + dec r0 + lea r4, [r1 * 3] + movq xm1, [r0] + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m1, m1, xm2, 1 + lea r0, [r0 + r1 * 4] + movq xm3, [r0] + movhps xm3, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m3, m3, xm2, 1 + + pshufb m1, m4 + pshufb m3, m4 + pmaddubsw m1, m0 + pmaddubsw m3, m0 + pmaddwd m1, m5 + pmaddwd m3, m5 + packssdw m1, m3 + pmulhrsw m1, [pw_512] + vextracti128 xm2, m1, 1 + packuswb xm1, xm2 + + lea r4, [r3 * 3] + pextrw [r2], xm1, 0 + pextrw [r2 + r3], xm1, 1 + pextrw [r2 + r3 * 2], xm1, 4 + pextrw [r2 + r4], xm1, 5 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm1, 2 + pextrw [r2 + r3], xm1, 3 + pextrw [r2 + r3 * 2], xm1, 6 + pextrw [r2 + r4], xm1, 7 + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_32x32, 4,6,7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + mova m6, [pw_512] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + mov r4d, 32 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + vbroadcasti128 m4, [r0 + 16] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + 20] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vpermq m3, m3, 11011000b + + movu [r2], m3 + lea r2, [r2 + r3] + lea r0, [r0 + r1] + dec r4d + jnz .loop + RET + + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_16x16, 4, 6, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m6, [pw_512] + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + mov r4d, 8 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vpermq m3, m3, 11011000b + + vextracti128 xm4, m3, 1 + movu [r2], xm3 + movu [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + dec r4d + jnz .loop + RET + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- + IPFILTER_LUMA 4, 4, pp + IPFILTER_LUMA 4, 8, pp + IPFILTER_LUMA 12, 16, pp + IPFILTER_LUMA 4, 16, pp + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_8x8, 4,6,6 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + movu m1, [h_tab_Tm] + vpbroadcastd m2, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + sub r0, 1 + mov r4d, 2 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, [pw_512] + lea r0, [r0 + r1 * 2] + + ; Row 2 + vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + ; Row 3 + vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, [pw_512] + + packuswb m3, m4 + mova m5, [interp_4tap_8x8_horiz_shuf] + vpermd m3, m5, m3 + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movhps [r2 + r3], xm3 + lea r2, [r2 + r3 * 2] + movq [r2], xm4 + movhps [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1*2] + dec r4d + jnz .loop + RET + + IPFILTER_LUMA_AVX2 16, 4 + IPFILTER_LUMA_AVX2 16, 8 + IPFILTER_LUMA_AVX2 16, 12 + IPFILTER_LUMA_AVX2 16, 16 + IPFILTER_LUMA_AVX2 16, 32 + IPFILTER_LUMA_AVX2 16, 64 + + IPFILTER_LUMA_32x_avx2 32 , 8 + IPFILTER_LUMA_32x_avx2 32 , 16 + IPFILTER_LUMA_32x_avx2 32 , 24 + IPFILTER_LUMA_32x_avx2 32 , 32 + IPFILTER_LUMA_32x_avx2 32 , 64 + + IPFILTER_LUMA_64x_avx2 64 , 64 + IPFILTER_LUMA_64x_avx2 64 , 48 + IPFILTER_LUMA_64x_avx2 64 , 32 + IPFILTER_LUMA_64x_avx2 64 , 16 + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_8x2, 4, 6, 5 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [h_tab_Tm] + mova m2, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, [pw_512] + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 + pshufd xm3, xm3, 11011000b + movq [r2], xm3 + movhps [r2 + r3], xm3 + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_8x6, 4, 6, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [h_tab_Tm] + mova m2, [pw_1] + mova m6, [pw_512] + lea r4, [r1 * 3] + lea r5, [r3 * 3] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + ; Row 2 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + ; Row 3 + vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + mova m5, [h_interp8_hps_shuf] + vpermd m3, m5, m3 + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movhps [r2 + r3], xm3 + movq [r2 + r3 * 2], xm4 + movhps [r2 + r5], xm4 + lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] + ; Row 4 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 5 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 + pshufd xm3, xm3, 11011000b + movq [r2], xm3 + movhps [r2 + r3], xm3 + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_6x8, 4, 6, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [h_tab_Tm] + mova m2, [pw_1] + mova m6, [pw_512] + lea r4, [r1 * 3] + lea r5, [r3 * 3] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 +%rep 2 + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + ; Row 2 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + ; Row 3 + vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vextracti128 xm4, m3, 1 + movd [r2], xm3 + pextrw [r2 + 4], xm4, 0 + pextrd [r2 + r3], xm3, 1 + pextrw [r2 + r3 + 4], xm4, 2 + pextrd [r2 + r3 * 2], xm3, 2 + pextrw [r2 + r3 * 2 + 4], xm4, 4 + pextrd [r2 + r5], xm3, 3 + pextrw [r2 + r5 + 4], xm4, 6 + lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] +%endrep + RET + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;-----------------------------------------------------------------------------------------------------------------------------; +%macro IPFILTER_CHROMA_HPS_64xN 1 +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_64x%1, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + mov r6d, %1 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 3 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2], m3 + + vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 24] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2 + 32], m3 + + vbroadcasti128 m3, [r0 + 32] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 40] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2 + 64], m3 + + vbroadcasti128 m3, [r0 + 48] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 56] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2 + 96], m3 + + add r2, r3 + add r0, r1 + dec r6d + jnz .loop + RET +%endmacro + + IPFILTER_CHROMA_HPS_64xN 64 + IPFILTER_CHROMA_HPS_64xN 32 + IPFILTER_CHROMA_HPS_64xN 48 + IPFILTER_CHROMA_HPS_64xN 16 + +;----------------------------------------------------------------------------------------------------------------------------- +;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- + +%macro IPFILTER_LUMA_PS_4xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_ps_4x%1, 6,7,6 + mov r5d, r5m + mov r4d, r4m +%ifdef PIC + lea r6, [h_tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [h_tab_LumaCoeff + r4 * 8] +%endif + mova m1, [h_tab_Lm] + add r3d, r3d + vbroadcasti128 m2, [pw_2000] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - pw_2000 + + sub r0, 3 + test r5d, r5d + mov r5d, %1 ; loop count variable - height + jz .preloop + lea r6, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride + sub r0, r6 ; r0(src) - 3 * srcStride + add r5d, 7 ; need extra 7 rows, just set a specially flag here, blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop) + +.preloop: + lea r6, [r3 * 3] +.loop: + ; Row 0-1 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 ; shuffled based on the col order tab_Lm + pmaddubsw m3, m0 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] + + ; Row 2-3 + lea r0, [r0 + r1 * 2] ;3rd row(i.e 2nd row) + vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + phaddw m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + phaddw m3, m4 ; all rows and col completed. + + mova m5, [h_interp8_hps_shuf] + vpermd m3, m5, m3 + psubw m3, m2 + + vextracti128 xm4, m3, 1 + movq [r2], xm3 ;row 0 + movhps [r2 + r3], xm3 ;row 1 + movq [r2 + r3 * 2], xm4 ;row 2 + movhps [r2 + r6], xm4 ;row 3 + + lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4) + lea r2, [r2 + r3 * 4] ; first loop dst ->5th row(i.e 4) + sub r5d, 4 + jz .end + cmp r5d, 4 + jge .loop + + ; Row 8-9 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] + + ; Row 10 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + phaddw m4, m4 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + phaddw m3, m4 + + vpermd m3, m5, m3 ; m5 don't broken in above + psubw m3, m2 + + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movhps [r2 + r3], xm3 + movq [r2 + r3 * 2], xm4 +.end: + RET +%endif +%endmacro + + IPFILTER_LUMA_PS_4xN_AVX2 4 + IPFILTER_LUMA_PS_4xN_AVX2 8 + IPFILTER_LUMA_PS_4xN_AVX2 16 + +%macro IPFILTER_LUMA_PS_8xN_AVX2 1 +; TODO: verify and enable on X86 mode +%if ARCH_X86_64 == 1 +; void filter_hps(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) +INIT_YMM avx2 +cglobal interp_8tap_horiz_ps_8x%1, 4,7,6 + mov r5d, r5m + mov r4d, r4m + shl r4d, 7 +%ifdef PIC + lea r6, [pb_LumaCoeffVer] + add r6, r4 +%else + lea r6, [pb_LumaCoeffVer + r4] +%endif + add r3d, r3d + vpbroadcastd m0, [pw_2000] + sub r0, 3 + lea r4, [pb_8tap_hps_0] + vbroadcasti128 m5, [r4 + 0 * mmsize] + + ; check row count extend for interpolateHV + test r5d, r5d; + mov r5d, %1 + jz .enter_loop + lea r4, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride + sub r0, r4 ; r0(src)-r8 + add r5d, 8-1-2 ; blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop) + +.enter_loop: + lea r4, [pb_8tap_hps_0] + + ; ***** register map ***** + ; m0 - pw_2000 + ; r4 - base pointer of shuffle order table + ; r5 - count of loop + ; r6 - point to LumaCoeff +.loop: + + ; Row 0-1 + movu xm1, [r0] + movu xm2, [r0 + r1] + vinserti128 m1, m1, xm2, 1 + pshufb m2, m1, m5 ; [0 1 1 2 2 3 3 4 ...] + pshufb m3, m1, [r4 + 1 * mmsize] ; [2 3 3 4 4 5 5 6 ...] + pshufb m4, m1, [r4 + 2 * mmsize] ; [4 5 5 6 6 7 7 8 ...] + pshufb m1, m1, [r4 + 3 * mmsize] ; [6 7 7 8 8 9 9 A ...] + pmaddubsw m2, [r6 + 0 * mmsize] + pmaddubsw m3, [r6 + 1 * mmsize] + pmaddubsw m4, [r6 + 2 * mmsize] + pmaddubsw m1, [r6 + 3 * mmsize] + paddw m2, m3 + paddw m1, m4 + paddw m1, m2 + psubw m1, m0 + + vextracti128 xm2, m1, 1 + movu [r2], xm1 ; row 0 + movu [r2 + r3], xm2 ; row 1 + + lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4) + lea r2, [r2 + r3 * 2] ; first loop dst ->5th row(i.e 4) + sub r5d, 2 + jg .loop + jz .end + + ; last row + movu xm1, [r0] + pshufb xm2, xm1, xm5 ; [0 1 1 2 2 3 3 4 ...] + pshufb xm3, xm1, [r4 + 1 * mmsize] ; [2 3 3 4 4 5 5 6 ...] + pshufb xm4, xm1, [r4 + 2 * mmsize] ; [4 5 5 6 6 7 7 8 ...] + pshufb xm1, xm1, [r4 + 3 * mmsize] ; [6 7 7 8 8 9 9 A ...] + pmaddubsw xm2, [r6 + 0 * mmsize] + pmaddubsw xm3, [r6 + 1 * mmsize] + pmaddubsw xm4, [r6 + 2 * mmsize] + pmaddubsw xm1, [r6 + 3 * mmsize] + paddw xm2, xm3 + paddw xm1, xm4 + paddw xm1, xm2 + psubw xm1, xm0 + movu [r2], xm1 ;row 0 +.end: + RET +%endif +%endmacro ; IPFILTER_LUMA_PS_8xN_AVX2 + + IPFILTER_LUMA_PS_8xN_AVX2 4 + IPFILTER_LUMA_PS_8xN_AVX2 8 + IPFILTER_LUMA_PS_8xN_AVX2 16 + IPFILTER_LUMA_PS_8xN_AVX2 32 + + +%macro IPFILTER_LUMA_PS_16x_AVX2 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_ps_%1x%2, 6, 10, 7 + mov r5d, r5m + mov r4d, r4m +%ifdef PIC + lea r6, [h_tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [h_tab_LumaCoeff + r4 * 8] +%endif + mova m6, [h_tab_Lm + 32] + mova m1, [h_tab_Lm] + mov r9, %2 ;height + add r3d, r3d + vbroadcasti128 m2, [pw_2000] + + ; register map + ; m0 - interpolate coeff + ; m1 , m6 - shuffle order table + ; m2 - pw_2000 + + xor r7, r7 ; loop count variable + sub r0, 3 + test r5d, r5d + jz .label + lea r8, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride + sub r0, r8 ; r0(src)-r8 + add r9, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) + +.label: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m6 ; row 0 (col 4 to 7) + pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] + + vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m6 ;row 1 (col 4 to 7) + pshufb m4, m1 ;row 1 (col 0 to 3) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + phaddw m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + phaddw m3, m4 ; all rows and col completed. + + mova m5, [h_interp8_hps_shuf] + vpermd m3, m5, m3 + psubw m3, m2 + + movu [r2], m3 ;row 0 + + lea r0, [r0 + r1] ; first loop src ->5th row(i.e 4) + lea r2, [r2 + r3] ; first loop dst ->5th row(i.e 4) + dec r9d + jnz .label + + RET +%endif +%endmacro + + + IPFILTER_LUMA_PS_16x_AVX2 16 , 16 + IPFILTER_LUMA_PS_16x_AVX2 16 , 8 + IPFILTER_LUMA_PS_16x_AVX2 16 , 12 + IPFILTER_LUMA_PS_16x_AVX2 16 , 4 + IPFILTER_LUMA_PS_16x_AVX2 16 , 32 + IPFILTER_LUMA_PS_16x_AVX2 16 , 64 + + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_LUMA_PP_W8 2 +INIT_XMM sse4 +cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_LumaCoeff] + movh m3, [r5 + r4 * 8] +%else + movh m3, [h_tab_LumaCoeff + r4 * 8] +%endif + pshufd m0, m3, 0 ; m0 = coeff-L + pshufd m1, m3, 0x55 ; m1 = coeff-H + lea r5, [h_tab_Tm] ; r5 = shuffle + mova m2, [pw_512] ; m2 = 512 + + mov r4d, %2 +.loopH: +%assign x 0 +%rep %1 / 8 + movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0] + pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4] + pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8] + pmaddubsw m4, m0 + pmaddubsw m6, m5, m1 + pmaddubsw m5, m0 + pmaddubsw m3, m1 + paddw m4, m6 + paddw m5, m3 + phaddw m4, m5 + pmulhrsw m4, m2 + packuswb m4, m4 + movh [r2 + x], m4 +%assign x x+8 +%endrep + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + + IPFILTER_LUMA_PP_W8 8, 4 + IPFILTER_LUMA_PP_W8 8, 8 + IPFILTER_LUMA_PP_W8 8, 16 + IPFILTER_LUMA_PP_W8 8, 32 + IPFILTER_LUMA_PP_W8 16, 4 + IPFILTER_LUMA_PP_W8 16, 8 + IPFILTER_LUMA_PP_W8 16, 12 + IPFILTER_LUMA_PP_W8 16, 16 + IPFILTER_LUMA_PP_W8 16, 32 + IPFILTER_LUMA_PP_W8 16, 64 + IPFILTER_LUMA_PP_W8 24, 32 + IPFILTER_LUMA_PP_W8 32, 8 + IPFILTER_LUMA_PP_W8 32, 16 + IPFILTER_LUMA_PP_W8 32, 24 + IPFILTER_LUMA_PP_W8 32, 32 + IPFILTER_LUMA_PP_W8 32, 64 + IPFILTER_LUMA_PP_W8 48, 64 + IPFILTER_LUMA_PP_W8 64, 16 + IPFILTER_LUMA_PP_W8 64, 32 + IPFILTER_LUMA_PP_W8 64, 48 + IPFILTER_LUMA_PP_W8 64, 64 + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- + IPFILTER_LUMA 4, 4, ps + IPFILTER_LUMA 8, 8, ps + IPFILTER_LUMA 8, 4, ps + IPFILTER_LUMA 4, 8, ps + IPFILTER_LUMA 16, 16, ps + IPFILTER_LUMA 16, 8, ps + IPFILTER_LUMA 8, 16, ps + IPFILTER_LUMA 16, 12, ps + IPFILTER_LUMA 12, 16, ps + IPFILTER_LUMA 16, 4, ps + IPFILTER_LUMA 4, 16, ps + IPFILTER_LUMA 32, 32, ps + IPFILTER_LUMA 32, 16, ps + IPFILTER_LUMA 16, 32, ps + IPFILTER_LUMA 32, 24, ps + IPFILTER_LUMA 24, 32, ps + IPFILTER_LUMA 32, 8, ps + IPFILTER_LUMA 8, 32, ps + IPFILTER_LUMA 64, 64, ps + IPFILTER_LUMA 64, 32, ps + IPFILTER_LUMA 32, 64, ps + IPFILTER_LUMA 64, 48, ps + IPFILTER_LUMA 48, 64, ps + IPFILTER_LUMA 64, 16, ps + IPFILTER_LUMA 16, 64, ps + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro FILTER_HORIZ_CHROMA_2xN 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride +%define coef2 m3 +%define Tm0 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t1, [pw_2000] + mova Tm0, [h_tab_Tm] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + movh t0, [srcq] + pshufb t0, t0, Tm0 + pmaddubsw t0, coef2 + phaddw t0, t0 + psubw t0, t1 + movd [dstq], t0 + + lea srcq, [srcq + srcstrideq] + lea dstq, [dstq + dststrideq] + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_HORIZ_CHROMA_2xN 2, 4 + FILTER_HORIZ_CHROMA_2xN 2, 8 + + FILTER_HORIZ_CHROMA_2xN 2, 16 + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro FILTER_HORIZ_CHROMA_4xN 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride +%define coef2 m3 +%define Tm0 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t1, [pw_2000] + mova Tm0, [h_tab_Tm] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + movh t0, [srcq] + pshufb t0, t0, Tm0 + pmaddubsw t0, coef2 + phaddw t0, t0 + psubw t0, t1 + movlps [dstq], t0 + + lea srcq, [srcq + srcstrideq] + lea dstq, [dstq + dststrideq] + + dec r4d + jnz .loopH + RET +%endmacro + + FILTER_HORIZ_CHROMA_4xN 4, 2 + FILTER_HORIZ_CHROMA_4xN 4, 4 + FILTER_HORIZ_CHROMA_4xN 4, 8 + FILTER_HORIZ_CHROMA_4xN 4, 16 + + FILTER_HORIZ_CHROMA_4xN 4, 32 + +%macro PROCESS_CHROMA_W6 3 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + psubw %2, %3 + movh [dstq], %2 + pshufd %2, %2, 2 + movd [dstq + 8], %2 +%endmacro + +%macro PROCESS_CHROMA_W12 3 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + psubw %2, %3 + movu [dstq], %2 + movu %1, [srcq + 8] + pshufb %1, %1, Tm0 + pmaddubsw %1, coef2 + phaddw %1, %1 + psubw %1, %3 + movh [dstq + 16], %1 +%endmacro + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro FILTER_HORIZ_CHROMA 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride +%define coef2 m5 +%define Tm0 m4 +%define Tm1 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_2000] + mova Tm0, [h_tab_Tm] + mova Tm1, [h_tab_Tm + 16] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + PROCESS_CHROMA_W%1 t0, t1, t2 + add srcq, srcstrideq + add dstq, dststrideq + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_HORIZ_CHROMA 6, 8 + FILTER_HORIZ_CHROMA 12, 16 + + FILTER_HORIZ_CHROMA 6, 16 + FILTER_HORIZ_CHROMA 12, 32 + +%macro PROCESS_CHROMA_W8 3 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + psubw %2, %3 + movu [dstq], %2 +%endmacro + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro FILTER_HORIZ_CHROMA_8xN 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride +%define coef2 m5 +%define Tm0 m4 +%define Tm1 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_2000] + mova Tm0, [h_tab_Tm] + mova Tm1, [h_tab_Tm + 16] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + PROCESS_CHROMA_W8 t0, t1, t2 + add srcq, srcstrideq + add dstq, dststrideq + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_HORIZ_CHROMA_8xN 8, 2 + FILTER_HORIZ_CHROMA_8xN 8, 4 + FILTER_HORIZ_CHROMA_8xN 8, 6 + FILTER_HORIZ_CHROMA_8xN 8, 8 + FILTER_HORIZ_CHROMA_8xN 8, 16 + FILTER_HORIZ_CHROMA_8xN 8, 32 + + FILTER_HORIZ_CHROMA_8xN 8, 12 + FILTER_HORIZ_CHROMA_8xN 8, 64 + +%macro PROCESS_CHROMA_W16 4 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq], %2 + movu [dstq + 16], %4 +%endmacro + +%macro PROCESS_CHROMA_W24 4 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq], %2 + movu [dstq + 16], %4 + movu %1, [srcq + 16] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + psubw %2, %3 + movu [dstq + 32], %2 +%endmacro + +%macro PROCESS_CHROMA_W32 4 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq], %2 + movu [dstq + 16], %4 + movu %1, [srcq + 16] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + 24] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq + 32], %2 + movu [dstq + 48], %4 +%endmacro + +%macro PROCESS_CHROMA_W16o 5 + movu %1, [srcq + %5] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + %5 + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq + %5 * 2], %2 + movu [dstq + %5 * 2 + 16], %4 +%endmacro + +%macro PROCESS_CHROMA_W48 4 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 +%endmacro + +%macro PROCESS_CHROMA_W64 4 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 48 +%endmacro + +;------------------------------------------------------------------------------------------------------------------------------ +; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;------------------------------------------------------------------------------------------------------------------------------ +%macro FILTER_HORIZ_CHROMA_WxN 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride +%define coef2 m6 +%define Tm0 m5 +%define Tm1 m4 +%define t3 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [h_tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_2000] + mova Tm0, [h_tab_Tm] + mova Tm1, [h_tab_Tm + 16] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + PROCESS_CHROMA_W%1 t0, t1, t2, t3 + add srcq, srcstrideq + add dstq, dststrideq + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_HORIZ_CHROMA_WxN 16, 4 + FILTER_HORIZ_CHROMA_WxN 16, 8 + FILTER_HORIZ_CHROMA_WxN 16, 12 + FILTER_HORIZ_CHROMA_WxN 16, 16 + FILTER_HORIZ_CHROMA_WxN 16, 32 + FILTER_HORIZ_CHROMA_WxN 24, 32 + FILTER_HORIZ_CHROMA_WxN 32, 8 + FILTER_HORIZ_CHROMA_WxN 32, 16 + FILTER_HORIZ_CHROMA_WxN 32, 24 + FILTER_HORIZ_CHROMA_WxN 32, 32 + + FILTER_HORIZ_CHROMA_WxN 16, 24 + FILTER_HORIZ_CHROMA_WxN 16, 64 + FILTER_HORIZ_CHROMA_WxN 24, 64 + FILTER_HORIZ_CHROMA_WxN 32, 48 + FILTER_HORIZ_CHROMA_WxN 32, 64 + + FILTER_HORIZ_CHROMA_WxN 64, 64 + FILTER_HORIZ_CHROMA_WxN 64, 32 + FILTER_HORIZ_CHROMA_WxN 64, 48 + FILTER_HORIZ_CHROMA_WxN 48, 64 + FILTER_HORIZ_CHROMA_WxN 64, 16 + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_32x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;-----------------------------------------------------------------------------------------------------------------------------; +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_32x32, 4,6,8 + mov r4d, r4m + add r3d, r3d + dec r0 + + ; check isRowExt + cmp r5m, byte 0 + + lea r5, [h_tab_ChromaCoeff] + vpbroadcastw m0, [r5 + r4 * 4 + 0] + vpbroadcastw m1, [r5 + r4 * 4 + 2] + mova m7, [pw_2000] + + ; register map + ; m0 - interpolate coeff Low + ; m1 - interpolate coeff High + ; m7 - constant pw_2000 + mov r4d, 32 + je .loop + sub r0, r1 + add r4d, 3 + +.loop: + ; Row 0 + movu m2, [r0] + movu m3, [r0 + 1] + punpckhbw m4, m2, m3 + punpcklbw m2, m3 + pmaddubsw m4, m0 + pmaddubsw m2, m0 + + movu m3, [r0 + 2] + movu m5, [r0 + 3] + punpckhbw m6, m3, m5 + punpcklbw m3, m5 + pmaddubsw m6, m1 + pmaddubsw m3, m1 + + paddw m4, m6 + paddw m2, m3 + psubw m4, m7 + psubw m2, m7 + vperm2i128 m3, m2, m4, 0x20 + vperm2i128 m5, m2, m4, 0x31 + movu [r2], m3 + movu [r2 + mmsize], m5 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_16x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;-----------------------------------------------------------------------------------------------------------------------------; +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_16x16, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + mov r6d, 16 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 3 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 8] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2], m3 + + add r2, r3 + add r0, r1 + dec r6d + jnz .loop + RET + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_16xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PS_16xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + mov r6d, %2 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 3 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 8] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + + vpermq m3, m3, 11011000b + movu [r2], m3 + + add r2, r3 + add r0, r1 + dec r6d + jnz .loop + RET +%endmacro + + IPFILTER_CHROMA_PS_16xN_AVX2 16 , 32 + IPFILTER_CHROMA_PS_16xN_AVX2 16 , 12 + IPFILTER_CHROMA_PS_16xN_AVX2 16 , 8 + IPFILTER_CHROMA_PS_16xN_AVX2 16 , 4 + IPFILTER_CHROMA_PS_16xN_AVX2 16 , 24 + IPFILTER_CHROMA_PS_16xN_AVX2 16 , 64 + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_32xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PS_32xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + mov r6d, %2 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 3 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 8] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + + vpermq m3, m3, 11011000b + movu [r2], m3 + + vbroadcasti128 m3, [r0 + 16] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 24] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + + vpermq m3, m3, 11011000b + movu [r2 + 32], m3 + + add r2, r3 + add r0, r1 + dec r6d + jnz .loop + RET +%endmacro + + IPFILTER_CHROMA_PS_32xN_AVX2 32 , 16 + IPFILTER_CHROMA_PS_32xN_AVX2 32 , 24 + IPFILTER_CHROMA_PS_32xN_AVX2 32 , 8 + IPFILTER_CHROMA_PS_32xN_AVX2 32 , 64 + IPFILTER_CHROMA_PS_32xN_AVX2 32 , 48 + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_4x4, 4,7,5 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + test r5d, r5d + je .label + sub r0 , r1 + +.label: + ; Row 0-1 + movu xm3, [r0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 2-3 + lea r0, [r0 + r1 * 2] + movu xm4, [r0] + vinserti128 m4, m4, [r0 + r1], 1 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, [pw_2000] + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movq [r2+r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm3 + movhps [r2 + r3], xm4 + + test r5d, r5d + jz .end + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + + ;Row 5-6 + movu xm3, [r0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 7 + lea r0, [r0 + r1 * 2] + vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, [pw_2000] + + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movq [r2+r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm3 +.end: + RET + +cglobal interp_4tap_horiz_ps_4x2, 4,7,5 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + test r5d, r5d + je .label + sub r0 , r1 + +.label: + ; Row 0-1 + movu xm3, [r0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + packssdw m3, m3 + psubw m3, [pw_2000] + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movq [r2+r3], xm4 + + test r5d, r5d + jz .end + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + + ;Row 2-3 + movu xm3, [r0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 5 + lea r0, [r0 + r1 * 2] + vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, [pw_2000] + + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movq [r2+r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm3 +.end: + RET + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_4xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;-----------------------------------------------------------------------------------------------------------------------------; +%macro IPFILTER_CHROMA_PS_4xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_%1x%2, 4,7,5 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + mov r4, %2 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + + +.loop: + sub r4d, 4 + ; Row 0-1 + movu xm3, [r0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 2-3 + lea r0, [r0 + r1 * 2] + movu xm4, [r0] + vinserti128 m4, m4, [r0 + r1], 1 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, [pw_2000] + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movq [r2+r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm3 + movhps [r2 + r3], xm4 + + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + + test r4d, r4d + jnz .loop + test r5d, r5d + jz .end + + ;Row 5-6 + movu xm3, [r0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 7 + lea r0, [r0 + r1 * 2] + vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, [pw_2000] + + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movq [r2+r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm3 +.end: + RET +%endmacro + + IPFILTER_CHROMA_PS_4xN_AVX2 4 , 8 + IPFILTER_CHROMA_PS_4xN_AVX2 4 , 16 + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_8x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;-----------------------------------------------------------------------------------------------------------------------------; +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_8x8, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + mov r6d, 4 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 1 + +.loop: + dec r6d + ; Row 0 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + + vpermq m3, m3, 11011000b + vextracti128 xm4, m3, 1 + movu [r2], xm3 + movu [r2 + r3], xm4 + + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + test r6d, r6d + jnz .loop + test r5d, r5d + je .end + + ;Row 11 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + packssdw m3, m3 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2], xm3 +.end: + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_4x2, 4,6,4 + mov r4d, r4m +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti128 m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + + ; Row 0-1 + movu xm2, [r0 - 1] + vinserti128 m2, m2, [r0 + r1 - 1], 1 + pshufb m2, m1 + pmaddubsw m2, m0 + pmaddwd m2, [pw_1] + + packssdw m2, m2 + pmulhrsw m2, [pw_512] + vextracti128 xm3, m2, 1 + packuswb xm2, xm3 + + movd [r2], xm2 + pextrd [r2+r3], xm2, 2 + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_32xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PP_32xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_%1x%2, 4,6,7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + mova m6, [pw_512] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + mov r4d, %2 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + vbroadcasti128 m4, [r0 + 16] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + 20] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vpermq m3, m3, 11011000b + + movu [r2], m3 + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET +%endmacro + + IPFILTER_CHROMA_PP_32xN_AVX2 32, 16 + IPFILTER_CHROMA_PP_32xN_AVX2 32, 24 + IPFILTER_CHROMA_PP_32xN_AVX2 32, 8 + IPFILTER_CHROMA_PP_32xN_AVX2 32, 64 + IPFILTER_CHROMA_PP_32xN_AVX2 32, 48 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_8xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PP_8xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_%1x%2, 4,6,6 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + movu m1, [h_tab_Tm] + vpbroadcastd m2, [pw_1] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + sub r0, 1 + mov r4d, %2 + +.loop: + sub r4d, 4 + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, [pw_512] + lea r0, [r0 + r1 * 2] + + ; Row 2 + vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + ; Row 3 + vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, [pw_512] + + packuswb m3, m4 + mova m5, [interp_4tap_8x8_horiz_shuf] + vpermd m3, m5, m3 + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movhps [r2 + r3], xm3 + lea r2, [r2 + r3 * 2] + movq [r2], xm4 + movhps [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1*2] + test r4d, r4d + jnz .loop + RET +%endmacro + + IPFILTER_CHROMA_PP_8xN_AVX2 8 , 16 + IPFILTER_CHROMA_PP_8xN_AVX2 8 , 32 + IPFILTER_CHROMA_PP_8xN_AVX2 8 , 4 + IPFILTER_CHROMA_PP_8xN_AVX2 8 , 64 + IPFILTER_CHROMA_PP_8xN_AVX2 8 , 12 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PP_4xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_%1x%2, 4,6,6 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vpbroadcastd m2, [pw_1] + vbroadcasti128 m1, [h_tab_Tm] + mov r4d, %2 + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + +.loop: + sub r4d, 4 + ; Row 0-1 + movu xm3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 2-3 + lea r0, [r0 + r1 * 2] + movu xm4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + vinserti128 m4, m4, [r0 + r1], 1 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + pmulhrsw m3, [pw_512] + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 + + movd [r2], xm3 + pextrd [r2+r3], xm3, 2 + lea r2, [r2 + r3 * 2] + pextrd [r2], xm3, 1 + pextrd [r2+r3], xm3, 3 + + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] + test r4d, r4d + jnz .loop + RET +%endmacro + + IPFILTER_CHROMA_PP_4xN_AVX2 4 , 8 + IPFILTER_CHROMA_PP_4xN_AVX2 4 , 16 + +%macro IPFILTER_LUMA_PS_32xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_8tap_horiz_ps_%1x%2, 4, 7, 8 + mov r5d, r5m + mov r4d, r4m +%ifdef PIC + lea r6, [h_tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [h_tab_LumaCoeff + r4 * 8] +%endif + mova m6, [h_tab_Lm + 32] + mova m1, [h_tab_Lm] + mov r4d, %2 ;height + add r3d, r3d + vbroadcasti128 m2, [pw_1] + mova m7, [h_interp8_hps_shuf] + + ; register map + ; m0 - interpolate coeff + ; m1 , m6 - shuffle order table + ; m2 - pw_1 + + + sub r0, 3 + test r5d, r5d + jz .label + lea r6, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride + sub r0, r6 + add r4d, 7 + +.label: + lea r6, [pw_2000] +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m6 ; row 0 (col 4 to 7) + pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + + + vbroadcasti128 m4, [r0 + 8] + pshufb m5, m4, m6 ;row 0 (col 12 to 15) + pshufb m4, m1 ;row 0 (col 8 to 11) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 + + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + vpermd m3, m7, m3 + psubw m3, [r6] + + movu [r2], m3 ;row 0 + + vbroadcasti128 m3, [r0 + 16] + pshufb m4, m3, m6 ; row 0 (col 20 to 23) + pshufb m3, m1 ; row 0 (col 16 to 19) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 24] + pshufb m5, m4, m6 ;row 0 (col 28 to 31) + pshufb m4, m1 ;row 0 (col 24 to 27) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 + + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + vpermd m3, m7, m3 + psubw m3, [r6] + + movu [r2 + 32], m3 ;row 0 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +%endmacro + + IPFILTER_LUMA_PS_32xN_AVX2 32 , 32 + IPFILTER_LUMA_PS_32xN_AVX2 32 , 16 + IPFILTER_LUMA_PS_32xN_AVX2 32 , 24 + IPFILTER_LUMA_PS_32xN_AVX2 32 , 8 + IPFILTER_LUMA_PS_32xN_AVX2 32 , 64 + +INIT_YMM avx2 +cglobal interp_8tap_horiz_ps_48x64, 4, 7, 8 + mov r5d, r5m + mov r4d, r4m +%ifdef PIC + lea r6, [h_tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [h_tab_LumaCoeff + r4 * 8] +%endif + mova m6, [h_tab_Lm + 32] + mova m1, [h_tab_Lm] + mov r4d, 64 ;height + add r3d, r3d + vbroadcasti128 m2, [pw_2000] + mova m7, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 , m6 - shuffle order table + ; m2 - pw_2000 + + sub r0, 3 + test r5d, r5d + jz .label + lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride + sub r0, r6 ; r0(src)-r6 + add r4d, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) + +.label: + lea r6, [h_interp8_hps_shuf] +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m6 ; row 0 (col 4 to 7) + pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 8] + pshufb m5, m4, m6 ;row 0 (col 12 to 15) + pshufb m4, m1 ;row 0 (col 8 to 11) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m7 + pmaddwd m5, m7 + packssdw m4, m5 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + mova m5, [r6] + vpermd m3, m5, m3 + psubw m3, m2 + movu [r2], m3 ;row 0 + + vbroadcasti128 m3, [r0 + 16] + pshufb m4, m3, m6 ; row 0 (col 20 to 23) + pshufb m3, m1 ; row 0 (col 16 to 19) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 24] + pshufb m5, m4, m6 ;row 0 (col 28 to 31) + pshufb m4, m1 ;row 0 (col 24 to 27) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m7 + pmaddwd m5, m7 + packssdw m4, m5 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + mova m5, [r6] + vpermd m3, m5, m3 + psubw m3, m2 + movu [r2 + 32], m3 ;row 0 + + vbroadcasti128 m3, [r0 + 32] + pshufb m4, m3, m6 ; row 0 (col 36 to 39) + pshufb m3, m1 ; row 0 (col 32 to 35) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 40] + pshufb m5, m4, m6 ;row 0 (col 44 to 47) + pshufb m4, m1 ;row 0 (col 40 to 43) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m7 + pmaddwd m5, m7 + packssdw m4, m5 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + mova m5, [r6] + vpermd m3, m5, m3 + psubw m3, m2 + movu [r2 + 64], m3 ;row 0 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET + +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_24x32, 4,6,8 + sub r0, 3 + mov r4d, r4m +%ifdef PIC + lea r5, [h_tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8] + vpbroadcastd m1, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [h_tab_LumaCoeff + r4 * 8] + vpbroadcastd m1, [h_tab_LumaCoeff + r4 * 8 + 4] +%endif + movu m3, [h_tab_Tm + 16] + vpbroadcastd m7, [pw_1] + lea r5, [h_tab_Tm] + + ; register map + ; m0 , m1 interpolate coeff + ; m2 , m2 shuffle order table + ; m7 - pw_1 + + mov r4d, 32 +.loop: + ; Row 0 + vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m3 + pshufb m4, [r5] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + + vbroadcasti128 m5, [r0 + 8] + pshufb m6, m5, m3 + pshufb m5, [r5] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m4, [pw_512] + + vbroadcasti128 m2, [r0 + 16] + pshufb m5, m2, m3 + pshufb m2, [r5] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 + + packssdw m2, m2 + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + + movu [r2], xm4 + movq [r2 + 16], xm5 + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET + +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_12x16, 4,6,8 + sub r0, 3 + mov r4d, r4m +%ifdef PIC + lea r5, [h_tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8] + vpbroadcastd m1, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [h_tab_LumaCoeff + r4 * 8] + vpbroadcastd m1, [h_tab_LumaCoeff + r4 * 8 + 4] +%endif + movu m3, [h_tab_Tm + 16] + vpbroadcastd m7, [pw_1] + lea r5, [h_tab_Tm] + + ; register map + ; m0 , m1 interpolate coeff + ; m2 , m2 shuffle order table + ; m7 - pw_1 + + mov r4d, 8 +.loop: + ; Row 0 + vbroadcasti128 m4, [r0] ;first 8 element + pshufb m5, m4, m3 + pshufb m4, [r5] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 + + vbroadcasti128 m5, [r0 + 8] ; element 8 to 11 + pshufb m6, m5, m3 + pshufb m5, [r5] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + + packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m4, [pw_512] + + ;Row 1 + vbroadcasti128 m2, [r0 + r1] + pshufb m5, m2, m3 + pshufb m2, [r5] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 + + vbroadcasti128 m5, [r0 + r1 + 8] + pshufb m6, m5, m3 + pshufb m5, [r5] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + + packssdw m2, m5 + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + + movq [r2], xm4 + pextrd [r2+8], xm4, 2 + movq [r2 + r3], xm5 + pextrd [r2+r3+8], xm5, 2 + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] + dec r4d + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_16xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PP_16xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m6, [pw_512] + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + mov r4d, %2/2 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vpermq m3, m3, 11011000b + + vextracti128 xm4, m3, 1 + movu [r2], xm3 + movu [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + dec r4d + jnz .loop + RET +%endmacro + + IPFILTER_CHROMA_PP_16xN_AVX2 16 , 8 + IPFILTER_CHROMA_PP_16xN_AVX2 16 , 32 + IPFILTER_CHROMA_PP_16xN_AVX2 16 , 12 + IPFILTER_CHROMA_PP_16xN_AVX2 16 , 4 + IPFILTER_CHROMA_PP_16xN_AVX2 16 , 64 + IPFILTER_CHROMA_PP_16xN_AVX2 16 , 24 + +%macro IPFILTER_LUMA_PS_64xN_AVX2 1 +INIT_YMM avx2 +cglobal interp_8tap_horiz_ps_64x%1, 4, 7, 8 + mov r5d, r5m + mov r4d, r4m +%ifdef PIC + lea r6, [h_tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [h_tab_LumaCoeff + r4 * 8] +%endif + mova m6, [h_tab_Lm + 32] + mova m1, [h_tab_Lm] + mov r4d, %1 ;height + add r3d, r3d + vbroadcasti128 m2, [pw_1] + mova m7, [h_interp8_hps_shuf] + + ; register map + ; m0 - interpolate coeff + ; m1 , m6 - shuffle order table + ; m2 - pw_2000 + + sub r0, 3 + test r5d, r5d + jz .label + lea r6, [r1 * 3] + sub r0, r6 ; r0(src)-r6 + add r4d, 7 ; blkheight += N - 1 + +.label: + lea r6, [pw_2000] +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m6 ; row 0 (col 4 to 7) + pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 8] + pshufb m5, m4, m6 ;row 0 (col 12 to 15) + pshufb m4, m1 ;row 0 (col 8 to 11) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + vpermd m3, m7, m3 + psubw m3, [r6] + movu [r2], m3 ;row 0 + + vbroadcasti128 m3, [r0 + 16] + pshufb m4, m3, m6 ; row 0 (col 20 to 23) + pshufb m3, m1 ; row 0 (col 16 to 19) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 24] + pshufb m5, m4, m6 ;row 0 (col 28 to 31) + pshufb m4, m1 ;row 0 (col 24 to 27) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + vpermd m3, m7, m3 + psubw m3, [r6] + movu [r2 + 32], m3 ;row 0 + + vbroadcasti128 m3, [r0 + 32] + pshufb m4, m3, m6 ; row 0 (col 36 to 39) + pshufb m3, m1 ; row 0 (col 32 to 35) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 40] + pshufb m5, m4, m6 ;row 0 (col 44 to 47) + pshufb m4, m1 ;row 0 (col 40 to 43) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + vpermd m3, m7, m3 + psubw m3, [r6] + movu [r2 + 64], m3 ;row 0 + vbroadcasti128 m3, [r0 + 48] + pshufb m4, m3, m6 ; row 0 (col 52 to 55) + pshufb m3, m1 ; row 0 (col 48 to 51) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 56] + pshufb m5, m4, m6 ;row 0 (col 60 to 63) + pshufb m4, m1 ;row 0 (col 56 to 59) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + vpermd m3, m7, m3 + psubw m3, [r6] + movu [r2 + 96], m3 ;row 0 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +%endmacro + + IPFILTER_LUMA_PS_64xN_AVX2 64 + IPFILTER_LUMA_PS_64xN_AVX2 48 + IPFILTER_LUMA_PS_64xN_AVX2 32 + IPFILTER_LUMA_PS_64xN_AVX2 16 + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_8xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PS_8xN_AVX2 1 +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_8x%1, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + mov r6d, %1/2 + dec r0 + test r5d, r5d + jz .loop + sub r0 , r1 + inc r6d + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + vextracti128 xm4, m3, 1 + movu [r2], xm3 + movu [r2 + r3], xm4 + + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + dec r6d + jnz .loop + test r5d, r5d + jz .end + + ;Row 11 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + packssdw m3, m3 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2], xm3 +.end: + RET +%endmacro + + IPFILTER_CHROMA_PS_8xN_AVX2 2 + IPFILTER_CHROMA_PS_8xN_AVX2 32 + IPFILTER_CHROMA_PS_8xN_AVX2 16 + IPFILTER_CHROMA_PS_8xN_AVX2 6 + IPFILTER_CHROMA_PS_8xN_AVX2 4 + IPFILTER_CHROMA_PS_8xN_AVX2 12 + IPFILTER_CHROMA_PS_8xN_AVX2 64 + +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_2x4, 4, 7, 3 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova xm3, [pw_2000] + dec r0 + test r5d, r5d + jz .label + sub r0, r1 + +.label: + lea r6, [r1 * 3] + movq xm1, [r0] + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r6] + + vinserti128 m1, m1, xm2, 1 + pshufb m1, [interp4_hpp_shuf] + pmaddubsw m1, m0 + pmaddwd m1, [pw_1] + vextracti128 xm2, m1, 1 + packssdw xm1, xm2 + psubw xm1, xm3 + + lea r4, [r3 * 3] + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r4], xm1, 3 + + test r5d, r5d + jz .end + lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] + + movq xm1, [r0] + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + vinserti128 m1, m1, xm2, 1 + pshufb m1, [interp4_hpp_shuf] + pmaddubsw m1, m0 + pmaddwd m1, [pw_1] + vextracti128 xm2, m1, 1 + packssdw xm1, xm2 + psubw xm1, xm3 + + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + pextrd [r2 + r3 * 2], xm1, 2 +.end: + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_2x8, 4, 7, 7 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + vbroadcasti128 m6, [pw_2000] + test r5d, r5d + jz .label + sub r0, r1 + +.label: + mova m4, [interp4_hpp_shuf] + mova m5, [pw_1] + dec r0 + lea r4, [r1 * 3] + movq xm1, [r0] ;row 0 + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m1, m1, xm2, 1 + lea r0, [r0 + r1 * 4] + movq xm3, [r0] + movhps xm3, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m3, m3, xm2, 1 + + pshufb m1, m4 + pshufb m3, m4 + pmaddubsw m1, m0 + pmaddubsw m3, m0 + pmaddwd m1, m5 + pmaddwd m3, m5 + packssdw m1, m3 + psubw m1, m6 + + lea r4, [r3 * 3] + vextracti128 xm2, m1, 1 + + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + movd [r2 + r3 * 2], xm2 + pextrd [r2 + r4], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm1, 2 + pextrd [r2 + r3], xm1, 3 + pextrd [r2 + r3 * 2], xm2, 2 + pextrd [r2 + r4], xm2, 3 + test r5d, r5d + jz .end + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + movq xm1, [r0] ;row 0 + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + vinserti128 m1, m1, xm2, 1 + pshufb m1, m4 + pmaddubsw m1, m0 + pmaddwd m1, m5 + packssdw m1, m1 + psubw m1, m6 + vextracti128 xm2, m1, 1 + + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + movd [r2 + r3 * 2], xm2 +.end: + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_12x16, 4, 6, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m6, [pw_512] + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + mov r4d, 8 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vpermq m3, m3, 11011000b + + vextracti128 xm4, m3, 1 + movq [r2], xm3 + pextrd [r2+8], xm3, 2 + movq [r2 + r3], xm4 + pextrd [r2 + r3 + 8],xm4, 2 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + dec r4d + jnz .loop + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_24x32, 4,6,7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + mova m6, [pw_512] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + mov r4d, 32 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + vbroadcasti128 m4, [r0 + 16] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + 20] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vpermq m3, m3, 11011000b + + vextracti128 xm4, m3, 1 + movu [r2], xm3 + movq [r2 + 16], xm4 + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;-----------------------------------------------------------------------------------------------------------------------------; +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_6x8, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + mov r6d, 8/2 + dec r0 + test r5d, r5d + jz .loop + sub r0 , r1 + inc r6d + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + vextracti128 xm4, m3, 1 + movq [r2], xm3 + pextrd [r2 + 8], xm3, 2 + movq [r2 + r3], xm4 + pextrd [r2 + r3 + 8], xm4, 2 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + dec r6d + jnz .loop + test r5d, r5d + jz .end + + ;Row 11 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + packssdw m3, m3 + psubw m3, m5 + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movd [r2+8], xm4 +.end: + RET + +INIT_YMM avx2 +cglobal interp_8tap_horiz_ps_12x16, 6, 7, 8 + mov r5d, r5m + mov r4d, r4m +%ifdef PIC + lea r6, [h_tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [h_tab_LumaCoeff + r4 * 8] +%endif + mova m6, [h_tab_Lm + 32] + mova m1, [h_tab_Lm] + add r3d, r3d + vbroadcasti128 m2, [pw_2000] + mov r4d, 16 + vbroadcasti128 m7, [pw_1] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - pw_2000 + + mova m5, [h_interp8_hps_shuf] + sub r0, 3 + test r5d, r5d + jz .loop + lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride + sub r0, r6 ; r0(src)-r6 + add r4d, 7 +.loop: + + ; Row 0 + + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m6 + pshufb m3, m1 ; shuffled based on the col order tab_Lm + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m7 + packssdw m4, m4 + + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + + vpermd m3, m5, m3 + psubw m3, m2 + + vextracti128 xm4, m3, 1 + movu [r2], xm3 ;row 0 + movq [r2 + 16], xm4 ;row 1 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET + +INIT_YMM avx2 +cglobal interp_8tap_horiz_ps_24x32, 4, 7, 8 + mov r5d, r5m + mov r4d, r4m +%ifdef PIC + lea r6, [h_tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [h_tab_LumaCoeff + r4 * 8] +%endif + mova m6, [h_tab_Lm + 32] + mova m1, [h_tab_Lm] + mov r4d, 32 ;height + add r3d, r3d + vbroadcasti128 m2, [pw_2000] + vbroadcasti128 m7, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 , m6 - shuffle order table + ; m2 - pw_2000 + + sub r0, 3 + test r5d, r5d + jz .label + lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride + sub r0, r6 ; r0(src)-r6 + add r4d, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) + +.label: + lea r6, [h_interp8_hps_shuf] +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m6 ; row 0 (col 4 to 7) + pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m6 ;row 1 (col 4 to 7) + pshufb m4, m1 ;row 1 (col 0 to 3) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m7 + pmaddwd m5, m7 + packssdw m4, m5 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + mova m5, [r6] + vpermd m3, m5, m3 + psubw m3, m2 + movu [r2], m3 ;row 0 + + vbroadcasti128 m3, [r0 + 16] + pshufb m4, m3, m6 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + mova m4, [r6] + vpermd m3, m4, m3 + psubw m3, m2 + movu [r2 + 32], xm3 ;row 0 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_24x32, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + mov r6d, 32 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 3 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2], m3 + + vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + packssdw m3, m3 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2 + 32], xm3 + + add r2, r3 + add r0, r1 + dec r6d + jnz .loop + RET + +;----------------------------------------------------------------------------------------------------------------------- +;macro FILTER_H8_W8_16N_AVX2 +;----------------------------------------------------------------------------------------------------------------------- +%macro FILTER_H8_W8_16N_AVX2 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m6 ; row 0 (col 4 to 7) + pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] + + vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m6 ;row 1 (col 4 to 7) + pshufb m4, m1 ;row 1 (col 0 to 3) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 ; all rows and col completed. + + mova m5, [h_interp8_hps_shuf] + vpermd m3, m5, m3 + psubw m3, m8 + + vextracti128 xm4, m3, 1 + mova [r4], xm3 + mova [r4 + 16], xm4 + %endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PP_64xN_AVX2 1 +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_64x%1, 4,6,7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + mova m6, [pw_512] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + mov r4d, %1 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + vbroadcasti128 m4, [r0 + 16] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + 20] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + packuswb m3, m4 + vpermq m3, m3, 11011000b + movu [r2], m3 + + vbroadcasti128 m3, [r0 + 32] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 36] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + vbroadcasti128 m4, [r0 + 48] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + 52] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + packuswb m3, m4 + vpermq m3, m3, 11011000b + movu [r2 + 32], m3 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET +%endmacro + + IPFILTER_CHROMA_PP_64xN_AVX2 64 + IPFILTER_CHROMA_PP_64xN_AVX2 32 + IPFILTER_CHROMA_PP_64xN_AVX2 48 + IPFILTER_CHROMA_PP_64xN_AVX2 16 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_48x64(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_48x64, 4,6,7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + mova m6, [pw_512] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + mov r4d, 64 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + vbroadcasti128 m4, [r0 + 16] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + 20] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vpermq m3, m3, q3120 + + movu [r2], m3 + + vbroadcasti128 m3, [r0 + mmsize] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + mmsize + 4] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + vbroadcasti128 m4, [r0 + mmsize + 16] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + mmsize + 20] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vpermq m3, m3, q3120 + movu [r2 + mmsize], xm3 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_48x64(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;-----------------------------------------------------------------------------------------------------------------------------; + +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_48x64, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + mov r6d, 64 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 3 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, q3120 + movu [r2], m3 + + vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 24] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, q3120 + movu [r2 + 32], m3 + + vbroadcasti128 m3, [r0 + 32] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 40] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, q3120 + movu [r2 + 64], m3 + + add r2, r3 + add r0, r1 + dec r6d + jnz .loop + RET + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_24x64(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_24x64, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [h_tab_Tm] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + mov r6d, 64 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 3 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, q3120 + movu [r2], m3 + + vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + packssdw m3, m3 + psubw m3, m5 + vpermq m3, m3, q3120 + movu [r2 + 32], xm3 + + add r2, r3 + add r0, r1 + dec r6d + jnz .loop + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_2x16, 4, 7, 7 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d + +%ifdef PIC + lea r6, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + vbroadcasti128 m6, [pw_2000] + test r5d, r5d + jz .label + sub r0, r1 + +.label: + mova m4, [interp4_hps_shuf] + mova m5, [pw_1] + dec r0 + lea r4, [r1 * 3] + movq xm1, [r0] ;row 0 + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m1, m1, xm2, 1 + lea r0, [r0 + r1 * 4] + movq xm3, [r0] + movhps xm3, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m3, m3, xm2, 1 + + pshufb m1, m4 + pshufb m3, m4 + pmaddubsw m1, m0 + pmaddubsw m3, m0 + pmaddwd m1, m5 + pmaddwd m3, m5 + packssdw m1, m3 + psubw m1, m6 + + lea r4, [r3 * 3] + vextracti128 xm2, m1, 1 + + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + movd [r2 + r3 * 2], xm2 + pextrd [r2 + r4], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm1, 2 + pextrd [r2 + r3], xm1, 3 + pextrd [r2 + r3 * 2], xm2, 2 + pextrd [r2 + r4], xm2, 3 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + lea r4, [r1 * 3] + movq xm1, [r0] + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m1, m1, xm2, 1 + lea r0, [r0 + r1 * 4] + movq xm3, [r0] + movhps xm3, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m3, m3, xm2, 1 + + pshufb m1, m4 + pshufb m3, m4 + pmaddubsw m1, m0 + pmaddubsw m3, m0 + pmaddwd m1, m5 + pmaddwd m3, m5 + packssdw m1, m3 + psubw m1, m6 + + lea r4, [r3 * 3] + vextracti128 xm2, m1, 1 + + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + movd [r2 + r3 * 2], xm2 + pextrd [r2 + r4], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm1, 2 + pextrd [r2 + r3], xm1, 3 + pextrd [r2 + r3 * 2], xm2, 2 + pextrd [r2 + r4], xm2, 3 + + test r5d, r5d + jz .end + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + movq xm1, [r0] + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + vinserti128 m1, m1, xm2, 1 + pshufb m1, m4 + pmaddubsw m1, m0 + pmaddwd m1, m5 + packssdw m1, m1 + psubw m1, m6 + vextracti128 xm2, m1, 1 + + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + movd [r2 + r3 * 2], xm2 +.end: + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_6x16, 4, 6, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [h_tab_Tm] + mova m2, [pw_1] + mova m6, [pw_512] + lea r4, [r1 * 3] + lea r5, [r3 * 3] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 +%rep 4 + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + ; Row 2 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + ; Row 3 + vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vextracti128 xm4, m3, 1 + movd [r2], xm3 + pextrw [r2 + 4], xm4, 0 + pextrd [r2 + r3], xm3, 1 + pextrw [r2 + r3 + 4], xm4, 2 + pextrd [r2 + r3 * 2], xm3, 2 + pextrw [r2 + r3 * 2 + 4], xm4, 4 + pextrd [r2 + r5], xm3, 3 + pextrw [r2 + r5 + 4], xm4, 6 + lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] +%endrep + RET + +;----------------------------------------------------------------------------- +; void interp_8tap_hv_pp_16x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_hv_pp_16x16, 4, 10, 15, 0-31*32 +%define stk_buf1 rsp + mov r4d, r4m + mov r5d, r5m +%ifdef PIC + lea r6, [h_tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [h_tab_LumaCoeff + r4 * 8] +%endif + + xor r6, r6 + mov r4, rsp + mova m6, [h_tab_Lm + 32] + mova m1, [h_tab_Lm] + mov r8, 16 ;height + vbroadcasti128 m8, [pw_2000] + vbroadcasti128 m2, [pw_1] + sub r0, 3 + lea r7, [r1 * 3] ; r7 = (N / 2 - 1) * srcStride + sub r0, r7 ; r0(src)-r7 + add r8, 7 + +.loopH: + FILTER_H8_W8_16N_AVX2 + add r0, r1 + add r4, 32 + inc r6 + cmp r6, 16+7 + jnz .loopH + +; vertical phase + xor r6, r6 + xor r1, r1 +.loopV: + +;load necessary variables + mov r4d, r5d ;coeff here for vertical is r5m + shl r4d, 7 + mov r1d, 16 + add r1d, r1d + + ; load intermedia buffer + mov r0, stk_buf1 + + ; register mapping + ; r0 - src + ; r5 - coeff + ; r6 - loop_i + +; load coeff table +%ifdef PIC + lea r5, [h_pw_LumaCoeffVer] + add r5, r4 +%else + lea r5, [h_pw_LumaCoeffVer + r4] +%endif + + lea r4, [r1*3] + mova m14, [h_pd_526336] + lea r6, [r3 * 3] + mov r9d, 16 / 8 + +.loopW: + PROCESS_LUMA_AVX2_W8_16R sp + add r2, 8 + add r0, 16 + dec r9d + jnz .loopW + RET +%endif + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_12x32, 4, 6, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m6, [pw_512] + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + mov r4d, 16 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vpermq m3, m3, 11011000b + + vextracti128 xm4, m3, 1 + movq [r2], xm3 + pextrd [r2+8], xm3, 2 + movq [r2 + r3], xm4 + pextrd [r2 + r3 + 8],xm4, 2 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + dec r4d + jnz .loop + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_24x64, 4,6,7 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + mova m6, [pw_512] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + mov r4d, 64 + +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + vbroadcasti128 m4, [r0 + 16] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + 20] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vpermq m3, m3, 11011000b + + vextracti128 xm4, m3, 1 + movu [r2], xm3 + movq [r2 + 16], xm4 + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET + + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_2x16, 4, 6, 6 + mov r4d, r4m + +%ifdef PIC + lea r5, [h_tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [h_tab_ChromaCoeff + r4 * 4] +%endif + + mova m4, [interp4_hpp_shuf] + mova m5, [pw_1] + dec r0 + lea r4, [r1 * 3] + movq xm1, [r0] + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m1, m1, xm2, 1 + lea r0, [r0 + r1 * 4] + movq xm3, [r0] + movhps xm3, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m3, m3, xm2, 1 + + pshufb m1, m4 + pshufb m3, m4 + pmaddubsw m1, m0 + pmaddubsw m3, m0 + pmaddwd m1, m5 + pmaddwd m3, m5 + packssdw m1, m3 + pmulhrsw m1, [pw_512] + vextracti128 xm2, m1, 1 + packuswb xm1, xm2 + + lea r4, [r3 * 3] + pextrw [r2], xm1, 0 + pextrw [r2 + r3], xm1, 1 + pextrw [r2 + r3 * 2], xm1, 4 + pextrw [r2 + r4], xm1, 5 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm1, 2 + pextrw [r2 + r3], xm1, 3 + pextrw [r2 + r3 * 2], xm1, 6 + pextrw [r2 + r4], xm1, 7 + lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] + + lea r4, [r1 * 3] + movq xm1, [r0] + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m1, m1, xm2, 1 + lea r0, [r0 + r1 * 4] + movq xm3, [r0] + movhps xm3, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m3, m3, xm2, 1 + + pshufb m1, m4 + pshufb m3, m4 + pmaddubsw m1, m0 + pmaddubsw m3, m0 + pmaddwd m1, m5 + pmaddwd m3, m5 + packssdw m1, m3 + pmulhrsw m1, [pw_512] + vextracti128 xm2, m1, 1 + packuswb xm1, xm2 + + lea r4, [r3 * 3] + pextrw [r2], xm1, 0 + pextrw [r2 + r3], xm1, 1 + pextrw [r2 + r3 * 2], xm1, 4 + pextrw [r2 + r4], xm1, 5 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm1, 2 + pextrw [r2 + r3], xm1, 3 + pextrw [r2 + r3 * 2], xm1, 6 + pextrw [r2 + r4], xm1, 7 + RET diff --git a/source/common/x86/ipfilter8.asm b/source/common/x86/ipfilter8.asm index dc3f04add0..223917b0f3 100644 --- a/source/common/x86/ipfilter8.asm +++ b/source/common/x86/ipfilter8.asm @@ -39,11 +39,6 @@ const interp_vert_shuf, times 2 db 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4 dd 2, 3, 3, 4, 4, 5, 5, 6 -const pb_8tap_hps_0, times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 - times 2 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10 - times 2 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12 - times 2 db 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14 - const tab_Lm, db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10 db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12 @@ -196,26 +191,6 @@ const pw_LumaCoeffVer, times 8 dw 0, 0 times 8 dw 58, -10 times 8 dw 4, -1 -const pb_LumaCoeffVer, times 16 db 0, 0 - times 16 db 0, 64 - times 16 db 0, 0 - times 16 db 0, 0 - - times 16 db -1, 4 - times 16 db -10, 58 - times 16 db 17, -5 - times 16 db 1, 0 - - times 16 db -1, 4 - times 16 db -11, 40 - times 16 db 40, -11 - times 16 db 4, -1 - - times 16 db 0, 1 - times 16 db -5, 17 - times 16 db 58, -10 - times 16 db 4, -1 - const tab_LumaCoeffVer, times 8 db 0, 0 times 8 db 0, 64 times 8 db 0, 0 @@ -282,18 +257,8 @@ const tab_ChromaCoeffVer_32, times 16 db 0, 64 const tab_c_64_n64, times 8 db 64, -64 -const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 - -const interp4_horiz_shuf1, db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - -const interp4_hpp_shuf, times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 - const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 -ALIGN 32 -interp4_hps_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 - SECTION .text cextern pb_128 @@ -303,462 +268,6 @@ cextern pw_512 cextern pw_2000 cextern pw_8192 -%macro FILTER_H4_w2_2_sse2 0 - pxor m3, m3 - movd m0, [srcq - 1] - movd m2, [srcq] - punpckldq m0, m2 - punpcklbw m0, m3 - movd m1, [srcq + srcstrideq - 1] - movd m2, [srcq + srcstrideq] - punpckldq m1, m2 - punpcklbw m1, m3 - pmaddwd m0, m4 - pmaddwd m1, m4 - packssdw m0, m1 - pshuflw m1, m0, q2301 - pshufhw m1, m1, q2301 - paddw m0, m1 - psrld m0, 16 - packssdw m0, m0 - paddw m0, m5 - psraw m0, 6 - packuswb m0, m0 - movd r4d, m0 - mov [dstq], r4w - shr r4, 16 - mov [dstq + dststrideq], r4w -%endmacro - -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_2xN(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_H4_W2xN_sse3 1 -INIT_XMM sse3 -cglobal interp_4tap_horiz_pp_2x%1, 4, 6, 6, src, srcstride, dst, dststride - mov r4d, r4m - mova m5, [pw_32] - -%ifdef PIC - lea r5, [tabw_ChromaCoeff] - movddup m4, [r5 + r4 * 8] -%else - movddup m4, [tabw_ChromaCoeff + r4 * 8] -%endif - -%assign x 1 -%rep %1/2 - FILTER_H4_w2_2_sse2 -%if x < %1/2 - lea srcq, [srcq + srcstrideq * 2] - lea dstq, [dstq + dststrideq * 2] -%endif -%assign x x+1 -%endrep - - RET - -%endmacro - - FILTER_H4_W2xN_sse3 4 - FILTER_H4_W2xN_sse3 8 - FILTER_H4_W2xN_sse3 16 - -%macro FILTER_H4_w4_2_sse2 0 - pxor m5, m5 - movd m0, [srcq - 1] - movd m6, [srcq] - punpckldq m0, m6 - punpcklbw m0, m5 - movd m1, [srcq + 1] - movd m6, [srcq + 2] - punpckldq m1, m6 - punpcklbw m1, m5 - movd m2, [srcq + srcstrideq - 1] - movd m6, [srcq + srcstrideq] - punpckldq m2, m6 - punpcklbw m2, m5 - movd m3, [srcq + srcstrideq + 1] - movd m6, [srcq + srcstrideq + 2] - punpckldq m3, m6 - punpcklbw m3, m5 - pmaddwd m0, m4 - pmaddwd m1, m4 - pmaddwd m2, m4 - pmaddwd m3, m4 - packssdw m0, m1 - packssdw m2, m3 - pshuflw m1, m0, q2301 - pshufhw m1, m1, q2301 - pshuflw m3, m2, q2301 - pshufhw m3, m3, q2301 - paddw m0, m1 - paddw m2, m3 - psrld m0, 16 - psrld m2, 16 - packssdw m0, m2 - paddw m0, m7 - psraw m0, 6 - packuswb m0, m2 - movd [dstq], m0 - psrldq m0, 4 - movd [dstq + dststrideq], m0 -%endmacro - -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_H4_W4xN_sse3 1 -INIT_XMM sse3 -cglobal interp_4tap_horiz_pp_4x%1, 4, 6, 8, src, srcstride, dst, dststride - mov r4d, r4m - mova m7, [pw_32] - -%ifdef PIC - lea r5, [tabw_ChromaCoeff] - movddup m4, [r5 + r4 * 8] -%else - movddup m4, [tabw_ChromaCoeff + r4 * 8] -%endif - -%assign x 1 -%rep %1/2 - FILTER_H4_w4_2_sse2 -%if x < %1/2 - lea srcq, [srcq + srcstrideq * 2] - lea dstq, [dstq + dststrideq * 2] -%endif -%assign x x+1 -%endrep - - RET - -%endmacro - - FILTER_H4_W4xN_sse3 2 - FILTER_H4_W4xN_sse3 4 - FILTER_H4_W4xN_sse3 8 - FILTER_H4_W4xN_sse3 16 - FILTER_H4_W4xN_sse3 32 - -%macro FILTER_H4_w6_sse2 0 - pxor m4, m4 - movh m0, [srcq - 1] - movh m5, [srcq] - punpckldq m0, m5 - movhlps m2, m0 - punpcklbw m0, m4 - punpcklbw m2, m4 - movd m1, [srcq + 1] - movd m5, [srcq + 2] - punpckldq m1, m5 - punpcklbw m1, m4 - pmaddwd m0, m6 - pmaddwd m1, m6 - pmaddwd m2, m6 - packssdw m0, m1 - packssdw m2, m2 - pshuflw m1, m0, q2301 - pshufhw m1, m1, q2301 - pshuflw m3, m2, q2301 - paddw m0, m1 - paddw m2, m3 - psrld m0, 16 - psrld m2, 16 - packssdw m0, m2 - paddw m0, m7 - psraw m0, 6 - packuswb m0, m0 - movd [dstq], m0 - pextrw r4d, m0, 2 - mov [dstq + 4], r4w -%endmacro - -%macro FILH4W8_sse2 1 - movh m0, [srcq - 1 + %1] - movh m5, [srcq + %1] - punpckldq m0, m5 - movhlps m2, m0 - punpcklbw m0, m4 - punpcklbw m2, m4 - movh m1, [srcq + 1 + %1] - movh m5, [srcq + 2 + %1] - punpckldq m1, m5 - movhlps m3, m1 - punpcklbw m1, m4 - punpcklbw m3, m4 - pmaddwd m0, m6 - pmaddwd m1, m6 - pmaddwd m2, m6 - pmaddwd m3, m6 - packssdw m0, m1 - packssdw m2, m3 - pshuflw m1, m0, q2301 - pshufhw m1, m1, q2301 - pshuflw m3, m2, q2301 - pshufhw m3, m3, q2301 - paddw m0, m1 - paddw m2, m3 - psrld m0, 16 - psrld m2, 16 - packssdw m0, m2 - paddw m0, m7 - psraw m0, 6 - packuswb m0, m0 - movh [dstq + %1], m0 -%endmacro - -%macro FILTER_H4_w8_sse2 0 - FILH4W8_sse2 0 -%endmacro - -%macro FILTER_H4_w12_sse2 0 - FILH4W8_sse2 0 - movd m1, [srcq - 1 + 8] - movd m3, [srcq + 8] - punpckldq m1, m3 - punpcklbw m1, m4 - movd m2, [srcq + 1 + 8] - movd m3, [srcq + 2 + 8] - punpckldq m2, m3 - punpcklbw m2, m4 - pmaddwd m1, m6 - pmaddwd m2, m6 - packssdw m1, m2 - pshuflw m2, m1, q2301 - pshufhw m2, m2, q2301 - paddw m1, m2 - psrld m1, 16 - packssdw m1, m1 - paddw m1, m7 - psraw m1, 6 - packuswb m1, m1 - movd [dstq + 8], m1 -%endmacro - -%macro FILTER_H4_w16_sse2 0 - FILH4W8_sse2 0 - FILH4W8_sse2 8 -%endmacro - -%macro FILTER_H4_w24_sse2 0 - FILH4W8_sse2 0 - FILH4W8_sse2 8 - FILH4W8_sse2 16 -%endmacro - -%macro FILTER_H4_w32_sse2 0 - FILH4W8_sse2 0 - FILH4W8_sse2 8 - FILH4W8_sse2 16 - FILH4W8_sse2 24 -%endmacro - -%macro FILTER_H4_w48_sse2 0 - FILH4W8_sse2 0 - FILH4W8_sse2 8 - FILH4W8_sse2 16 - FILH4W8_sse2 24 - FILH4W8_sse2 32 - FILH4W8_sse2 40 -%endmacro - -%macro FILTER_H4_w64_sse2 0 - FILH4W8_sse2 0 - FILH4W8_sse2 8 - FILH4W8_sse2 16 - FILH4W8_sse2 24 - FILH4W8_sse2 32 - FILH4W8_sse2 40 - FILH4W8_sse2 48 - FILH4W8_sse2 56 -%endmacro - -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro IPFILTER_CHROMA_sse3 2 -INIT_XMM sse3 -cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 8, src, srcstride, dst, dststride - mov r4d, r4m - mova m7, [pw_32] - pxor m4, m4 - -%ifdef PIC - lea r5, [tabw_ChromaCoeff] - movddup m6, [r5 + r4 * 8] -%else - movddup m6, [tabw_ChromaCoeff + r4 * 8] -%endif - -%assign x 1 -%rep %2 - FILTER_H4_w%1_sse2 -%if x < %2 - add srcq, srcstrideq - add dstq, dststrideq -%endif -%assign x x+1 -%endrep - - RET - -%endmacro - - IPFILTER_CHROMA_sse3 6, 8 - IPFILTER_CHROMA_sse3 8, 2 - IPFILTER_CHROMA_sse3 8, 4 - IPFILTER_CHROMA_sse3 8, 6 - IPFILTER_CHROMA_sse3 8, 8 - IPFILTER_CHROMA_sse3 8, 16 - IPFILTER_CHROMA_sse3 8, 32 - IPFILTER_CHROMA_sse3 12, 16 - - IPFILTER_CHROMA_sse3 6, 16 - IPFILTER_CHROMA_sse3 8, 12 - IPFILTER_CHROMA_sse3 8, 64 - IPFILTER_CHROMA_sse3 12, 32 - - IPFILTER_CHROMA_sse3 16, 4 - IPFILTER_CHROMA_sse3 16, 8 - IPFILTER_CHROMA_sse3 16, 12 - IPFILTER_CHROMA_sse3 16, 16 - IPFILTER_CHROMA_sse3 16, 32 - IPFILTER_CHROMA_sse3 32, 8 - IPFILTER_CHROMA_sse3 32, 16 - IPFILTER_CHROMA_sse3 32, 24 - IPFILTER_CHROMA_sse3 24, 32 - IPFILTER_CHROMA_sse3 32, 32 - - IPFILTER_CHROMA_sse3 16, 24 - IPFILTER_CHROMA_sse3 16, 64 - IPFILTER_CHROMA_sse3 32, 48 - IPFILTER_CHROMA_sse3 24, 64 - IPFILTER_CHROMA_sse3 32, 64 - - IPFILTER_CHROMA_sse3 64, 64 - IPFILTER_CHROMA_sse3 64, 32 - IPFILTER_CHROMA_sse3 64, 48 - IPFILTER_CHROMA_sse3 48, 64 - IPFILTER_CHROMA_sse3 64, 16 - -%macro FILTER_2 2 - movd m3, [srcq + %1] - movd m4, [srcq + 1 + %1] - punpckldq m3, m4 - punpcklbw m3, m0 - pmaddwd m3, m1 - packssdw m3, m3 - pshuflw m4, m3, q2301 - paddw m3, m4 - psrldq m3, 2 - psubw m3, m2 - movd [dstq + %2], m3 -%endmacro - -%macro FILTER_4 2 - movd m3, [srcq + %1] - movd m4, [srcq + 1 + %1] - punpckldq m3, m4 - punpcklbw m3, m0 - pmaddwd m3, m1 - movd m4, [srcq + 2 + %1] - movd m5, [srcq + 3 + %1] - punpckldq m4, m5 - punpcklbw m4, m0 - pmaddwd m4, m1 - packssdw m3, m4 - pshuflw m4, m3, q2301 - pshufhw m4, m4, q2301 - paddw m3, m4 - psrldq m3, 2 - pshufd m3, m3, q3120 - psubw m3, m2 - movh [dstq + %2], m3 -%endmacro - -%macro FILTER_4TAP_HPS_sse3 2 -INIT_XMM sse3 -cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride - mov r4d, r4m - add dststrided, dststrided - mova m2, [pw_2000] - pxor m0, m0 - -%ifdef PIC - lea r6, [tabw_ChromaCoeff] - movddup m1, [r6 + r4 * 8] -%else - movddup m1, [tabw_ChromaCoeff + r4 * 8] -%endif - - mov r4d, %2 - cmp r5m, byte 0 - je .loopH - sub srcq, srcstrideq - add r4d, 3 - -.loopH: -%assign x -1 -%assign y 0 -%rep %1/4 - FILTER_4 x,y -%assign x x+4 -%assign y y+8 -%endrep -%rep (%1 % 4)/2 - FILTER_2 x,y -%endrep - add srcq, srcstrideq - add dstq, dststrideq - - dec r4d - jnz .loopH - RET - -%endmacro - - FILTER_4TAP_HPS_sse3 2, 4 - FILTER_4TAP_HPS_sse3 2, 8 - FILTER_4TAP_HPS_sse3 2, 16 - FILTER_4TAP_HPS_sse3 4, 2 - FILTER_4TAP_HPS_sse3 4, 4 - FILTER_4TAP_HPS_sse3 4, 8 - FILTER_4TAP_HPS_sse3 4, 16 - FILTER_4TAP_HPS_sse3 4, 32 - FILTER_4TAP_HPS_sse3 6, 8 - FILTER_4TAP_HPS_sse3 6, 16 - FILTER_4TAP_HPS_sse3 8, 2 - FILTER_4TAP_HPS_sse3 8, 4 - FILTER_4TAP_HPS_sse3 8, 6 - FILTER_4TAP_HPS_sse3 8, 8 - FILTER_4TAP_HPS_sse3 8, 12 - FILTER_4TAP_HPS_sse3 8, 16 - FILTER_4TAP_HPS_sse3 8, 32 - FILTER_4TAP_HPS_sse3 8, 64 - FILTER_4TAP_HPS_sse3 12, 16 - FILTER_4TAP_HPS_sse3 12, 32 - FILTER_4TAP_HPS_sse3 16, 4 - FILTER_4TAP_HPS_sse3 16, 8 - FILTER_4TAP_HPS_sse3 16, 12 - FILTER_4TAP_HPS_sse3 16, 16 - FILTER_4TAP_HPS_sse3 16, 24 - FILTER_4TAP_HPS_sse3 16, 32 - FILTER_4TAP_HPS_sse3 16, 64 - FILTER_4TAP_HPS_sse3 24, 32 - FILTER_4TAP_HPS_sse3 24, 64 - FILTER_4TAP_HPS_sse3 32, 8 - FILTER_4TAP_HPS_sse3 32, 16 - FILTER_4TAP_HPS_sse3 32, 24 - FILTER_4TAP_HPS_sse3 32, 32 - FILTER_4TAP_HPS_sse3 32, 48 - FILTER_4TAP_HPS_sse3 32, 64 - FILTER_4TAP_HPS_sse3 48, 64 - FILTER_4TAP_HPS_sse3 64, 16 - FILTER_4TAP_HPS_sse3 64, 32 - FILTER_4TAP_HPS_sse3 64, 48 - FILTER_4TAP_HPS_sse3 64, 64 - %macro FILTER_H8_W8_sse2 0 movh m1, [r0 + x - 3] movh m4, [r0 + x - 2] @@ -3330,2543 +2839,1125 @@ RET FILTER_PIX_TO_SHORT_sse2 64, 48 FILTER_PIX_TO_SHORT_sse2 64, 64 -%macro FILTER_H4_w2_2 3 - movh %2, [srcq - 1] - pshufb %2, %2, Tm0 - movh %1, [srcq + srcstrideq - 1] - pshufb %1, %1, Tm0 - punpcklqdq %2, %1 - pmaddubsw %2, coef2 - phaddw %2, %2 - pmulhrsw %2, %3 +%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst + movu %1, %7 + pshufb %2, %1, [tab_Lm + 0] + pmaddubsw %2, %5 + pshufb %3, %1, [tab_Lm + 16] + pmaddubsw %3, %5 + phaddw %2, %3 + pshufb %4, %1, [tab_Lm + 32] + pmaddubsw %4, %5 + pshufb %1, %1, [tab_Lm + 48] + pmaddubsw %1, %5 + phaddw %4, %1 + phaddw %2, %4 + %if %0 == 8 + pmulhrsw %2, %6 packuswb %2, %2 - movd r4d, %2 - mov [dstq], r4w - shr r4, 16 - mov [dstq + dststrideq], r4w + movh %8, %2 + %endif %endmacro - ;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; Interpolate HV ;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride -%define coef2 m4 -%define Tm0 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 - - mov r4d, r4m - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd coef2, [r5 + r4 * 4] -%else - movd coef2, [tab_ChromaCoeff + r4 * 4] -%endif +%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2] + mova %5, [r0 + (%6 + 0) * 16] + mova %1, [r0 + (%6 + 1) * 16] + mova %2, [r0 + (%6 + 2) * 16] + punpcklwd %3, %5, %1 + punpckhwd %5, %1 + pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0 + pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1] + punpcklwd %4, %1, %2 + punpckhwd %1, %2 + pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1 + pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2] +%endmacro ; FILTER_HV8_START - pshufd coef2, coef2, 0 - mova t2, [pw_512] - mova Tm0, [tab_Tm] +%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6] + mova %8, [r0 + (%9 + 0) * 16] + mova %1, [r0 + (%9 + 1) * 16] + punpcklwd %7, %2, %8 + punpckhwd %2, %8 + pmaddwd %7, [r5 + %10 * 16] + pmaddwd %2, [r5 + %10 * 16] + paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0 + paddd %5, %2 ; R0 = H[0+1+2+3] + punpcklwd %7, %8, %1 + punpckhwd %8, %1 + pmaddwd %7, [r5 + %10 * 16] + pmaddwd %8, [r5 + %10 * 16] + paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1 + paddd %6, %8 ; R1 = H[1+2+3+4] +%endmacro ; FILTER_HV8_MID -%rep 2 - FILTER_H4_w2_2 t0, t1, t2 - lea srcq, [srcq + srcstrideq * 2] - lea dstq, [dstq + dststrideq * 2] -%endrep +; Round and Saturate +%macro FILTER_HV8_END 4 ; output in [1, 3] + paddd %1, [pd_526336] + paddd %2, [pd_526336] + paddd %3, [pd_526336] + paddd %4, [pd_526336] + psrad %1, 12 + psrad %2, 12 + psrad %3, 12 + psrad %4, 12 + packssdw %1, %2 + packssdw %3, %4 - RET + ; TODO: is merge better? I think this way is short dependency link + packuswb %1, %3 +%endmacro ; FILTER_HV8_END ;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) ;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride -%define coef2 m4 -%define Tm0 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 +INIT_XMM ssse3 +cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 +%define coef m7 +%define stk_buf rsp mov r4d, r4m + mov r5d, r5m %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd coef2, [r5 + r4 * 4] + lea r6, [tab_LumaCoeff] + movh coef, [r6 + r4 * 8] %else - movd coef2, [tab_ChromaCoeff + r4 * 4] + movh coef, [tab_LumaCoeff + r4 * 8] %endif + punpcklqdq coef, coef - pshufd coef2, coef2, 0 - mova t2, [pw_512] - mova Tm0, [tab_Tm] + ; move to row -3 + lea r6, [r1 + r1 * 2] + sub r0, r6 -%rep 4 - FILTER_H4_w2_2 t0, t1, t2 - lea srcq, [srcq + srcstrideq * 2] - lea dstq, [dstq + dststrideq * 2] -%endrep + xor r6, r6 + mov r4, rsp - RET +.loopH: + FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3] + psubw m1, [pw_2000] + mova [r4], m1 -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride -%define coef2 m4 -%define Tm0 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 + add r0, r1 + add r4, 16 + inc r6 + cmp r6, 8+7 + jnz .loopH - mov r4d, r4m + ; ready to phase V + ; Here all of mN is free -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd coef2, [r5 + r4 * 4] -%else - movd coef2, [tab_ChromaCoeff + r4 * 4] -%endif + ; load coeff table + shl r5, 6 + lea r6, [tab_LumaCoeffV] + lea r5, [r5 + r6] - pshufd coef2, coef2, 0 - mova t2, [pw_512] - mova Tm0, [tab_Tm] + ; load intermedia buffer + mov r0, stk_buf - mov r5d, 16/2 + ; register mapping + ; r0 - src + ; r5 - coeff + ; r6 - loop_i -.loop: - FILTER_H4_w2_2 t0, t1, t2 - lea srcq, [srcq + srcstrideq * 2] - lea dstq, [dstq + dststrideq * 2] - dec r5d - jnz .loop + ; let's go + xor r6, r6 - RET + ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache +.loopV: -%macro FILTER_H4_w4_2 3 - movh %2, [srcq - 1] - pshufb %2, %2, Tm0 - pmaddubsw %2, coef2 - movh %1, [srcq + srcstrideq - 1] - pshufb %1, %1, Tm0 - pmaddubsw %1, coef2 - phaddw %2, %1 - pmulhrsw %2, %3 - packuswb %2, %2 - movd [dstq], %2 - palignr %2, %2, 4 - movd [dstq + dststrideq], %2 -%endmacro - -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride -%define coef2 m4 -%define Tm0 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 - - mov r4d, r4m + FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 + FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 + FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 + FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 + FILTER_HV8_END m3, m0, m4, m1 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd coef2, [r5 + r4 * 4] -%else - movd coef2, [tab_ChromaCoeff + r4 * 4] -%endif + movh [r2], m3 + movhps [r2 + r3], m3 - pshufd coef2, coef2, 0 - mova t2, [pw_512] - mova Tm0, [tab_Tm] + lea r0, [r0 + 16 * 2] + lea r2, [r2 + r3 * 2] - FILTER_H4_w4_2 t0, t1, t2 + inc r6 + cmp r6, 8/2 + jnz .loopV RET ;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) ;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride -%define coef2 m4 -%define Tm0 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 - +INIT_XMM sse3 +cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 mov r4d, r4m + mov r5d, r5m + add r4d, r4d + pxor m6, m6 %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd coef2, [r5 + r4 * 4] + lea r6, [tabw_LumaCoeff] + mova m3, [r6 + r4 * 8] %else - movd coef2, [tab_ChromaCoeff + r4 * 4] + mova m3, [tabw_LumaCoeff + r4 * 8] %endif - pshufd coef2, coef2, 0 - mova t2, [pw_512] - mova Tm0, [tab_Tm] - -%rep 2 - FILTER_H4_w4_2 t0, t1, t2 - lea srcq, [srcq + srcstrideq * 2] - lea dstq, [dstq + dststrideq * 2] -%endrep - - RET + ; move to row -3 + lea r6, [r1 + r1 * 2] + sub r0, r6 -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride -%define coef2 m4 -%define Tm0 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 + mov r4, rsp - mov r4d, r4m +%assign x 0 ;needed for FILTER_H8_W8_sse2 macro +%assign y 1 +%rep 15 + FILTER_H8_W8_sse2 + psubw m1, [pw_2000] + mova [r4], m1 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd coef2, [r5 + r4 * 4] -%else - movd coef2, [tab_ChromaCoeff + r4 * 4] +%if y < 15 + add r0, r1 + add r4, 16 %endif - - pshufd coef2, coef2, 0 - mova t2, [pw_512] - mova Tm0, [tab_Tm] - -%rep 4 - FILTER_H4_w4_2 t0, t1, t2 - lea srcq, [srcq + srcstrideq * 2] - lea dstq, [dstq + dststrideq * 2] +%assign y y+1 %endrep - RET + ; ready to phase V + ; Here all of mN is free -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride -%define coef2 m4 -%define Tm0 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 + ; load coeff table + shl r5, 6 + lea r6, [tab_LumaCoeffV] + lea r5, [r5 + r6] - mov r4d, r4m + ; load intermedia buffer + mov r0, rsp -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd coef2, [r5 + r4 * 4] -%else - movd coef2, [tab_ChromaCoeff + r4 * 4] -%endif + ; register mapping + ; r0 - src + ; r5 - coeff - pshufd coef2, coef2, 0 - mova t2, [pw_512] - mova Tm0, [tab_Tm] + ; let's go +%assign y 1 +%rep 4 + FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 + FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 + FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 + FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 + FILTER_HV8_END m3, m0, m4, m1 -%rep 8 - FILTER_H4_w4_2 t0, t1, t2 - lea srcq, [srcq + srcstrideq * 2] - lea dstq, [dstq + dststrideq * 2] -%endrep + movh [r2], m3 + movhps [r2 + r3], m3 +%if y < 4 + lea r0, [r0 + 16 * 2] + lea r2, [r2 + r3 * 2] +%endif +%assign y y+1 +%endrep RET ;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 -cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride -%define coef2 m4 -%define Tm0 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 +cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 - mov r4d, r4m + mov r4d, r4m + sub r0, r1 %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd coef2, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] %else - movd coef2, [tab_ChromaCoeff + r4 * 4] + movd m0, [tab_ChromaCoeff + r4 * 4] %endif + lea r4, [r1 * 3] + lea r5, [r0 + 4 * r1] + pshufb m0, [tab_Cm] + mova m1, [pw_512] - pshufd coef2, coef2, 0 - mova t2, [pw_512] - mova Tm0, [tab_Tm] + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r4] - mov r5d, 32/2 + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 -.loop: - FILTER_H4_w4_2 t0, t1, t2 - lea srcq, [srcq + srcstrideq * 2] - lea dstq, [dstq + dststrideq * 2] - dec r5d - jnz .loop + pmaddubsw m2, m0 - RET + movd m6, [r5] -ALIGN 32 -const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 + pmaddubsw m3, m0 -%macro FILTER_H4_w6 3 - movu %1, [srcq - 1] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - pmulhrsw %2, %3 - packuswb %2, %2 - movd [dstq], %2 - pextrw [dstq + 4], %2, 2 -%endmacro + phaddw m2, m3 -%macro FILTER_H4_w8 3 - movu %1, [srcq - 1] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - pmulhrsw %2, %3 - packuswb %2, %2 - movh [dstq], %2 -%endmacro + pmulhrsw m2, m1 -%macro FILTER_H4_w12 3 - movu %1, [srcq - 1] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - pmulhrsw %2, %3 - movu %1, [srcq - 1 + 8] - pshufb %1, %1, Tm0 - pmaddubsw %1, coef2 - phaddw %1, %1 - pmulhrsw %1, %3 - packuswb %2, %1 - movh [dstq], %2 - pextrd [dstq + 8], %2, 2 -%endmacro + movd m7, [r5 + r1] -%macro FILTER_H4_w16 4 - movu %1, [srcq - 1] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq - 1 + 8] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - pmulhrsw %2, %3 - pmulhrsw %4, %3 - packuswb %2, %4 - movu [dstq], %2 -%endmacro + punpcklbw m4, m5 + punpcklbw m3, m6, m7 + punpcklbw m4, m3 -%macro FILTER_H4_w24 4 - movu %1, [srcq - 1] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq - 1 + 8] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - pmulhrsw %2, %3 - pmulhrsw %4, %3 - packuswb %2, %4 - movu [dstq], %2 - movu %1, [srcq - 1 + 16] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - pmulhrsw %2, %3 - packuswb %2, %2 - movh [dstq + 16], %2 -%endmacro + pmaddubsw m4, m0 -%macro FILTER_H4_w32 4 - movu %1, [srcq - 1] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq - 1 + 8] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - pmulhrsw %2, %3 - pmulhrsw %4, %3 - packuswb %2, %4 - movu [dstq], %2 - movu %1, [srcq - 1 + 16] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq - 1 + 24] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - pmulhrsw %2, %3 - pmulhrsw %4, %3 - packuswb %2, %4 - movu [dstq + 16], %2 -%endmacro + movd m3, [r5 + 2 * r1] -%macro FILTER_H4_w16o 5 - movu %1, [srcq + %5 - 1] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq + %5 - 1 + 8] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - pmulhrsw %2, %3 - pmulhrsw %4, %3 - packuswb %2, %4 - movu [dstq + %5], %2 -%endmacro + punpcklbw m5, m6 + punpcklbw m7, m3 + punpcklbw m5, m7 -%macro FILTER_H4_w48 4 - FILTER_H4_w16o %1, %2, %3, %4, 0 - FILTER_H4_w16o %1, %2, %3, %4, 16 - FILTER_H4_w16o %1, %2, %3, %4, 32 -%endmacro + pmaddubsw m5, m0 -%macro FILTER_H4_w64 4 - FILTER_H4_w16o %1, %2, %3, %4, 0 - FILTER_H4_w16o %1, %2, %3, %4, 16 - FILTER_H4_w16o %1, %2, %3, %4, 32 - FILTER_H4_w16o %1, %2, %3, %4, 48 -%endmacro + phaddw m4, m5 -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro IPFILTER_CHROMA 2 -INIT_XMM sse4 -cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride -%define coef2 m5 -%define Tm0 m4 -%define Tm1 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 + pmulhrsw m4, m1 + packuswb m2, m4 - mov r4d, r4m + pextrw [r2], m2, 0 + pextrw [r2 + r3], m2, 2 + lea r2, [r2 + 2 * r3] + pextrw [r2], m2, 4 + pextrw [r2 + r3], m2, 6 + + RET + +%macro FILTER_VER_CHROMA_AVX2_2x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_2x4, 4, 6, 2 + mov r4d, r4m + shl r4d, 5 + sub r0, r1 %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd coef2, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeff_V] + add r5, r4 %else - movd coef2, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_ChromaCoeff_V + r4] %endif - mov r5d, %2 - - pshufd coef2, coef2, 0 - mova t2, [pw_512] - mova Tm0, [tab_Tm] - mova Tm1, [tab_Tm + 16] - -.loop: - FILTER_H4_w%1 t0, t1, t2 - add srcq, srcstrideq - add dstq, dststrideq + lea r4, [r1 * 3] - dec r5d - jnz .loop + pinsrw xm1, [r0], 0 + pinsrw xm1, [r0 + r1], 1 + pinsrw xm1, [r0 + r1 * 2], 2 + pinsrw xm1, [r0 + r4], 3 + lea r0, [r0 + r1 * 4] + pinsrw xm1, [r0], 4 + pinsrw xm1, [r0 + r1], 5 + pinsrw xm1, [r0 + r1 * 2], 6 + pshufb xm0, xm1, [interp_vert_shuf] + pshufb xm1, [interp_vert_shuf + 32] + vinserti128 m0, m0, xm1, 1 + pmaddubsw m0, [r5] + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +%ifidn %1,pp + pmulhrsw xm0, [pw_512] + packuswb xm0, xm0 + lea r4, [r3 * 3] + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r4], xm0, 3 +%else + add r3d, r3d + lea r4, [r3 * 3] + psubw xm0, [pw_2000] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + r3 * 2], xm0, 2 + pextrd [r2 + r4], xm0, 3 +%endif RET %endmacro + FILTER_VER_CHROMA_AVX2_2x4 pp + FILTER_VER_CHROMA_AVX2_2x4 ps - IPFILTER_CHROMA 6, 8 - IPFILTER_CHROMA 8, 2 - IPFILTER_CHROMA 8, 4 - IPFILTER_CHROMA 8, 6 - IPFILTER_CHROMA 8, 8 - IPFILTER_CHROMA 8, 16 - IPFILTER_CHROMA 8, 32 - IPFILTER_CHROMA 12, 16 - - IPFILTER_CHROMA 6, 16 - IPFILTER_CHROMA 8, 12 - IPFILTER_CHROMA 8, 64 - IPFILTER_CHROMA 12, 32 - -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro IPFILTER_CHROMA_W 2 -INIT_XMM sse4 -cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride -%define coef2 m6 -%define Tm0 m5 -%define Tm1 m4 -%define t3 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 - - mov r4d, r4m +%macro FILTER_VER_CHROMA_AVX2_2x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_2x8, 4, 6, 2 + mov r4d, r4m + shl r4d, 6 + sub r0, r1 %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd coef2, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 %else - movd coef2, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32 + r4] %endif - mov r5d, %2 - - pshufd coef2, coef2, 0 - mova t2, [pw_512] - mova Tm0, [tab_Tm] - mova Tm1, [tab_Tm + 16] - -.loop: - FILTER_H4_w%1 t0, t1, t2, t3 - add srcq, srcstrideq - add dstq, dststrideq + lea r4, [r1 * 3] - dec r5d - jnz .loop + pinsrw xm1, [r0], 0 + pinsrw xm1, [r0 + r1], 1 + pinsrw xm1, [r0 + r1 * 2], 2 + pinsrw xm1, [r0 + r4], 3 + lea r0, [r0 + r1 * 4] + pinsrw xm1, [r0], 4 + pinsrw xm1, [r0 + r1], 5 + pinsrw xm1, [r0 + r1 * 2], 6 + pinsrw xm1, [r0 + r4], 7 + movhlps xm0, xm1 + lea r0, [r0 + r1 * 4] + pinsrw xm0, [r0], 4 + pinsrw xm0, [r0 + r1], 5 + pinsrw xm0, [r0 + r1 * 2], 6 + vinserti128 m1, m1, xm0, 1 - RET -%endmacro - - IPFILTER_CHROMA_W 16, 4 - IPFILTER_CHROMA_W 16, 8 - IPFILTER_CHROMA_W 16, 12 - IPFILTER_CHROMA_W 16, 16 - IPFILTER_CHROMA_W 16, 32 - IPFILTER_CHROMA_W 32, 8 - IPFILTER_CHROMA_W 32, 16 - IPFILTER_CHROMA_W 32, 24 - IPFILTER_CHROMA_W 24, 32 - IPFILTER_CHROMA_W 32, 32 - - IPFILTER_CHROMA_W 16, 24 - IPFILTER_CHROMA_W 16, 64 - IPFILTER_CHROMA_W 32, 48 - IPFILTER_CHROMA_W 24, 64 - IPFILTER_CHROMA_W 32, 64 - - IPFILTER_CHROMA_W 64, 64 - IPFILTER_CHROMA_W 64, 32 - IPFILTER_CHROMA_W 64, 48 - IPFILTER_CHROMA_W 48, 64 - IPFILTER_CHROMA_W 64, 16 - - -%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst - movu %1, %7 - pshufb %2, %1, [tab_Lm + 0] - pmaddubsw %2, %5 - pshufb %3, %1, [tab_Lm + 16] - pmaddubsw %3, %5 - phaddw %2, %3 - pshufb %4, %1, [tab_Lm + 32] - pmaddubsw %4, %5 - pshufb %1, %1, [tab_Lm + 48] - pmaddubsw %1, %5 - phaddw %4, %1 - phaddw %2, %4 - %if %0 == 8 - pmulhrsw %2, %6 - packuswb %2, %2 - movh %8, %2 - %endif -%endmacro - -%macro FILTER_H8_W4 2 - movu %1, [r0 - 3 + r5] - pshufb %2, %1, [tab_Lm] - pmaddubsw %2, m3 - pshufb m7, %1, [tab_Lm + 16] - pmaddubsw m7, m3 - phaddw %2, m7 - phaddw %2, %2 -%endmacro - -;---------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;---------------------------------------------------------------------------------------------------------------------------- -%macro IPFILTER_LUMA 3 -INIT_XMM sse4 -cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8 - - mov r4d, r4m - -%ifdef PIC - lea r6, [tab_LumaCoeff] - movh m3, [r6 + r4 * 8] -%else - movh m3, [tab_LumaCoeff + r4 * 8] -%endif - punpcklqdq m3, m3 - -%ifidn %3, pp - mova m2, [pw_512] + pshufb m0, m1, [interp_vert_shuf] + pshufb m1, [interp_vert_shuf + 32] + pmaddubsw m0, [r5] + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m0, m1 +%ifidn %1,pp + pmulhrsw m0, [pw_512] + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + lea r4, [r3 * 3] + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r4], xm0, 3 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 4 + pextrw [r2 + r3], xm0, 5 + pextrw [r2 + r3 * 2], xm0, 6 + pextrw [r2 + r4], xm0, 7 %else - mova m2, [pw_2000] -%endif - - mov r4d, %2 -%ifidn %3, ps - add r3, r3 - cmp r5m, byte 0 - je .loopH - lea r6, [r1 + 2 * r1] - sub r0, r6 - add r4d, 7 + add r3d, r3d + lea r4, [r3 * 3] + psubw m0, [pw_2000] + vextracti128 xm1, m0, 1 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + r3 * 2], xm0, 2 + pextrd [r2 + r4], xm0, 3 + lea r2, [r2 + r3 * 4] + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r4], xm1, 3 %endif - -.loopH: - xor r5, r5 -%rep %1 / 8 - %ifidn %3, pp - FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5] - %else - FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5] - psubw m1, m2 - movu [r2 + 2 * r5], m1 - %endif - add r5, 8 -%endrep - -%rep (%1 % 8) / 4 - FILTER_H8_W4 m0, m1 - %ifidn %3, pp - pmulhrsw m1, m2 - packuswb m1, m1 - movd [r2 + r5], m1 - %else - psubw m1, m2 - movh [r2 + 2 * r5], m1 - %endif -%endrep - - add r0, r1 - add r2, r3 - - dec r4d - jnz .loopH RET %endmacro + FILTER_VER_CHROMA_AVX2_2x8 pp + FILTER_VER_CHROMA_AVX2_2x8 ps +%macro FILTER_VER_CHROMA_AVX2_2x16 1 INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_4x4, 4,6,6 +cglobal interp_4tap_vert_%1_2x16, 4, 6, 3 mov r4d, r4m + shl r4d, 6 + sub r0, r1 %ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 %else - vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] + lea r5, [tab_ChromaCoeffVer_32 + r4] %endif - mova m1, [tab_Lm] - vpbroadcastd m2, [pw_1] + lea r4, [r1 * 3] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 + movd xm1, [r0] + pinsrw xm1, [r0 + r1], 1 + pinsrw xm1, [r0 + r1 * 2], 2 + pinsrw xm1, [r0 + r4], 3 + lea r0, [r0 + r1 * 4] + pinsrw xm1, [r0], 4 + pinsrw xm1, [r0 + r1], 5 + pinsrw xm1, [r0 + r1 * 2], 6 + pinsrw xm1, [r0 + r4], 7 + lea r0, [r0 + r1 * 4] + pinsrw xm0, [r0], 4 + pinsrw xm0, [r0 + r1], 5 + pinsrw xm0, [r0 + r1 * 2], 6 + pinsrw xm0, [r0 + r4], 7 + punpckhqdq xm0, xm1, xm0 + vinserti128 m1, m1, xm0, 1 - sub r0, 3 - ; Row 0-1 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] - - ; Row 2-3 - lea r0, [r0 + r1 * 2] - vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] - - packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] - pmulhrsw m3, [pw_512] - vextracti128 xm4, m3, 1 - packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] - pshufb xm3, [interp4_shuf] ; [row3 row1 row2 row0] - - lea r0, [r3 * 3] - movd [r2], xm3 - pextrd [r2+r3], xm3, 2 - pextrd [r2+r3*2], xm3, 1 - pextrd [r2+r0], xm3, 3 - RET + pshufb m2, m1, [interp_vert_shuf] + pshufb m1, [interp_vert_shuf + 32] + pmaddubsw m2, [r5] + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m2, m1 -%macro FILTER_HORIZ_LUMA_AVX2_4xN 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_pp_4x%1, 4, 6, 9 - mov r4d, r4m + lea r0, [r0 + r1 * 4] + pinsrw xm1, [r0], 4 + pinsrw xm1, [r0 + r1], 5 + pinsrw xm1, [r0 + r1 * 2], 6 + pinsrw xm1, [r0 + r4], 7 + punpckhqdq xm1, xm0, xm1 + lea r0, [r0 + r1 * 4] + pinsrw xm0, [r0], 4 + pinsrw xm0, [r0 + r1], 5 + pinsrw xm0, [r0 + r1 * 2], 6 + punpckhqdq xm0, xm1, xm0 + vinserti128 m1, m1, xm0, 1 -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] + pshufb m0, m1, [interp_vert_shuf] + pshufb m1, [interp_vert_shuf + 32] + pmaddubsw m0, [r5] + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m0, m1 +%ifidn %1,pp + mova m1, [pw_512] + pmulhrsw m2, m1 + pmulhrsw m0, m1 + packuswb m2, m0 + lea r4, [r3 * 3] + pextrw [r2], xm2, 0 + pextrw [r2 + r3], xm2, 1 + pextrw [r2 + r3 * 2], xm2, 2 + pextrw [r2 + r4], xm2, 3 + vextracti128 xm0, m2, 1 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r4], xm0, 3 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm2, 4 + pextrw [r2 + r3], xm2, 5 + pextrw [r2 + r3 * 2], xm2, 6 + pextrw [r2 + r4], xm2, 7 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 4 + pextrw [r2 + r3], xm0, 5 + pextrw [r2 + r3 * 2], xm0, 6 + pextrw [r2 + r4], xm0, 7 %else - vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] -%endif - - mova m1, [tab_Lm] - mova m2, [pw_1] - mova m7, [interp8_hps_shuf] - mova m8, [pw_512] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - lea r4, [r1 * 3] - lea r5, [r3 * 3] - sub r0, 3 -%rep %1 / 8 - ; Row 0-1 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] - - ; Row 2-3 - vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] - - packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] - lea r0, [r0 + r1 * 4] - ; Row 4-5 - vbroadcasti128 m5, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - phaddd m5, m4 ; DWORD [R5D R5C R4D R4C R5B R5A R4B R4A] - - ; Row 6-7 - vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m6, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m6, m1 - pmaddubsw m6, m0 - pmaddwd m6, m2 - phaddd m4, m6 ; DWORD [R7D R7C R6D R6C R7B R7A R6B R6A] - - packssdw m5, m4 ; WORD [R7D R7C R6D R6C R5D R5C R4D R4C R7B R7A R6B R6A R5B R5A R4B R4A] - vpermd m3, m7, m3 - vpermd m5, m7, m5 - pmulhrsw m3, m8 - pmulhrsw m5, m8 - packuswb m3, m5 - vextracti128 xm5, m3, 1 - - movd [r2], xm3 - pextrd [r2 + r3], xm3, 1 - movd [r2 + r3 * 2], xm5 - pextrd [r2 + r5], xm5, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm3, 2 - pextrd [r2 + r3], xm3, 3 - pextrd [r2 + r3 * 2], xm5, 2 - pextrd [r2 + r5], xm5, 3 - lea r0, [r0 + r1 * 4] + add r3d, r3d + lea r4, [r3 * 3] + vbroadcasti128 m1, [pw_2000] + psubw m2, m1 + psubw m0, m1 + vextracti128 xm1, m2, 1 + movd [r2], xm2 + pextrd [r2 + r3], xm2, 1 + pextrd [r2 + r3 * 2], xm2, 2 + pextrd [r2 + r4], xm2, 3 lea r2, [r2 + r3 * 4] -%endrep - RET + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r4], xm1, 3 + vextracti128 xm1, m0, 1 + lea r2, [r2 + r3 * 4] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + r3 * 2], xm0, 2 + pextrd [r2 + r4], xm0, 3 + lea r2, [r2 + r3 * 4] + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r4], xm1, 3 %endif + RET %endmacro - FILTER_HORIZ_LUMA_AVX2_4xN 8 - FILTER_HORIZ_LUMA_AVX2_4xN 16 + FILTER_VER_CHROMA_AVX2_2x16 pp + FILTER_VER_CHROMA_AVX2_2x16 ps -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7 - mov r4d, r4m +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W2_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 %ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] %else - vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] + movd m0, [tab_ChromaCoeff + r4 * 4] %endif - mova m1, [tab_Lm] - mova m2, [tab_Lm + 32] - - ; register map - ; m0 - interpolate coeff - ; m1, m2 - shuffle order table + pshufb m0, [tab_Cm] - sub r0, 3 - lea r5, [r1 * 3] - lea r4, [r3 * 3] + mova m1, [pw_512] - ; Row 0 - vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m3, m2 - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddubsw m4, m0 - phaddw m3, m4 - ; Row 1 - vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m4, m2 - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddubsw m5, m0 - phaddw m4, m5 - - phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A] - pmulhrsw m3, [pw_512] - - ; Row 2 - vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m4, m2 - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddubsw m5, m0 - phaddw m4, m5 - ; Row 3 - vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m6, m5, m2 - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddubsw m6, m0 - phaddw m5, m6 - - phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A] - pmulhrsw m4, [pw_512] + mov r4d, %2 + lea r5, [3 * r1] - packuswb m3, m4 - vextracti128 xm4, m3, 1 - punpcklwd xm5, xm3, xm4 +.loop: + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] - movq [r2], xm5 - movhps [r2 + r3], xm5 + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 - punpckhwd xm5, xm3, xm4 - movq [r2 + r3 * 2], xm5 - movhps [r2 + r4], xm5 - RET + pmaddubsw m2, m0 -%macro IPFILTER_LUMA_AVX2_8xN 2 -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_%1x%2, 4, 7, 7 - mov r4d, r4m + lea r0, [r0 + 4 * r1] + movd m6, [r0] -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] -%endif + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 - mova m1, [tab_Lm] - mova m2, [tab_Lm + 32] + pmaddubsw m3, m0 - ; register map - ; m0 - interpolate coeff - ; m1, m2 - shuffle order table + phaddw m2, m3 - sub r0, 3 - lea r5, [r1 * 3] - lea r6, [r3 * 3] - mov r4d, %2 / 4 -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m3, m2 - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddubsw m4, m0 - phaddw m3, m4 - ; Row 1 - vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m4, m2 - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddubsw m5, m0 - phaddw m4, m5 - - phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A] - pmulhrsw m3, [pw_512] - - ; Row 2 - vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m4, m2 - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddubsw m5, m0 - phaddw m4, m5 - ; Row 3 - vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m6, m5, m2 - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddubsw m6, m0 - phaddw m5, m6 - - phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A] - pmulhrsw m4, [pw_512] + pmulhrsw m2, m1 - packuswb m3, m4 - vextracti128 xm4, m3, 1 - punpcklwd xm5, xm3, xm4 + movd m7, [r0 + r1] - movq [r2], xm5 - movhps [r2 + r3], xm5 + punpcklbw m4, m5 + punpcklbw m3, m6, m7 + punpcklbw m4, m3 - punpckhwd xm5, xm3, xm4 - movq [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm5 + pmaddubsw m4, m0 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - dec r4d - jnz .loop - RET -%endmacro + movd m3, [r0 + 2 * r1] - IPFILTER_LUMA_AVX2_8xN 8, 8 - IPFILTER_LUMA_AVX2_8xN 8, 16 - IPFILTER_LUMA_AVX2_8xN 8, 32 + punpcklbw m5, m6 + punpcklbw m7, m3 + punpcklbw m5, m7 -%macro IPFILTER_LUMA_AVX2 2 -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 - sub r0, 3 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastd m0, [r5 + r4 * 8] - vpbroadcastd m1, [r5 + r4 * 8 + 4] -%else - vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] - vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] -%endif - movu m3, [tab_Tm + 16] - vpbroadcastd m7, [pw_1] + pmaddubsw m5, m0 - ; register map - ; m0 , m1 interpolate coeff - ; m2 , m2 shuffle order table - ; m7 - pw_1 + phaddw m4, m5 - mov r4d, %2/2 -.loop: - ; Row 0 - vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m4, m3 - pshufb m4, [tab_Tm] - pmaddubsw m4, m0 - pmaddubsw m5, m1 - paddw m4, m5 - pmaddwd m4, m7 - vbroadcasti128 m5, [r0 + 8] ; second 8 elements in Row0 - pshufb m6, m5, m3 - pshufb m5, [tab_Tm] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] - pmulhrsw m4, [pw_512] - vbroadcasti128 m2, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m2, m3 - pshufb m2, [tab_Tm] - pmaddubsw m2, m0 - pmaddubsw m5, m1 - paddw m2, m5 - pmaddwd m2, m7 - vbroadcasti128 m5, [r0 + r1 + 8] ; second 8 elements in Row0 - pshufb m6, m5, m3 - pshufb m5, [tab_Tm] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - packssdw m2, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] - pmulhrsw m2, [pw_512] - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm5, m4, 1 - pshufd xm4, xm4, 11011000b - pshufd xm5, xm5, 11011000b - movu [r2], xm4 - movu [r2+r3], xm5 - lea r0, [r0 + r1 * 2] - lea r2, [r2 + r3 * 2] - dec r4d - jnz .loop - RET -%endmacro + pmulhrsw m4, m1 + packuswb m2, m4 -%macro IPFILTER_LUMA_32x_avx2 2 -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 - sub r0, 3 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastd m0, [r5 + r4 * 8] - vpbroadcastd m1, [r5 + r4 * 8 + 4] -%else - vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] - vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] -%endif - movu m3, [tab_Tm + 16] - vpbroadcastd m7, [pw_1] + pextrw [r2], m2, 0 + pextrw [r2 + r3], m2, 2 + lea r2, [r2 + 2 * r3] + pextrw [r2], m2, 4 + pextrw [r2 + r3], m2, 6 - ; register map - ; m0 , m1 interpolate coeff - ; m2 , m2 shuffle order table - ; m7 - pw_1 + lea r2, [r2 + 2 * r3] - mov r4d, %2 -.loop: - ; Row 0 - vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m4, m3 - pshufb m4, [tab_Tm] - pmaddubsw m4, m0 - pmaddubsw m5, m1 - paddw m4, m5 - pmaddwd m4, m7 - vbroadcasti128 m5, [r0 + 8] - pshufb m6, m5, m3 - pshufb m5, [tab_Tm] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] - pmulhrsw m4, [pw_512] - vbroadcasti128 m2, [r0 + 16] - pshufb m5, m2, m3 - pshufb m2, [tab_Tm] - pmaddubsw m2, m0 - pmaddubsw m5, m1 - paddw m2, m5 - pmaddwd m2, m7 - vbroadcasti128 m5, [r0 + 24] - pshufb m6, m5, m3 - pshufb m5, [tab_Tm] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - packssdw m2, m5 - pmulhrsw m2, [pw_512] - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm5, m4, 1 - pshufd xm4, xm4, 11011000b - pshufd xm5, xm5, 11011000b - movu [r2], xm4 - movu [r2 + 16], xm5 - lea r0, [r0 + r1] - lea r2, [r2 + r3] - dec r4d - jnz .loop + sub r4, 4 + jnz .loop RET %endmacro -%macro IPFILTER_LUMA_64x_avx2 2 -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 - sub r0, 3 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastd m0, [r5 + r4 * 8] - vpbroadcastd m1, [r5 + r4 * 8 + 4] -%else - vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] - vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] -%endif - movu m3, [tab_Tm + 16] - vpbroadcastd m7, [pw_1] + FILTER_V4_W2_H4 2, 8 - ; register map - ; m0 , m1 interpolate coeff - ; m2 , m2 shuffle order table - ; m7 - pw_1 + FILTER_V4_W2_H4 2, 16 - mov r4d, %2 -.loop: - ; Row 0 - vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m4, m3 - pshufb m4, [tab_Tm] - pmaddubsw m4, m0 - pmaddubsw m5, m1 - paddw m4, m5 - pmaddwd m4, m7 - vbroadcasti128 m5, [r0 + 8] - pshufb m6, m5, m3 - pshufb m5, [tab_Tm] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] - pmulhrsw m4, [pw_512] - vbroadcasti128 m2, [r0 + 16] - pshufb m5, m2, m3 - pshufb m2, [tab_Tm] - pmaddubsw m2, m0 - pmaddubsw m5, m1 - paddw m2, m5 - pmaddwd m2, m7 - vbroadcasti128 m5, [r0 + 24] - pshufb m6, m5, m3 - pshufb m5, [tab_Tm] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - packssdw m2, m5 - pmulhrsw m2, [pw_512] - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm5, m4, 1 - pshufd xm4, xm4, 11011000b - pshufd xm5, xm5, 11011000b - movu [r2], xm4 - movu [r2 + 16], xm5 - - vbroadcasti128 m4, [r0 + 32] - pshufb m5, m4, m3 - pshufb m4, [tab_Tm] - pmaddubsw m4, m0 - pmaddubsw m5, m1 - paddw m4, m5 - pmaddwd m4, m7 - vbroadcasti128 m5, [r0 + 40] - pshufb m6, m5, m3 - pshufb m5, [tab_Tm] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - packssdw m4, m5 - pmulhrsw m4, [pw_512] - vbroadcasti128 m2, [r0 + 48] - pshufb m5, m2, m3 - pshufb m2, [tab_Tm] - pmaddubsw m2, m0 - pmaddubsw m5, m1 - paddw m2, m5 - pmaddwd m2, m7 - vbroadcasti128 m5, [r0 + 56] - pshufb m6, m5, m3 - pshufb m5, [tab_Tm] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - packssdw m2, m5 - pmulhrsw m2, [pw_512] - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm5, m4, 1 - pshufd xm4, xm4, 11011000b - pshufd xm5, xm5, 11011000b - movu [r2 +32], xm4 - movu [r2 + 48], xm5 - - lea r0, [r0 + r1] - lea r2, [r2 + r3] - dec r4d - jnz .loop - RET -%endmacro +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_4x2, 4, 6, 6 + + mov r4d, r4m + sub r0, r1 -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_48x64, 4,6,8 - sub r0, 3 - mov r4d, r4m %ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastd m0, [r5 + r4 * 8] - vpbroadcastd m1, [r5 + r4 * 8 + 4] + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] %else - vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] - vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] + movd m0, [tab_ChromaCoeff + r4 * 4] %endif - movu m3, [tab_Tm + 16] - vpbroadcastd m7, [pw_1] - ; register map - ; m0 , m1 interpolate coeff - ; m2 , m2 shuffle order table - ; m7 - pw_1 + pshufb m0, [tab_Cm] + lea r5, [r0 + 2 * r1] + + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r5] + movd m5, [r5 + r1] - mov r4d, 64 -.loop: - ; Row 0 - vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m4, m3 - pshufb m4, [tab_Tm] - pmaddubsw m4, m0 - pmaddubsw m5, m1 - paddw m4, m5 - pmaddwd m4, m7 - vbroadcasti128 m5, [r0 + 8] - pshufb m6, m5, m3 - pshufb m5, [tab_Tm] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] - pmulhrsw m4, [pw_512] - - vbroadcasti128 m2, [r0 + 16] - pshufb m5, m2, m3 - pshufb m2, [tab_Tm] - pmaddubsw m2, m0 - pmaddubsw m5, m1 - paddw m2, m5 - pmaddwd m2, m7 - vbroadcasti128 m5, [r0 + 24] - pshufb m6, m5, m3 - pshufb m5, [tab_Tm] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - packssdw m2, m5 - pmulhrsw m2, [pw_512] - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm5, m4, 1 - pshufd xm4, xm4, 11011000b - pshufd xm5, xm5, 11011000b - movu [r2], xm4 - movu [r2 + 16], xm5 - - vbroadcasti128 m4, [r0 + 32] - pshufb m5, m4, m3 - pshufb m4, [tab_Tm] - pmaddubsw m4, m0 - pmaddubsw m5, m1 - paddw m4, m5 - pmaddwd m4, m7 - vbroadcasti128 m5, [r0 + 40] - pshufb m6, m5, m3 - pshufb m5, [tab_Tm] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - packssdw m4, m5 - pmulhrsw m4, [pw_512] - packuswb m4, m4 - vpermq m4, m4, 11011000b - pshufd xm4, xm4, 11011000b - movu [r2 + 32], xm4 - - lea r0, [r0 + r1] - lea r2, [r2 + r3] - dec r4d - jnz .loop - RET + punpcklbw m2, m3 + punpcklbw m1, m4, m5 + punpcklbw m2, m1 -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_4x4, 4,6,6 - mov r4d, r4m + pmaddubsw m2, m0 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - vpbroadcastd m2, [pw_1] - vbroadcasti128 m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - - ; Row 0-1 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - vinserti128 m3, m3, [r0 + r1], 1 - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 2-3 - lea r0, [r0 + r1 * 2] - vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - vinserti128 m4, m4, [r0 + r1], 1 - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - pmulhrsw m3, [pw_512] - vextracti128 xm4, m3, 1 - packuswb xm3, xm4 - - lea r0, [r3 * 3] - movd [r2], xm3 - pextrd [r2+r3], xm3, 2 - pextrd [r2+r3*2], xm3, 1 - pextrd [r2+r0], xm3, 3 - RET + movd m1, [r0 + 4 * r1] -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_2x4, 4, 6, 3 - mov r4d, r4m + punpcklbw m3, m4 + punpcklbw m5, m1 + punpcklbw m3, m5 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - dec r0 - lea r4, [r1 * 3] - movq xm1, [r0] - movhps xm1, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m1, m1, xm2, 1 - pshufb m1, [interp4_hpp_shuf] - pmaddubsw m1, m0 - pmaddwd m1, [pw_1] - vextracti128 xm2, m1, 1 - packssdw xm1, xm2 - pmulhrsw xm1, [pw_512] - packuswb xm1, xm1 - - lea r4, [r3 * 3] - pextrw [r2], xm1, 0 - pextrw [r2 + r3], xm1, 1 - pextrw [r2 + r3 * 2], xm1, 2 - pextrw [r2 + r4], xm1, 3 - RET + pmaddubsw m3, m0 -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6 - mov r4d, r4m + phaddw m2, m3 + + pmulhrsw m2, [pw_512] + packuswb m2, m2 + movd [r2], m2 + pextrd [r2 + r3], m2, 1 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - mova m4, [interp4_hpp_shuf] - mova m5, [pw_1] - dec r0 - lea r4, [r1 * 3] - movq xm1, [r0] - movhps xm1, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m1, m1, xm2, 1 - lea r0, [r0 + r1 * 4] - movq xm3, [r0] - movhps xm3, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m3, m3, xm2, 1 - - pshufb m1, m4 - pshufb m3, m4 - pmaddubsw m1, m0 - pmaddubsw m3, m0 - pmaddwd m1, m5 - pmaddwd m3, m5 - packssdw m1, m3 - pmulhrsw m1, [pw_512] - vextracti128 xm2, m1, 1 - packuswb xm1, xm2 - - lea r4, [r3 * 3] - pextrw [r2], xm1, 0 - pextrw [r2 + r3], xm1, 1 - pextrw [r2 + r3 * 2], xm1, 4 - pextrw [r2 + r4], xm1, 5 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm1, 2 - pextrw [r2 + r3], xm1, 3 - pextrw [r2 + r3 * 2], xm1, 6 - pextrw [r2 + r4], xm1, 7 RET +%macro FILTER_VER_CHROMA_AVX2_4x2 1 INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_32x32, 4,6,7 +cglobal interp_4tap_vert_%1_4x2, 4, 6, 4 mov r4d, r4m + shl r4d, 5 + sub r0, r1 %ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeff_V] + add r5, r4 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_ChromaCoeff_V + r4] %endif - mova m1, [interp4_horiz_shuf1] - vpbroadcastd m2, [pw_1] - mova m6, [pw_512] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 + lea r4, [r1 * 3] - dec r0 - mov r4d, 32 + movd xm1, [r0] + movd xm2, [r0 + r1] + punpcklbw xm1, xm2 + movd xm3, [r0 + r1 * 2] + punpcklbw xm2, xm3 + movlhps xm1, xm2 + movd xm0, [r0 + r4] + punpcklbw xm3, xm0 + movd xm2, [r0 + r1 * 4] + punpcklbw xm0, xm2 + movlhps xm3, xm0 + vinserti128 m1, m1, xm3, 1 ; m1 = row[x x x 4 3 2 1 0] -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 4] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - vbroadcasti128 m4, [r0 + 16] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + 20] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - vpermq m3, m3, 11011000b - - movu [r2], m3 - lea r2, [r2 + r3] - lea r0, [r0 + r1] - dec r4d - jnz .loop + pmaddubsw m1, [r5] + vextracti128 xm3, m1, 1 + paddw xm1, xm3 +%ifidn %1,pp + pmulhrsw xm1, [pw_512] + packuswb xm1, xm1 + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 +%else + add r3d, r3d + psubw xm1, [pw_2000] + movq [r2], xm1 + movhps [r2 + r3], xm1 +%endif RET +%endmacro + FILTER_VER_CHROMA_AVX2_4x2 pp + FILTER_VER_CHROMA_AVX2_4x2 ps -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_16x16, 4, 6, 7 - mov r4d, r4m +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_4x4, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 %ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + movd m0, [tab_ChromaCoeff + r4 * 4] %endif - mova m6, [pw_512] - mova m1, [interp4_horiz_shuf1] - vpbroadcastd m2, [pw_1] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - mov r4d, 8 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - vpermq m3, m3, 11011000b - - vextracti128 xm4, m3, 1 - movu [r2], xm3 - movu [r2 + r3], xm4 - lea r2, [r2 + r3 * 2] - lea r0, [r0 + r1 * 2] - dec r4d - jnz .loop - RET -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- - IPFILTER_LUMA 4, 4, pp - IPFILTER_LUMA 4, 8, pp - IPFILTER_LUMA 12, 16, pp - IPFILTER_LUMA 4, 16, pp + pshufb m0, [tab_Cm] + mova m1, [pw_512] + lea r5, [r0 + 4 * r1] + lea r4, [r1 * 3] -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_8x8, 4,6,6 - mov r4d, r4m + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r4] -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 - movu m1, [tab_Tm] - vpbroadcastd m2, [pw_1] + pmaddubsw m2, m0 - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 + movd m6, [r5] - sub r0, 1 - mov r4d, 2 + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, [pw_512] - lea r0, [r0 + r1 * 2] - - ; Row 2 - vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - ; Row 3 - vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, [pw_512] - - packuswb m3, m4 - mova m5, [interp_4tap_8x8_horiz_shuf] - vpermd m3, m5, m3 - vextracti128 xm4, m3, 1 - movq [r2], xm3 - movhps [r2 + r3], xm3 - lea r2, [r2 + r3 * 2] - movq [r2], xm4 - movhps [r2 + r3], xm4 - lea r2, [r2 + r3 * 2] - lea r0, [r0 + r1*2] - dec r4d - jnz .loop - RET + pmaddubsw m3, m0 - IPFILTER_LUMA_AVX2 16, 4 - IPFILTER_LUMA_AVX2 16, 8 - IPFILTER_LUMA_AVX2 16, 12 - IPFILTER_LUMA_AVX2 16, 16 - IPFILTER_LUMA_AVX2 16, 32 - IPFILTER_LUMA_AVX2 16, 64 + phaddw m2, m3 - IPFILTER_LUMA_32x_avx2 32 , 8 - IPFILTER_LUMA_32x_avx2 32 , 16 - IPFILTER_LUMA_32x_avx2 32 , 24 - IPFILTER_LUMA_32x_avx2 32 , 32 - IPFILTER_LUMA_32x_avx2 32 , 64 + pmulhrsw m2, m1 - IPFILTER_LUMA_64x_avx2 64 , 64 - IPFILTER_LUMA_64x_avx2 64 , 48 - IPFILTER_LUMA_64x_avx2 64 , 32 - IPFILTER_LUMA_64x_avx2 64 , 16 + movd m7, [r5 + r1] -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_8x2, 4, 6, 5 - mov r4d, r4m + punpcklbw m4, m5 + punpcklbw m3, m6, m7 + punpcklbw m4, m3 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - mova m1, [tab_Tm] - mova m2, [pw_1] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, [pw_512] - vextracti128 xm4, m3, 1 - packuswb xm3, xm4 - pshufd xm3, xm3, 11011000b - movq [r2], xm3 - movhps [r2 + r3], xm3 - RET + pmaddubsw m4, m0 -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_8x6, 4, 6, 7 - mov r4d, r4m + movd m3, [r5 + 2 * r1] -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - mova m1, [tab_Tm] - mova m2, [pw_1] - mova m6, [pw_512] - lea r4, [r1 * 3] - lea r5, [r3 * 3] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - ; Row 2 - vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - ; Row 3 - vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - mova m5, [interp8_hps_shuf] - vpermd m3, m5, m3 - vextracti128 xm4, m3, 1 - movq [r2], xm3 - movhps [r2 + r3], xm3 - movq [r2 + r3 * 2], xm4 - movhps [r2 + r5], xm4 - lea r2, [r2 + r3 * 4] - lea r0, [r0 + r1 * 4] - ; Row 4 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 5 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - vextracti128 xm4, m3, 1 - packuswb xm3, xm4 - pshufd xm3, xm3, 11011000b - movq [r2], xm3 - movhps [r2 + r3], xm3 - RET + punpcklbw m5, m6 + punpcklbw m7, m3 + punpcklbw m5, m7 -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_6x8, 4, 6, 7 - mov r4d, r4m + pmaddubsw m5, m0 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif + phaddw m4, m5 - mova m1, [tab_Tm] - mova m2, [pw_1] - mova m6, [pw_512] - lea r4, [r1 * 3] - lea r5, [r3 * 3] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 + pmulhrsw m4, m1 - dec r0 -%rep 2 - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - ; Row 2 - vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - ; Row 3 - vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - vextracti128 xm4, m3, 1 - movd [r2], xm3 - pextrw [r2 + 4], xm4, 0 - pextrd [r2 + r3], xm3, 1 - pextrw [r2 + r3 + 4], xm4, 2 - pextrd [r2 + r3 * 2], xm3, 2 - pextrw [r2 + r3 * 2 + 4], xm4, 4 - pextrd [r2 + r5], xm3, 3 - pextrw [r2 + r5 + 4], xm4, 6 - lea r2, [r2 + r3 * 4] - lea r0, [r0 + r1 * 4] -%endrep + packuswb m2, m4 + movd [r2], m2 + pextrd [r2 + r3], m2, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m2, 2 + pextrd [r2 + r3], m2, 3 RET - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;-----------------------------------------------------------------------------------------------------------------------------; -%macro IPFILTER_CHROMA_HPS_64xN 1 +%macro FILTER_VER_CHROMA_AVX2_4x4 1 INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_64x%1, 4,7,6 +cglobal interp_4tap_vert_%1_4x4, 4, 6, 3 mov r4d, r4m - mov r5d, r5m - add r3d, r3d + shl r4d, 6 + sub r0, r1 %ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32 + r4] %endif - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m5, [pw_2000] - mova m1, [tab_Tm] + lea r4, [r1 * 3] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - mov r6d, %1 - dec r0 - test r5d, r5d - je .loop - sub r0 , r1 - add r6d , 3 + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 ; m2 = row[x 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[x 6 5 4 3 2 1 0] + mova m2, [interp4_vpp_shuf1] + vpermd m0, m2, m1 ; m0 = row[4 3 3 2 2 1 1 0] + mova m2, [interp4_vpp_shuf1 + mmsize] + vpermd m1, m2, m1 ; m1 = row[6 5 5 4 4 3 3 2] -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, m5 - vpermq m3, m3, 11011000b - movu [r2], m3 - - vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 24] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, m5 - vpermq m3, m3, 11011000b - movu [r2 + 32], m3 - - vbroadcasti128 m3, [r0 + 32] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 40] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, m5 - vpermq m3, m3, 11011000b - movu [r2 + 64], m3 - - vbroadcasti128 m3, [r0 + 48] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 56] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, m5 - vpermq m3, m3, 11011000b - movu [r2 + 96], m3 - - add r2, r3 - add r0, r1 - dec r6d - jnz .loop + mova m2, [interp4_vpp_shuf] + pshufb m0, m0, m2 + pshufb m1, m1, m2 + pmaddubsw m0, [r5] + pmaddubsw m1, [r5 + mmsize] + paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] +%ifidn %1,pp + pmulhrsw m0, [pw_512] + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + lea r5, [r3 * 3] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + r3 * 2], xm0, 2 + pextrd [r2 + r5], xm0, 3 +%else + add r3d, r3d + psubw m0, [pw_2000] + vextracti128 xm1, m0, 1 + lea r5, [r3 * 3] + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm1 + movhps [r2 + r5], xm1 +%endif RET %endmacro + FILTER_VER_CHROMA_AVX2_4x4 pp + FILTER_VER_CHROMA_AVX2_4x4 ps - IPFILTER_CHROMA_HPS_64xN 64 - IPFILTER_CHROMA_HPS_64xN 32 - IPFILTER_CHROMA_HPS_64xN 48 - IPFILTER_CHROMA_HPS_64xN 16 - -;----------------------------------------------------------------------------------------------------------------------------- -;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- - -%macro IPFILTER_LUMA_PS_4xN_AVX2 1 +%macro FILTER_VER_CHROMA_AVX2_4x8 1 INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_4x%1, 6,7,6 - mov r5d, r5m - mov r4d, r4m +cglobal interp_4tap_vert_%1_4x8, 4, 6, 5 + mov r4d, r4m + shl r4d, 6 + sub r0, r1 + %ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] -%endif - mova m1, [tab_Lm] - add r3d, r3d - vbroadcasti128 m2, [pw_2000] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - pw_2000 - - sub r0, 3 - test r5d, r5d - mov r5d, %1 ; loop count variable - height - jz .preloop - lea r6, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride - sub r0, r6 ; r0(src) - 3 * srcStride - add r5d, 7 ; need extra 7 rows, just set a specially flag here, blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop) - -.preloop: - lea r6, [r3 * 3] -.loop: - ; Row 0-1 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 ; shuffled based on the col order tab_Lm - pmaddubsw m3, m0 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] - - ; Row 2-3 - lea r0, [r0 + r1 * 2] ;3rd row(i.e 2nd row) - vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - phaddw m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] - phaddw m3, m4 ; all rows and col completed. - - mova m5, [interp8_hps_shuf] - vpermd m3, m5, m3 - psubw m3, m2 - - vextracti128 xm4, m3, 1 - movq [r2], xm3 ;row 0 - movhps [r2 + r3], xm3 ;row 1 - movq [r2 + r3 * 2], xm4 ;row 2 - movhps [r2 + r6], xm4 ;row 3 - - lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4) - lea r2, [r2 + r3 * 4] ; first loop dst ->5th row(i.e 4) - sub r5d, 4 - jz .end - cmp r5d, 4 - jge .loop - - ; Row 8-9 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] - - ; Row 10 - vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - phaddw m4, m4 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] - phaddw m3, m4 - - vpermd m3, m5, m3 ; m5 don't broken in above - psubw m3, m2 - - vextracti128 xm4, m3, 1 - movq [r2], xm3 - movhps [r2 + r3], xm3 - movq [r2 + r3 * 2], xm4 -.end: - RET + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] %endif -%endmacro - IPFILTER_LUMA_PS_4xN_AVX2 4 - IPFILTER_LUMA_PS_4xN_AVX2 8 - IPFILTER_LUMA_PS_4xN_AVX2 16 + lea r4, [r1 * 3] -%macro IPFILTER_LUMA_PS_8xN_AVX2 1 -; TODO: verify and enable on X86 mode -%if ARCH_X86_64 == 1 -; void filter_hps(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) -INIT_YMM avx2 -cglobal interp_8tap_horiz_ps_8x%1, 4,7,6 - mov r5d, r5m - mov r4d, r4m - shl r4d, 7 -%ifdef PIC - lea r6, [pb_LumaCoeffVer] - add r6, r4 -%else - lea r6, [pb_LumaCoeffVer + r4] -%endif - add r3d, r3d - vpbroadcastd m0, [pw_2000] - sub r0, 3 - lea r4, [pb_8tap_hps_0] - vbroadcasti128 m5, [r4 + 0 * mmsize] - - ; check row count extend for interpolateHV - test r5d, r5d; - mov r5d, %1 - jz .enter_loop - lea r4, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride - sub r0, r4 ; r0(src)-r8 - add r5d, 8-1-2 ; blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop) - -.enter_loop: - lea r4, [pb_8tap_hps_0] - - ; ***** register map ***** - ; m0 - pw_2000 - ; r4 - base pointer of shuffle order table - ; r5 - count of loop - ; r6 - point to LumaCoeff -.loop: + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] + mova m3, [interp4_vpp_shuf1] + vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] + vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] + mova m3, [interp4_vpp_shuf1 + mmsize] + vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] - ; Row 0-1 - movu xm1, [r0] - movu xm2, [r0 + r1] - vinserti128 m1, m1, xm2, 1 - pshufb m2, m1, m5 ; [0 1 1 2 2 3 3 4 ...] - pshufb m3, m1, [r4 + 1 * mmsize] ; [2 3 3 4 4 5 5 6 ...] - pshufb m4, m1, [r4 + 2 * mmsize] ; [4 5 5 6 6 7 7 8 ...] - pshufb m1, m1, [r4 + 3 * mmsize] ; [6 7 7 8 8 9 9 A ...] - pmaddubsw m2, [r6 + 0 * mmsize] - pmaddubsw m3, [r6 + 1 * mmsize] - pmaddubsw m4, [r6 + 2 * mmsize] - pmaddubsw m1, [r6 + 3 * mmsize] - paddw m2, m3 - paddw m1, m4 - paddw m1, m2 - psubw m1, m0 - - vextracti128 xm2, m1, 1 - movu [r2], xm1 ; row 0 - movu [r2 + r3], xm2 ; row 1 - - lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4) - lea r2, [r2 + r3 * 2] ; first loop dst ->5th row(i.e 4) - sub r5d, 2 - jg .loop - jz .end - - ; last row - movu xm1, [r0] - pshufb xm2, xm1, xm5 ; [0 1 1 2 2 3 3 4 ...] - pshufb xm3, xm1, [r4 + 1 * mmsize] ; [2 3 3 4 4 5 5 6 ...] - pshufb xm4, xm1, [r4 + 2 * mmsize] ; [4 5 5 6 6 7 7 8 ...] - pshufb xm1, xm1, [r4 + 3 * mmsize] ; [6 7 7 8 8 9 9 A ...] - pmaddubsw xm2, [r6 + 0 * mmsize] - pmaddubsw xm3, [r6 + 1 * mmsize] - pmaddubsw xm4, [r6 + 2 * mmsize] - pmaddubsw xm1, [r6 + 3 * mmsize] - paddw xm2, xm3 - paddw xm1, xm4 - paddw xm1, xm2 - psubw xm1, xm0 - movu [r2], xm1 ;row 0 -.end: - RET + mova m3, [interp4_vpp_shuf] + pshufb m0, m0, m3 + pshufb m1, m1, m3 + pshufb m2, m2, m3 + pshufb m4, m4, m3 + pmaddubsw m0, [r5] + pmaddubsw m4, [r5] + pmaddubsw m1, [r5 + mmsize] + pmaddubsw m2, [r5 + mmsize] + paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] + paddw m4, m2 ; m4 = WORD ROW[7 6 5 4] +%ifidn %1,pp + pmulhrsw m0, [pw_512] + pmulhrsw m4, [pw_512] + packuswb m0, m4 + vextracti128 xm1, m0, 1 + lea r5, [r3 * 3] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + movd [r2 + r3 * 2], xm1 + pextrd [r2 + r5], xm1, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm0, 3 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r5], xm1, 3 +%else + add r3d, r3d + psubw m0, [pw_2000] + psubw m4, [pw_2000] + vextracti128 xm1, m0, 1 + vextracti128 xm2, m4, 1 + lea r5, [r3 * 3] + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm1 + movhps [r2 + r5], xm1 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r5], xm2 %endif -%endmacro ; IPFILTER_LUMA_PS_8xN_AVX2 - - IPFILTER_LUMA_PS_8xN_AVX2 4 - IPFILTER_LUMA_PS_8xN_AVX2 8 - IPFILTER_LUMA_PS_8xN_AVX2 16 - IPFILTER_LUMA_PS_8xN_AVX2 32 + RET +%endmacro + FILTER_VER_CHROMA_AVX2_4x8 pp + FILTER_VER_CHROMA_AVX2_4x8 ps -%macro IPFILTER_LUMA_PS_16x_AVX2 2 -INIT_YMM avx2 +%macro FILTER_VER_CHROMA_AVX2_4xN 2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_%1x%2, 6, 10, 7 - mov r5d, r5m - mov r4d, r4m +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x%2, 4, 6, 12 + mov r4d, r4m + shl r4d, 6 + sub r0, r1 + %ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] -%endif - mova m6, [tab_Lm + 32] - mova m1, [tab_Lm] - mov r9, %2 ;height - add r3d, r3d - vbroadcasti128 m2, [pw_2000] - - ; register map - ; m0 - interpolate coeff - ; m1 , m6 - shuffle order table - ; m2 - pw_2000 - - xor r7, r7 ; loop count variable - sub r0, 3 - test r5d, r5d - jz .label - lea r8, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride - sub r0, r8 ; r0(src)-r8 - add r9, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) - -.label: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m3, m6 ; row 0 (col 4 to 7) - pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) - pmaddubsw m3, m0 - pmaddubsw m4, m0 - phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] - - vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m4, m6 ;row 1 (col 4 to 7) - pshufb m4, m1 ;row 1 (col 0 to 3) - pmaddubsw m4, m0 - pmaddubsw m5, m0 - phaddw m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] - phaddw m3, m4 ; all rows and col completed. - - mova m5, [interp8_hps_shuf] - vpermd m3, m5, m3 - psubw m3, m2 - - movu [r2], m3 ;row 0 - - lea r0, [r0 + r1] ; first loop src ->5th row(i.e 4) - lea r2, [r2 + r3] ; first loop dst ->5th row(i.e 4) - dec r9d - jnz .label + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + mova m10, [r5] + mova m11, [r5 + mmsize] +%ifidn %1,pp + mova m9, [pw_512] +%else + add r3d, r3d + mova m9, [pw_2000] +%endif + lea r5, [r3 * 3] +%rep %2 / 16 + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 + pinsrd xm3, [r0 + r4], 3 ; m3 = row[11 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[11 10 9 8 7 6 5 4] + lea r0, [r0 + r1 * 4] + movd xm4, [r0] + pinsrd xm4, [r0 + r1], 1 + pinsrd xm4, [r0 + r1 * 2], 2 + pinsrd xm4, [r0 + r4], 3 ; m4 = row[15 14 13 12] + vinserti128 m3, m3, xm4, 1 ; m3 = row[15 14 13 12 11 10 9 8] + lea r0, [r0 + r1 * 4] + movd xm5, [r0] + pinsrd xm5, [r0 + r1], 1 + pinsrd xm5, [r0 + r1 * 2], 2 ; m5 = row[x 18 17 16] + vinserti128 m4, m4, xm5, 1 ; m4 = row[x 18 17 16 15 14 13 12] + mova m5, [interp4_vpp_shuf1] + vpermd m0, m5, m1 ; m0 = row[4 3 3 2 2 1 1 0] + vpermd m6, m5, m2 ; m6 = row[8 7 7 6 6 5 5 4] + vpermd m7, m5, m3 ; m7 = row[12 11 11 10 10 9 9 8] + vpermd m8, m5, m4 ; m8 = row[16 15 15 14 14 13 13 12] + mova m5, [interp4_vpp_shuf1 + mmsize] + vpermd m1, m5, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m5, m2 ; m2 = row[10 9 9 8 8 7 7 6] + vpermd m3, m5, m3 ; m3 = row[14 13 13 12 12 11 11 10] + vpermd m4, m5, m4 ; m4 = row[18 17 17 16 16 15 15 14] + mova m5, [interp4_vpp_shuf] + pshufb m0, m0, m5 + pshufb m1, m1, m5 + pshufb m2, m2, m5 + pshufb m4, m4, m5 + pshufb m3, m3, m5 + pshufb m6, m6, m5 + pshufb m7, m7, m5 + pshufb m8, m8, m5 + pmaddubsw m0, m10 + pmaddubsw m6, m10 + pmaddubsw m7, m10 + pmaddubsw m8, m10 + pmaddubsw m1, m11 + pmaddubsw m2, m11 + pmaddubsw m3, m11 + pmaddubsw m4, m11 + paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] + paddw m6, m2 ; m6 = WORD ROW[7 6 5 4] + paddw m7, m3 ; m7 = WORD ROW[11 10 9 8] + paddw m8, m4 ; m8 = WORD ROW[15 14 13 12] +%ifidn %1,pp + pmulhrsw m0, m9 + pmulhrsw m6, m9 + pmulhrsw m7, m9 + pmulhrsw m8, m9 + packuswb m0, m6 + packuswb m7, m8 + vextracti128 xm1, m0, 1 + vextracti128 xm2, m7, 1 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + movd [r2 + r3 * 2], xm1 + pextrd [r2 + r5], xm1, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm0, 3 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r5], xm1, 3 + lea r2, [r2 + r3 * 4] + movd [r2], xm7 + pextrd [r2 + r3], xm7, 1 + movd [r2 + r3 * 2], xm2 + pextrd [r2 + r5], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm7, 2 + pextrd [r2 + r3], xm7, 3 + pextrd [r2 + r3 * 2], xm2, 2 + pextrd [r2 + r5], xm2, 3 +%else + psubw m0, m9 + psubw m6, m9 + psubw m7, m9 + psubw m8, m9 + vextracti128 xm1, m0, 1 + vextracti128 xm2, m6, 1 + vextracti128 xm3, m7, 1 + vextracti128 xm4, m8, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm1 + movhps [r2 + r5], xm1 + lea r2, [r2 + r3 * 4] + movq [r2], xm6 + movhps [r2 + r3], xm6 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r5], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm7 + movhps [r2 + r3], xm7 + movq [r2 + r3 * 2], xm3 + movhps [r2 + r5], xm3 + lea r2, [r2 + r3 * 4] + movq [r2], xm8 + movhps [r2 + r3], xm8 + movq [r2 + r3 * 2], xm4 + movhps [r2 + r5], xm4 +%endif + lea r2, [r2 + r3 * 4] +%endrep RET %endif %endmacro + FILTER_VER_CHROMA_AVX2_4xN pp, 16 + FILTER_VER_CHROMA_AVX2_4xN ps, 16 + FILTER_VER_CHROMA_AVX2_4xN pp, 32 + FILTER_VER_CHROMA_AVX2_4xN ps, 32 - IPFILTER_LUMA_PS_16x_AVX2 16 , 16 - IPFILTER_LUMA_PS_16x_AVX2 16 , 8 - IPFILTER_LUMA_PS_16x_AVX2 16 , 12 - IPFILTER_LUMA_PS_16x_AVX2 16 , 4 - IPFILTER_LUMA_PS_16x_AVX2 16 , 32 - IPFILTER_LUMA_PS_16x_AVX2 16 , 64 - - -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro IPFILTER_LUMA_PP_W8 2 +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W4_H4 2 INIT_XMM sse4 -cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7 - mov r4d, r4m +cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 %ifdef PIC - lea r5, [tab_LumaCoeff] - movh m3, [r5 + r4 * 8] + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] %else - movh m3, [tab_LumaCoeff + r4 * 8] + movd m0, [tab_ChromaCoeff + r4 * 4] %endif - pshufd m0, m3, 0 ; m0 = coeff-L - pshufd m1, m3, 0x55 ; m1 = coeff-H - lea r5, [tab_Tm] ; r5 = shuffle - mova m2, [pw_512] ; m2 = 512 - mov r4d, %2 -.loopH: -%assign x 0 -%rep %1 / 8 - movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0] - pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4] - pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8] - pmaddubsw m4, m0 - pmaddubsw m6, m5, m1 - pmaddubsw m5, m0 - pmaddubsw m3, m1 - paddw m4, m6 - paddw m5, m3 - phaddw m4, m5 - pmulhrsw m4, m2 - packuswb m4, m4 - movh [r2 + x], m4 -%assign x x+8 -%endrep - - add r0, r1 - add r2, r3 - - dec r4d - jnz .loopH - RET -%endmacro - - IPFILTER_LUMA_PP_W8 8, 4 - IPFILTER_LUMA_PP_W8 8, 8 - IPFILTER_LUMA_PP_W8 8, 16 - IPFILTER_LUMA_PP_W8 8, 32 - IPFILTER_LUMA_PP_W8 16, 4 - IPFILTER_LUMA_PP_W8 16, 8 - IPFILTER_LUMA_PP_W8 16, 12 - IPFILTER_LUMA_PP_W8 16, 16 - IPFILTER_LUMA_PP_W8 16, 32 - IPFILTER_LUMA_PP_W8 16, 64 - IPFILTER_LUMA_PP_W8 24, 32 - IPFILTER_LUMA_PP_W8 32, 8 - IPFILTER_LUMA_PP_W8 32, 16 - IPFILTER_LUMA_PP_W8 32, 24 - IPFILTER_LUMA_PP_W8 32, 32 - IPFILTER_LUMA_PP_W8 32, 64 - IPFILTER_LUMA_PP_W8 48, 64 - IPFILTER_LUMA_PP_W8 64, 16 - IPFILTER_LUMA_PP_W8 64, 32 - IPFILTER_LUMA_PP_W8 64, 48 - IPFILTER_LUMA_PP_W8 64, 64 - -;---------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;---------------------------------------------------------------------------------------------------------------------------- - IPFILTER_LUMA 4, 4, ps - IPFILTER_LUMA 8, 8, ps - IPFILTER_LUMA 8, 4, ps - IPFILTER_LUMA 4, 8, ps - IPFILTER_LUMA 16, 16, ps - IPFILTER_LUMA 16, 8, ps - IPFILTER_LUMA 8, 16, ps - IPFILTER_LUMA 16, 12, ps - IPFILTER_LUMA 12, 16, ps - IPFILTER_LUMA 16, 4, ps - IPFILTER_LUMA 4, 16, ps - IPFILTER_LUMA 32, 32, ps - IPFILTER_LUMA 32, 16, ps - IPFILTER_LUMA 16, 32, ps - IPFILTER_LUMA 32, 24, ps - IPFILTER_LUMA 24, 32, ps - IPFILTER_LUMA 32, 8, ps - IPFILTER_LUMA 8, 32, ps - IPFILTER_LUMA 64, 64, ps - IPFILTER_LUMA 64, 32, ps - IPFILTER_LUMA 32, 64, ps - IPFILTER_LUMA 64, 48, ps - IPFILTER_LUMA 48, 64, ps - IPFILTER_LUMA 64, 16, ps - IPFILTER_LUMA 16, 64, ps - -;----------------------------------------------------------------------------- -; Interpolate HV -;----------------------------------------------------------------------------- -%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2] - mova %5, [r0 + (%6 + 0) * 16] - mova %1, [r0 + (%6 + 1) * 16] - mova %2, [r0 + (%6 + 2) * 16] - punpcklwd %3, %5, %1 - punpckhwd %5, %1 - pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0 - pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1] - punpcklwd %4, %1, %2 - punpckhwd %1, %2 - pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1 - pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2] -%endmacro ; FILTER_HV8_START - -%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6] - mova %8, [r0 + (%9 + 0) * 16] - mova %1, [r0 + (%9 + 1) * 16] - punpcklwd %7, %2, %8 - punpckhwd %2, %8 - pmaddwd %7, [r5 + %10 * 16] - pmaddwd %2, [r5 + %10 * 16] - paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0 - paddd %5, %2 ; R0 = H[0+1+2+3] - punpcklwd %7, %8, %1 - punpckhwd %8, %1 - pmaddwd %7, [r5 + %10 * 16] - pmaddwd %8, [r5 + %10 * 16] - paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1 - paddd %6, %8 ; R1 = H[1+2+3+4] -%endmacro ; FILTER_HV8_MID - -; Round and Saturate -%macro FILTER_HV8_END 4 ; output in [1, 3] - paddd %1, [pd_526336] - paddd %2, [pd_526336] - paddd %3, [pd_526336] - paddd %4, [pd_526336] - psrad %1, 12 - psrad %2, 12 - psrad %3, 12 - psrad %4, 12 - packssdw %1, %2 - packssdw %3, %4 - - ; TODO: is merge better? I think this way is short dependency link - packuswb %1, %3 -%endmacro ; FILTER_HV8_END - -;----------------------------------------------------------------------------- -; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) -;----------------------------------------------------------------------------- -INIT_XMM ssse3 -cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 -%define coef m7 -%define stk_buf rsp - - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_LumaCoeff] - movh coef, [r6 + r4 * 8] -%else - movh coef, [tab_LumaCoeff + r4 * 8] -%endif - punpcklqdq coef, coef - - ; move to row -3 - lea r6, [r1 + r1 * 2] - sub r0, r6 - - xor r6, r6 - mov r4, rsp - -.loopH: - FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3] - psubw m1, [pw_2000] - mova [r4], m1 - - add r0, r1 - add r4, 16 - inc r6 - cmp r6, 8+7 - jnz .loopH - - ; ready to phase V - ; Here all of mN is free - - ; load coeff table - shl r5, 6 - lea r6, [tab_LumaCoeffV] - lea r5, [r5 + r6] - - ; load intermedia buffer - mov r0, stk_buf - - ; register mapping - ; r0 - src - ; r5 - coeff - ; r6 - loop_i - - ; let's go - xor r6, r6 - - ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache -.loopV: - - FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 - FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 - FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 - FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 - FILTER_HV8_END m3, m0, m4, m1 - - movh [r2], m3 - movhps [r2 + r3], m3 - - lea r0, [r0 + 16 * 2] - lea r2, [r2 + r3 * 2] - - inc r6 - cmp r6, 8/2 - jnz .loopV - - RET - -;----------------------------------------------------------------------------- -; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) -;----------------------------------------------------------------------------- -INIT_XMM sse3 -cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 - mov r4d, r4m - mov r5d, r5m - add r4d, r4d - pxor m6, m6 - -%ifdef PIC - lea r6, [tabw_LumaCoeff] - mova m3, [r6 + r4 * 8] -%else - mova m3, [tabw_LumaCoeff + r4 * 8] -%endif - - ; move to row -3 - lea r6, [r1 + r1 * 2] - sub r0, r6 - - mov r4, rsp - -%assign x 0 ;needed for FILTER_H8_W8_sse2 macro -%assign y 1 -%rep 15 - FILTER_H8_W8_sse2 - psubw m1, [pw_2000] - mova [r4], m1 - -%if y < 15 - add r0, r1 - add r4, 16 -%endif -%assign y y+1 -%endrep - - ; ready to phase V - ; Here all of mN is free - - ; load coeff table - shl r5, 6 - lea r6, [tab_LumaCoeffV] - lea r5, [r5 + r6] - - ; load intermedia buffer - mov r0, rsp - - ; register mapping - ; r0 - src - ; r5 - coeff - - ; let's go -%assign y 1 -%rep 4 - FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 - FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 - FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 - FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 - FILTER_HV8_END m3, m0, m4, m1 - - movh [r2], m3 - movhps [r2 + r3], m3 - -%if y < 4 - lea r0, [r0 + 16 * 2] - lea r2, [r2 + r3 * 2] -%endif -%assign y y+1 -%endrep - RET + pshufb m0, [tab_Cm] -;----------------------------------------------------------------------------- -;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 + mova m1, [pw_512] - mov r4d, r4m - sub r0, r1 + mov r4d, %2 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - lea r4, [r1 * 3] - lea r5, [r0 + 4 * r1] - pshufb m0, [tab_Cm] - mova m1, [pw_512] + lea r5, [3 * r1] +.loop: movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] - movd m5, [r0 + r4] + movd m5, [r0 + r5] punpcklbw m2, m3 punpcklbw m6, m4, m5 @@ -5874,7 +3965,8 @@ cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 pmaddubsw m2, m0 - movd m6, [r5] + lea r0, [r0 + 4 * r1] + movd m6, [r0] punpcklbw m3, m4 punpcklbw m7, m5, m6 @@ -5886,7 +3978,7 @@ cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 pmulhrsw m2, m1 - movd m7, [r5 + r1] + movd m7, [r0 + r1] punpcklbw m4, m5 punpcklbw m3, m6, m7 @@ -5894,7 +3986,7 @@ cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 pmaddubsw m4, m0 - movd m3, [r5 + 2 * r1] + movd m3, [r0 + 2 * r1] punpcklbw m5, m6 punpcklbw m7, m3 @@ -5906,1113 +3998,834 @@ cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 pmulhrsw m4, m1 packuswb m2, m4 + movd [r2], m2 + pextrd [r2 + r3], m2, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m2, 2 + pextrd [r2 + r3], m2, 3 - pextrw [r2], m2, 0 - pextrw [r2 + r3], m2, 2 lea r2, [r2 + 2 * r3] - pextrw [r2], m2, 4 - pextrw [r2 + r3], m2, 6 + sub r4, 4 + jnz .loop RET +%endmacro -%macro FILTER_VER_CHROMA_AVX2_2x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_2x4, 4, 6, 2 - mov r4d, r4m - shl r4d, 5 - sub r0, r1 + FILTER_V4_W4_H4 4, 8 + FILTER_V4_W4_H4 4, 16 -%ifdef PIC - lea r5, [tab_ChromaCoeff_V] - add r5, r4 -%else - lea r5, [tab_ChromaCoeff_V + r4] -%endif + FILTER_V4_W4_H4 4, 32 - lea r4, [r1 * 3] +%macro FILTER_V4_W8_H2 0 + punpcklbw m1, m2 + punpcklbw m7, m3, m0 - pinsrw xm1, [r0], 0 - pinsrw xm1, [r0 + r1], 1 - pinsrw xm1, [r0 + r1 * 2], 2 - pinsrw xm1, [r0 + r4], 3 - lea r0, [r0 + r1 * 4] - pinsrw xm1, [r0], 4 - pinsrw xm1, [r0 + r1], 5 - pinsrw xm1, [r0 + r1 * 2], 6 + pmaddubsw m1, m6 + pmaddubsw m7, m5 - pshufb xm0, xm1, [interp_vert_shuf] - pshufb xm1, [interp_vert_shuf + 32] - vinserti128 m0, m0, xm1, 1 - pmaddubsw m0, [r5] - vextracti128 xm1, m0, 1 - paddw xm0, xm1 -%ifidn %1,pp - pmulhrsw xm0, [pw_512] - packuswb xm0, xm0 - lea r4, [r3 * 3] - pextrw [r2], xm0, 0 - pextrw [r2 + r3], xm0, 1 - pextrw [r2 + r3 * 2], xm0, 2 - pextrw [r2 + r4], xm0, 3 -%else - add r3d, r3d - lea r4, [r3 * 3] - psubw xm0, [pw_2000] - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - pextrd [r2 + r3 * 2], xm0, 2 - pextrd [r2 + r4], xm0, 3 -%endif - RET + paddw m1, m7 + + pmulhrsw m1, m4 + packuswb m1, m1 %endmacro - FILTER_VER_CHROMA_AVX2_2x4 pp - FILTER_VER_CHROMA_AVX2_2x4 ps +%macro FILTER_V4_W8_H3 0 + punpcklbw m2, m3 + punpcklbw m7, m0, m1 -%macro FILTER_VER_CHROMA_AVX2_2x8 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_2x8, 4, 6, 2 - mov r4d, r4m - shl r4d, 6 - sub r0, r1 + pmaddubsw m2, m6 + pmaddubsw m7, m5 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif + paddw m2, m7 - lea r4, [r1 * 3] + pmulhrsw m2, m4 + packuswb m2, m2 +%endmacro - pinsrw xm1, [r0], 0 - pinsrw xm1, [r0 + r1], 1 - pinsrw xm1, [r0 + r1 * 2], 2 - pinsrw xm1, [r0 + r4], 3 - lea r0, [r0 + r1 * 4] - pinsrw xm1, [r0], 4 - pinsrw xm1, [r0 + r1], 5 - pinsrw xm1, [r0 + r1 * 2], 6 - pinsrw xm1, [r0 + r4], 7 - movhlps xm0, xm1 - lea r0, [r0 + r1 * 4] - pinsrw xm0, [r0], 4 - pinsrw xm0, [r0 + r1], 5 - pinsrw xm0, [r0 + r1 * 2], 6 - vinserti128 m1, m1, xm0, 1 +%macro FILTER_V4_W8_H4 0 + punpcklbw m3, m0 + punpcklbw m7, m1, m2 - pshufb m0, m1, [interp_vert_shuf] - pshufb m1, [interp_vert_shuf + 32] - pmaddubsw m0, [r5] - pmaddubsw m1, [r5 + 1 * mmsize] - paddw m0, m1 -%ifidn %1,pp - pmulhrsw m0, [pw_512] - vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - lea r4, [r3 * 3] - pextrw [r2], xm0, 0 - pextrw [r2 + r3], xm0, 1 - pextrw [r2 + r3 * 2], xm0, 2 - pextrw [r2 + r4], xm0, 3 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 4 - pextrw [r2 + r3], xm0, 5 - pextrw [r2 + r3 * 2], xm0, 6 - pextrw [r2 + r4], xm0, 7 -%else - add r3d, r3d - lea r4, [r3 * 3] - psubw m0, [pw_2000] - vextracti128 xm1, m0, 1 - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - pextrd [r2 + r3 * 2], xm0, 2 - pextrd [r2 + r4], xm0, 3 - lea r2, [r2 + r3 * 4] - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r4], xm1, 3 -%endif - RET + pmaddubsw m3, m6 + pmaddubsw m7, m5 + + paddw m3, m7 + + pmulhrsw m3, m4 + packuswb m3, m3 %endmacro - FILTER_VER_CHROMA_AVX2_2x8 pp - FILTER_VER_CHROMA_AVX2_2x8 ps +%macro FILTER_V4_W8_H5 0 + punpcklbw m0, m1 + punpcklbw m7, m2, m3 -%macro FILTER_VER_CHROMA_AVX2_2x16 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_2x16, 4, 6, 3 - mov r4d, r4m - shl r4d, 6 - sub r0, r1 + pmaddubsw m0, m6 + pmaddubsw m7, m5 + + paddw m0, m7 + + pmulhrsw m0, m4 + packuswb m0, m0 +%endmacro + +%macro FILTER_V4_W8_8x2 2 + FILTER_V4_W8 %1, %2 + movq m0, [r0 + 4 * r1] + + FILTER_V4_W8_H2 + + movh [r2 + r3], m1 +%endmacro + +%macro FILTER_V4_W8_8x4 2 + FILTER_V4_W8_8x2 %1, %2 +;8x3 + lea r6, [r0 + 4 * r1] + movq m1, [r6 + r1] + + FILTER_V4_W8_H3 + + movh [r2 + 2 * r3], m2 + +;8x4 + movq m2, [r6 + 2 * r1] + + FILTER_V4_W8_H4 + + lea r5, [r2 + 2 * r3] + movh [r5 + r3], m3 +%endmacro + +%macro FILTER_V4_W8_8x6 2 + FILTER_V4_W8_8x4 %1, %2 +;8x5 + lea r6, [r6 + 2 * r1] + movq m3, [r6 + r1] + + FILTER_V4_W8_H5 + + movh [r2 + 4 * r3], m0 + +;8x6 + movq m0, [r0 + 8 * r1] + + FILTER_V4_W8_H2 + + lea r5, [r2 + 4 * r3] + movh [r5 + r3], m1 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W8 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 + + mov r4d, r4m + + sub r0, r1 + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + lea r5, [r0 + 2 * r1] + movq m3, [r5 + r1] + + punpcklbw m0, m1 + punpcklbw m4, m2, m3 %ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 + lea r6, [tab_ChromaCoeff] + movd m5, [r6 + r4 * 4] %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + movd m5, [tab_ChromaCoeff + r4 * 4] %endif - lea r4, [r1 * 3] + pshufb m6, m5, [tab_Vm] + pmaddubsw m0, m6 - movd xm1, [r0] - pinsrw xm1, [r0 + r1], 1 - pinsrw xm1, [r0 + r1 * 2], 2 - pinsrw xm1, [r0 + r4], 3 - lea r0, [r0 + r1 * 4] - pinsrw xm1, [r0], 4 - pinsrw xm1, [r0 + r1], 5 - pinsrw xm1, [r0 + r1 * 2], 6 - pinsrw xm1, [r0 + r4], 7 - lea r0, [r0 + r1 * 4] - pinsrw xm0, [r0], 4 - pinsrw xm0, [r0 + r1], 5 - pinsrw xm0, [r0 + r1 * 2], 6 - pinsrw xm0, [r0 + r4], 7 - punpckhqdq xm0, xm1, xm0 - vinserti128 m1, m1, xm0, 1 + pshufb m5, [tab_Vm + 16] + pmaddubsw m4, m5 - pshufb m2, m1, [interp_vert_shuf] - pshufb m1, [interp_vert_shuf + 32] - pmaddubsw m2, [r5] - pmaddubsw m1, [r5 + 1 * mmsize] - paddw m2, m1 + paddw m0, m4 - lea r0, [r0 + r1 * 4] - pinsrw xm1, [r0], 4 - pinsrw xm1, [r0 + r1], 5 - pinsrw xm1, [r0 + r1 * 2], 6 - pinsrw xm1, [r0 + r4], 7 - punpckhqdq xm1, xm0, xm1 - lea r0, [r0 + r1 * 4] - pinsrw xm0, [r0], 4 - pinsrw xm0, [r0 + r1], 5 - pinsrw xm0, [r0 + r1 * 2], 6 - punpckhqdq xm0, xm1, xm0 - vinserti128 m1, m1, xm0, 1 + mova m4, [pw_512] - pshufb m0, m1, [interp_vert_shuf] - pshufb m1, [interp_vert_shuf + 32] - pmaddubsw m0, [r5] - pmaddubsw m1, [r5 + 1 * mmsize] - paddw m0, m1 -%ifidn %1,pp - mova m1, [pw_512] - pmulhrsw m2, m1 - pmulhrsw m0, m1 - packuswb m2, m0 - lea r4, [r3 * 3] - pextrw [r2], xm2, 0 - pextrw [r2 + r3], xm2, 1 - pextrw [r2 + r3 * 2], xm2, 2 - pextrw [r2 + r4], xm2, 3 - vextracti128 xm0, m2, 1 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 0 - pextrw [r2 + r3], xm0, 1 - pextrw [r2 + r3 * 2], xm0, 2 - pextrw [r2 + r4], xm0, 3 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm2, 4 - pextrw [r2 + r3], xm2, 5 - pextrw [r2 + r3 * 2], xm2, 6 - pextrw [r2 + r4], xm2, 7 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 4 - pextrw [r2 + r3], xm0, 5 - pextrw [r2 + r3 * 2], xm0, 6 - pextrw [r2 + r4], xm0, 7 -%else - add r3d, r3d - lea r4, [r3 * 3] - vbroadcasti128 m1, [pw_2000] - psubw m2, m1 - psubw m0, m1 - vextracti128 xm1, m2, 1 - movd [r2], xm2 - pextrd [r2 + r3], xm2, 1 - pextrd [r2 + r3 * 2], xm2, 2 - pextrd [r2 + r4], xm2, 3 - lea r2, [r2 + r3 * 4] - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r4], xm1, 3 - vextracti128 xm1, m0, 1 - lea r2, [r2 + r3 * 4] - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - pextrd [r2 + r3 * 2], xm0, 2 - pextrd [r2 + r4], xm0, 3 - lea r2, [r2 + r3 * 4] - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r4], xm1, 3 -%endif - RET + pmulhrsw m0, m4 + packuswb m0, m0 + movh [r2], m0 %endmacro - FILTER_VER_CHROMA_AVX2_2x16 pp - FILTER_VER_CHROMA_AVX2_2x16 ps +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- + FILTER_V4_W8_8x2 8, 2 + + RET ;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W2_H4 2 + FILTER_V4_W8_8x4 8, 4 + + RET + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- + FILTER_V4_W8_8x6 8, 6 + + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 -cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8 +cglobal interp_4tap_vert_ps_4x2, 4, 6, 6 - mov r4d, r4m - sub r0, r1 + mov r4d, r4m + sub r0, r1 + add r3d, r3d %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] %else - movd m0, [tab_ChromaCoeff + r4 * 4] + movd m0, [tab_ChromaCoeff + r4 * 4] %endif - pshufb m0, [tab_Cm] + pshufb m0, [tab_Cm] - mova m1, [pw_512] + movd m2, [r0] + movd m3, [r0 + r1] + lea r5, [r0 + 2 * r1] + movd m4, [r5] + movd m5, [r5 + r1] - mov r4d, %2 - lea r5, [3 * r1] + punpcklbw m2, m3 + punpcklbw m1, m4, m5 + punpcklbw m2, m1 -.loop: - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] + pmaddubsw m2, m0 - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 + movd m1, [r0 + 4 * r1] - pmaddubsw m2, m0 + punpcklbw m3, m4 + punpcklbw m5, m1 + punpcklbw m3, m5 - lea r0, [r0 + 4 * r1] - movd m6, [r0] + pmaddubsw m3, m0 - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 + phaddw m2, m3 - pmaddubsw m3, m0 + psubw m2, [pw_2000] + movh [r2], m2 + movhps [r2 + r3], m2 - phaddw m2, m3 + RET - pmulhrsw m2, m1 +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_4x4, 4, 6, 7 - movd m7, [r0 + r1] + mov r4d, r4m + sub r0, r1 + add r3d, r3d - punpcklbw m4, m5 - punpcklbw m3, m6, m7 - punpcklbw m4, m3 +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif - pmaddubsw m4, m0 + pshufb m0, [tab_Cm] - movd m3, [r0 + 2 * r1] + lea r4, [r1 * 3] + lea r5, [r0 + 4 * r1] - punpcklbw m5, m6 - punpcklbw m7, m3 - punpcklbw m5, m7 + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r4] - pmaddubsw m5, m0 + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 - phaddw m4, m5 + pmaddubsw m2, m0 - pmulhrsw m4, m1 - packuswb m2, m4 + movd m6, [r5] - pextrw [r2], m2, 0 - pextrw [r2 + r3], m2, 2 - lea r2, [r2 + 2 * r3] - pextrw [r2], m2, 4 - pextrw [r2 + r3], m2, 6 + punpcklbw m3, m4 + punpcklbw m1, m5, m6 + punpcklbw m3, m1 - lea r2, [r2 + 2 * r3] + pmaddubsw m3, m0 - sub r4, 4 - jnz .loop - RET -%endmacro + phaddw m2, m3 - FILTER_V4_W2_H4 2, 8 + mova m1, [pw_2000] - FILTER_V4_W2_H4 2, 16 + psubw m2, m1 + movh [r2], m2 + movhps [r2 + r3], m2 -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_4x2, 4, 6, 6 + movd m2, [r5 + r1] - mov r4d, r4m - sub r0, r1 + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif + pmaddubsw m4, m0 - pshufb m0, [tab_Cm] - lea r5, [r0 + 2 * r1] + movd m3, [r5 + 2 * r1] - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r5] - movd m5, [r5 + r1] + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 - punpcklbw m2, m3 - punpcklbw m1, m4, m5 - punpcklbw m2, m1 + pmaddubsw m5, m0 - pmaddubsw m2, m0 + phaddw m4, m5 - movd m1, [r0 + 4 * r1] - - punpcklbw m3, m4 - punpcklbw m5, m1 - punpcklbw m3, m5 - - pmaddubsw m3, m0 - - phaddw m2, m3 - - pmulhrsw m2, [pw_512] - packuswb m2, m2 - movd [r2], m2 - pextrd [r2 + r3], m2, 1 + psubw m4, m1 + lea r2, [r2 + 2 * r3] + movh [r2], m4 + movhps [r2 + r3], m4 RET -%macro FILTER_VER_CHROMA_AVX2_4x2 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x2, 4, 6, 4 - mov r4d, r4m - shl r4d, 5 - sub r0, r1 +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W4_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d %ifdef PIC - lea r5, [tab_ChromaCoeff_V] - add r5, r4 + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] %else - lea r5, [tab_ChromaCoeff_V + r4] + movd m0, [tab_ChromaCoeff + r4 * 4] %endif - lea r4, [r1 * 3] + pshufb m0, [tab_Cm] - movd xm1, [r0] - movd xm2, [r0 + r1] - punpcklbw xm1, xm2 - movd xm3, [r0 + r1 * 2] - punpcklbw xm2, xm3 - movlhps xm1, xm2 - movd xm0, [r0 + r4] - punpcklbw xm3, xm0 - movd xm2, [r0 + r1 * 4] - punpcklbw xm0, xm2 - movlhps xm3, xm0 - vinserti128 m1, m1, xm3, 1 ; m1 = row[x x x 4 3 2 1 0] + mova m1, [pw_2000] - pmaddubsw m1, [r5] - vextracti128 xm3, m1, 1 - paddw xm1, xm3 -%ifidn %1,pp - pmulhrsw xm1, [pw_512] - packuswb xm1, xm1 - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 -%else - add r3d, r3d - psubw xm1, [pw_2000] - movq [r2], xm1 - movhps [r2 + r3], xm1 -%endif + mov r4d, %2/4 + lea r5, [3 * r1] + +.loop: + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 + + pmaddubsw m3, m0 + + phaddw m2, m3 + + psubw m2, m1 + movh [r2], m2 + movhps [r2 + r3], m2 + + movd m2, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + psubw m4, m1 + lea r2, [r2 + 2 * r3] + movh [r2], m4 + movhps [r2 + r3], m4 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop RET %endmacro - FILTER_VER_CHROMA_AVX2_4x2 pp - FILTER_VER_CHROMA_AVX2_4x2 ps + FILTER_V_PS_W4_H4 4, 8 + FILTER_V_PS_W4_H4 4, 16 -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- + FILTER_V_PS_W4_H4 4, 32 + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W8_H8_H16_H2 2 INIT_XMM sse4 -cglobal interp_4tap_vert_pp_4x4, 4, 6, 8 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7 - mov r4d, r4m - sub r0, r1 + mov r4d, r4m + sub r0, r1 + add r3d, r3d %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] %else - movd m0, [tab_ChromaCoeff + r4 * 4] + movd m5, [tab_ChromaCoeff + r4 * 4] %endif - pshufb m0, [tab_Cm] - mova m1, [pw_512] - lea r5, [r0 + 4 * r1] - lea r4, [r1 * 3] + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [pw_2000] - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r4] + mov r4d, %2/2 + lea r5, [3 * r1] - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 +.loopH: + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] - pmaddubsw m2, m0 + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 - movd m6, [r5] + pmaddubsw m0, m6 + pmaddubsw m2, m5 - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 + paddw m0, m2 - pmaddubsw m3, m0 + psubw m0, m4 + movu [r2], m0 - phaddw m2, m3 + movq m0, [r0 + 4 * r1] - pmulhrsw m2, m1 + punpcklbw m3, m0 - movd m7, [r5 + r1] + pmaddubsw m1, m6 + pmaddubsw m3, m5 - punpcklbw m4, m5 - punpcklbw m3, m6, m7 - punpcklbw m4, m3 + paddw m1, m3 + psubw m1, m4 - pmaddubsw m4, m0 + movu [r2 + r3], m1 - movd m3, [r5 + 2 * r1] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] - punpcklbw m5, m6 - punpcklbw m7, m3 - punpcklbw m5, m7 + dec r4d + jnz .loopH - pmaddubsw m5, m0 + RET +%endmacro - phaddw m4, m5 + FILTER_V_PS_W8_H8_H16_H2 8, 2 + FILTER_V_PS_W8_H8_H16_H2 8, 4 + FILTER_V_PS_W8_H8_H16_H2 8, 6 - pmulhrsw m4, m1 + FILTER_V_PS_W8_H8_H16_H2 8, 12 + FILTER_V_PS_W8_H8_H16_H2 8, 64 - packuswb m2, m4 - movd [r2], m2 - pextrd [r2 + r3], m2, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m2, 2 - pextrd [r2 + r3], m2, 3 - RET -%macro FILTER_VER_CHROMA_AVX2_4x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x4, 4, 6, 3 - mov r4d, r4m - shl r4d, 6 - sub r0, r1 +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W8_H8_H16_H32 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d %ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + movd m5, [tab_ChromaCoeff + r4 * 4] %endif - lea r4, [r1 * 3] + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [pw_2000] - movd xm1, [r0] - pinsrd xm1, [r0 + r1], 1 - pinsrd xm1, [r0 + r1 * 2], 2 - pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm2, [r0] - pinsrd xm2, [r0 + r1], 1 - pinsrd xm2, [r0 + r1 * 2], 2 ; m2 = row[x 6 5 4] - vinserti128 m1, m1, xm2, 1 ; m1 = row[x 6 5 4 3 2 1 0] - mova m2, [interp4_vpp_shuf1] - vpermd m0, m2, m1 ; m0 = row[4 3 3 2 2 1 1 0] - mova m2, [interp4_vpp_shuf1 + mmsize] - vpermd m1, m2, m1 ; m1 = row[6 5 5 4 4 3 3 2] + mov r4d, %2/4 + lea r5, [3 * r1] - mova m2, [interp4_vpp_shuf] - pshufb m0, m0, m2 - pshufb m1, m1, m2 - pmaddubsw m0, [r5] - pmaddubsw m1, [r5 + mmsize] - paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] -%ifidn %1,pp - pmulhrsw m0, [pw_512] - vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - lea r5, [r3 * 3] - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - pextrd [r2 + r3 * 2], xm0, 2 - pextrd [r2 + r5], xm0, 3 -%else - add r3d, r3d - psubw m0, [pw_2000] - vextracti128 xm1, m0, 1 - lea r5, [r3 * 3] - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm1 - movhps [r2 + r5], xm1 -%endif - RET -%endmacro - FILTER_VER_CHROMA_AVX2_4x4 pp - FILTER_VER_CHROMA_AVX2_4x4 ps +.loop: + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] -%macro FILTER_VER_CHROMA_AVX2_4x8 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x8, 4, 6, 5 - mov r4d, r4m - shl r4d, 6 - sub r0, r1 + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif + pmaddubsw m0, m6 + pmaddubsw m7, m2, m5 - lea r4, [r1 * 3] + paddw m0, m7 - movd xm1, [r0] - pinsrd xm1, [r0 + r1], 1 - pinsrd xm1, [r0 + r1 * 2], 2 - pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm2, [r0] - pinsrd xm2, [r0 + r1], 1 - pinsrd xm2, [r0 + r1 * 2], 2 - pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] - vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm3, [r0] - pinsrd xm3, [r0 + r1], 1 - pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] - vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] - mova m3, [interp4_vpp_shuf1] - vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] - vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] - mova m3, [interp4_vpp_shuf1 + mmsize] - vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] - vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] + psubw m0, m4 + movu [r2], m0 - mova m3, [interp4_vpp_shuf] - pshufb m0, m0, m3 - pshufb m1, m1, m3 - pshufb m2, m2, m3 - pshufb m4, m4, m3 - pmaddubsw m0, [r5] - pmaddubsw m4, [r5] - pmaddubsw m1, [r5 + mmsize] - pmaddubsw m2, [r5 + mmsize] - paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] - paddw m4, m2 ; m4 = WORD ROW[7 6 5 4] -%ifidn %1,pp - pmulhrsw m0, [pw_512] - pmulhrsw m4, [pw_512] - packuswb m0, m4 - vextracti128 xm1, m0, 1 - lea r5, [r3 * 3] - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - movd [r2 + r3 * 2], xm1 - pextrd [r2 + r5], xm1, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm0, 3 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r5], xm1, 3 -%else - add r3d, r3d - psubw m0, [pw_2000] - psubw m4, [pw_2000] - vextracti128 xm1, m0, 1 - vextracti128 xm2, m4, 1 - lea r5, [r3 * 3] - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm1 - movhps [r2 + r5], xm1 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movhps [r2 + r3], xm4 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r5], xm2 -%endif + lea r0, [r0 + 4 * r1] + movq m0, [r0] + + punpcklbw m3, m0 + + pmaddubsw m1, m6 + pmaddubsw m7, m3, m5 + + paddw m1, m7 + + psubw m1, m4 + movu [r2 + r3], m1 + + movq m1, [r0 + r1] + + punpcklbw m0, m1 + + pmaddubsw m2, m6 + pmaddubsw m0, m5 + + paddw m2, m0 + + psubw m2, m4 + lea r2, [r2 + 2 * r3] + movu [r2], m2 + + movq m2, [r0 + 2 * r1] + + punpcklbw m1, m2 + + pmaddubsw m3, m6 + pmaddubsw m1, m5 + + paddw m3, m1 + psubw m3, m4 + + movu [r2 + r3], m3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop RET %endmacro - FILTER_VER_CHROMA_AVX2_4x8 pp - FILTER_VER_CHROMA_AVX2_4x8 ps + FILTER_V_PS_W8_H8_H16_H32 8, 8 + FILTER_V_PS_W8_H8_H16_H32 8, 16 + FILTER_V_PS_W8_H8_H16_H32 8, 32 -%macro FILTER_VER_CHROMA_AVX2_4xN 2 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x%2, 4, 6, 12 - mov r4d, r4m - shl r4d, 6 - sub r0, r1 +;------------------------------------------------------------------------------------------------------------ +;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +%macro FILTER_V_PS_W6 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d %ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + movd m5, [tab_ChromaCoeff + r4 * 4] %endif - lea r4, [r1 * 3] - mova m10, [r5] - mova m11, [r5 + mmsize] -%ifidn %1,pp - mova m9, [pw_512] -%else - add r3d, r3d - mova m9, [pw_2000] -%endif - lea r5, [r3 * 3] -%rep %2 / 16 - movd xm1, [r0] - pinsrd xm1, [r0 + r1], 1 - pinsrd xm1, [r0 + r1 * 2], 2 - pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm2, [r0] - pinsrd xm2, [r0 + r1], 1 - pinsrd xm2, [r0 + r1 * 2], 2 - pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] - vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm3, [r0] - pinsrd xm3, [r0 + r1], 1 - pinsrd xm3, [r0 + r1 * 2], 2 - pinsrd xm3, [r0 + r4], 3 ; m3 = row[11 10 9 8] - vinserti128 m2, m2, xm3, 1 ; m2 = row[11 10 9 8 7 6 5 4] - lea r0, [r0 + r1 * 4] - movd xm4, [r0] - pinsrd xm4, [r0 + r1], 1 - pinsrd xm4, [r0 + r1 * 2], 2 - pinsrd xm4, [r0 + r4], 3 ; m4 = row[15 14 13 12] - vinserti128 m3, m3, xm4, 1 ; m3 = row[15 14 13 12 11 10 9 8] - lea r0, [r0 + r1 * 4] - movd xm5, [r0] - pinsrd xm5, [r0 + r1], 1 - pinsrd xm5, [r0 + r1 * 2], 2 ; m5 = row[x 18 17 16] - vinserti128 m4, m4, xm5, 1 ; m4 = row[x 18 17 16 15 14 13 12] - mova m5, [interp4_vpp_shuf1] - vpermd m0, m5, m1 ; m0 = row[4 3 3 2 2 1 1 0] - vpermd m6, m5, m2 ; m6 = row[8 7 7 6 6 5 5 4] - vpermd m7, m5, m3 ; m7 = row[12 11 11 10 10 9 9 8] - vpermd m8, m5, m4 ; m8 = row[16 15 15 14 14 13 13 12] - mova m5, [interp4_vpp_shuf1 + mmsize] - vpermd m1, m5, m1 ; m1 = row[6 5 5 4 4 3 3 2] - vpermd m2, m5, m2 ; m2 = row[10 9 9 8 8 7 7 6] - vpermd m3, m5, m3 ; m3 = row[14 13 13 12 12 11 11 10] - vpermd m4, m5, m4 ; m4 = row[18 17 17 16 16 15 15 14] + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [pw_2000] + lea r5, [3 * r1] + mov r4d, %2/4 - mova m5, [interp4_vpp_shuf] - pshufb m0, m0, m5 - pshufb m1, m1, m5 - pshufb m2, m2, m5 - pshufb m4, m4, m5 - pshufb m3, m3, m5 - pshufb m6, m6, m5 - pshufb m7, m7, m5 - pshufb m8, m8, m5 - pmaddubsw m0, m10 - pmaddubsw m6, m10 - pmaddubsw m7, m10 - pmaddubsw m8, m10 - pmaddubsw m1, m11 - pmaddubsw m2, m11 - pmaddubsw m3, m11 - pmaddubsw m4, m11 - paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] - paddw m6, m2 ; m6 = WORD ROW[7 6 5 4] - paddw m7, m3 ; m7 = WORD ROW[11 10 9 8] - paddw m8, m4 ; m8 = WORD ROW[15 14 13 12] -%ifidn %1,pp - pmulhrsw m0, m9 - pmulhrsw m6, m9 - pmulhrsw m7, m9 - pmulhrsw m8, m9 - packuswb m0, m6 - packuswb m7, m8 - vextracti128 xm1, m0, 1 - vextracti128 xm2, m7, 1 - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - movd [r2 + r3 * 2], xm1 - pextrd [r2 + r5], xm1, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm0, 3 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r5], xm1, 3 - lea r2, [r2 + r3 * 4] - movd [r2], xm7 - pextrd [r2 + r3], xm7, 1 - movd [r2 + r3 * 2], xm2 - pextrd [r2 + r5], xm2, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm7, 2 - pextrd [r2 + r3], xm7, 3 - pextrd [r2 + r3 * 2], xm2, 2 - pextrd [r2 + r5], xm2, 3 -%else - psubw m0, m9 - psubw m6, m9 - psubw m7, m9 - psubw m8, m9 - vextracti128 xm1, m0, 1 - vextracti128 xm2, m6, 1 - vextracti128 xm3, m7, 1 - vextracti128 xm4, m8, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm1 - movhps [r2 + r5], xm1 - lea r2, [r2 + r3 * 4] - movq [r2], xm6 - movhps [r2 + r3], xm6 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r5], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm7 - movhps [r2 + r3], xm7 - movq [r2 + r3 * 2], xm3 - movhps [r2 + r5], xm3 - lea r2, [r2 + r3 * 4] - movq [r2], xm8 - movhps [r2 + r3], xm8 - movq [r2 + r3 * 2], xm4 - movhps [r2 + r5], xm4 -%endif - lea r2, [r2 + r3 * 4] -%endrep - RET -%endif -%endmacro - - FILTER_VER_CHROMA_AVX2_4xN pp, 16 - FILTER_VER_CHROMA_AVX2_4xN ps, 16 - FILTER_VER_CHROMA_AVX2_4xN pp, 32 - FILTER_VER_CHROMA_AVX2_4xN ps, 32 +.loop: + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W4_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 - mov r4d, r4m - sub r0, r1 + pmaddubsw m0, m6 + pmaddubsw m7, m2, m5 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif + paddw m0, m7 + psubw m0, m4 - pshufb m0, [tab_Cm] + movh [r2], m0 + pshufd m0, m0, 2 + movd [r2 + 8], m0 - mova m1, [pw_512] + lea r0, [r0 + 4 * r1] + movq m0, [r0] + punpcklbw m3, m0 - mov r4d, %2 + pmaddubsw m1, m6 + pmaddubsw m7, m3, m5 - lea r5, [3 * r1] + paddw m1, m7 + psubw m1, m4 -.loop: - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] + movh [r2 + r3], m1 + pshufd m1, m1, 2 + movd [r2 + r3 + 8], m1 - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 + movq m1, [r0 + r1] + punpcklbw m0, m1 - pmaddubsw m2, m0 + pmaddubsw m2, m6 + pmaddubsw m0, m5 - lea r0, [r0 + 4 * r1] - movd m6, [r0] + paddw m2, m0 + psubw m2, m4 - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 + lea r2,[r2 + 2 * r3] + movh [r2], m2 + pshufd m2, m2, 2 + movd [r2 + 8], m2 - pmaddubsw m3, m0 + movq m2,[r0 + 2 * r1] + punpcklbw m1, m2 - phaddw m2, m3 + pmaddubsw m3, m6 + pmaddubsw m1, m5 - pmulhrsw m2, m1 + paddw m3, m1 + psubw m3, m4 - movd m7, [r0 + r1] + movh [r2 + r3], m3 + pshufd m3, m3, 2 + movd [r2 + r3 + 8], m3 - punpcklbw m4, m5 - punpcklbw m3, m6, m7 - punpcklbw m4, m3 + lea r2, [r2 + 2 * r3] - pmaddubsw m4, m0 + dec r4d + jnz .loop + RET +%endmacro - movd m3, [r0 + 2 * r1] + FILTER_V_PS_W6 6, 8 + FILTER_V_PS_W6 6, 16 - punpcklbw m5, m6 - punpcklbw m7, m3 - punpcklbw m5, m7 +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W12 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8 - pmaddubsw m5, m0 + mov r4d, r4m + sub r0, r1 + add r3d, r3d - phaddw m4, m5 +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif - pmulhrsw m4, m1 - packuswb m2, m4 - movd [r2], m2 - pextrd [r2 + r3], m2, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m2, 2 - pextrd [r2 + r3], m2, 3 + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] - lea r2, [r2 + 2 * r3] + mov r4d, %2/2 - sub r4, 4 - jnz .loop - RET -%endmacro +.loop: + movu m2, [r0] + movu m3, [r0 + r1] - FILTER_V4_W4_H4 4, 8 - FILTER_V4_W4_H4 4, 16 + punpcklbw m4, m2, m3 + punpckhbw m2, m3 - FILTER_V4_W4_H4 4, 32 + pmaddubsw m4, m1 + pmaddubsw m2, m1 -%macro FILTER_V4_W8_H2 0 - punpcklbw m1, m2 - punpcklbw m7, m3, m0 + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m7, [r0 + r1] - pmaddubsw m1, m6 - pmaddubsw m7, m5 + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 - paddw m1, m7 + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 - pmulhrsw m1, m4 - packuswb m1, m1 -%endmacro + mova m6, [pw_2000] -%macro FILTER_V4_W8_H3 0 - punpcklbw m2, m3 - punpcklbw m7, m0, m1 + psubw m4, m6 + psubw m2, m6 - pmaddubsw m2, m6 - pmaddubsw m7, m5 + movu [r2], m4 + movh [r2 + 16], m2 - paddw m2, m7 + punpcklbw m4, m3, m5 + punpckhbw m3, m5 - pmulhrsw m2, m4 - packuswb m2, m2 -%endmacro + pmaddubsw m4, m1 + pmaddubsw m3, m1 -%macro FILTER_V4_W8_H4 0 - punpcklbw m3, m0 - punpcklbw m7, m1, m2 + movu m2, [r0 + 2 * r1] - pmaddubsw m3, m6 - pmaddubsw m7, m5 - - paddw m3, m7 - - pmulhrsw m3, m4 - packuswb m3, m3 -%endmacro - -%macro FILTER_V4_W8_H5 0 - punpcklbw m0, m1 - punpcklbw m7, m2, m3 - - pmaddubsw m0, m6 - pmaddubsw m7, m5 - - paddw m0, m7 - - pmulhrsw m0, m4 - packuswb m0, m0 -%endmacro - -%macro FILTER_V4_W8_8x2 2 - FILTER_V4_W8 %1, %2 - movq m0, [r0 + 4 * r1] - - FILTER_V4_W8_H2 - - movh [r2 + r3], m1 -%endmacro + punpcklbw m5, m7, m2 + punpckhbw m7, m2 -%macro FILTER_V4_W8_8x4 2 - FILTER_V4_W8_8x2 %1, %2 -;8x3 - lea r6, [r0 + 4 * r1] - movq m1, [r6 + r1] + pmaddubsw m5, m0 + pmaddubsw m7, m0 - FILTER_V4_W8_H3 + paddw m4, m5 + paddw m3, m7 - movh [r2 + 2 * r3], m2 + psubw m4, m6 + psubw m3, m6 -;8x4 - movq m2, [r6 + 2 * r1] + movu [r2 + r3], m4 + movh [r2 + r3 + 16], m3 - FILTER_V4_W8_H4 + lea r2, [r2 + 2 * r3] - lea r5, [r2 + 2 * r3] - movh [r5 + r3], m3 + dec r4d + jnz .loop + RET %endmacro -%macro FILTER_V4_W8_8x6 2 - FILTER_V4_W8_8x4 %1, %2 -;8x5 - lea r6, [r6 + 2 * r1] - movq m3, [r6 + r1] - - FILTER_V4_W8_H5 - - movh [r2 + 4 * r3], m0 - -;8x6 - movq m0, [r0 + 8 * r1] - - FILTER_V4_W8_H2 - - lea r5, [r2 + 4 * r3] - movh [r5 + r3], m1 -%endmacro + FILTER_V_PS_W12 12, 16 + FILTER_V_PS_W12 12, 32 -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W8 2 +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W16 2 INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 - - mov r4d, r4m - - sub r0, r1 - movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - lea r5, [r0 + 2 * r1] - movq m3, [r5 + r1] +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 - punpcklbw m0, m1 - punpcklbw m4, m2, m3 + mov r4d, r4m + sub r0, r1 + add r3d, r3d %ifdef PIC - lea r6, [tab_ChromaCoeff] - movd m5, [r6 + r4 * 4] + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] %else - movd m5, [tab_ChromaCoeff + r4 * 4] + movd m0, [tab_ChromaCoeff + r4 * 4] %endif - pshufb m6, m5, [tab_Vm] - pmaddubsw m0, m6 - - pshufb m5, [tab_Vm + 16] - pmaddubsw m4, m5 - - paddw m0, m4 + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + mov r4d, %2/2 - mova m4, [pw_512] +.loop: + movu m2, [r0] + movu m3, [r0 + r1] - pmulhrsw m0, m4 - packuswb m0, m0 - movh [r2], m0 -%endmacro + punpcklbw m4, m2, m3 + punpckhbw m2, m3 -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- - FILTER_V4_W8_8x2 8, 2 + pmaddubsw m4, m1 + pmaddubsw m2, m1 - RET + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m7, [r0 + r1] -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- - FILTER_V4_W8_8x4 8, 4 + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 - RET + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- - FILTER_V4_W8_8x6 8, 6 + mova m6, [pw_2000] - RET + psubw m4, m6 + psubw m2, m6 -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_4x2, 4, 6, 6 + movu [r2], m4 + movu [r2 + 16], m2 - mov r4d, r4m - sub r0, r1 - add r3d, r3d + punpcklbw m4, m3, m5 + punpckhbw m3, m5 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif + pmaddubsw m4, m1 + pmaddubsw m3, m1 - pshufb m0, [tab_Cm] + movu m5, [r0 + 2 * r1] - movd m2, [r0] - movd m3, [r0 + r1] - lea r5, [r0 + 2 * r1] - movd m4, [r5] - movd m5, [r5 + r1] + punpcklbw m2, m7, m5 + punpckhbw m7, m5 - punpcklbw m2, m3 - punpcklbw m1, m4, m5 - punpcklbw m2, m1 + pmaddubsw m2, m0 + pmaddubsw m7, m0 - pmaddubsw m2, m0 + paddw m4, m2 + paddw m3, m7 - movd m1, [r0 + 4 * r1] + psubw m4, m6 + psubw m3, m6 - punpcklbw m3, m4 - punpcklbw m5, m1 - punpcklbw m3, m5 + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 - pmaddubsw m3, m0 + lea r2, [r2 + 2 * r3] - phaddw m2, m3 + dec r4d + jnz .loop + RET +%endmacro - psubw m2, [pw_2000] - movh [r2], m2 - movhps [r2 + r3], m2 + FILTER_V_PS_W16 16, 4 + FILTER_V_PS_W16 16, 8 + FILTER_V_PS_W16 16, 12 + FILTER_V_PS_W16 16, 16 + FILTER_V_PS_W16 16, 32 - RET + FILTER_V_PS_W16 16, 24 + FILTER_V_PS_W16 16, 64 -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- +;-------------------------------------------------------------------------------------------------------------- +;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_V4_PS_W24 2 INIT_XMM sse4 -cglobal interp_4tap_vert_ps_4x4, 4, 6, 7 +cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 @@ -7025,67 +4838,110 @@ cglobal interp_4tap_vert_ps_4x4, 4, 6, 7 movd m0, [tab_ChromaCoeff + r4 * 4] %endif - pshufb m0, [tab_Cm] + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] - lea r4, [r1 * 3] - lea r5, [r0 + 4 * r1] + mov r4d, %2/2 - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r4] +.loop: + movu m2, [r0] + movu m3, [r0 + r1] - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 + punpcklbw m4, m2, m3 + punpckhbw m2, m3 - pmaddubsw m2, m0 + pmaddubsw m4, m1 + pmaddubsw m2, m1 - movd m6, [r5] + lea r5, [r0 + 2 * r1] - punpcklbw m3, m4 - punpcklbw m1, m5, m6 - punpcklbw m3, m1 + movu m5, [r5] + movu m7, [r5 + r1] - pmaddubsw m3, m0 + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 - phaddw m2, m3 + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 - mova m1, [pw_2000] + mova m6, [pw_2000] - psubw m2, m1 - movh [r2], m2 - movhps [r2 + r3], m2 + psubw m4, m6 + psubw m2, m6 - movd m2, [r5 + r1] + movu [r2], m4 + movu [r2 + 16], m2 - punpcklbw m4, m5 - punpcklbw m3, m6, m2 - punpcklbw m4, m3 + punpcklbw m4, m3, m5 + punpckhbw m3, m5 - pmaddubsw m4, m0 + pmaddubsw m4, m1 + pmaddubsw m3, m1 - movd m3, [r5 + 2 * r1] + movu m2, [r5 + 2 * r1] + + punpcklbw m5, m7, m2 + punpckhbw m7, m2 + + pmaddubsw m5, m0 + pmaddubsw m7, m0 + + paddw m4, m5 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 + + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 + + movq m2, [r0 + 16] + movq m3, [r0 + r1 + 16] + movq m4, [r5 + 16] + movq m5, [r5 + r1 + 16] - punpcklbw m5, m6 punpcklbw m2, m3 + punpcklbw m7, m4, m5 + + pmaddubsw m2, m1 + pmaddubsw m7, m0 + + paddw m2, m7 + psubw m2, m6 + + movu [r2 + 32], m2 + + movq m2, [r5 + 2 * r1 + 16] + + punpcklbw m3, m4 punpcklbw m5, m2 + pmaddubsw m3, m1 pmaddubsw m5, m0 - phaddw m4, m5 + paddw m3, m5 + psubw m3, m6 - psubw m4, m1 + movu [r2 + r3 + 32], m3 + + mov r0, r5 lea r2, [r2 + 2 * r3] - movh [r2], m4 - movhps [r2 + r3], m4 + dec r4d + jnz .loop RET +%endmacro + + FILTER_V4_PS_W24 24, 32 + + FILTER_V4_PS_W24 24, 64 ;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W4_H4 2 +%macro FILTER_V_PS_W32 2 INIT_XMM sse4 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 @@ -7100,731 +4956,942 @@ cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 movd m0, [tab_ChromaCoeff + r4 * 4] %endif - pshufb m0, [tab_Cm] + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] - mova m1, [pw_2000] + mova m7, [pw_2000] - mov r4d, %2/4 - lea r5, [3 * r1] + mov r4d, %2 .loop: - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] + movu m2, [r0] + movu m3, [r0 + r1] - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 + punpcklbw m4, m2, m3 + punpckhbw m2, m3 - pmaddubsw m2, m0 + pmaddubsw m4, m1 + pmaddubsw m2, m1 - lea r0, [r0 + 4 * r1] - movd m6, [r0] + lea r5, [r0 + 2 * r1] + movu m3, [r5] + movu m5, [r5 + r1] - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 + punpcklbw m6, m3, m5 + punpckhbw m3, m5 + pmaddubsw m6, m0 pmaddubsw m3, m0 - phaddw m2, m3 + paddw m4, m6 + paddw m2, m3 - psubw m2, m1 - movh [r2], m2 - movhps [r2 + r3], m2 + psubw m4, m7 + psubw m2, m7 - movd m2, [r0 + r1] + movu [r2], m4 + movu [r2 + 16], m2 - punpcklbw m4, m5 - punpcklbw m3, m6, m2 - punpcklbw m4, m3 + movu m2, [r0 + 16] + movu m3, [r0 + r1 + 16] - pmaddubsw m4, m0 + punpcklbw m4, m2, m3 + punpckhbw m2, m3 - movd m3, [r0 + 2 * r1] + pmaddubsw m4, m1 + pmaddubsw m2, m1 - punpcklbw m5, m6 - punpcklbw m2, m3 - punpcklbw m5, m2 + movu m3, [r5 + 16] + movu m5, [r5 + r1 + 16] - pmaddubsw m5, m0 + punpcklbw m6, m3, m5 + punpckhbw m3, m5 - phaddw m4, m5 + pmaddubsw m6, m0 + pmaddubsw m3, m0 - psubw m4, m1 - lea r2, [r2 + 2 * r3] - movh [r2], m4 - movhps [r2 + r3], m4 + paddw m4, m6 + paddw m2, m3 - lea r2, [r2 + 2 * r3] + psubw m4, m7 + psubw m2, m7 + + movu [r2 + 32], m4 + movu [r2 + 48], m2 + + lea r0, [r0 + r1] + lea r2, [r2 + r3] dec r4d jnz .loop RET %endmacro - FILTER_V_PS_W4_H4 4, 8 - FILTER_V_PS_W4_H4 4, 16 + FILTER_V_PS_W32 32, 8 + FILTER_V_PS_W32 32, 16 + FILTER_V_PS_W32 32, 24 + FILTER_V_PS_W32 32, 32 - FILTER_V_PS_W4_H4 4, 32 + FILTER_V_PS_W32 32, 48 + FILTER_V_PS_W32 32, 64 -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W8_H8_H16_H2 2 +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W8_H8_H16_H32 2 INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7 +cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 - mov r4d, r4m - sub r0, r1 - add r3d, r3d + mov r4d, r4m + sub r0, r1 %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m5, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] %else - movd m5, [tab_ChromaCoeff + r4 * 4] + movd m5, [tab_ChromaCoeff + r4 * 4] %endif - pshufb m6, m5, [tab_Vm] - pshufb m5, [tab_Vm + 16] - mova m4, [pw_2000] + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [pw_512] + lea r5, [r1 * 3] - mov r4d, %2/2 - lea r5, [3 * r1] + mov r4d, %2 -.loopH: - movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - movq m3, [r0 + r5] +.loop: + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 - pmaddubsw m0, m6 - pmaddubsw m2, m5 + pmaddubsw m0, m6 + pmaddubsw m7, m2, m5 - paddw m0, m2 + paddw m0, m7 - psubw m0, m4 - movu [r2], m0 + pmulhrsw m0, m4 + packuswb m0, m0 + movh [r2], m0 - movq m0, [r0 + 4 * r1] + lea r0, [r0 + 4 * r1] + movq m0, [r0] - punpcklbw m3, m0 + punpcklbw m3, m0 - pmaddubsw m1, m6 - pmaddubsw m3, m5 + pmaddubsw m1, m6 + pmaddubsw m7, m3, m5 - paddw m1, m3 - psubw m1, m4 + paddw m1, m7 - movu [r2 + r3], m1 + pmulhrsw m1, m4 + packuswb m1, m1 + movh [r2 + r3], m1 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] + movq m1, [r0 + r1] - dec r4d - jnz .loopH + punpcklbw m0, m1 - RET -%endmacro + pmaddubsw m2, m6 + pmaddubsw m0, m5 - FILTER_V_PS_W8_H8_H16_H2 8, 2 - FILTER_V_PS_W8_H8_H16_H2 8, 4 - FILTER_V_PS_W8_H8_H16_H2 8, 6 - - FILTER_V_PS_W8_H8_H16_H2 8, 12 - FILTER_V_PS_W8_H8_H16_H2 8, 64 - -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W8_H8_H16_H32 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m5, [r5 + r4 * 4] -%else - movd m5, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m6, m5, [tab_Vm] - pshufb m5, [tab_Vm + 16] - mova m4, [pw_2000] - - mov r4d, %2/4 - lea r5, [3 * r1] - -.loop: - movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - movq m3, [r0 + r5] - - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 - - pmaddubsw m0, m6 - pmaddubsw m7, m2, m5 - - paddw m0, m7 - - psubw m0, m4 - movu [r2], m0 - - lea r0, [r0 + 4 * r1] - movq m0, [r0] - - punpcklbw m3, m0 - - pmaddubsw m1, m6 - pmaddubsw m7, m3, m5 - - paddw m1, m7 - - psubw m1, m4 - movu [r2 + r3], m1 - - movq m1, [r0 + r1] - - punpcklbw m0, m1 - - pmaddubsw m2, m6 - pmaddubsw m0, m5 - - paddw m2, m0 + paddw m2, m0 - psubw m2, m4 - lea r2, [r2 + 2 * r3] - movu [r2], m2 + pmulhrsw m2, m4 - movq m2, [r0 + 2 * r1] + movq m7, [r0 + 2 * r1] + punpcklbw m1, m7 - punpcklbw m1, m2 + pmaddubsw m3, m6 + pmaddubsw m1, m5 - pmaddubsw m3, m6 - pmaddubsw m1, m5 + paddw m3, m1 - paddw m3, m1 - psubw m3, m4 + pmulhrsw m3, m4 + packuswb m2, m3 - movu [r2 + r3], m3 + lea r2, [r2 + 2 * r3] + movh [r2], m2 + movhps [r2 + r3], m2 - lea r2, [r2 + 2 * r3] + lea r2, [r2 + 2 * r3] - dec r4d + sub r4, 4 jnz .loop RET %endmacro - FILTER_V_PS_W8_H8_H16_H32 8, 8 - FILTER_V_PS_W8_H8_H16_H32 8, 16 - FILTER_V_PS_W8_H8_H16_H32 8, 32 + FILTER_V4_W8_H8_H16_H32 8, 8 + FILTER_V4_W8_H8_H16_H32 8, 16 + FILTER_V4_W8_H8_H16_H32 8, 32 -;------------------------------------------------------------------------------------------------------------ -;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------ -%macro FILTER_V_PS_W6 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8 + FILTER_V4_W8_H8_H16_H32 8, 12 + FILTER_V4_W8_H8_H16_H32 8, 64 - mov r4d, r4m - sub r0, r1 - add r3d, r3d +%macro PROCESS_CHROMA_AVX2_W8_8R 0 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] + vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] + vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + pmaddubsw m0, [r5 + 1 * mmsize] + paddw m4, m0 +%endmacro + +%macro FILTER_VER_CHROMA_AVX2_8x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x8, 4, 6, 7 + mov r4d, r4m + shl r4d, 6 %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m5, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 %else - movd m5, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32 + r4] %endif - pshufb m6, m5, [tab_Vm] - pshufb m5, [tab_Vm + 16] - mova m4, [pw_2000] - lea r5, [3 * r1] - mov r4d, %2/4 - -.loop: - movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - movq m3, [r0 + r5] - - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 + lea r4, [r1 * 3] + sub r0, r1 + PROCESS_CHROMA_AVX2_W8_8R +%ifidn %1,pp + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + pmulhrsw m4, m3 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 + movhps [r2 + r3 * 2], xm1 + movhps [r2 + r4], xm4 +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] + lea r4, [r3 * 3] + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + psubw m4, m3 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movu [r2], xm5 + movu [r2 + r3], xm6 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm4 + vextracti128 xm4, m4, 1 + movu [r2 + r4], xm4 +%endif + RET +%endmacro - pmaddubsw m0, m6 - pmaddubsw m7, m2, m5 + FILTER_VER_CHROMA_AVX2_8x8 pp + FILTER_VER_CHROMA_AVX2_8x8 ps - paddw m0, m7 - psubw m0, m4 +%macro FILTER_VER_CHROMA_AVX2_8x6 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x6, 4, 6, 6 + mov r4d, r4m + shl r4d, 6 - movh [r2], m0 - pshufd m0, m0, 2 - movd [r2 + 8], m0 +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif - lea r0, [r0 + 4 * r1] - movq m0, [r0] - punpcklbw m3, m0 + lea r4, [r1 * 3] + sub r0, r1 - pmaddubsw m1, m6 - pmaddubsw m7, m3, m5 - - paddw m1, m7 - psubw m1, m4 - - movh [r2 + r3], m1 - pshufd m1, m1, 2 - movd [r2 + r3 + 8], m1 - - movq m1, [r0 + r1] - punpcklbw m0, m1 - - pmaddubsw m2, m6 - pmaddubsw m0, m5 - - paddw m2, m0 - psubw m2, m4 - - lea r2,[r2 + 2 * r3] - movh [r2], m2 - pshufd m2, m2, 2 - movd [r2 + 8], m2 - - movq m2,[r0 + 2 * r1] - punpcklbw m1, m2 - - pmaddubsw m3, m6 - pmaddubsw m1, m5 - - paddw m3, m1 - psubw m3, m4 - - movh [r2 + r3], m3 - pshufd m3, m3, 2 - movd [r2 + r3 + 8], m3 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loop - RET -%endmacro - - FILTER_V_PS_W6 6, 8 - FILTER_V_PS_W6 6, 16 - -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W12 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] + vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m4, [r5 + 1 * mmsize] + paddw m1, m4 +%ifidn %1,pp + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + packuswb m5, m2 + packuswb m1, m1 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 %else - movd m0, [tab_ChromaCoeff + r4 * 4] + add r3d, r3d + mova m3, [pw_2000] + lea r4, [r3 * 3] + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + vextracti128 xm4, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movu [r2], xm5 + movu [r2 + r3], xm4 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm1 + movu [r2 + r3], xm0 %endif - - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mov r4d, %2/2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r0, [r0 + 2 * r1] - movu m5, [r0] - movu m7, [r0 + r1] - - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 - - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 - - mova m6, [pw_2000] - - psubw m4, m6 - psubw m2, m6 - - movu [r2], m4 - movh [r2 + 16], m2 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m2, [r0 + 2 * r1] - - punpcklbw m5, m7, m2 - punpckhbw m7, m2 - - pmaddubsw m5, m0 - pmaddubsw m7, m0 - - paddw m4, m5 - paddw m3, m7 - - psubw m4, m6 - psubw m3, m6 - - movu [r2 + r3], m4 - movh [r2 + r3 + 16], m3 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loop RET %endmacro - FILTER_V_PS_W12 12, 16 - FILTER_V_PS_W12 12, 32 - -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W16 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - mov r4d, %2/2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r0, [r0 + 2 * r1] - movu m5, [r0] - movu m7, [r0 + r1] - - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 - - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 - - mova m6, [pw_2000] - - psubw m4, m6 - psubw m2, m6 - - movu [r2], m4 - movu [r2 + 16], m2 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m5, [r0 + 2 * r1] - - punpcklbw m2, m7, m5 - punpckhbw m7, m5 + FILTER_VER_CHROMA_AVX2_8x6 pp + FILTER_VER_CHROMA_AVX2_8x6 ps - pmaddubsw m2, m0 - pmaddubsw m7, m0 - - paddw m4, m2 - paddw m3, m7 - - psubw m4, m6 - psubw m3, m6 +%macro PROCESS_CHROMA_AVX2_W8_16R 1 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 + vinserti128 m5, m1, xm2, 1 + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 + vinserti128 m2, m3, xm4, 1 + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 + vinserti128 m4, m4, xm3, 1 + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 + vinserti128 m0, m0, xm3, 1 + pmaddubsw m3, m0, [r5 + 1 * mmsize] + paddw m4, m3 + pmaddubsw m0, [r5] +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 + movhps [r2 + r3 * 2], xm1 + movhps [r2 + r6], xm4 +%else + psubw m5, m7 ; m5 = word: row 0, row 1 + psubw m2, m7 ; m2 = word: row 2, row 3 + psubw m1, m7 ; m1 = word: row 4, row 5 + psubw m4, m7 ; m4 = word: row 6, row 7 + vextracti128 xm3, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm3 + vextracti128 xm3, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + vextracti128 xm5, m1, 1 + vextracti128 xm3, m4, 1 + movu [r2], xm1 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm4 + movu [r2 + r6], xm3 +%endif + movq xm3, [r0 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 + lea r0, [r0 + r1 * 4] + movq xm5, [r0] ; m5 = row 12 + punpcklbw xm3, xm5 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, [r5 + 1 * mmsize] + paddw m0, m3 + pmaddubsw m6, [r5] + movq xm3, [r0 + r1] ; m3 = row 13 + punpcklbw xm5, xm3 + movq xm2, [r0 + r1 * 2] ; m2 = row 14 + punpcklbw xm3, xm2 + vinserti128 m5, m5, xm3, 1 + pmaddubsw m3, m5, [r5 + 1 * mmsize] + paddw m6, m3 + pmaddubsw m5, [r5] + movq xm3, [r0 + r4] ; m3 = row 15 + punpcklbw xm2, xm3 + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 16 + punpcklbw xm3, xm1 + vinserti128 m2, m2, xm3, 1 + pmaddubsw m3, m2, [r5 + 1 * mmsize] + paddw m5, m3 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 17 + punpcklbw xm1, xm3 + movq xm4, [r0 + r1 * 2] ; m4 = row 18 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m2, m1 + lea r2, [r2 + r3 * 4] +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 8, row 9 + pmulhrsw m6, m7 ; m6 = word: row 10, row 11 + pmulhrsw m5, m7 ; m5 = word: row 12, row 13 + pmulhrsw m2, m7 ; m2 = word: row 14, row 15 + packuswb m0, m6 + packuswb m5, m2 + vextracti128 xm6, m0, 1 + vextracti128 xm2, m5, 1 + movq [r2], xm0 + movq [r2 + r3], xm6 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm6 + lea r2, [r2 + r3 * 4] + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 +%else + psubw m0, m7 ; m0 = word: row 8, row 9 + psubw m6, m7 ; m6 = word: row 10, row 11 + psubw m5, m7 ; m5 = word: row 12, row 13 + psubw m2, m7 ; m2 = word: row 14, row 15 + vextracti128 xm1, m0, 1 + vextracti128 xm3, m6, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + vextracti128 xm1, m5, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm5 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif +%endmacro - movu [r2 + r3], m4 - movu [r2 + r3 + 16], m3 +%macro FILTER_VER_CHROMA_AVX2_8x16 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x16, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 - lea r2, [r2 + 2 * r3] +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif - dec r4d - jnz .loop + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + mova m7, [pw_2000] +%endif + lea r6, [r3 * 3] + PROCESS_CHROMA_AVX2_W8_16R %1 RET %endmacro - FILTER_V_PS_W16 16, 4 - FILTER_V_PS_W16 16, 8 - FILTER_V_PS_W16 16, 12 - FILTER_V_PS_W16 16, 16 - FILTER_V_PS_W16 16, 32 - - FILTER_V_PS_W16 16, 24 - FILTER_V_PS_W16 16, 64 - -;-------------------------------------------------------------------------------------------------------------- -;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_V4_PS_W24 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8 + FILTER_VER_CHROMA_AVX2_8x16 pp + FILTER_VER_CHROMA_AVX2_8x16 ps - mov r4d, r4m - sub r0, r1 - add r3d, r3d +%macro FILTER_VER_CHROMA_AVX2_8x12 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x12, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 %else - movd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32 + r4] %endif - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mov r4d, %2/2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r5, [r0 + 2 * r1] - - movu m5, [r5] - movu m7, [r5 + r1] - - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 - - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 - - mova m6, [pw_2000] - - psubw m4, m6 - psubw m2, m6 - - movu [r2], m4 - movu [r2 + 16], m2 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m2, [r5 + 2 * r1] - - punpcklbw m5, m7, m2 - punpckhbw m7, m2 - - pmaddubsw m5, m0 - pmaddubsw m7, m0 - - paddw m4, m5 - paddw m3, m7 - - psubw m4, m6 - psubw m3, m6 - - movu [r2 + r3], m4 - movu [r2 + r3 + 16], m3 - - movq m2, [r0 + 16] - movq m3, [r0 + r1 + 16] - movq m4, [r5 + 16] - movq m5, [r5 + r1 + 16] - - punpcklbw m2, m3 - punpcklbw m7, m4, m5 - - pmaddubsw m2, m1 - pmaddubsw m7, m0 - - paddw m2, m7 - psubw m2, m6 - - movu [r2 + 32], m2 - - movq m2, [r5 + 2 * r1 + 16] - - punpcklbw m3, m4 - punpcklbw m5, m2 - - pmaddubsw m3, m1 - pmaddubsw m5, m0 + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1, pp + mova m7, [pw_512] +%else + add r3d, r3d + mova m7, [pw_2000] +%endif + lea r6, [r3 * 3] + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 + vinserti128 m5, m1, xm2, 1 + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 + vinserti128 m2, m3, xm4, 1 + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 + vinserti128 m4, m4, xm3, 1 + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 + vinserti128 m0, m0, xm3, 1 + pmaddubsw m3, m0, [r5 + 1 * mmsize] + paddw m4, m3 + pmaddubsw m0, [r5] +%ifidn %1, pp + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 + movhps [r2 + r3 * 2], xm1 + movhps [r2 + r6], xm4 +%else + psubw m5, m7 ; m5 = word: row 0, row 1 + psubw m2, m7 ; m2 = word: row 2, row 3 + psubw m1, m7 ; m1 = word: row 4, row 5 + psubw m4, m7 ; m4 = word: row 6, row 7 + vextracti128 xm3, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm3 + vextracti128 xm3, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + vextracti128 xm5, m1, 1 + vextracti128 xm3, m4, 1 + movu [r2], xm1 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm4 + movu [r2 + r6], xm3 +%endif + movq xm3, [r0 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 + lea r0, [r0 + r1 * 4] + movq xm5, [r0] ; m5 = row 12 + punpcklbw xm3, xm5 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, [r5 + 1 * mmsize] + paddw m0, m3 + pmaddubsw m6, [r5] + movq xm3, [r0 + r1] ; m3 = row 13 + punpcklbw xm5, xm3 + movq xm2, [r0 + r1 * 2] ; m2 = row 14 + punpcklbw xm3, xm2 + vinserti128 m5, m5, xm3, 1 + pmaddubsw m3, m5, [r5 + 1 * mmsize] + paddw m6, m3 + lea r2, [r2 + r3 * 4] +%ifidn %1, pp + pmulhrsw m0, m7 ; m0 = word: row 8, row 9 + pmulhrsw m6, m7 ; m6 = word: row 10, row 11 + packuswb m0, m6 + vextracti128 xm6, m0, 1 + movq [r2], xm0 + movq [r2 + r3], xm6 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm6 +%else + psubw m0, m7 ; m0 = word: row 8, row 9 + psubw m6, m7 ; m6 = word: row 10, row 11 + vextracti128 xm1, m0, 1 + vextracti128 xm3, m6, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm3 +%endif + RET +%endmacro - paddw m3, m5 - psubw m3, m6 + FILTER_VER_CHROMA_AVX2_8x12 pp + FILTER_VER_CHROMA_AVX2_8x12 ps - movu [r2 + r3 + 32], m3 +%macro FILTER_VER_CHROMA_AVX2_8xN 2 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x%2, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 - mov r0, r5 - lea r2, [r2 + 2 * r3] +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif - dec r4d - jnz .loop + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + mova m7, [pw_2000] +%endif + lea r6, [r3 * 3] +%rep %2 / 16 + PROCESS_CHROMA_AVX2_W8_16R %1 + lea r2, [r2 + r3 * 4] +%endrep RET %endmacro - FILTER_V4_PS_W24 24, 32 - - FILTER_V4_PS_W24 24, 64 + FILTER_VER_CHROMA_AVX2_8xN pp, 32 + FILTER_VER_CHROMA_AVX2_8xN ps, 32 + FILTER_VER_CHROMA_AVX2_8xN pp, 64 + FILTER_VER_CHROMA_AVX2_8xN ps, 64 -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W32 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 +%macro PROCESS_CHROMA_AVX2_W8_4R 0 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m0, m1, xm2, 1 ; m0 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m0, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m2, m1 +%endmacro - mov r4d, r4m - sub r0, r1 - add r3d, r3d +%macro FILTER_VER_CHROMA_AVX2_8x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x4, 4, 6, 5 + mov r4d, r4m + shl r4d, 6 %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 %else - movd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32 + r4] %endif - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mova m7, [pw_2000] - - mov r4d, %2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r5, [r0 + 2 * r1] - movu m3, [r5] - movu m5, [r5 + r1] - - punpcklbw m6, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m6, m0 - pmaddubsw m3, m0 - - paddw m4, m6 - paddw m2, m3 - - psubw m4, m7 - psubw m2, m7 - - movu [r2], m4 - movu [r2 + 16], m2 - - movu m2, [r0 + 16] - movu m3, [r0 + r1 + 16] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 + lea r4, [r1 * 3] + sub r0, r1 + PROCESS_CHROMA_AVX2_W8_4R +%ifidn %1,pp + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m0, m3 ; m0 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + packuswb m0, m2 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] + lea r4, [r3 * 3] + psubw m0, m3 ; m0 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + vextracti128 xm1, m0, 1 + vextracti128 xm4, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm4 +%endif + RET +%endmacro - pmaddubsw m4, m1 - pmaddubsw m2, m1 + FILTER_VER_CHROMA_AVX2_8x4 pp + FILTER_VER_CHROMA_AVX2_8x4 ps - movu m3, [r5 + 16] - movu m5, [r5 + r1 + 16] +%macro FILTER_VER_CHROMA_AVX2_8x2 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x2, 4, 6, 4 + mov r4d, r4m + shl r4d, 6 - punpcklbw m6, m3, m5 - punpckhbw m3, m5 +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif - pmaddubsw m6, m0 - pmaddubsw m3, m0 + lea r4, [r1 * 3] + sub r0, r1 - paddw m4, m6 - paddw m2, m3 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m1, m1, xm2, 1 ; m1 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m1, [r5] + movq xm2, [r0 + r4] ; m2 = row 3 + punpcklbw xm3, xm2 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + movq xm0, [r0 + r1 * 4] ; m0 = row 4 + punpcklbw xm2, xm0 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m3, m3, xm2, 1 ; m3 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 +%ifidn %1,pp + pmulhrsw m1, [pw_512] ; m1 = word: row 0, row 1 + packuswb m1, m1 + vextracti128 xm0, m1, 1 + movq [r2], xm1 + movq [r2 + r3], xm0 +%else + add r3d, r3d + psubw m1, [pw_2000] ; m1 = word: row 0, row 1 + vextracti128 xm0, m1, 1 + movu [r2], xm1 + movu [r2 + r3], xm0 +%endif + RET +%endmacro - psubw m4, m7 - psubw m2, m7 + FILTER_VER_CHROMA_AVX2_8x2 pp + FILTER_VER_CHROMA_AVX2_8x2 ps - movu [r2 + 32], m4 - movu [r2 + 48], m2 +%macro FILTER_VER_CHROMA_AVX2_6x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_6x8, 4, 6, 7 + mov r4d, r4m + shl r4d, 6 - lea r0, [r0 + r1] - lea r2, [r2 + r3] +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif - dec r4d - jnz .loop + lea r4, [r1 * 3] + sub r0, r1 + PROCESS_CHROMA_AVX2_W8_8R +%ifidn %1,pp + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + pmulhrsw m4, m3 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movd [r2], xm5 + pextrw [r2 + 4], xm5, 2 + movd [r2 + r3], xm2 + pextrw [r2 + r3 + 4], xm2, 2 + pextrd [r2 + r3 * 2], xm5, 2 + pextrw [r2 + r3 * 2 + 4], xm5, 6 + pextrd [r2 + r4], xm2, 2 + pextrw [r2 + r4 + 4], xm2, 6 + lea r2, [r2 + r3 * 4] + movd [r2], xm1 + pextrw [r2 + 4], xm1, 2 + movd [r2 + r3], xm4 + pextrw [r2 + r3 + 4], xm4, 2 + pextrd [r2 + r3 * 2], xm1, 2 + pextrw [r2 + r3 * 2 + 4], xm1, 6 + pextrd [r2 + r4], xm4, 2 + pextrw [r2 + r4 + 4], xm4, 6 +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] + lea r4, [r3 * 3] + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + psubw m4, m3 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movq [r2], xm5 + pextrd [r2 + 8], xm5, 2 + movq [r2 + r3], xm6 + pextrd [r2 + r3 + 8], xm6, 2 + movq [r2 + r3 * 2], xm2 + pextrd [r2 + r3 * 2 + 8], xm2, 2 + movq [r2 + r4], xm3 + pextrd [r2 + r4 + 8], xm3, 2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + pextrd [r2 + 8], xm1, 2 + movq [r2 + r3], xm0 + pextrd [r2 + r3 + 8], xm0, 2 + movq [r2 + r3 * 2], xm4 + pextrd [r2 + r3 * 2 + 8], xm4, 2 + vextracti128 xm4, m4, 1 + movq [r2 + r4], xm4 + pextrd [r2 + r4 + 8], xm4, 2 +%endif RET %endmacro - FILTER_V_PS_W32 32, 8 - FILTER_V_PS_W32 32, 16 - FILTER_V_PS_W32 32, 24 - FILTER_V_PS_W32 32, 32 - - FILTER_V_PS_W32 32, 48 - FILTER_V_PS_W32 32, 64 + FILTER_VER_CHROMA_AVX2_6x8 pp + FILTER_VER_CHROMA_AVX2_6x8 ps ;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W8_H8_H16_H32 2 +%macro FILTER_V4_W6_H4 2 INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 +cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 @@ -7839,9 +5906,9 @@ cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 pshufb m6, m5, [tab_Vm] pshufb m5, [tab_Vm + 16] mova m4, [pw_512] - lea r5, [r1 * 3] mov r4d, %2 + lea r5, [3 * r1] .loop: movq m0, [r0] @@ -7860,11 +5927,12 @@ cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 pmulhrsw m0, m4 packuswb m0, m0 - movh [r2], m0 + movd [r2], m0 + pextrw [r2 + 4], m0, 2 lea r0, [r0 + 4 * r1] - movq m0, [r0] + movq m0, [r0] punpcklbw m3, m0 pmaddubsw m1, m6 @@ -7874,21 +5942,25 @@ cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 pmulhrsw m1, m4 packuswb m1, m1 - movh [r2 + r3], m1 + movd [r2 + r3], m1 + pextrw [r2 + r3 + 4], m1, 2 movq m1, [r0 + r1] - - punpcklbw m0, m1 + punpcklbw m7, m0, m1 pmaddubsw m2, m6 - pmaddubsw m0, m5 + pmaddubsw m7, m5 - paddw m2, m0 + paddw m2, m7 pmulhrsw m2, m4 + packuswb m2, m2 + lea r2, [r2 + 2 * r3] + movd [r2], m2 + pextrw [r2 + 4], m2, 2 - movq m7, [r0 + 2 * r1] - punpcklbw m1, m7 + movq m2, [r0 + 2 * r1] + punpcklbw m1, m2 pmaddubsw m3, m6 pmaddubsw m1, m5 @@ -7896,11 +5968,10 @@ cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 paddw m3, m1 pmulhrsw m3, m4 - packuswb m2, m3 + packuswb m3, m3 - lea r2, [r2 + 2 * r3] - movh [r2], m2 - movhps [r2 + r3], m2 + movd [r2 + r3], m3 + pextrw [r2 + r3 + 4], m3, 2 lea r2, [r2 + 2 * r3] @@ -7909,59 +5980,197 @@ cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 RET %endmacro - FILTER_V4_W8_H8_H16_H32 8, 8 - FILTER_V4_W8_H8_H16_H32 8, 16 - FILTER_V4_W8_H8_H16_H32 8, 32 + FILTER_V4_W6_H4 6, 8 - FILTER_V4_W8_H8_H16_H32 8, 12 - FILTER_V4_W8_H8_H16_H32 8, 64 + FILTER_V4_W6_H4 6, 16 -%macro PROCESS_CHROMA_AVX2_W8_8R 0 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - pmaddubsw m1, [r5] - movq xm3, [r0 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 8 - punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - pmaddubsw m3, m4, [r5 + 1 * mmsize] - paddw m1, m3 - pmaddubsw m4, [r5] - movq xm3, [r0 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - movq xm6, [r0 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - pmaddubsw m0, [r5 + 1 * mmsize] - paddw m4, m0 +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W12_H2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mov r4d, %2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m7, [r0 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_512] + + pmulhrsw m4, m6 + pmulhrsw m2, m6 + + packuswb m4, m2 + + movh [r2], m4 + pextrd [r2 + 8], m4, 2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m5, [r0 + 2 * r1] + + punpcklbw m2, m7, m5 + punpckhbw m7, m5 + + pmaddubsw m2, m0 + pmaddubsw m7, m0 + + paddw m4, m2 + paddw m3, m7 + + pmulhrsw m4, m6 + pmulhrsw m3, m6 + + packuswb m4, m3 + + movh [r2 + r3], m4 + pextrd [r2 + r3 + 8], m4, 2 + + lea r2, [r2 + 2 * r3] + + sub r4, 2 + jnz .loop + RET %endmacro -%macro FILTER_VER_CHROMA_AVX2_8x8 1 + FILTER_V4_W12_H2 12, 16 + + FILTER_V4_W12_H2 12, 32 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W16_H2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mov r4d, %2/2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m6, [r0 + r1] + + punpckhbw m7, m5, m6 + pmaddubsw m7, m0 + paddw m2, m7 + + punpcklbw m7, m5, m6 + pmaddubsw m7, m0 + paddw m4, m7 + + mova m7, [pw_512] + + pmulhrsw m4, m7 + pmulhrsw m2, m7 + + packuswb m4, m2 + + movu [r2], m4 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m5, [r0 + 2 * r1] + + punpcklbw m2, m6, m5 + punpckhbw m6, m5 + + pmaddubsw m2, m0 + pmaddubsw m6, m0 + + paddw m4, m2 + paddw m3, m6 + + pmulhrsw m4, m7 + pmulhrsw m3, m7 + + packuswb m4, m3 + + movu [r2 + r3], m4 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + + FILTER_V4_W16_H2 16, 4 + FILTER_V4_W16_H2 16, 8 + FILTER_V4_W16_H2 16, 12 + FILTER_V4_W16_H2 16, 16 + FILTER_V4_W16_H2 16, 32 + + FILTER_V4_W16_H2 16, 24 + FILTER_V4_W16_H2 16, 64 + +%macro FILTER_VER_CHROMA_AVX2_16x16 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x8, 4, 6, 7 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 mov r4d, r4m shl r4d, 6 @@ -7972,60 +6181,255 @@ cglobal interp_4tap_vert_%1_8x8, 4, 6, 7 lea r5, [tab_ChromaCoeffVer_32 + r4] %endif + mova m12, [r5] + mova m13, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 - PROCESS_CHROMA_AVX2_W8_8R %ifidn %1,pp - lea r4, [r3 * 3] - mova m3, [pw_512] - pmulhrsw m5, m3 ; m5 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - pmulhrsw m1, m3 ; m1 = word: row 4, row 5 - pmulhrsw m4, m3 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r4], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm1 - movq [r2 + r3], xm4 - movhps [r2 + r3 * 2], xm1 - movhps [r2 + r4], xm4 + mova m14, [pw_512] %else add r3d, r3d - vbroadcasti128 m3, [pw_2000] - lea r4, [r3 * 3] - psubw m5, m3 ; m5 = word: row 0, row 1 - psubw m2, m3 ; m2 = word: row 2, row 3 - psubw m1, m3 ; m1 = word: row 4, row 5 - psubw m4, m3 ; m4 = word: row 6, row 7 - vextracti128 xm6, m5, 1 + vbroadcasti128 m14, [pw_2000] +%endif + lea r5, [r3 * 3] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, m12 + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, m12 + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, m13 + paddw m0, m4 + pmaddubsw m2, m12 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, m13 + paddw m1, m5 + pmaddubsw m3, m12 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, m13 + paddw m2, m6 + pmaddubsw m4, m12 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, m13 + paddw m3, m7 + pmaddubsw m5, m12 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, m13 + paddw m4, m8 + pmaddubsw m6, m12 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, m13 + paddw m5, m9 + pmaddubsw m7, m12 + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, m13 + paddw m6, m10 + pmaddubsw m8, m12 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, m13 + paddw m7, m11 + pmaddubsw m9, m12 + +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + packuswb m6, m7 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 - vextracti128 xm0, m1, 1 - movu [r2], xm5 - movu [r2 + r3], xm6 + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 + movu [r2 + r5], xm3 lea r2, [r2 + r3 * 4] - movu [r2], xm1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm4 - vextracti128 xm4, m4, 1 - movu [r2 + r4], xm4 + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r5], xm7 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r5], m3 + lea r2, [r2 + r3 * 4] + movu [r2], m4 + movu [r2 + r3], m5 + movu [r2 + r3 * 2], m6 + movu [r2 + r5], m7 +%endif + lea r2, [r2 + r3 * 4] + + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm6, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm6, 1 + pmaddubsw m6, m10, m13 + paddw m8, m6 + pmaddubsw m10, m12 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 12 + punpckhbw xm7, xm11, xm6 + punpcklbw xm11, xm6 + vinserti128 m11, m11, xm7, 1 + pmaddubsw m7, m11, m13 + paddw m9, m7 + pmaddubsw m11, m12 + + movu xm7, [r0 + r1] ; m7 = row 13 + punpckhbw xm0, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm0, 1 + pmaddubsw m0, m6, m13 + paddw m10, m0 + pmaddubsw m6, m12 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm7, xm0 + punpcklbw xm7, xm0 + vinserti128 m7, m7, xm1, 1 + pmaddubsw m1, m7, m13 + paddw m11, m1 + pmaddubsw m7, m12 + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, m13 + paddw m6, m2 + pmaddubsw m0, m12 + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, m13 + paddw m7, m3 + pmaddubsw m1, m12 + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m2, m13 + paddw m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m3, m13 + paddw m1, m3 + +%ifidn %1,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m6, m14 ; m6 = word: row 12 + pmulhrsw m7, m14 ; m7 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m6, m7 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m6, m6, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm7, m6, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r5], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm6 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm0 + movu [r2 + r5], xm1 +%else + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m6, m14 ; m6 = word: row 12 + psubw m7, m14 ; m7 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r2], m8 + movu [r2 + r3], m9 + movu [r2 + r3 * 2], m10 + movu [r2 + r5], m11 + lea r2, [r2 + r3 * 4] + movu [r2], m6 + movu [r2 + r3], m7 + movu [r2 + r3 * 2], m0 + movu [r2 + r5], m1 %endif RET +%endif %endmacro - FILTER_VER_CHROMA_AVX2_8x8 pp - FILTER_VER_CHROMA_AVX2_8x8 ps - -%macro FILTER_VER_CHROMA_AVX2_8x6 1 + FILTER_VER_CHROMA_AVX2_16x16 pp + FILTER_VER_CHROMA_AVX2_16x16 ps +%macro FILTER_VER_CHROMA_AVX2_16x8 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x6, 4, 6, 6 +cglobal interp_4tap_vert_%1_16x8, 4, 7, 7 mov r4d, r4m shl r4d, 6 @@ -8038,264 +6442,151 @@ cglobal interp_4tap_vert_%1_8x6, 4, 6, 6 lea r4, [r1 * 3] sub r0, r1 +%ifidn %1,pp + mova m6, [pw_512] +%else + add r3d, r3d + mova m6, [pw_2000] +%endif + lea r6, [r3 * 3] - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5] - movq xm3, [r0 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 8 - punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - pmaddubsw m4, [r5 + 1 * mmsize] - paddw m1, m4 + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] %ifidn %1,pp - lea r4, [r3 * 3] - mova m3, [pw_512] - pmulhrsw m5, m3 ; m5 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - pmulhrsw m1, m3 ; m1 = word: row 4, row 5 - packuswb m5, m2 - packuswb m1, m1 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r4], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm1 - movq [r2 + r3], xm4 + pmulhrsw m0, m6 ; m0 = word: row 0 + pmulhrsw m1, m6 ; m1 = word: row 1 + packuswb m0, m1 + vpermq m0, m0, 11011000b + vextracti128 xm1, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 %else - add r3d, r3d - mova m3, [pw_2000] - lea r4, [r3 * 3] - psubw m5, m3 ; m5 = word: row 0, row 1 - psubw m2, m3 ; m2 = word: row 2, row 3 - psubw m1, m3 ; m1 = word: row 4, row 5 - vextracti128 xm4, m5, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm0, m1, 1 - movu [r2], xm5 - movu [r2 + r3], xm4 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm1 - movu [r2 + r3], xm0 + psubw m0, m6 ; m0 = word: row 0 + psubw m1, m6 ; m1 = word: row 1 + movu [r2], m0 + movu [r2 + r3], m1 %endif - RET -%endmacro - - FILTER_VER_CHROMA_AVX2_8x6 pp - FILTER_VER_CHROMA_AVX2_8x6 ps -%macro PROCESS_CHROMA_AVX2_W8_16R 1 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 - vinserti128 m5, m1, xm2, 1 - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 - vinserti128 m2, m3, xm4, 1 - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - pmaddubsw m1, [r5] - movq xm3, [r0 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 - lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 8 - punpcklbw xm3, xm0 - vinserti128 m4, m4, xm3, 1 - pmaddubsw m3, m4, [r5 + 1 * mmsize] - paddw m1, m3 + movu xm0, [r0 + r1] ; m0 = row 5 + punpckhbw xm1, xm4, xm0 + punpcklbw xm4, xm0 + vinserti128 m4, m4, xm1, 1 + pmaddubsw m1, m4, [r5 + mmsize] + paddw m2, m1 pmaddubsw m4, [r5] - movq xm3, [r0 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 - movq xm6, [r0 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 - vinserti128 m0, m0, xm3, 1 - pmaddubsw m3, m0, [r5 + 1 * mmsize] - paddw m4, m3 + movu xm1, [r0 + r1 * 2] ; m1 = row 6 + punpckhbw xm5, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm5, 1 + pmaddubsw m5, m0, [r5 + mmsize] + paddw m3, m5 pmaddubsw m0, [r5] %ifidn %1,pp - pmulhrsw m5, m7 ; m5 = word: row 0, row 1 - pmulhrsw m2, m7 ; m2 = word: row 2, row 3 - pmulhrsw m1, m7 ; m1 = word: row 4, row 5 - pmulhrsw m4, m7 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm1 - movq [r2 + r3], xm4 - movhps [r2 + r3 * 2], xm1 - movhps [r2 + r6], xm4 -%else - psubw m5, m7 ; m5 = word: row 0, row 1 - psubw m2, m7 ; m2 = word: row 2, row 3 - psubw m1, m7 ; m1 = word: row 4, row 5 - psubw m4, m7 ; m4 = word: row 6, row 7 - vextracti128 xm3, m5, 1 - movu [r2], xm5 - movu [r2 + r3], xm3 + pmulhrsw m2, m6 ; m2 = word: row 2 + pmulhrsw m3, m6 ; m3 = word: row 3 + packuswb m2, m3 + vpermq m2, m2, 11011000b vextracti128 xm3, m2, 1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - vextracti128 xm5, m1, 1 - vextracti128 xm3, m4, 1 - movu [r2], xm1 - movu [r2 + r3], xm5 - movu [r2 + r3 * 2], xm4 - movu [r2 + r6], xm3 +%else + psubw m2, m6 ; m2 = word: row 2 + psubw m3, m6 ; m3 = word: row 3 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 %endif - movq xm3, [r0 + r4] ; m3 = row 11 - punpcklbw xm6, xm3 + + movu xm2, [r0 + r4] ; m2 = row 7 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + mmsize] + paddw m4, m3 + pmaddubsw m1, [r5] lea r0, [r0 + r1 * 4] - movq xm5, [r0] ; m5 = row 12 - punpcklbw xm3, xm5 - vinserti128 m6, m6, xm3, 1 - pmaddubsw m3, m6, [r5 + 1 * mmsize] - paddw m0, m3 - pmaddubsw m6, [r5] - movq xm3, [r0 + r1] ; m3 = row 13 - punpcklbw xm5, xm3 - movq xm2, [r0 + r1 * 2] ; m2 = row 14 - punpcklbw xm3, xm2 - vinserti128 m5, m5, xm3, 1 - pmaddubsw m3, m5, [r5 + 1 * mmsize] - paddw m6, m3 - pmaddubsw m5, [r5] - movq xm3, [r0 + r4] ; m3 = row 15 + movu xm3, [r0] ; m3 = row 8 + punpckhbw xm5, xm2, xm3 punpcklbw xm2, xm3 - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 16 - punpcklbw xm3, xm1 - vinserti128 m2, m2, xm3, 1 - pmaddubsw m3, m2, [r5 + 1 * mmsize] - paddw m5, m3 + vinserti128 m2, m2, xm5, 1 + pmaddubsw m5, m2, [r5 + mmsize] + paddw m0, m5 pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 17 - punpcklbw xm1, xm3 - movq xm4, [r0 + r1 * 2] ; m4 = row 18 - punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5 + 1 * mmsize] - paddw m2, m1 lea r2, [r2 + r3 * 4] %ifidn %1,pp - pmulhrsw m0, m7 ; m0 = word: row 8, row 9 - pmulhrsw m6, m7 ; m6 = word: row 10, row 11 - pmulhrsw m5, m7 ; m5 = word: row 12, row 13 - pmulhrsw m2, m7 ; m2 = word: row 14, row 15 - packuswb m0, m6 - packuswb m5, m2 - vextracti128 xm6, m0, 1 - vextracti128 xm2, m5, 1 - movq [r2], xm0 - movq [r2 + r3], xm6 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm6 - lea r2, [r2 + r3 * 4] - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm2 -%else - psubw m0, m7 ; m0 = word: row 8, row 9 - psubw m6, m7 ; m6 = word: row 10, row 11 - psubw m5, m7 ; m5 = word: row 12, row 13 - psubw m2, m7 ; m2 = word: row 14, row 15 - vextracti128 xm1, m0, 1 - vextracti128 xm3, m6, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - vextracti128 xm1, m5, 1 - vextracti128 xm3, m2, 1 - movu [r2], xm5 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 -%endif -%endmacro - -%macro FILTER_VER_CHROMA_AVX2_8x16 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x16, 4, 7, 8 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 + pmulhrsw m4, m6 ; m4 = word: row 4 + pmulhrsw m0, m6 ; m0 = word: row 5 + packuswb m4, m0 + vpermq m4, m4, 11011000b + vextracti128 xm0, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm0 %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + psubw m4, m6 ; m4 = word: row 4 + psubw m0, m6 ; m0 = word: row 5 + movu [r2], m4 + movu [r2 + r3], m0 %endif - lea r4, [r1 * 3] - sub r0, r1 + movu xm5, [r0 + r1] ; m5 = row 9 + punpckhbw xm4, xm3, xm5 + punpcklbw xm3, xm5 + vinserti128 m3, m3, xm4, 1 + pmaddubsw m3, [r5 + mmsize] + paddw m1, m3 + movu xm4, [r0 + r1 * 2] ; m4 = row 10 + punpckhbw xm0, xm5, xm4 + punpcklbw xm5, xm4 + vinserti128 m5, m5, xm0, 1 + pmaddubsw m5, [r5 + mmsize] + paddw m2, m5 %ifidn %1,pp - mova m7, [pw_512] + pmulhrsw m1, m6 ; m1 = word: row 6 + pmulhrsw m2, m6 ; m2 = word: row 7 + packuswb m1, m2 + vpermq m1, m1, 11011000b + vextracti128 xm2, m1, 1 + movu [r2 + r3 * 2], xm1 + movu [r2 + r6], xm2 %else - add r3d, r3d - mova m7, [pw_2000] + psubw m1, m6 ; m1 = word: row 6 + psubw m2, m6 ; m2 = word: row 7 + movu [r2 + r3 * 2], m1 + movu [r2 + r6], m2 %endif - lea r6, [r3 * 3] - PROCESS_CHROMA_AVX2_W8_16R %1 RET %endmacro - FILTER_VER_CHROMA_AVX2_8x16 pp - FILTER_VER_CHROMA_AVX2_8x16 ps + FILTER_VER_CHROMA_AVX2_16x8 pp + FILTER_VER_CHROMA_AVX2_16x8 ps -%macro FILTER_VER_CHROMA_AVX2_8x12 1 +%macro FILTER_VER_CHROMA_AVX2_16x12 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x12, 4, 7, 8 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 mov r4d, r4m shl r4d, 6 @@ -8306,293 +6597,224 @@ cglobal interp_4tap_vert_%1_8x12, 4, 7, 8 lea r5, [tab_ChromaCoeffVer_32 + r4] %endif + mova m8, [r5] + mova m9, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 -%ifidn %1, pp +%ifidn %1,pp mova m7, [pw_512] %else add r3d, r3d - mova m7, [pw_2000] + vbroadcasti128 m7, [pw_2000] %endif - lea r6, [r3 * 3] - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 - vinserti128 m5, m1, xm2, 1 - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 - vinserti128 m2, m3, xm4, 1 - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - pmaddubsw m1, [r5] - movq xm3, [r0 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 - lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 8 - punpcklbw xm3, xm0 - vinserti128 m4, m4, xm3, 1 - pmaddubsw m3, m4, [r5 + 1 * mmsize] - paddw m1, m3 - pmaddubsw m4, [r5] - movq xm3, [r0 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 - movq xm6, [r0 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 - vinserti128 m0, m0, xm3, 1 - pmaddubsw m3, m0, [r5 + 1 * mmsize] + lea r5, [r3 * 3] + + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r1 * 2], 1 + movu xm1, [r0 + r1] + vinserti128 m1, m1, [r0 + r4], 1 + + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + vperm2i128 m4, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + pmaddubsw m4, m8 + pmaddubsw m3, m2, m9 paddw m4, m3 - pmaddubsw m0, [r5] -%ifidn %1, pp - pmulhrsw m5, m7 ; m5 = word: row 0, row 1 - pmulhrsw m2, m7 ; m2 = word: row 2, row 3 - pmulhrsw m1, m7 ; m1 = word: row 4, row 5 - pmulhrsw m4, m7 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm1 - movq [r2 + r3], xm4 - movhps [r2 + r3 * 2], xm1 - movhps [r2 + r6], xm4 -%else - psubw m5, m7 ; m5 = word: row 0, row 1 - psubw m2, m7 ; m2 = word: row 2, row 3 - psubw m1, m7 ; m1 = word: row 4, row 5 - psubw m4, m7 ; m4 = word: row 6, row 7 - vextracti128 xm3, m5, 1 - movu [r2], xm5 - movu [r2 + r3], xm3 - vextracti128 xm3, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - vextracti128 xm5, m1, 1 - vextracti128 xm3, m4, 1 - movu [r2], xm1 - movu [r2 + r3], xm5 - movu [r2 + r3 * 2], xm4 - movu [r2 + r6], xm3 -%endif - movq xm3, [r0 + r4] ; m3 = row 11 - punpcklbw xm6, xm3 + pmaddubsw m2, m8 + + vextracti128 xm0, m0, 1 lea r0, [r0 + r1 * 4] - movq xm5, [r0] ; m5 = row 12 - punpcklbw xm3, xm5 - vinserti128 m6, m6, xm3, 1 - pmaddubsw m3, m6, [r5 + 1 * mmsize] - paddw m0, m3 - pmaddubsw m6, [r5] - movq xm3, [r0 + r1] ; m3 = row 13 - punpcklbw xm5, xm3 - movq xm2, [r0 + r1 * 2] ; m2 = row 14 - punpcklbw xm3, xm2 - vinserti128 m5, m5, xm3, 1 - pmaddubsw m3, m5, [r5 + 1 * mmsize] + vinserti128 m0, m0, [r0], 1 + + punpcklbw m5, m1, m0 + punpckhbw m3, m1, m0 + vperm2i128 m6, m5, m3, 0x20 + vperm2i128 m5, m5, m3, 0x31 + pmaddubsw m6, m8 + pmaddubsw m3, m5, m9 paddw m6, m3 - lea r2, [r2 + r3 * 4] -%ifidn %1, pp - pmulhrsw m0, m7 ; m0 = word: row 8, row 9 - pmulhrsw m6, m7 ; m6 = word: row 10, row 11 - packuswb m0, m6 - vextracti128 xm6, m0, 1 - movq [r2], xm0 - movq [r2 + r3], xm6 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm6 + pmaddubsw m5, m8 +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 0 + pmulhrsw m6, m7 ; m6 = word: row 1 + packuswb m4, m6 + vpermq m4, m4, 11011000b + vextracti128 xm6, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm6 %else - psubw m0, m7 ; m0 = word: row 8, row 9 - psubw m6, m7 ; m6 = word: row 10, row 11 - vextracti128 xm1, m0, 1 - vextracti128 xm3, m6, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm3 + psubw m4, m7 ; m4 = word: row 0 + psubw m6, m7 ; m6 = word: row 1 + movu [r2], m4 + movu [r2 + r3], m6 %endif - RET -%endmacro - FILTER_VER_CHROMA_AVX2_8x12 pp - FILTER_VER_CHROMA_AVX2_8x12 ps + movu xm4, [r0 + r1 * 2] + vinserti128 m4, m4, [r0 + r1], 1 + vextracti128 xm1, m4, 1 + vinserti128 m0, m0, xm1, 0 -%macro FILTER_VER_CHROMA_AVX2_8xN 2 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x%2, 4, 7, 8 - mov r4d, r4m - shl r4d, 6 + punpcklbw m6, m0, m4 + punpckhbw m1, m0, m4 + vperm2i128 m0, m6, m1, 0x20 + vperm2i128 m6, m6, m1, 0x31 + pmaddubsw m1, m0, m9 + paddw m5, m1 + pmaddubsw m0, m8 + pmaddubsw m1, m6, m9 + paddw m2, m1 + pmaddubsw m6, m8 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 %ifidn %1,pp - mova m7, [pw_512] + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m5, m7 ; m5 = word: row 3 + packuswb m2, m5 + vpermq m2, m2, 11011000b + vextracti128 xm5, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r5], xm5 %else - add r3d, r3d - mova m7, [pw_2000] + psubw m2, m7 ; m2 = word: row 2 + psubw m5, m7 ; m5 = word: row 3 + movu [r2 + r3 * 2], m2 + movu [r2 + r5], m5 %endif - lea r6, [r3 * 3] -%rep %2 / 16 - PROCESS_CHROMA_AVX2_W8_16R %1 lea r2, [r2 + r3 * 4] -%endrep - RET -%endmacro - - FILTER_VER_CHROMA_AVX2_8xN pp, 32 - FILTER_VER_CHROMA_AVX2_8xN ps, 32 - FILTER_VER_CHROMA_AVX2_8xN pp, 64 - FILTER_VER_CHROMA_AVX2_8xN ps, 64 -%macro PROCESS_CHROMA_AVX2_W8_4R 0 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - vinserti128 m0, m1, xm2, 1 ; m0 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - pmaddubsw m0, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + movu xm1, [r0 + r4] lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - pmaddubsw m1, [r5 + 1 * mmsize] - paddw m2, m1 -%endmacro + vinserti128 m1, m1, [r0], 1 + vinserti128 m4, m4, xm1, 1 -%macro FILTER_VER_CHROMA_AVX2_8x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x4, 4, 6, 5 - mov r4d, r4m - shl r4d, 6 + punpcklbw m2, m4, m1 + punpckhbw m5, m4, m1 + vperm2i128 m3, m2, m5, 0x20 + vperm2i128 m2, m2, m5, 0x31 + pmaddubsw m5, m3, m9 + paddw m6, m5 + pmaddubsw m3, m8 + pmaddubsw m5, m2, m9 + paddw m0, m5 + pmaddubsw m2, m8 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 +%ifidn %1,pp + pmulhrsw m6, m7 ; m6 = word: row 4 + pmulhrsw m0, m7 ; m0 = word: row 5 + packuswb m6, m0 + vpermq m6, m6, 11011000b + vextracti128 xm0, m6, 1 + movu [r2], xm6 + movu [r2 + r3], xm0 %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + psubw m6, m7 ; m6 = word: row 4 + psubw m0, m7 ; m0 = word: row 5 + movu [r2], m6 + movu [r2 + r3], m0 %endif - lea r4, [r1 * 3] - sub r0, r1 - PROCESS_CHROMA_AVX2_W8_4R + movu xm6, [r0 + r1 * 2] + vinserti128 m6, m6, [r0 + r1], 1 + vextracti128 xm0, m6, 1 + vinserti128 m1, m1, xm0, 0 + + punpcklbw m4, m1, m6 + punpckhbw m5, m1, m6 + vperm2i128 m0, m4, m5, 0x20 + vperm2i128 m5, m4, m5, 0x31 + pmaddubsw m4, m0, m9 + paddw m2, m4 + pmaddubsw m0, m8 + pmaddubsw m4, m5, m9 + paddw m3, m4 + pmaddubsw m5, m8 + %ifidn %1,pp - lea r4, [r3 * 3] - mova m3, [pw_512] - pmulhrsw m0, m3 ; m0 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - packuswb m0, m2 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r4], xm2 + pmulhrsw m3, m7 ; m3 = word: row 6 + pmulhrsw m2, m7 ; m2 = word: row 7 + packuswb m3, m2 + vpermq m3, m3, 11011000b + vextracti128 xm2, m3, 1 + movu [r2 + r3 * 2], xm3 + movu [r2 + r5], xm2 %else - add r3d, r3d - vbroadcasti128 m3, [pw_2000] - lea r4, [r3 * 3] - psubw m0, m3 ; m0 = word: row 0, row 1 - psubw m2, m3 ; m2 = word: row 2, row 3 - vextracti128 xm1, m0, 1 - vextracti128 xm4, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm4 + psubw m3, m7 ; m3 = word: row 6 + psubw m2, m7 ; m2 = word: row 7 + movu [r2 + r3 * 2], m3 + movu [r2 + r5], m2 %endif - RET -%endmacro + lea r2, [r2 + r3 * 4] - FILTER_VER_CHROMA_AVX2_8x4 pp - FILTER_VER_CHROMA_AVX2_8x4 ps + movu xm3, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m3, m3, [r0], 1 + vinserti128 m6, m6, xm3, 1 -%macro FILTER_VER_CHROMA_AVX2_8x2 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x2, 4, 6, 4 - mov r4d, r4m - shl r4d, 6 + punpcklbw m2, m6, m3 + punpckhbw m1, m6, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, m9 + paddw m5, m1 + pmaddubsw m4, m8 + pmaddubsw m1, m2, m9 + paddw m0, m1 + pmaddubsw m2, m8 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 8 + pmulhrsw m0, m7 ; m0 = word: row 9 + packuswb m5, m0 + vpermq m5, m5, 11011000b + vextracti128 xm0, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm0 %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + psubw m5, m7 ; m5 = word: row 8 + psubw m0, m7 ; m0 = word: row 9 + movu [r2], m5 + movu [r2 + r3], m0 %endif - lea r4, [r1 * 3] - sub r0, r1 + movu xm5, [r0 + r1 * 2] + vinserti128 m5, m5, [r0 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 + + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m1, m6, m9 + paddw m2, m1 + pmaddubsw m1, m0, m9 + paddw m4, m1 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - vinserti128 m1, m1, xm2, 1 ; m1 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - pmaddubsw m1, [r5] - movq xm2, [r0 + r4] ; m2 = row 3 - punpcklbw xm3, xm2 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - movq xm0, [r0 + r1 * 4] ; m0 = row 4 - punpcklbw xm2, xm0 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - vinserti128 m3, m3, xm2, 1 ; m3 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 %ifidn %1,pp - pmulhrsw m1, [pw_512] ; m1 = word: row 0, row 1 - packuswb m1, m1 - vextracti128 xm0, m1, 1 - movq [r2], xm1 - movq [r2 + r3], xm0 + pmulhrsw m4, m7 ; m4 = word: row 10 + pmulhrsw m2, m7 ; m2 = word: row 11 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r2 + r3 * 2], xm4 + movu [r2 + r5], xm2 %else - add r3d, r3d - psubw m1, [pw_2000] ; m1 = word: row 0, row 1 - vextracti128 xm0, m1, 1 - movu [r2], xm1 - movu [r2 + r3], xm0 + psubw m4, m7 ; m4 = word: row 10 + psubw m2, m7 ; m2 = word: row 11 + movu [r2 + r3 * 2], m4 + movu [r2 + r5], m2 %endif RET +%endif %endmacro - FILTER_VER_CHROMA_AVX2_8x2 pp - FILTER_VER_CHROMA_AVX2_8x2 ps + FILTER_VER_CHROMA_AVX2_16x12 pp + FILTER_VER_CHROMA_AVX2_16x12 ps -%macro FILTER_VER_CHROMA_AVX2_6x8 1 +%macro FILTER_VER_CHROMA_AVX2_16xN 2 +%if ARCH_X86_64 == 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_6x8, 4, 6, 7 +cglobal interp_4tap_vert_%1_16x%2, 4, 8, 8 mov r4d, r4m shl r4d, 6 @@ -8605,377 +6827,310 @@ cglobal interp_4tap_vert_%1_6x8, 4, 6, 7 lea r4, [r1 * 3] sub r0, r1 - PROCESS_CHROMA_AVX2_W8_8R %ifidn %1,pp - lea r4, [r3 * 3] - mova m3, [pw_512] - pmulhrsw m5, m3 ; m5 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - pmulhrsw m1, m3 ; m1 = word: row 4, row 5 - pmulhrsw m4, m3 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movd [r2], xm5 - pextrw [r2 + 4], xm5, 2 - movd [r2 + r3], xm2 - pextrw [r2 + r3 + 4], xm2, 2 - pextrd [r2 + r3 * 2], xm5, 2 - pextrw [r2 + r3 * 2 + 4], xm5, 6 - pextrd [r2 + r4], xm2, 2 - pextrw [r2 + r4 + 4], xm2, 6 - lea r2, [r2 + r3 * 4] - movd [r2], xm1 - pextrw [r2 + 4], xm1, 2 - movd [r2 + r3], xm4 - pextrw [r2 + r3 + 4], xm4, 2 - pextrd [r2 + r3 * 2], xm1, 2 - pextrw [r2 + r3 * 2 + 4], xm1, 6 - pextrd [r2 + r4], xm4, 2 - pextrw [r2 + r4 + 4], xm4, 6 + mova m7, [pw_512] %else add r3d, r3d - vbroadcasti128 m3, [pw_2000] - lea r4, [r3 * 3] - psubw m5, m3 ; m5 = word: row 0, row 1 - psubw m2, m3 ; m2 = word: row 2, row 3 - psubw m1, m3 ; m1 = word: row 4, row 5 - psubw m4, m3 ; m4 = word: row 6, row 7 - vextracti128 xm6, m5, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm0, m1, 1 - movq [r2], xm5 - pextrd [r2 + 8], xm5, 2 - movq [r2 + r3], xm6 - pextrd [r2 + r3 + 8], xm6, 2 - movq [r2 + r3 * 2], xm2 - pextrd [r2 + r3 * 2 + 8], xm2, 2 - movq [r2 + r4], xm3 - pextrd [r2 + r4 + 8], xm3, 2 - lea r2, [r2 + r3 * 4] - movq [r2], xm1 - pextrd [r2 + 8], xm1, 2 - movq [r2 + r3], xm0 - pextrd [r2 + r3 + 8], xm0, 2 - movq [r2 + r3 * 2], xm4 - pextrd [r2 + r3 * 2 + 8], xm4, 2 - vextracti128 xm4, m4, 1 - movq [r2 + r4], xm4 - pextrd [r2 + r4 + 8], xm4, 2 + mova m7, [pw_2000] %endif - RET -%endmacro - - FILTER_VER_CHROMA_AVX2_6x8 pp - FILTER_VER_CHROMA_AVX2_6x8 ps + lea r6, [r3 * 3] + mov r7d, %2 / 16 +.loopH: + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r1 * 2], 1 + movu xm1, [r0 + r1] + vinserti128 m1, m1, [r0 + r4], 1 -;----------------------------------------------------------------------------- -;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W6_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + vperm2i128 m4, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + pmaddubsw m4, [r5] + pmaddubsw m3, m2, [r5 + mmsize] + paddw m4, m3 + pmaddubsw m2, [r5] - mov r4d, r4m - sub r0, r1 + vextracti128 xm0, m0, 1 + lea r0, [r0 + r1 * 4] + vinserti128 m0, m0, [r0], 1 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m5, [r5 + r4 * 4] + punpcklbw m5, m1, m0 + punpckhbw m3, m1, m0 + vperm2i128 m6, m5, m3, 0x20 + vperm2i128 m5, m5, m3, 0x31 + pmaddubsw m6, [r5] + pmaddubsw m3, m5, [r5 + mmsize] + paddw m6, m3 + pmaddubsw m5, [r5] +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 0 + pmulhrsw m6, m7 ; m6 = word: row 1 + packuswb m4, m6 + vpermq m4, m4, 11011000b + vextracti128 xm6, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm6 %else - movd m5, [tab_ChromaCoeff + r4 * 4] + psubw m4, m7 ; m4 = word: row 0 + psubw m6, m7 ; m6 = word: row 1 + movu [r2], m4 + movu [r2 + r3], m6 %endif - pshufb m6, m5, [tab_Vm] - pshufb m5, [tab_Vm + 16] - mova m4, [pw_512] - - mov r4d, %2 - lea r5, [3 * r1] - -.loop: - movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - movq m3, [r0 + r5] + movu xm4, [r0 + r1 * 2] + vinserti128 m4, m4, [r0 + r1], 1 + vextracti128 xm1, m4, 1 + vinserti128 m0, m0, xm1, 0 - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 + punpcklbw m6, m0, m4 + punpckhbw m1, m0, m4 + vperm2i128 m0, m6, m1, 0x20 + vperm2i128 m6, m6, m1, 0x31 + pmaddubsw m1, m0, [r5 + mmsize] + paddw m5, m1 + pmaddubsw m0, [r5] + pmaddubsw m1, m6, [r5 + mmsize] + paddw m2, m1 + pmaddubsw m6, [r5] - pmaddubsw m0, m6 - pmaddubsw m7, m2, m5 +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m5, m7 ; m5 = word: row 3 + packuswb m2, m5 + vpermq m2, m2, 11011000b + vextracti128 xm5, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm5 +%else + psubw m2, m7 ; m2 = word: row 2 + psubw m5, m7 ; m5 = word: row 3 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m5 +%endif + lea r2, [r2 + r3 * 4] - paddw m0, m7 + movu xm1, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m1, m1, [r0], 1 + vinserti128 m4, m4, xm1, 1 - pmulhrsw m0, m4 - packuswb m0, m0 - movd [r2], m0 - pextrw [r2 + 4], m0, 2 + punpcklbw m2, m4, m1 + punpckhbw m5, m4, m1 + vperm2i128 m3, m2, m5, 0x20 + vperm2i128 m2, m2, m5, 0x31 + pmaddubsw m5, m3, [r5 + mmsize] + paddw m6, m5 + pmaddubsw m3, [r5] + pmaddubsw m5, m2, [r5 + mmsize] + paddw m0, m5 + pmaddubsw m2, [r5] - lea r0, [r0 + 4 * r1] +%ifidn %1,pp + pmulhrsw m6, m7 ; m6 = word: row 4 + pmulhrsw m0, m7 ; m0 = word: row 5 + packuswb m6, m0 + vpermq m6, m6, 11011000b + vextracti128 xm0, m6, 1 + movu [r2], xm6 + movu [r2 + r3], xm0 +%else + psubw m6, m7 ; m6 = word: row 4 + psubw m0, m7 ; m0 = word: row 5 + movu [r2], m6 + movu [r2 + r3], m0 +%endif - movq m0, [r0] - punpcklbw m3, m0 + movu xm6, [r0 + r1 * 2] + vinserti128 m6, m6, [r0 + r1], 1 + vextracti128 xm0, m6, 1 + vinserti128 m1, m1, xm0, 0 - pmaddubsw m1, m6 - pmaddubsw m7, m3, m5 + punpcklbw m4, m1, m6 + punpckhbw m5, m1, m6 + vperm2i128 m0, m4, m5, 0x20 + vperm2i128 m5, m4, m5, 0x31 + pmaddubsw m4, m0, [r5 + mmsize] + paddw m2, m4 + pmaddubsw m0, [r5] + pmaddubsw m4, m5, [r5 + mmsize] + paddw m3, m4 + pmaddubsw m5, [r5] - paddw m1, m7 +%ifidn %1,pp + pmulhrsw m3, m7 ; m3 = word: row 6 + pmulhrsw m2, m7 ; m2 = word: row 7 + packuswb m3, m2 + vpermq m3, m3, 11011000b + vextracti128 xm2, m3, 1 + movu [r2 + r3 * 2], xm3 + movu [r2 + r6], xm2 +%else + psubw m3, m7 ; m3 = word: row 6 + psubw m2, m7 ; m2 = word: row 7 + movu [r2 + r3 * 2], m3 + movu [r2 + r6], m2 +%endif + lea r2, [r2 + r3 * 4] - pmulhrsw m1, m4 - packuswb m1, m1 - movd [r2 + r3], m1 - pextrw [r2 + r3 + 4], m1, 2 + movu xm3, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m3, m3, [r0], 1 + vinserti128 m6, m6, xm3, 1 - movq m1, [r0 + r1] - punpcklbw m7, m0, m1 + punpcklbw m2, m6, m3 + punpckhbw m1, m6, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, [r5 + mmsize] + paddw m5, m1 + pmaddubsw m4, [r5] + pmaddubsw m1, m2, [r5 + mmsize] + paddw m0, m1 + pmaddubsw m2, [r5] - pmaddubsw m2, m6 - pmaddubsw m7, m5 +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 8 + pmulhrsw m0, m7 ; m0 = word: row 9 + packuswb m5, m0 + vpermq m5, m5, 11011000b + vextracti128 xm0, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm0 +%else + psubw m5, m7 ; m5 = word: row 8 + psubw m0, m7 ; m0 = word: row 9 + movu [r2], m5 + movu [r2 + r3], m0 +%endif - paddw m2, m7 + movu xm5, [r0 + r1 * 2] + vinserti128 m5, m5, [r0 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 - pmulhrsw m2, m4 - packuswb m2, m2 - lea r2, [r2 + 2 * r3] - movd [r2], m2 - pextrw [r2 + 4], m2, 2 - - movq m2, [r0 + 2 * r1] - punpcklbw m1, m2 - - pmaddubsw m3, m6 - pmaddubsw m1, m5 - - paddw m3, m1 - - pmulhrsw m3, m4 - packuswb m3, m3 - - movd [r2 + r3], m3 - pextrw [r2 + r3 + 4], m3, 2 - - lea r2, [r2 + 2 * r3] - - sub r4, 4 - jnz .loop - RET -%endmacro - - FILTER_V4_W6_H4 6, 8 - - FILTER_V4_W6_H4 6, 16 - -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W12_H2 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m1, m6, [r5 + mmsize] + paddw m2, m1 + pmaddubsw m6, [r5] + pmaddubsw m1, m0, [r5 + mmsize] + paddw m4, m1 + pmaddubsw m0, [r5] -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 10 + pmulhrsw m2, m7 ; m2 = word: row 11 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r2 + r3 * 2], xm4 + movu [r2 + r6], xm2 %else - movd m0, [tab_ChromaCoeff + r4 * 4] + psubw m4, m7 ; m4 = word: row 10 + psubw m2, m7 ; m2 = word: row 11 + movu [r2 + r3 * 2], m4 + movu [r2 + r6], m2 %endif + lea r2, [r2 + r3 * 4] - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mov r4d, %2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r0, [r0 + 2 * r1] - movu m5, [r0] - movu m7, [r0 + r1] - - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 - - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 - - mova m6, [pw_512] - - pmulhrsw m4, m6 - pmulhrsw m2, m6 - - packuswb m4, m2 - - movh [r2], m4 - pextrd [r2 + 8], m4, 2 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m5, [r0 + 2 * r1] - - punpcklbw m2, m7, m5 - punpckhbw m7, m5 - - pmaddubsw m2, m0 - pmaddubsw m7, m0 - - paddw m4, m2 - paddw m3, m7 + movu xm3, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m3, m3, [r0], 1 + vinserti128 m5, m5, xm3, 1 - pmulhrsw m4, m6 - pmulhrsw m3, m6 + punpcklbw m2, m5, m3 + punpckhbw m1, m5, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, [r5 + mmsize] + paddw m0, m1 + pmaddubsw m4, [r5] + pmaddubsw m1, m2, [r5 + mmsize] + paddw m6, m1 + pmaddubsw m2, [r5] - packuswb m4, m3 +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 12 + pmulhrsw m6, m7 ; m6 = word: row 13 + packuswb m0, m6 + vpermq m0, m0, 11011000b + vextracti128 xm6, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm6 +%else + psubw m0, m7 ; m0 = word: row 12 + psubw m6, m7 ; m6 = word: row 13 + movu [r2], m0 + movu [r2 + r3], m6 +%endif - movh [r2 + r3], m4 - pextrd [r2 + r3 + 8], m4, 2 + movu xm5, [r0 + r1 * 2] + vinserti128 m5, m5, [r0 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 - lea r2, [r2 + 2 * r3] + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m6, [r5 + mmsize] + paddw m2, m6 + pmaddubsw m0, [r5 + mmsize] + paddw m4, m0 - sub r4, 2 - jnz .loop +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 14 + pmulhrsw m2, m7 ; m2 = word: row 15 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r2 + r3 * 2], xm4 + movu [r2 + r6], xm2 +%else + psubw m4, m7 ; m4 = word: row 14 + psubw m2, m7 ; m2 = word: row 15 + movu [r2 + r3 * 2], m4 + movu [r2 + r6], m2 +%endif + lea r2, [r2 + r3 * 4] + dec r7d + jnz .loopH RET +%endif %endmacro - FILTER_V4_W12_H2 12, 16 - - FILTER_V4_W12_H2 12, 32 - -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W16_H2 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8 + FILTER_VER_CHROMA_AVX2_16xN pp, 32 + FILTER_VER_CHROMA_AVX2_16xN ps, 32 + FILTER_VER_CHROMA_AVX2_16xN pp, 64 + FILTER_VER_CHROMA_AVX2_16xN ps, 64 - mov r4d, r4m - sub r0, r1 +%macro FILTER_VER_CHROMA_AVX2_16x24 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_16x24, 4, 6, 15 + mov r4d, r4m + shl r4d, 6 %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 %else - movd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32 + r4] %endif - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mov r4d, %2/2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r0, [r0 + 2 * r1] - movu m5, [r0] - movu m6, [r0 + r1] - - punpckhbw m7, m5, m6 - pmaddubsw m7, m0 - paddw m2, m7 - - punpcklbw m7, m5, m6 - pmaddubsw m7, m0 - paddw m4, m7 - - mova m7, [pw_512] - - pmulhrsw m4, m7 - pmulhrsw m2, m7 - - packuswb m4, m2 - - movu [r2], m4 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m5, [r0 + 2 * r1] - - punpcklbw m2, m6, m5 - punpckhbw m6, m5 - - pmaddubsw m2, m0 - pmaddubsw m6, m0 - - paddw m4, m2 - paddw m3, m6 - - pmulhrsw m4, m7 - pmulhrsw m3, m7 - - packuswb m4, m3 - - movu [r2 + r3], m4 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loop - RET -%endmacro - - FILTER_V4_W16_H2 16, 4 - FILTER_V4_W16_H2 16, 8 - FILTER_V4_W16_H2 16, 12 - FILTER_V4_W16_H2 16, 16 - FILTER_V4_W16_H2 16, 32 - - FILTER_V4_W16_H2 16, 24 - FILTER_V4_W16_H2 16, 64 - -%macro FILTER_VER_CHROMA_AVX2_16x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - mova m12, [r5] - mova m13, [r5 + mmsize] - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m14, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%endif - lea r5, [r3 * 3] + mova m12, [r5] + mova m13, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m14, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif + lea r5, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 @@ -9060,10 +7215,10 @@ cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b + vpermq m0, m0, q3120 + vpermq m2, m2, q3120 + vpermq m4, m4, q3120 + vpermq m6, m6, q3120 vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 @@ -9147,14 +7302,16 @@ cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddubsw m2, m13 - paddw m0, m2 + pmaddubsw m4, m2, m13 + paddw m0, m4 + pmaddubsw m2, m12 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 - pmaddubsw m3, m13 - paddw m1, m3 + pmaddubsw m5, m3, m13 + paddw m1, m5 + pmaddubsw m3, m12 %ifidn %1,pp pmulhrsw m8, m14 ; m8 = word: row 8 @@ -9169,10 +7326,10 @@ cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 packuswb m10, m11 packuswb m6, m7 packuswb m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m6, m6, 11011000b - vpermq m0, m0, 11011000b + vpermq m8, m8, q3120 + vpermq m10, m10, q3120 + vpermq m6, m6, q3120 + vpermq m0, m0, q3120 vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 vextracti128 xm7, m6, 1 @@ -9204,16 +7361,126 @@ cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 movu [r2 + r3], m7 movu [r2 + r3 * 2], m0 movu [r2 + r5], m1 +%endif + lea r2, [r2 + r3 * 4] + + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, m13 + paddw m2, m6 + pmaddubsw m4, m12 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, m13 + paddw m3, m7 + pmaddubsw m5, m12 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm0, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm0, 1 + pmaddubsw m0, m6, m13 + paddw m4, m0 + pmaddubsw m6, m12 + movu xm0, [r0 + r1 * 2] ; m0 = row 22 + punpckhbw xm1, xm7, xm0 + punpcklbw xm7, xm0 + vinserti128 m7, m7, xm1, 1 + pmaddubsw m1, m7, m13 + paddw m5, m1 + pmaddubsw m7, m12 + movu xm1, [r0 + r4] ; m1 = row 23 + punpckhbw xm8, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm8, 1 + pmaddubsw m8, m0, m13 + paddw m6, m8 + pmaddubsw m0, m12 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 24 + punpckhbw xm9, xm1, xm8 + punpcklbw xm1, xm8 + vinserti128 m1, m1, xm9, 1 + pmaddubsw m9, m1, m13 + paddw m7, m9 + pmaddubsw m1, m12 + movu xm9, [r0 + r1] ; m9 = row 25 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m8, m13 + paddw m0, m8 + movu xm10, [r0 + r1 * 2] ; m10 = row 26 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m9, m13 + paddw m1, m9 + +%ifidn %1,pp + pmulhrsw m2, m14 ; m2 = word: row 16 + pmulhrsw m3, m14 ; m3 = word: row 17 + pmulhrsw m4, m14 ; m4 = word: row 18 + pmulhrsw m5, m14 ; m5 = word: row 19 + pmulhrsw m6, m14 ; m6 = word: row 20 + pmulhrsw m7, m14 ; m7 = word: row 21 + pmulhrsw m0, m14 ; m0 = word: row 22 + pmulhrsw m1, m14 ; m1 = word: row 23 + packuswb m2, m3 + packuswb m4, m5 + packuswb m6, m7 + packuswb m0, m1 + vpermq m2, m2, q3120 + vpermq m4, m4, q3120 + vpermq m6, m6, q3120 + vpermq m0, m0, q3120 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm2 + movu [r2 + r3], xm3 + movu [r2 + r3 * 2], xm4 + movu [r2 + r5], xm5 + lea r2, [r2 + r3 * 4] + movu [r2], xm6 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm0 + movu [r2 + r5], xm1 +%else + psubw m2, m14 ; m2 = word: row 16 + psubw m3, m14 ; m3 = word: row 17 + psubw m4, m14 ; m4 = word: row 18 + psubw m5, m14 ; m5 = word: row 19 + psubw m6, m14 ; m6 = word: row 20 + psubw m7, m14 ; m7 = word: row 21 + psubw m0, m14 ; m0 = word: row 22 + psubw m1, m14 ; m1 = word: row 23 + movu [r2], m2 + movu [r2 + r3], m3 + movu [r2 + r3 * 2], m4 + movu [r2 + r5], m5 + lea r2, [r2 + r3 * 4] + movu [r2], m6 + movu [r2 + r3], m7 + movu [r2 + r3 * 2], m0 + movu [r2 + r5], m1 %endif RET %endif %endmacro - FILTER_VER_CHROMA_AVX2_16x16 pp - FILTER_VER_CHROMA_AVX2_16x16 ps -%macro FILTER_VER_CHROMA_AVX2_16x8 1 + FILTER_VER_CHROMA_AVX2_16x24 pp + FILTER_VER_CHROMA_AVX2_16x24 ps + +%macro FILTER_VER_CHROMA_AVX2_24x32 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_16x8, 4, 7, 7 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_24x32, 4, 9, 10 mov r4d, r4m shl r4d, 6 @@ -9224,179 +7491,23 @@ cglobal interp_4tap_vert_%1_16x8, 4, 7, 7 lea r5, [tab_ChromaCoeffVer_32 + r4] %endif + mova m8, [r5] + mova m9, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp - mova m6, [pw_512] + mova m7, [pw_512] %else add r3d, r3d - mova m6, [pw_2000] + vbroadcasti128 m7, [pw_2000] %endif lea r6, [r3 * 3] - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] -%ifidn %1,pp - pmulhrsw m0, m6 ; m0 = word: row 0 - pmulhrsw m1, m6 ; m1 = word: row 1 - packuswb m0, m1 - vpermq m0, m0, 11011000b - vextracti128 xm1, m0, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 -%else - psubw m0, m6 ; m0 = word: row 0 - psubw m1, m6 ; m1 = word: row 1 - movu [r2], m0 - movu [r2 + r3], m1 -%endif - - movu xm0, [r0 + r1] ; m0 = row 5 - punpckhbw xm1, xm4, xm0 - punpcklbw xm4, xm0 - vinserti128 m4, m4, xm1, 1 - pmaddubsw m1, m4, [r5 + mmsize] - paddw m2, m1 - pmaddubsw m4, [r5] - movu xm1, [r0 + r1 * 2] ; m1 = row 6 - punpckhbw xm5, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm5, 1 - pmaddubsw m5, m0, [r5 + mmsize] - paddw m3, m5 - pmaddubsw m0, [r5] -%ifidn %1,pp - pmulhrsw m2, m6 ; m2 = word: row 2 - pmulhrsw m3, m6 ; m3 = word: row 3 - packuswb m2, m3 - vpermq m2, m2, 11011000b - vextracti128 xm3, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 -%else - psubw m2, m6 ; m2 = word: row 2 - psubw m3, m6 ; m3 = word: row 3 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m3 -%endif - - movu xm2, [r0 + r4] ; m2 = row 7 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + mmsize] - paddw m4, m3 - pmaddubsw m1, [r5] - lea r0, [r0 + r1 * 4] - movu xm3, [r0] ; m3 = row 8 - punpckhbw xm5, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm5, 1 - pmaddubsw m5, m2, [r5 + mmsize] - paddw m0, m5 - pmaddubsw m2, [r5] - lea r2, [r2 + r3 * 4] -%ifidn %1,pp - pmulhrsw m4, m6 ; m4 = word: row 4 - pmulhrsw m0, m6 ; m0 = word: row 5 - packuswb m4, m0 - vpermq m4, m4, 11011000b - vextracti128 xm0, m4, 1 - movu [r2], xm4 - movu [r2 + r3], xm0 -%else - psubw m4, m6 ; m4 = word: row 4 - psubw m0, m6 ; m0 = word: row 5 - movu [r2], m4 - movu [r2 + r3], m0 -%endif - - movu xm5, [r0 + r1] ; m5 = row 9 - punpckhbw xm4, xm3, xm5 - punpcklbw xm3, xm5 - vinserti128 m3, m3, xm4, 1 - pmaddubsw m3, [r5 + mmsize] - paddw m1, m3 - movu xm4, [r0 + r1 * 2] ; m4 = row 10 - punpckhbw xm0, xm5, xm4 - punpcklbw xm5, xm4 - vinserti128 m5, m5, xm0, 1 - pmaddubsw m5, [r5 + mmsize] - paddw m2, m5 -%ifidn %1,pp - pmulhrsw m1, m6 ; m1 = word: row 6 - pmulhrsw m2, m6 ; m2 = word: row 7 - packuswb m1, m2 - vpermq m1, m1, 11011000b - vextracti128 xm2, m1, 1 - movu [r2 + r3 * 2], xm1 - movu [r2 + r6], xm2 -%else - psubw m1, m6 ; m1 = word: row 6 - psubw m2, m6 ; m2 = word: row 7 - movu [r2 + r3 * 2], m1 - movu [r2 + r6], m2 -%endif - RET -%endmacro - - FILTER_VER_CHROMA_AVX2_16x8 pp - FILTER_VER_CHROMA_AVX2_16x8 ps - -%macro FILTER_VER_CHROMA_AVX2_16x12 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - mova m8, [r5] - mova m9, [r5 + mmsize] - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m7, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m7, [pw_2000] -%endif - lea r5, [r3 * 3] - - movu xm0, [r0] - vinserti128 m0, m0, [r0 + r1 * 2], 1 - movu xm1, [r0 + r1] - vinserti128 m1, m1, [r0 + r4], 1 + mov r5d, 2 +.loopH: + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r1 * 2], 1 + movu xm1, [r0 + r1] + vinserti128 m1, m1, [r0 + r4], 1 punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 @@ -9408,8 +7519,8 @@ cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 pmaddubsw m2, m8 vextracti128 xm0, m0, 1 - lea r0, [r0 + r1 * 4] - vinserti128 m0, m0, [r0], 1 + lea r7, [r0 + r1 * 4] + vinserti128 m0, m0, [r7], 1 punpcklbw m5, m1, m0 punpckhbw m3, m1, m0 @@ -9434,8 +7545,8 @@ cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 movu [r2 + r3], m6 %endif - movu xm4, [r0 + r1 * 2] - vinserti128 m4, m4, [r0 + r1], 1 + movu xm4, [r7 + r1 * 2] + vinserti128 m4, m4, [r7 + r1], 1 vextracti128 xm1, m4, 1 vinserti128 m0, m0, xm1, 0 @@ -9457,18 +7568,18 @@ cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 vpermq m2, m2, 11011000b vextracti128 xm5, m2, 1 movu [r2 + r3 * 2], xm2 - movu [r2 + r5], xm5 + movu [r2 + r6], xm5 %else psubw m2, m7 ; m2 = word: row 2 psubw m5, m7 ; m5 = word: row 3 movu [r2 + r3 * 2], m2 - movu [r2 + r5], m5 + movu [r2 + r6], m5 %endif - lea r2, [r2 + r3 * 4] + lea r8, [r2 + r3 * 4] - movu xm1, [r0 + r4] - lea r0, [r0 + r1 * 4] - vinserti128 m1, m1, [r0], 1 + movu xm1, [r7 + r4] + lea r7, [r7 + r1 * 4] + vinserti128 m1, m1, [r7], 1 vinserti128 m4, m4, xm1, 1 punpcklbw m2, m4, m1 @@ -9488,17 +7599,17 @@ cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 packuswb m6, m0 vpermq m6, m6, 11011000b vextracti128 xm0, m6, 1 - movu [r2], xm6 - movu [r2 + r3], xm0 + movu [r8], xm6 + movu [r8 + r3], xm0 %else psubw m6, m7 ; m6 = word: row 4 psubw m0, m7 ; m0 = word: row 5 - movu [r2], m6 - movu [r2 + r3], m0 + movu [r8], m6 + movu [r8 + r3], m0 %endif - movu xm6, [r0 + r1 * 2] - vinserti128 m6, m6, [r0 + r1], 1 + movu xm6, [r7 + r1 * 2] + vinserti128 m6, m6, [r7 + r1], 1 vextracti128 xm0, m6, 1 vinserti128 m1, m1, xm0, 0 @@ -9519,19 +7630,19 @@ cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 packuswb m3, m2 vpermq m3, m3, 11011000b vextracti128 xm2, m3, 1 - movu [r2 + r3 * 2], xm3 - movu [r2 + r5], xm2 + movu [r8 + r3 * 2], xm3 + movu [r8 + r6], xm2 %else psubw m3, m7 ; m3 = word: row 6 psubw m2, m7 ; m2 = word: row 7 - movu [r2 + r3 * 2], m3 - movu [r2 + r5], m2 + movu [r8 + r3 * 2], m3 + movu [r8 + r6], m2 %endif - lea r2, [r2 + r3 * 4] + lea r8, [r8 + r3 * 4] - movu xm3, [r0 + r4] - lea r0, [r0 + r1 * 4] - vinserti128 m3, m3, [r0], 1 + movu xm3, [r7 + r4] + lea r7, [r7 + r1 * 4] + vinserti128 m3, m3, [r7], 1 vinserti128 m6, m6, xm3, 1 punpcklbw m2, m6, m3 @@ -9551,17 +7662,17 @@ cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 packuswb m5, m0 vpermq m5, m5, 11011000b vextracti128 xm0, m5, 1 - movu [r2], xm5 - movu [r2 + r3], xm0 + movu [r8], xm5 + movu [r8 + r3], xm0 %else psubw m5, m7 ; m5 = word: row 8 psubw m0, m7 ; m0 = word: row 9 - movu [r2], m5 - movu [r2 + r3], m0 + movu [r8], m5 + movu [r8 + r3], m0 %endif - movu xm5, [r0 + r1 * 2] - vinserti128 m5, m5, [r0 + r1], 1 + movu xm5, [r7 + r1 * 2] + vinserti128 m5, m5, [r7 + r1], 1 vextracti128 xm0, m5, 1 vinserti128 m3, m3, xm0, 0 @@ -9571,8 +7682,10 @@ cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 vperm2i128 m0, m1, m0, 0x31 pmaddubsw m1, m6, m9 paddw m2, m1 + pmaddubsw m6, m8 pmaddubsw m1, m0, m9 paddw m4, m1 + pmaddubsw m0, m8 %ifidn %1,pp pmulhrsw m4, m7 ; m4 = word: row 10 @@ -9580,278 +7693,49 @@ cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm2, m4, 1 - movu [r2 + r3 * 2], xm4 - movu [r2 + r5], xm2 + movu [r8 + r3 * 2], xm4 + movu [r8 + r6], xm2 %else psubw m4, m7 ; m4 = word: row 10 psubw m2, m7 ; m2 = word: row 11 - movu [r2 + r3 * 2], m4 - movu [r2 + r5], m2 -%endif - RET + movu [r8 + r3 * 2], m4 + movu [r8 + r6], m2 %endif -%endmacro - - FILTER_VER_CHROMA_AVX2_16x12 pp - FILTER_VER_CHROMA_AVX2_16x12 ps + lea r8, [r8 + r3 * 4] -%macro FILTER_VER_CHROMA_AVX2_16xN 2 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_16x%2, 4, 8, 8 - mov r4d, r4m - shl r4d, 6 + movu xm3, [r7 + r4] + lea r7, [r7 + r1 * 4] + vinserti128 m3, m3, [r7], 1 + vinserti128 m5, m5, xm3, 1 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif + punpcklbw m2, m5, m3 + punpckhbw m1, m5, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, m9 + paddw m0, m1 + pmaddubsw m4, m8 + pmaddubsw m1, m2, m9 + paddw m6, m1 + pmaddubsw m2, m8 - lea r4, [r1 * 3] - sub r0, r1 %ifidn %1,pp - mova m7, [pw_512] + pmulhrsw m0, m7 ; m0 = word: row 12 + pmulhrsw m6, m7 ; m6 = word: row 13 + packuswb m0, m6 + vpermq m0, m0, 11011000b + vextracti128 xm6, m0, 1 + movu [r8], xm0 + movu [r8 + r3], xm6 %else - add r3d, r3d - mova m7, [pw_2000] -%endif - lea r6, [r3 * 3] - mov r7d, %2 / 16 -.loopH: - movu xm0, [r0] - vinserti128 m0, m0, [r0 + r1 * 2], 1 - movu xm1, [r0 + r1] - vinserti128 m1, m1, [r0 + r4], 1 - - punpcklbw m2, m0, m1 - punpckhbw m3, m0, m1 - vperm2i128 m4, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - pmaddubsw m4, [r5] - pmaddubsw m3, m2, [r5 + mmsize] - paddw m4, m3 - pmaddubsw m2, [r5] - - vextracti128 xm0, m0, 1 - lea r0, [r0 + r1 * 4] - vinserti128 m0, m0, [r0], 1 - - punpcklbw m5, m1, m0 - punpckhbw m3, m1, m0 - vperm2i128 m6, m5, m3, 0x20 - vperm2i128 m5, m5, m3, 0x31 - pmaddubsw m6, [r5] - pmaddubsw m3, m5, [r5 + mmsize] - paddw m6, m3 - pmaddubsw m5, [r5] -%ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 0 - pmulhrsw m6, m7 ; m6 = word: row 1 - packuswb m4, m6 - vpermq m4, m4, 11011000b - vextracti128 xm6, m4, 1 - movu [r2], xm4 - movu [r2 + r3], xm6 -%else - psubw m4, m7 ; m4 = word: row 0 - psubw m6, m7 ; m6 = word: row 1 - movu [r2], m4 - movu [r2 + r3], m6 -%endif - - movu xm4, [r0 + r1 * 2] - vinserti128 m4, m4, [r0 + r1], 1 - vextracti128 xm1, m4, 1 - vinserti128 m0, m0, xm1, 0 - - punpcklbw m6, m0, m4 - punpckhbw m1, m0, m4 - vperm2i128 m0, m6, m1, 0x20 - vperm2i128 m6, m6, m1, 0x31 - pmaddubsw m1, m0, [r5 + mmsize] - paddw m5, m1 - pmaddubsw m0, [r5] - pmaddubsw m1, m6, [r5 + mmsize] - paddw m2, m1 - pmaddubsw m6, [r5] - -%ifidn %1,pp - pmulhrsw m2, m7 ; m2 = word: row 2 - pmulhrsw m5, m7 ; m5 = word: row 3 - packuswb m2, m5 - vpermq m2, m2, 11011000b - vextracti128 xm5, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm5 -%else - psubw m2, m7 ; m2 = word: row 2 - psubw m5, m7 ; m5 = word: row 3 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m5 -%endif - lea r2, [r2 + r3 * 4] - - movu xm1, [r0 + r4] - lea r0, [r0 + r1 * 4] - vinserti128 m1, m1, [r0], 1 - vinserti128 m4, m4, xm1, 1 - - punpcklbw m2, m4, m1 - punpckhbw m5, m4, m1 - vperm2i128 m3, m2, m5, 0x20 - vperm2i128 m2, m2, m5, 0x31 - pmaddubsw m5, m3, [r5 + mmsize] - paddw m6, m5 - pmaddubsw m3, [r5] - pmaddubsw m5, m2, [r5 + mmsize] - paddw m0, m5 - pmaddubsw m2, [r5] - -%ifidn %1,pp - pmulhrsw m6, m7 ; m6 = word: row 4 - pmulhrsw m0, m7 ; m0 = word: row 5 - packuswb m6, m0 - vpermq m6, m6, 11011000b - vextracti128 xm0, m6, 1 - movu [r2], xm6 - movu [r2 + r3], xm0 -%else - psubw m6, m7 ; m6 = word: row 4 - psubw m0, m7 ; m0 = word: row 5 - movu [r2], m6 - movu [r2 + r3], m0 -%endif - - movu xm6, [r0 + r1 * 2] - vinserti128 m6, m6, [r0 + r1], 1 - vextracti128 xm0, m6, 1 - vinserti128 m1, m1, xm0, 0 - - punpcklbw m4, m1, m6 - punpckhbw m5, m1, m6 - vperm2i128 m0, m4, m5, 0x20 - vperm2i128 m5, m4, m5, 0x31 - pmaddubsw m4, m0, [r5 + mmsize] - paddw m2, m4 - pmaddubsw m0, [r5] - pmaddubsw m4, m5, [r5 + mmsize] - paddw m3, m4 - pmaddubsw m5, [r5] - -%ifidn %1,pp - pmulhrsw m3, m7 ; m3 = word: row 6 - pmulhrsw m2, m7 ; m2 = word: row 7 - packuswb m3, m2 - vpermq m3, m3, 11011000b - vextracti128 xm2, m3, 1 - movu [r2 + r3 * 2], xm3 - movu [r2 + r6], xm2 -%else - psubw m3, m7 ; m3 = word: row 6 - psubw m2, m7 ; m2 = word: row 7 - movu [r2 + r3 * 2], m3 - movu [r2 + r6], m2 -%endif - lea r2, [r2 + r3 * 4] - - movu xm3, [r0 + r4] - lea r0, [r0 + r1 * 4] - vinserti128 m3, m3, [r0], 1 - vinserti128 m6, m6, xm3, 1 - - punpcklbw m2, m6, m3 - punpckhbw m1, m6, m3 - vperm2i128 m4, m2, m1, 0x20 - vperm2i128 m2, m2, m1, 0x31 - pmaddubsw m1, m4, [r5 + mmsize] - paddw m5, m1 - pmaddubsw m4, [r5] - pmaddubsw m1, m2, [r5 + mmsize] - paddw m0, m1 - pmaddubsw m2, [r5] - -%ifidn %1,pp - pmulhrsw m5, m7 ; m5 = word: row 8 - pmulhrsw m0, m7 ; m0 = word: row 9 - packuswb m5, m0 - vpermq m5, m5, 11011000b - vextracti128 xm0, m5, 1 - movu [r2], xm5 - movu [r2 + r3], xm0 -%else - psubw m5, m7 ; m5 = word: row 8 - psubw m0, m7 ; m0 = word: row 9 - movu [r2], m5 - movu [r2 + r3], m0 -%endif - - movu xm5, [r0 + r1 * 2] - vinserti128 m5, m5, [r0 + r1], 1 - vextracti128 xm0, m5, 1 - vinserti128 m3, m3, xm0, 0 - - punpcklbw m1, m3, m5 - punpckhbw m0, m3, m5 - vperm2i128 m6, m1, m0, 0x20 - vperm2i128 m0, m1, m0, 0x31 - pmaddubsw m1, m6, [r5 + mmsize] - paddw m2, m1 - pmaddubsw m6, [r5] - pmaddubsw m1, m0, [r5 + mmsize] - paddw m4, m1 - pmaddubsw m0, [r5] - -%ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 10 - pmulhrsw m2, m7 ; m2 = word: row 11 - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm2, m4, 1 - movu [r2 + r3 * 2], xm4 - movu [r2 + r6], xm2 -%else - psubw m4, m7 ; m4 = word: row 10 - psubw m2, m7 ; m2 = word: row 11 - movu [r2 + r3 * 2], m4 - movu [r2 + r6], m2 -%endif - lea r2, [r2 + r3 * 4] - - movu xm3, [r0 + r4] - lea r0, [r0 + r1 * 4] - vinserti128 m3, m3, [r0], 1 - vinserti128 m5, m5, xm3, 1 - - punpcklbw m2, m5, m3 - punpckhbw m1, m5, m3 - vperm2i128 m4, m2, m1, 0x20 - vperm2i128 m2, m2, m1, 0x31 - pmaddubsw m1, m4, [r5 + mmsize] - paddw m0, m1 - pmaddubsw m4, [r5] - pmaddubsw m1, m2, [r5 + mmsize] - paddw m6, m1 - pmaddubsw m2, [r5] - -%ifidn %1,pp - pmulhrsw m0, m7 ; m0 = word: row 12 - pmulhrsw m6, m7 ; m6 = word: row 13 - packuswb m0, m6 - vpermq m0, m0, 11011000b - vextracti128 xm6, m0, 1 - movu [r2], xm0 - movu [r2 + r3], xm6 -%else - psubw m0, m7 ; m0 = word: row 12 - psubw m6, m7 ; m6 = word: row 13 - movu [r2], m0 - movu [r2 + r3], m6 + psubw m0, m7 ; m0 = word: row 12 + psubw m6, m7 ; m6 = word: row 13 + movu [r8], m0 + movu [r8 + r3], m6 %endif - movu xm5, [r0 + r1 * 2] - vinserti128 m5, m5, [r0 + r1], 1 + movu xm5, [r7 + r1 * 2] + vinserti128 m5, m5, [r7 + r1], 1 vextracti128 xm0, m5, 1 vinserti128 m3, m3, xm0, 0 @@ -9859,9 +7743,9 @@ cglobal interp_4tap_vert_%1_16x%2, 4, 8, 8 punpckhbw m0, m3, m5 vperm2i128 m6, m1, m0, 0x20 vperm2i128 m0, m1, m0, 0x31 - pmaddubsw m6, [r5 + mmsize] + pmaddubsw m6, m9 paddw m2, m6 - pmaddubsw m0, [r5 + mmsize] + pmaddubsw m0, m9 paddw m4, m0 %ifidn %1,pp @@ -9870,30 +7754,184 @@ cglobal interp_4tap_vert_%1_16x%2, 4, 8, 8 packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm2, m4, 1 - movu [r2 + r3 * 2], xm4 - movu [r2 + r6], xm2 + movu [r8 + r3 * 2], xm4 + movu [r8 + r6], xm2 + add r2, 16 %else psubw m4, m7 ; m4 = word: row 14 psubw m2, m7 ; m2 = word: row 15 - movu [r2 + r3 * 2], m4 - movu [r2 + r6], m2 -%endif - lea r2, [r2 + r3 * 4] - dec r7d - jnz .loopH - RET + movu [r8 + r3 * 2], m4 + movu [r8 + r6], m2 + add r2, 32 %endif -%endmacro - - FILTER_VER_CHROMA_AVX2_16xN pp, 32 - FILTER_VER_CHROMA_AVX2_16xN ps, 32 - FILTER_VER_CHROMA_AVX2_16xN pp, 64 - FILTER_VER_CHROMA_AVX2_16xN ps, 64 - -%macro FILTER_VER_CHROMA_AVX2_16x24 1 + add r0, 16 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 + vinserti128 m5, m1, xm2, 1 + pmaddubsw m5, m8 + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 + lea r7, [r0 + r1 * 4] + movq xm1, [r7] ; m1 = row 4 + punpcklbw xm4, xm1 + vinserti128 m2, m3, xm4, 1 + pmaddubsw m0, m2, m9 + paddw m5, m0 + pmaddubsw m2, m8 + movq xm3, [r7 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 + movq xm4, [r7 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m0, m1, m9 + paddw m2, m0 + pmaddubsw m1, m8 + movq xm3, [r7 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 + lea r7, [r7 + r1 * 4] + movq xm0, [r7] ; m0 = row 8 + punpcklbw xm3, xm0 + vinserti128 m4, m4, xm3, 1 + pmaddubsw m3, m4, m9 + paddw m1, m3 + pmaddubsw m4, m8 + movq xm3, [r7 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 + movq xm6, [r7 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 + vinserti128 m0, m0, xm3, 1 + pmaddubsw m3, m0, m9 + paddw m4, m3 + pmaddubsw m0, m8 + +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 + lea r8, [r2 + r3 * 4] + movq [r8], xm1 + movq [r8 + r3], xm4 + movhps [r8 + r3 * 2], xm1 + movhps [r8 + r6], xm4 +%else + psubw m5, m7 ; m5 = word: row 0, row 1 + psubw m2, m7 ; m2 = word: row 2, row 3 + psubw m1, m7 ; m1 = word: row 4, row 5 + psubw m4, m7 ; m4 = word: row 6, row 7 + vextracti128 xm3, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm3 + vextracti128 xm3, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + vextracti128 xm3, m1, 1 + lea r8, [r2 + r3 * 4] + movu [r8], xm1 + movu [r8 + r3], xm3 + vextracti128 xm3, m4, 1 + movu [r8 + r3 * 2], xm4 + movu [r8 + r6], xm3 +%endif + lea r8, [r8 + r3 * 4] + + movq xm3, [r7 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 + lea r7, [r7 + r1 * 4] + movq xm5, [r7] ; m5 = row 12 + punpcklbw xm3, xm5 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, m9 + paddw m0, m3 + pmaddubsw m6, m8 + movq xm3, [r7 + r1] ; m3 = row 13 + punpcklbw xm5, xm3 + movq xm2, [r7 + r1 * 2] ; m2 = row 14 + punpcklbw xm3, xm2 + vinserti128 m5, m5, xm3, 1 + pmaddubsw m3, m5, m9 + paddw m6, m3 + pmaddubsw m5, m8 + movq xm3, [r7 + r4] ; m3 = row 15 + punpcklbw xm2, xm3 + lea r7, [r7 + r1 * 4] + movq xm1, [r7] ; m1 = row 16 + punpcklbw xm3, xm1 + vinserti128 m2, m2, xm3, 1 + pmaddubsw m3, m2, m9 + paddw m5, m3 + pmaddubsw m2, m8 + movq xm3, [r7 + r1] ; m3 = row 17 + punpcklbw xm1, xm3 + movq xm4, [r7 + r1 * 2] ; m4 = row 18 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, m9 + paddw m2, m3 +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 8, row 9 + pmulhrsw m6, m7 ; m6 = word: row 10, row 11 + pmulhrsw m5, m7 ; m5 = word: row 12, row 13 + pmulhrsw m2, m7 ; m2 = word: row 14, row 15 + packuswb m0, m6 + packuswb m5, m2 + vextracti128 xm6, m0, 1 + vextracti128 xm2, m5, 1 + movq [r8], xm0 + movq [r8 + r3], xm6 + movhps [r8 + r3 * 2], xm0 + movhps [r8 + r6], xm6 + lea r8, [r8 + r3 * 4] + movq [r8], xm5 + movq [r8 + r3], xm2 + movhps [r8 + r3 * 2], xm5 + movhps [r8 + r6], xm2 + lea r2, [r8 + r3 * 4 - 16] +%else + psubw m0, m7 ; m0 = word: row 8, row 9 + psubw m6, m7 ; m6 = word: row 10, row 11 + psubw m5, m7 ; m5 = word: row 12, row 13 + psubw m2, m7 ; m2 = word: row 14, row 15 + vextracti128 xm3, m0, 1 + movu [r8], xm0 + movu [r8 + r3], xm3 + vextracti128 xm3, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm3 + vextracti128 xm3, m5, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm5 + movu [r8 + r3], xm3 + vextracti128 xm3, m2, 1 + movu [r8 + r3 * 2], xm2 + movu [r8 + r6], xm3 + lea r2, [r8 + r3 * 4 - 32] +%endif + lea r0, [r7 - 16] + dec r5d + jnz .loopH + RET +%endif +%endmacro + + FILTER_VER_CHROMA_AVX2_24x32 pp + FILTER_VER_CHROMA_AVX2_24x32 ps + +%macro FILTER_VER_CHROMA_AVX2_24x64 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_16x24, 4, 6, 15 +cglobal interp_4tap_vert_%1_24x64, 4, 7, 13 mov r4d, r4m shl r4d, 6 @@ -9904,367 +7942,138 @@ cglobal interp_4tap_vert_%1_16x24, 4, 6, 15 lea r5, [tab_ChromaCoeffVer_32 + r4] %endif - mova m12, [r5] - mova m13, [r5 + mmsize] + mova m10, [r5] + mova m11, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp - mova m14, [pw_512] + mova m12, [pw_512] %else add r3d, r3d - vbroadcasti128 m14, [pw_2000] + vbroadcasti128 m12, [pw_2000] %endif lea r5, [r3 * 3] - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, m12 - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, m12 - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, m13 - paddw m0, m4 - pmaddubsw m2, m12 + mov r6d, 16 +.loopH: + movu m0, [r0] ; m0 = row 0 + movu m1, [r0 + r1] ; m1 = row 1 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + movu m0, [r0 + r1 * 2] ; m0 = row 2 + punpcklbw m4, m1, m0 + punpckhbw m5, m1, m0 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + movu m1, [r0 + r4] ; m1 = row 3 + punpcklbw m6, m0, m1 + punpckhbw m7, m0, m1 + pmaddubsw m8, m6, m11 + pmaddubsw m9, m7, m11 + pmaddubsw m6, m10 + pmaddubsw m7, m10 + paddw m2, m8 + paddw m3, m9 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2], xm2 + vextracti128 xm2, m2, 1 + movq [r2 + 16], xm2 +%else + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2], m0 + movu [r2 + mmsize], xm2 +%endif lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, m13 - paddw m1, m5 - pmaddubsw m3, m12 - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, m13 - paddw m2, m6 - pmaddubsw m4, m12 - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, m13 - paddw m3, m7 - pmaddubsw m5, m12 - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, m13 + movu m0, [r0] ; m0 = row 4 + punpcklbw m2, m1, m0 + punpckhbw m3, m1, m0 + pmaddubsw m8, m2, m11 + pmaddubsw m9, m3, m11 + pmaddubsw m2, m10 + pmaddubsw m3, m10 paddw m4, m8 - pmaddubsw m6, m12 - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, m13 paddw m5, m9 - pmaddubsw m7, m12 - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, m13 - paddw m6, m10 - pmaddubsw m8, m12 - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, m13 - paddw m7, m11 - pmaddubsw m9, m12 - %ifidn %1,pp - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m0, m1 - packuswb m2, m3 + pmulhrsw m4, m12 + pmulhrsw m5, m12 packuswb m4, m5 - packuswb m6, m7 - vpermq m0, m0, q3120 - vpermq m2, m2, q3120 - vpermq m4, m4, q3120 - vpermq m6, m6, q3120 - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - vextracti128 xm7, m6, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r5], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 - movu [r2 + r3 * 2], xm6 - movu [r2 + r5], xm7 + movu [r2 + r3], xm4 + vextracti128 xm4, m4, 1 + movq [r2 + r3 + 16], xm4 %else - psubw m0, m14 ; m0 = word: row 0 - psubw m1, m14 ; m1 = word: row 1 - psubw m2, m14 ; m2 = word: row 2 - psubw m3, m14 ; m3 = word: row 3 - psubw m4, m14 ; m4 = word: row 4 - psubw m5, m14 ; m5 = word: row 5 - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 - movu [r2], m0 + psubw m4, m12 + psubw m5, m12 + vperm2i128 m1, m4, m5, 0x20 + vperm2i128 m4, m4, m5, 0x31 movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r5], m3 - lea r2, [r2 + r3 * 4] - movu [r2], m4 - movu [r2 + r3], m5 - movu [r2 + r3 * 2], m6 - movu [r2 + r5], m7 + movu [r2 + r3 + mmsize], xm4 %endif - lea r2, [r2 + r3 * 4] - - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm6, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm6, 1 - pmaddubsw m6, m10, m13 - paddw m8, m6 - pmaddubsw m10, m12 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 12 - punpckhbw xm7, xm11, xm6 - punpcklbw xm11, xm6 - vinserti128 m11, m11, xm7, 1 - pmaddubsw m7, m11, m13 - paddw m9, m7 - pmaddubsw m11, m12 - - movu xm7, [r0 + r1] ; m7 = row 13 - punpckhbw xm0, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm0, 1 - pmaddubsw m0, m6, m13 - paddw m10, m0 - pmaddubsw m6, m12 - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm7, xm0 - punpcklbw xm7, xm0 - vinserti128 m7, m7, xm1, 1 - pmaddubsw m1, m7, m13 - paddw m11, m1 - pmaddubsw m7, m12 - movu xm1, [r0 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, m13 - paddw m6, m2 - pmaddubsw m0, m12 - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, m13 - paddw m7, m3 - pmaddubsw m1, m12 - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, m13 - paddw m0, m4 - pmaddubsw m2, m12 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, m13 - paddw m1, m5 - pmaddubsw m3, m12 + movu m1, [r0 + r1] ; m1 = row 5 + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m6, m4 + paddw m7, m5 %ifidn %1,pp - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m6, m14 ; m6 = word: row 12 - pmulhrsw m7, m14 ; m7 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 - packuswb m8, m9 - packuswb m10, m11 + pmulhrsw m6, m12 + pmulhrsw m7, m12 packuswb m6, m7 - packuswb m0, m1 - vpermq m8, m8, q3120 - vpermq m10, m10, q3120 - vpermq m6, m6, q3120 - vpermq m0, m0, q3120 - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm7, m6, 1 - vextracti128 xm1, m0, 1 - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r5], xm11 - lea r2, [r2 + r3 * 4] - movu [r2], xm6 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm0 - movu [r2 + r5], xm1 + movu [r2 + r3 * 2], xm6 + vextracti128 xm6, m6, 1 + movq [r2 + r3 * 2 + 16], xm6 %else - psubw m8, m14 ; m8 = word: row 8 - psubw m9, m14 ; m9 = word: row 9 - psubw m10, m14 ; m10 = word: row 10 - psubw m11, m14 ; m11 = word: row 11 - psubw m6, m14 ; m6 = word: row 12 - psubw m7, m14 ; m7 = word: row 13 - psubw m0, m14 ; m0 = word: row 14 - psubw m1, m14 ; m1 = word: row 15 - movu [r2], m8 - movu [r2 + r3], m9 - movu [r2 + r3 * 2], m10 - movu [r2 + r5], m11 - lea r2, [r2 + r3 * 4] - movu [r2], m6 - movu [r2 + r3], m7 + psubw m6, m12 + psubw m7, m12 + vperm2i128 m0, m6, m7, 0x20 + vperm2i128 m6, m6, m7, 0x31 movu [r2 + r3 * 2], m0 - movu [r2 + r5], m1 + movu [r2 + r3 * 2 + mmsize], xm6 %endif - lea r2, [r2 + r3 * 4] - movu xm5, [r0 + r4] ; m5 = row 19 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, m13 + movu m0, [r0 + r1 * 2] ; m0 = row 6 + punpcklbw m6, m1, m0 + punpckhbw m7, m1, m0 + pmaddubsw m6, m11 + pmaddubsw m7, m11 paddw m2, m6 - pmaddubsw m4, m12 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 20 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, m13 paddw m3, m7 - pmaddubsw m5, m12 - movu xm7, [r0 + r1] ; m7 = row 21 - punpckhbw xm0, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm0, 1 - pmaddubsw m0, m6, m13 - paddw m4, m0 - pmaddubsw m6, m12 - movu xm0, [r0 + r1 * 2] ; m0 = row 22 - punpckhbw xm1, xm7, xm0 - punpcklbw xm7, xm0 - vinserti128 m7, m7, xm1, 1 - pmaddubsw m1, m7, m13 - paddw m5, m1 - pmaddubsw m7, m12 - movu xm1, [r0 + r4] ; m1 = row 23 - punpckhbw xm8, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm8, 1 - pmaddubsw m8, m0, m13 - paddw m6, m8 - pmaddubsw m0, m12 - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 24 - punpckhbw xm9, xm1, xm8 - punpcklbw xm1, xm8 - vinserti128 m1, m1, xm9, 1 - pmaddubsw m9, m1, m13 - paddw m7, m9 - pmaddubsw m1, m12 - movu xm9, [r0 + r1] ; m9 = row 25 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m8, m13 - paddw m0, m8 - movu xm10, [r0 + r1 * 2] ; m10 = row 26 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m9, m13 - paddw m1, m9 - %ifidn %1,pp - pmulhrsw m2, m14 ; m2 = word: row 16 - pmulhrsw m3, m14 ; m3 = word: row 17 - pmulhrsw m4, m14 ; m4 = word: row 18 - pmulhrsw m5, m14 ; m5 = word: row 19 - pmulhrsw m6, m14 ; m6 = word: row 20 - pmulhrsw m7, m14 ; m7 = word: row 21 - pmulhrsw m0, m14 ; m0 = word: row 22 - pmulhrsw m1, m14 ; m1 = word: row 23 + pmulhrsw m2, m12 + pmulhrsw m3, m12 packuswb m2, m3 - packuswb m4, m5 - packuswb m6, m7 - packuswb m0, m1 - vpermq m2, m2, q3120 - vpermq m4, m4, q3120 - vpermq m6, m6, q3120 - vpermq m0, m0, q3120 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - vextracti128 xm7, m6, 1 - vextracti128 xm1, m0, 1 - movu [r2], xm2 - movu [r2 + r3], xm3 - movu [r2 + r3 * 2], xm4 - movu [r2 + r5], xm5 - lea r2, [r2 + r3 * 4] - movu [r2], xm6 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm0 - movu [r2 + r5], xm1 + movu [r2 + r5], xm2 + vextracti128 xm2, m2, 1 + movq [r2 + r5 + 16], xm2 %else - psubw m2, m14 ; m2 = word: row 16 - psubw m3, m14 ; m3 = word: row 17 - psubw m4, m14 ; m4 = word: row 18 - psubw m5, m14 ; m5 = word: row 19 - psubw m6, m14 ; m6 = word: row 20 - psubw m7, m14 ; m7 = word: row 21 - psubw m0, m14 ; m0 = word: row 22 - psubw m1, m14 ; m1 = word: row 23 - movu [r2], m2 - movu [r2 + r3], m3 - movu [r2 + r3 * 2], m4 - movu [r2 + r5], m5 - lea r2, [r2 + r3 * 4] - movu [r2], m6 - movu [r2 + r3], m7 - movu [r2 + r3 * 2], m0 - movu [r2 + r5], m1 + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2 + r5], m0 + movu [r2 + r5 + mmsize], xm2 %endif + lea r2, [r2 + r3 * 4] + dec r6d + jnz .loopH RET %endif %endmacro - FILTER_VER_CHROMA_AVX2_16x24 pp - FILTER_VER_CHROMA_AVX2_16x24 ps + FILTER_VER_CHROMA_AVX2_24x64 pp + FILTER_VER_CHROMA_AVX2_24x64 ps -%macro FILTER_VER_CHROMA_AVX2_24x32 1 +%macro FILTER_VER_CHROMA_AVX2_16x4 1 INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_24x32, 4, 9, 10 +cglobal interp_4tap_vert_%1_16x4, 4, 6, 8 mov r4d, r4m shl r4d, 6 @@ -10275,19 +8084,15 @@ cglobal interp_4tap_vert_%1_24x32, 4, 9, 10 lea r5, [tab_ChromaCoeffVer_32 + r4] %endif - mova m8, [r5] - mova m9, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m7, [pw_512] %else add r3d, r3d - vbroadcasti128 m7, [pw_2000] + mova m7, [pw_2000] %endif - lea r6, [r3 * 3] - mov r5d, 2 -.loopH: + movu xm0, [r0] vinserti128 m0, m0, [r0 + r1 * 2], 1 movu xm1, [r0 + r1] @@ -10297,40 +8102,41 @@ cglobal interp_4tap_vert_%1_24x32, 4, 9, 10 punpckhbw m3, m0, m1 vperm2i128 m4, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 - pmaddubsw m4, m8 - pmaddubsw m3, m2, m9 + pmaddubsw m4, [r5] + pmaddubsw m3, m2, [r5 + mmsize] paddw m4, m3 - pmaddubsw m2, m8 + pmaddubsw m2, [r5] vextracti128 xm0, m0, 1 - lea r7, [r0 + r1 * 4] - vinserti128 m0, m0, [r7], 1 + lea r0, [r0 + r1 * 4] + vinserti128 m0, m0, [r0], 1 punpcklbw m5, m1, m0 punpckhbw m3, m1, m0 vperm2i128 m6, m5, m3, 0x20 vperm2i128 m5, m5, m3, 0x31 - pmaddubsw m6, m8 - pmaddubsw m3, m5, m9 + pmaddubsw m6, [r5] + pmaddubsw m3, m5, [r5 + mmsize] paddw m6, m3 - pmaddubsw m5, m8 + pmaddubsw m5, [r5] %ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 0 - pmulhrsw m6, m7 ; m6 = word: row 1 + pmulhrsw m4, m7 ; m4 = word: row 0 + pmulhrsw m6, m7 ; m6 = word: row 1 packuswb m4, m6 vpermq m4, m4, 11011000b vextracti128 xm6, m4, 1 movu [r2], xm4 movu [r2 + r3], xm6 %else - psubw m4, m7 ; m4 = word: row 0 - psubw m6, m7 ; m6 = word: row 1 + psubw m4, m7 ; m4 = word: row 0 + psubw m6, m7 ; m6 = word: row 1 movu [r2], m4 movu [r2 + r3], m6 %endif + lea r2, [r2 + r3 * 2] - movu xm4, [r7 + r1 * 2] - vinserti128 m4, m4, [r7 + r1], 1 + movu xm4, [r0 + r1 * 2] + vinserti128 m4, m4, [r0 + r1], 1 vextracti128 xm1, m4, 1 vinserti128 m0, m0, xm1, 0 @@ -10338,390 +8144,580 @@ cglobal interp_4tap_vert_%1_24x32, 4, 9, 10 punpckhbw m1, m0, m4 vperm2i128 m0, m6, m1, 0x20 vperm2i128 m6, m6, m1, 0x31 - pmaddubsw m1, m0, m9 - paddw m5, m1 - pmaddubsw m0, m8 - pmaddubsw m1, m6, m9 - paddw m2, m1 - pmaddubsw m6, m8 + pmaddubsw m0, [r5 + mmsize] + paddw m5, m0 + pmaddubsw m6, [r5 + mmsize] + paddw m2, m6 %ifidn %1,pp - pmulhrsw m2, m7 ; m2 = word: row 2 - pmulhrsw m5, m7 ; m5 = word: row 3 + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m5, m7 ; m5 = word: row 3 packuswb m2, m5 vpermq m2, m2, 11011000b vextracti128 xm5, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm5 -%else - psubw m2, m7 ; m2 = word: row 2 - psubw m5, m7 ; m5 = word: row 3 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m5 -%endif - lea r8, [r2 + r3 * 4] - - movu xm1, [r7 + r4] - lea r7, [r7 + r1 * 4] - vinserti128 m1, m1, [r7], 1 - vinserti128 m4, m4, xm1, 1 - - punpcklbw m2, m4, m1 - punpckhbw m5, m4, m1 - vperm2i128 m3, m2, m5, 0x20 - vperm2i128 m2, m2, m5, 0x31 - pmaddubsw m5, m3, m9 - paddw m6, m5 - pmaddubsw m3, m8 - pmaddubsw m5, m2, m9 - paddw m0, m5 - pmaddubsw m2, m8 - -%ifidn %1,pp - pmulhrsw m6, m7 ; m6 = word: row 4 - pmulhrsw m0, m7 ; m0 = word: row 5 - packuswb m6, m0 - vpermq m6, m6, 11011000b - vextracti128 xm0, m6, 1 - movu [r8], xm6 - movu [r8 + r3], xm0 + movu [r2], xm2 + movu [r2 + r3], xm5 %else - psubw m6, m7 ; m6 = word: row 4 - psubw m0, m7 ; m0 = word: row 5 - movu [r8], m6 - movu [r8 + r3], m0 + psubw m2, m7 ; m2 = word: row 2 + psubw m5, m7 ; m5 = word: row 3 + movu [r2], m2 + movu [r2 + r3], m5 %endif + RET +%endmacro - movu xm6, [r7 + r1 * 2] - vinserti128 m6, m6, [r7 + r1], 1 - vextracti128 xm0, m6, 1 - vinserti128 m1, m1, xm0, 0 + FILTER_VER_CHROMA_AVX2_16x4 pp + FILTER_VER_CHROMA_AVX2_16x4 ps - punpcklbw m4, m1, m6 - punpckhbw m5, m1, m6 - vperm2i128 m0, m4, m5, 0x20 - vperm2i128 m5, m4, m5, 0x31 - pmaddubsw m4, m0, m9 - paddw m2, m4 - pmaddubsw m0, m8 - pmaddubsw m4, m5, m9 - paddw m3, m4 - pmaddubsw m5, m8 +%macro FILTER_VER_CHROMA_AVX2_12xN 2 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_12x%2, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 -%ifidn %1,pp - pmulhrsw m3, m7 ; m3 = word: row 6 - pmulhrsw m2, m7 ; m2 = word: row 7 - packuswb m3, m2 - vpermq m3, m3, 11011000b - vextracti128 xm2, m3, 1 - movu [r8 + r3 * 2], xm3 - movu [r8 + r6], xm2 +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 %else - psubw m3, m7 ; m3 = word: row 6 - psubw m2, m7 ; m2 = word: row 7 - movu [r8 + r3 * 2], m3 - movu [r8 + r6], m2 + lea r5, [tab_ChromaCoeffVer_32 + r4] %endif - lea r8, [r8 + r3 * 4] - - movu xm3, [r7 + r4] - lea r7, [r7 + r1 * 4] - vinserti128 m3, m3, [r7], 1 - vinserti128 m6, m6, xm3, 1 - - punpcklbw m2, m6, m3 - punpckhbw m1, m6, m3 - vperm2i128 m4, m2, m1, 0x20 - vperm2i128 m2, m2, m1, 0x31 - pmaddubsw m1, m4, m9 - paddw m5, m1 - pmaddubsw m4, m8 - pmaddubsw m1, m2, m9 - paddw m0, m1 - pmaddubsw m2, m8 + lea r4, [r1 * 3] + sub r0, r1 %ifidn %1,pp - pmulhrsw m5, m7 ; m5 = word: row 8 - pmulhrsw m0, m7 ; m0 = word: row 9 - packuswb m5, m0 - vpermq m5, m5, 11011000b - vextracti128 xm0, m5, 1 - movu [r8], xm5 - movu [r8 + r3], xm0 + mova m7, [pw_512] %else - psubw m5, m7 ; m5 = word: row 8 - psubw m0, m7 ; m0 = word: row 9 - movu [r8], m5 - movu [r8 + r3], m0 + add r3d, r3d + vbroadcasti128 m7, [pw_2000] %endif - - movu xm5, [r7 + r1 * 2] - vinserti128 m5, m5, [r7 + r1], 1 - vextracti128 xm0, m5, 1 - vinserti128 m3, m3, xm0, 0 - - punpcklbw m1, m3, m5 - punpckhbw m0, m3, m5 - vperm2i128 m6, m1, m0, 0x20 - vperm2i128 m0, m1, m0, 0x31 - pmaddubsw m1, m6, m9 - paddw m2, m1 - pmaddubsw m6, m8 - pmaddubsw m1, m0, m9 - paddw m4, m1 - pmaddubsw m0, m8 - + lea r6, [r3 * 3] +%rep %2 / 16 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] %ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 10 - pmulhrsw m2, m7 ; m2 = word: row 11 - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm2, m4, 1 - movu [r8 + r3 * 2], xm4 - movu [r8 + r6], xm2 + pmulhrsw m0, m7 ; m0 = word: row 0 + pmulhrsw m1, m7 ; m1 = word: row 1 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [r2], xm0 + movd [r2 + 8], xm1 + movhps [r2 + r3], xm0 + pextrd [r2 + r3 + 8], xm1, 2 %else - psubw m4, m7 ; m4 = word: row 10 - psubw m2, m7 ; m2 = word: row 11 - movu [r8 + r3 * 2], m4 - movu [r8 + r6], m2 + psubw m0, m7 ; m0 = word: row 0 + psubw m1, m7 ; m1 = word: row 1 + movu [r2], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + 16], xm0 + movu [r2 + r3], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r3 + 16], xm1 %endif - lea r8, [r8 + r3 * 4] - - movu xm3, [r7 + r4] - lea r7, [r7 + r1 * 4] - vinserti128 m3, m3, [r7], 1 - vinserti128 m5, m5, xm3, 1 - - punpcklbw m2, m5, m3 - punpckhbw m1, m5, m3 - vperm2i128 m4, m2, m1, 0x20 - vperm2i128 m2, m2, m1, 0x31 - pmaddubsw m1, m4, m9 - paddw m0, m1 - pmaddubsw m4, m8 - pmaddubsw m1, m2, m9 - paddw m6, m1 - pmaddubsw m2, m8 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm0, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm0, 1 + pmaddubsw m0, m5, [r5 + 1 * mmsize] + paddw m3, m0 + pmaddubsw m5, [r5] %ifidn %1,pp - pmulhrsw m0, m7 ; m0 = word: row 12 - pmulhrsw m6, m7 ; m6 = word: row 13 - packuswb m0, m6 - vpermq m0, m0, 11011000b - vextracti128 xm6, m0, 1 - movu [r8], xm0 - movu [r8 + r3], xm6 + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m3, m7 ; m3 = word: row 3 + packuswb m2, m3 + vextracti128 xm3, m2, 1 + movq [r2 + r3 * 2], xm2 + movd [r2 + r3 * 2 + 8], xm3 + movhps [r2 + r6], xm2 + pextrd [r2 + r6 + 8], xm3, 2 %else - psubw m0, m7 ; m0 = word: row 12 - psubw m6, m7 ; m6 = word: row 13 - movu [r8], m0 - movu [r8 + r3], m6 + psubw m2, m7 ; m2 = word: row 2 + psubw m3, m7 ; m3 = word: row 3 + movu [r2 + r3 * 2], xm2 + vextracti128 xm2, m2, 1 + movq [r2 + r3 * 2 + 16], xm2 + movu [r2 + r6], xm3 + vextracti128 xm3, m3, 1 + movq [r2 + r6 + 16], xm3 %endif + lea r2, [r2 + r3 * 4] - movu xm5, [r7 + r1 * 2] - vinserti128 m5, m5, [r7 + r1], 1 - vextracti128 xm0, m5, 1 - vinserti128 m3, m3, xm0, 0 - - punpcklbw m1, m3, m5 - punpckhbw m0, m3, m5 - vperm2i128 m6, m1, m0, 0x20 - vperm2i128 m0, m1, m0, 0x31 - pmaddubsw m6, m9 - paddw m2, m6 - pmaddubsw m0, m9 - paddw m4, m0 - + movu xm0, [r0 + r4] ; m0 = row 7 + punpckhbw xm3, xm6, xm0 + punpcklbw xm6, xm0 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, [r5 + 1 * mmsize] + paddw m4, m3 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm3, [r0] ; m3 = row 8 + punpckhbw xm1, xm0, xm3 + punpcklbw xm0, xm3 + vinserti128 m0, m0, xm1, 1 + pmaddubsw m1, m0, [r5 + 1 * mmsize] + paddw m5, m1 + pmaddubsw m0, [r5] %ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 14 - pmulhrsw m2, m7 ; m2 = word: row 15 - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm2, m4, 1 - movu [r8 + r3 * 2], xm4 - movu [r8 + r6], xm2 - add r2, 16 + pmulhrsw m4, m7 ; m4 = word: row 4 + pmulhrsw m5, m7 ; m5 = word: row 5 + packuswb m4, m5 + vextracti128 xm5, m4, 1 + movq [r2], xm4 + movd [r2 + 8], xm5 + movhps [r2 + r3], xm4 + pextrd [r2 + r3 + 8], xm5, 2 %else - psubw m4, m7 ; m4 = word: row 14 - psubw m2, m7 ; m2 = word: row 15 - movu [r8 + r3 * 2], m4 - movu [r8 + r6], m2 - add r2, 32 + psubw m4, m7 ; m4 = word: row 4 + psubw m5, m7 ; m5 = word: row 5 + movu [r2], xm4 + vextracti128 xm4, m4, 1 + movq [r2 + 16], xm4 + movu [r2 + r3], xm5 + vextracti128 xm5, m5, 1 + movq [r2 + r3 + 16], xm5 %endif - add r0, 16 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 + + movu xm1, [r0 + r1] ; m1 = row 9 + punpckhbw xm2, xm3, xm1 + punpcklbw xm3, xm1 + vinserti128 m3, m3, xm2, 1 + pmaddubsw m2, m3, [r5 + 1 * mmsize] + paddw m6, m2 + pmaddubsw m3, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 10 + punpckhbw xm4, xm1, xm2 punpcklbw xm1, xm2 - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 - vinserti128 m5, m1, xm2, 1 - pmaddubsw m5, m8 - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 - lea r7, [r0 + r1 * 4] - movq xm1, [r7] ; m1 = row 4 - punpcklbw xm4, xm1 - vinserti128 m2, m3, xm4, 1 - pmaddubsw m0, m2, m9 - paddw m5, m0 - pmaddubsw m2, m8 - movq xm3, [r7 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 - movq xm4, [r7 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m0, m1, m9 - paddw m2, m0 - pmaddubsw m1, m8 - movq xm3, [r7 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 - lea r7, [r7 + r1 * 4] - movq xm0, [r7] ; m0 = row 8 - punpcklbw xm3, xm0 - vinserti128 m4, m4, xm3, 1 - pmaddubsw m3, m4, m9 - paddw m1, m3 - pmaddubsw m4, m8 - movq xm3, [r7 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 - movq xm6, [r7 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 - vinserti128 m0, m0, xm3, 1 - pmaddubsw m3, m0, m9 - paddw m4, m3 - pmaddubsw m0, m8 + vinserti128 m1, m1, xm4, 1 + pmaddubsw m4, m1, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m1, [r5] %ifidn %1,pp - pmulhrsw m5, m7 ; m5 = word: row 0, row 1 - pmulhrsw m2, m7 ; m2 = word: row 2, row 3 - pmulhrsw m1, m7 ; m1 = word: row 4, row 5 - pmulhrsw m4, m7 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm2 - lea r8, [r2 + r3 * 4] - movq [r8], xm1 - movq [r8 + r3], xm4 - movhps [r8 + r3 * 2], xm1 - movhps [r8 + r6], xm4 -%else - psubw m5, m7 ; m5 = word: row 0, row 1 - psubw m2, m7 ; m2 = word: row 2, row 3 - psubw m1, m7 ; m1 = word: row 4, row 5 - psubw m4, m7 ; m4 = word: row 6, row 7 - vextracti128 xm3, m5, 1 - movu [r2], xm5 - movu [r2 + r3], xm3 - vextracti128 xm3, m2, 1 + pmulhrsw m6, m7 ; m6 = word: row 6 + pmulhrsw m0, m7 ; m0 = word: row 7 + packuswb m6, m0 + vextracti128 xm0, m6, 1 + movq [r2 + r3 * 2], xm6 + movd [r2 + r3 * 2 + 8], xm0 + movhps [r2 + r6], xm6 + pextrd [r2 + r6 + 8], xm0, 2 +%else + psubw m6, m7 ; m6 = word: row 6 + psubw m0, m7 ; m0 = word: row 7 + movu [r2 + r3 * 2], xm6 + vextracti128 xm6, m6, 1 + movq [r2 + r3 * 2 + 16], xm6 + movu [r2 + r6], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + r6 + 16], xm0 +%endif + lea r2, [r2 + r3 * 4] + + movu xm4, [r0 + r4] ; m4 = row 11 + punpckhbw xm6, xm2, xm4 + punpcklbw xm2, xm4 + vinserti128 m2, m2, xm6, 1 + pmaddubsw m6, m2, [r5 + 1 * mmsize] + paddw m3, m6 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 12 + punpckhbw xm0, xm4, xm6 + punpcklbw xm4, xm6 + vinserti128 m4, m4, xm0, 1 + pmaddubsw m0, m4, [r5 + 1 * mmsize] + paddw m1, m0 + pmaddubsw m4, [r5] +%ifidn %1,pp + pmulhrsw m3, m7 ; m3 = word: row 8 + pmulhrsw m1, m7 ; m1 = word: row 9 + packuswb m3, m1 + vextracti128 xm1, m3, 1 + movq [r2], xm3 + movd [r2 + 8], xm1 + movhps [r2 + r3], xm3 + pextrd [r2 + r3 + 8], xm1, 2 +%else + psubw m3, m7 ; m3 = word: row 8 + psubw m1, m7 ; m1 = word: row 9 + movu [r2], xm3 + vextracti128 xm3, m3, 1 + movq [r2 + 16], xm3 + movu [r2 + r3], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r3 + 16], xm1 +%endif + + movu xm0, [r0 + r1] ; m0 = row 13 + punpckhbw xm1, xm6, xm0 + punpcklbw xm6, xm0 + vinserti128 m6, m6, xm1, 1 + pmaddubsw m1, m6, [r5 + 1 * mmsize] + paddw m2, m1 + pmaddubsw m6, [r5] + movu xm1, [r0 + r1 * 2] ; m1 = row 14 + punpckhbw xm5, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm5, 1 + pmaddubsw m5, m0, [r5 + 1 * mmsize] + paddw m4, m5 + pmaddubsw m0, [r5] +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 10 + pmulhrsw m4, m7 ; m4 = word: row 11 + packuswb m2, m4 + vextracti128 xm4, m2, 1 + movq [r2 + r3 * 2], xm2 + movd [r2 + r3 * 2 + 8], xm4 + movhps [r2 + r6], xm2 + pextrd [r2 + r6 + 8], xm4, 2 +%else + psubw m2, m7 ; m2 = word: row 10 + psubw m4, m7 ; m4 = word: row 11 movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - vextracti128 xm3, m1, 1 - lea r8, [r2 + r3 * 4] - movu [r8], xm1 - movu [r8 + r3], xm3 - vextracti128 xm3, m4, 1 - movu [r8 + r3 * 2], xm4 - movu [r8 + r6], xm3 + vextracti128 xm2, m2, 1 + movq [r2 + r3 * 2 + 16], xm2 + movu [r2 + r6], xm4 + vextracti128 xm4, m4, 1 + movq [r2 + r6 + 16], xm4 %endif - lea r8, [r8 + r3 * 4] + lea r2, [r2 + r3 * 4] - movq xm3, [r7 + r4] ; m3 = row 11 - punpcklbw xm6, xm3 - lea r7, [r7 + r1 * 4] - movq xm5, [r7] ; m5 = row 12 - punpcklbw xm3, xm5 - vinserti128 m6, m6, xm3, 1 - pmaddubsw m3, m6, m9 - paddw m0, m3 - pmaddubsw m6, m8 - movq xm3, [r7 + r1] ; m3 = row 13 - punpcklbw xm5, xm3 - movq xm2, [r7 + r1 * 2] ; m2 = row 14 - punpcklbw xm3, xm2 + movu xm5, [r0 + r4] ; m5 = row 15 + punpckhbw xm2, xm1, xm5 + punpcklbw xm1, xm5 + vinserti128 m1, m1, xm2, 1 + pmaddubsw m2, m1, [r5 + 1 * mmsize] + paddw m6, m2 + pmaddubsw m1, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm5, xm2 + punpcklbw xm5, xm2 vinserti128 m5, m5, xm3, 1 - pmaddubsw m3, m5, m9 - paddw m6, m3 - pmaddubsw m5, m8 - movq xm3, [r7 + r4] ; m3 = row 15 + pmaddubsw m3, m5, [r5 + 1 * mmsize] + paddw m0, m3 + pmaddubsw m5, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 - lea r7, [r7 + r1 * 4] - movq xm1, [r7] ; m1 = row 16 - punpcklbw xm3, xm1 - vinserti128 m2, m2, xm3, 1 - pmaddubsw m3, m2, m9 - paddw m5, m3 - pmaddubsw m2, m8 - movq xm3, [r7 + r1] ; m3 = row 17 - punpcklbw xm1, xm3 - movq xm4, [r7 + r1 * 2] ; m4 = row 18 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m1, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm2, xm3, xm4 punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, m9 - paddw m2, m3 + vinserti128 m3, m3, xm2, 1 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m5, m3 + %ifidn %1,pp - pmulhrsw m0, m7 ; m0 = word: row 8, row 9 - pmulhrsw m6, m7 ; m6 = word: row 10, row 11 - pmulhrsw m5, m7 ; m5 = word: row 12, row 13 - pmulhrsw m2, m7 ; m2 = word: row 14, row 15 - packuswb m0, m6 - packuswb m5, m2 - vextracti128 xm6, m0, 1 - vextracti128 xm2, m5, 1 - movq [r8], xm0 - movq [r8 + r3], xm6 - movhps [r8 + r3 * 2], xm0 - movhps [r8 + r6], xm6 - lea r8, [r8 + r3 * 4] - movq [r8], xm5 - movq [r8 + r3], xm2 - movhps [r8 + r3 * 2], xm5 - movhps [r8 + r6], xm2 - lea r2, [r8 + r3 * 4 - 16] + pmulhrsw m6, m7 ; m6 = word: row 12 + pmulhrsw m0, m7 ; m0 = word: row 13 + pmulhrsw m1, m7 ; m1 = word: row 14 + pmulhrsw m5, m7 ; m5 = word: row 15 + packuswb m6, m0 + packuswb m1, m5 + vextracti128 xm0, m6, 1 + vextracti128 xm5, m1, 1 + movq [r2], xm6 + movd [r2 + 8], xm0 + movhps [r2 + r3], xm6 + pextrd [r2 + r3 + 8], xm0, 2 + movq [r2 + r3 * 2], xm1 + movd [r2 + r3 * 2 + 8], xm5 + movhps [r2 + r6], xm1 + pextrd [r2 + r6 + 8], xm5, 2 %else - psubw m0, m7 ; m0 = word: row 8, row 9 - psubw m6, m7 ; m6 = word: row 10, row 11 - psubw m5, m7 ; m5 = word: row 12, row 13 - psubw m2, m7 ; m2 = word: row 14, row 15 - vextracti128 xm3, m0, 1 - movu [r8], xm0 - movu [r8 + r3], xm3 - vextracti128 xm3, m6, 1 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm3 - vextracti128 xm3, m5, 1 - lea r8, [r8 + r3 * 4] - movu [r8], xm5 - movu [r8 + r3], xm3 - vextracti128 xm3, m2, 1 - movu [r8 + r3 * 2], xm2 - movu [r8 + r6], xm3 - lea r2, [r8 + r3 * 4 - 32] + psubw m6, m7 ; m6 = word: row 12 + psubw m0, m7 ; m0 = word: row 13 + psubw m1, m7 ; m1 = word: row 14 + psubw m5, m7 ; m5 = word: row 15 + movu [r2], xm6 + vextracti128 xm6, m6, 1 + movq [r2 + 16], xm6 + movu [r2 + r3], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + r3 + 16], xm0 + movu [r2 + r3 * 2], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r3 * 2 + 16], xm1 + movu [r2 + r6], xm5 + vextracti128 xm5, m5, 1 + movq [r2 + r6 + 16], xm5 %endif - lea r0, [r7 - 16] - dec r5d - jnz .loopH + lea r2, [r2 + r3 * 4] +%endrep RET -%endif %endmacro - FILTER_VER_CHROMA_AVX2_24x32 pp - FILTER_VER_CHROMA_AVX2_24x32 ps + FILTER_VER_CHROMA_AVX2_12xN pp, 16 + FILTER_VER_CHROMA_AVX2_12xN ps, 16 + FILTER_VER_CHROMA_AVX2_12xN pp, 32 + FILTER_VER_CHROMA_AVX2_12xN ps, 32 -%macro FILTER_VER_CHROMA_AVX2_24x64 1 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_24x64, 4, 7, 13 - mov r4d, r4m - shl r4d, 6 +;----------------------------------------------------------------------------- +;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W24 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 %ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mov r4d, %2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + movu m5, [r5] + movu m7, [r5 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_512] + + pmulhrsw m4, m6 + pmulhrsw m2, m6 + + packuswb m4, m2 + + movu [r2], m4 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m2, [r5 + 2 * r1] + + punpcklbw m5, m7, m2 + punpckhbw m7, m2 + + pmaddubsw m5, m0 + pmaddubsw m7, m0 + + paddw m4, m5 + paddw m3, m7 + + pmulhrsw m4, m6 + pmulhrsw m3, m6 + + packuswb m4, m3 + + movu [r2 + r3], m4 + + movq m2, [r0 + 16] + movq m3, [r0 + r1 + 16] + movq m4, [r5 + 16] + movq m5, [r5 + r1 + 16] + + punpcklbw m2, m3 + punpcklbw m4, m5 + + pmaddubsw m2, m1 + pmaddubsw m4, m0 + + paddw m2, m4 + + pmulhrsw m2, m6 + + movq m3, [r0 + r1 + 16] + movq m4, [r5 + 16] + movq m5, [r5 + r1 + 16] + movq m7, [r5 + 2 * r1 + 16] + + punpcklbw m3, m4 + punpcklbw m5, m7 + + pmaddubsw m3, m1 + pmaddubsw m5, m0 + + paddw m3, m5 + + pmulhrsw m3, m6 + packuswb m2, m3 + + movh [r2 + 16], m2 + movhps [r2 + r3 + 16], m2 + + mov r0, r5 + lea r2, [r2 + 2 * r3] + + sub r4, 2 + jnz .loop + RET +%endmacro + + FILTER_V4_W24 24, 32 + + FILTER_V4_W24 24, 64 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W32 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mova m7, [pw_512] + + mov r4d, %2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + movu m3, [r5] + movu m5, [r5 + r1] + + punpcklbw m6, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m6, m0 + pmaddubsw m3, m0 + + paddw m4, m6 + paddw m2, m3 + + pmulhrsw m4, m7 + pmulhrsw m2, m7 + + packuswb m4, m2 + + movu [r2], m4 + + movu m2, [r0 + 16] + movu m3, [r0 + r1 + 16] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + movu m3, [r5 + 16] + movu m5, [r5 + r1 + 16] + + punpcklbw m6, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m6, m0 + pmaddubsw m3, m0 + + paddw m4, m6 + paddw m2, m3 + + pmulhrsw m4, m7 + pmulhrsw m2, m7 + + packuswb m4, m2 + + movu [r2 + 16], m4 + + lea r0, [r0 + r1] + lea r2, [r2 + r3] + + dec r4 + jnz .loop + RET +%endmacro + + FILTER_V4_W32 32, 8 + FILTER_V4_W32 32, 16 + FILTER_V4_W32 32, 24 + FILTER_V4_W32 32, 32 + + FILTER_V4_W32 32, 48 + FILTER_V4_W32 32, 64 + +%macro FILTER_VER_CHROMA_AVX2_32xN 2 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_32x%2, 4, 7, 13 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif @@ -10737,8 +8733,8 @@ cglobal interp_4tap_vert_%1_24x64, 4, 7, 13 vbroadcasti128 m12, [pw_2000] %endif lea r5, [r3 * 3] - mov r6d, 16 -.loopH: + mov r6d, %2 / 4 +.loopW: movu m0, [r0] ; m0 = row 0 movu m1, [r0 + r1] ; m1 = row 1 punpcklbw m2, m0, m1 @@ -10763,16 +8759,14 @@ cglobal interp_4tap_vert_%1_24x64, 4, 7, 13 pmulhrsw m2, m12 pmulhrsw m3, m12 packuswb m2, m3 - movu [r2], xm2 - vextracti128 xm2, m2, 1 - movq [r2 + 16], xm2 + movu [r2], m2 %else psubw m2, m12 psubw m3, m12 vperm2i128 m0, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 movu [r2], m0 - movu [r2 + mmsize], xm2 + movu [r2 + mmsize], m2 %endif lea r0, [r0 + r1 * 4] movu m0, [r0] ; m0 = row 4 @@ -10788,16 +8782,14 @@ cglobal interp_4tap_vert_%1_24x64, 4, 7, 13 pmulhrsw m4, m12 pmulhrsw m5, m12 packuswb m4, m5 - movu [r2 + r3], xm4 - vextracti128 xm4, m4, 1 - movq [r2 + r3 + 16], xm4 + movu [r2 + r3], m4 %else psubw m4, m12 psubw m5, m12 vperm2i128 m1, m4, m5, 0x20 vperm2i128 m4, m4, m5, 0x31 movu [r2 + r3], m1 - movu [r2 + r3 + mmsize], xm4 + movu [r2 + r3 + mmsize], m4 %endif movu m1, [r0 + r1] ; m1 = row 5 @@ -10811,16 +8803,14 @@ cglobal interp_4tap_vert_%1_24x64, 4, 7, 13 pmulhrsw m6, m12 pmulhrsw m7, m12 packuswb m6, m7 - movu [r2 + r3 * 2], xm6 - vextracti128 xm6, m6, 1 - movq [r2 + r3 * 2 + 16], xm6 + movu [r2 + r3 * 2], m6 %else psubw m6, m12 psubw m7, m12 vperm2i128 m0, m6, m7, 0x20 vperm2i128 m6, m6, m7, 0x31 movu [r2 + r3 * 2], m0 - movu [r2 + r3 * 2 + mmsize], xm6 + movu [r2 + r3 * 2 + mmsize], m6 %endif movu m0, [r0 + r1 * 2] ; m0 = row 6 @@ -10834,32 +8824,41 @@ cglobal interp_4tap_vert_%1_24x64, 4, 7, 13 pmulhrsw m2, m12 pmulhrsw m3, m12 packuswb m2, m3 - movu [r2 + r5], xm2 - vextracti128 xm2, m2, 1 - movq [r2 + r5 + 16], xm2 + movu [r2 + r5], m2 %else psubw m2, m12 psubw m3, m12 vperm2i128 m0, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 movu [r2 + r5], m0 - movu [r2 + r5 + mmsize], xm2 + movu [r2 + r5 + mmsize], m2 %endif lea r2, [r2 + r3 * 4] dec r6d - jnz .loopH + jnz .loopW RET %endif %endmacro - FILTER_VER_CHROMA_AVX2_24x64 pp - FILTER_VER_CHROMA_AVX2_24x64 ps - -%macro FILTER_VER_CHROMA_AVX2_16x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_16x4, 4, 6, 8 - mov r4d, r4m - shl r4d, 6 + FILTER_VER_CHROMA_AVX2_32xN pp, 64 + FILTER_VER_CHROMA_AVX2_32xN pp, 48 + FILTER_VER_CHROMA_AVX2_32xN pp, 32 + FILTER_VER_CHROMA_AVX2_32xN pp, 24 + FILTER_VER_CHROMA_AVX2_32xN pp, 16 + FILTER_VER_CHROMA_AVX2_32xN pp, 8 + FILTER_VER_CHROMA_AVX2_32xN ps, 64 + FILTER_VER_CHROMA_AVX2_32xN ps, 48 + FILTER_VER_CHROMA_AVX2_32xN ps, 32 + FILTER_VER_CHROMA_AVX2_32xN ps, 24 + FILTER_VER_CHROMA_AVX2_32xN ps, 16 + FILTER_VER_CHROMA_AVX2_32xN ps, 8 + +%macro FILTER_VER_CHROMA_AVX2_48x64 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_48x64, 4, 8, 13 + mov r4d, r4m + shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] @@ -10868,94 +8867,200 @@ cglobal interp_4tap_vert_%1_16x4, 4, 6, 8 lea r5, [tab_ChromaCoeffVer_32 + r4] %endif + mova m10, [r5] + mova m11, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp - mova m7, [pw_512] + mova m12, [pw_512] %else add r3d, r3d - mova m7, [pw_2000] + vbroadcasti128 m12, [pw_2000] %endif - - movu xm0, [r0] - vinserti128 m0, m0, [r0 + r1 * 2], 1 - movu xm1, [r0 + r1] - vinserti128 m1, m1, [r0 + r4], 1 - + lea r5, [r3 * 3] + lea r7, [r1 * 4] + mov r6d, 16 +.loopH: + movu m0, [r0] ; m0 = row 0 + movu m1, [r0 + r1] ; m1 = row 1 punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 - vperm2i128 m4, m2, m3, 0x20 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + movu m0, [r0 + r1 * 2] ; m0 = row 2 + punpcklbw m4, m1, m0 + punpckhbw m5, m1, m0 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + movu m1, [r0 + r4] ; m1 = row 3 + punpcklbw m6, m0, m1 + punpckhbw m7, m0, m1 + pmaddubsw m8, m6, m11 + pmaddubsw m9, m7, m11 + pmaddubsw m6, m10 + pmaddubsw m7, m10 + paddw m2, m8 + paddw m3, m9 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2], m2 +%else + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 - pmaddubsw m4, [r5] - pmaddubsw m3, m2, [r5 + mmsize] - paddw m4, m3 - pmaddubsw m2, [r5] - - vextracti128 xm0, m0, 1 + movu [r2], m0 + movu [r2 + mmsize], m2 +%endif lea r0, [r0 + r1 * 4] - vinserti128 m0, m0, [r0], 1 - - punpcklbw m5, m1, m0 + movu m0, [r0] ; m0 = row 4 + punpcklbw m2, m1, m0 punpckhbw m3, m1, m0 - vperm2i128 m6, m5, m3, 0x20 - vperm2i128 m5, m5, m3, 0x31 - pmaddubsw m6, [r5] - pmaddubsw m3, m5, [r5 + mmsize] - paddw m6, m3 - pmaddubsw m5, [r5] + pmaddubsw m8, m2, m11 + pmaddubsw m9, m3, m11 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + paddw m4, m8 + paddw m5, m9 %ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 0 - pmulhrsw m6, m7 ; m6 = word: row 1 - packuswb m4, m6 - vpermq m4, m4, 11011000b - vextracti128 xm6, m4, 1 - movu [r2], xm4 - movu [r2 + r3], xm6 + pmulhrsw m4, m12 + pmulhrsw m5, m12 + packuswb m4, m5 + movu [r2 + r3], m4 %else - psubw m4, m7 ; m4 = word: row 0 - psubw m6, m7 ; m6 = word: row 1 - movu [r2], m4 - movu [r2 + r3], m6 + psubw m4, m12 + psubw m5, m12 + vperm2i128 m1, m4, m5, 0x20 + vperm2i128 m4, m4, m5, 0x31 + movu [r2 + r3], m1 + movu [r2 + r3 + mmsize], m4 %endif - lea r2, [r2 + r3 * 2] - movu xm4, [r0 + r1 * 2] - vinserti128 m4, m4, [r0 + r1], 1 - vextracti128 xm1, m4, 1 - vinserti128 m0, m0, xm1, 0 + movu m1, [r0 + r1] ; m1 = row 5 + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m6, m4 + paddw m7, m5 +%ifidn %1,pp + pmulhrsw m6, m12 + pmulhrsw m7, m12 + packuswb m6, m7 + movu [r2 + r3 * 2], m6 +%else + psubw m6, m12 + psubw m7, m12 + vperm2i128 m0, m6, m7, 0x20 + vperm2i128 m6, m6, m7, 0x31 + movu [r2 + r3 * 2], m0 + movu [r2 + r3 * 2 + mmsize], m6 +%endif - punpcklbw m6, m0, m4 - punpckhbw m1, m0, m4 - vperm2i128 m0, m6, m1, 0x20 - vperm2i128 m6, m6, m1, 0x31 - pmaddubsw m0, [r5 + mmsize] - paddw m5, m0 - pmaddubsw m6, [r5 + mmsize] + movu m0, [r0 + r1 * 2] ; m0 = row 6 + punpcklbw m6, m1, m0 + punpckhbw m7, m1, m0 + pmaddubsw m6, m11 + pmaddubsw m7, m11 paddw m2, m6 + paddw m3, m7 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2 + r5], m2 + add r2, 32 +%else + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2 + r5], m0 + movu [r2 + r5 + mmsize], m2 + add r2, 64 +%endif + sub r0, r7 + movu xm0, [r0 + 32] ; m0 = row 0 + movu xm1, [r0 + r1 + 32] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, m10 + movu xm2, [r0 + r1 * 2 + 32] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, m10 + movu xm3, [r0 + r4 + 32] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, m11 + paddw m0, m4 + pmaddubsw m2, m10 + lea r0, [r0 + r1 * 4] + movu xm4, [r0 + 32] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, m11 + paddw m1, m5 + pmaddubsw m3, m10 + movu xm5, [r0 + r1 + 32] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m4, m11 + paddw m2, m4 + movu xm6, [r0 + r1 * 2 + 32] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m5, m11 + paddw m3, m5 %ifidn %1,pp - pmulhrsw m2, m7 ; m2 = word: row 2 - pmulhrsw m5, m7 ; m5 = word: row 3 - packuswb m2, m5 + pmulhrsw m0, m12 ; m0 = word: row 0 + pmulhrsw m1, m12 ; m1 = word: row 1 + pmulhrsw m2, m12 ; m2 = word: row 2 + pmulhrsw m3, m12 ; m3 = word: row 3 + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b - vextracti128 xm5, m2, 1 - movu [r2], xm2 - movu [r2 + r3], xm5 + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r5], xm3 + lea r2, [r2 + r3 * 4 - 32] %else - psubw m2, m7 ; m2 = word: row 2 - psubw m5, m7 ; m5 = word: row 3 - movu [r2], m2 - movu [r2 + r3], m5 + psubw m0, m12 ; m0 = word: row 0 + psubw m1, m12 ; m1 = word: row 1 + psubw m2, m12 ; m2 = word: row 2 + psubw m3, m12 ; m3 = word: row 3 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r5], m3 + lea r2, [r2 + r3 * 4 - 64] %endif + dec r6d + jnz .loopH RET +%endif %endmacro - FILTER_VER_CHROMA_AVX2_16x4 pp - FILTER_VER_CHROMA_AVX2_16x4 ps + FILTER_VER_CHROMA_AVX2_48x64 pp + FILTER_VER_CHROMA_AVX2_48x64 ps -%macro FILTER_VER_CHROMA_AVX2_12xN 2 +%macro FILTER_VER_CHROMA_AVX2_64xN 2 +%if ARCH_X86_64 == 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_12x%2, 4, 7, 8 +cglobal interp_4tap_vert_%1_64x%2, 4, 8, 13 mov r4d, r4m shl r4d, 6 @@ -10966,317 +9071,152 @@ cglobal interp_4tap_vert_%1_12x%2, 4, 7, 8 lea r5, [tab_ChromaCoeffVer_32 + r4] %endif + mova m10, [r5] + mova m11, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp - mova m7, [pw_512] + mova m12, [pw_512] %else add r3d, r3d - vbroadcasti128 m7, [pw_2000] + vbroadcasti128 m12, [pw_2000] %endif - lea r6, [r3 * 3] -%rep %2 / 16 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] -%ifidn %1,pp - pmulhrsw m0, m7 ; m0 = word: row 0 - pmulhrsw m1, m7 ; m1 = word: row 1 - packuswb m0, m1 - vextracti128 xm1, m0, 1 - movq [r2], xm0 - movd [r2 + 8], xm1 - movhps [r2 + r3], xm0 - pextrd [r2 + r3 + 8], xm1, 2 -%else - psubw m0, m7 ; m0 = word: row 0 - psubw m1, m7 ; m1 = word: row 1 - movu [r2], xm0 - vextracti128 xm0, m0, 1 - movq [r2 + 16], xm0 - movu [r2 + r3], xm1 - vextracti128 xm1, m1, 1 - movq [r2 + r3 + 16], xm1 -%endif - - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm0, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm0, 1 - pmaddubsw m0, m5, [r5 + 1 * mmsize] - paddw m3, m0 - pmaddubsw m5, [r5] + lea r5, [r3 * 3] + lea r7, [r1 * 4] + mov r6d, %2 / 4 +.loopH: +%assign x 0 +%rep 2 + movu m0, [r0 + x] ; m0 = row 0 + movu m1, [r0 + r1 + x] ; m1 = row 1 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + movu m0, [r0 + r1 * 2 + x] ; m0 = row 2 + punpcklbw m4, m1, m0 + punpckhbw m5, m1, m0 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + movu m1, [r0 + r4 + x] ; m1 = row 3 + punpcklbw m6, m0, m1 + punpckhbw m7, m0, m1 + pmaddubsw m8, m6, m11 + pmaddubsw m9, m7, m11 + pmaddubsw m6, m10 + pmaddubsw m7, m10 + paddw m2, m8 + paddw m3, m9 %ifidn %1,pp - pmulhrsw m2, m7 ; m2 = word: row 2 - pmulhrsw m3, m7 ; m3 = word: row 3 + pmulhrsw m2, m12 + pmulhrsw m3, m12 packuswb m2, m3 - vextracti128 xm3, m2, 1 - movq [r2 + r3 * 2], xm2 - movd [r2 + r3 * 2 + 8], xm3 - movhps [r2 + r6], xm2 - pextrd [r2 + r6 + 8], xm3, 2 + movu [r2], m2 %else - psubw m2, m7 ; m2 = word: row 2 - psubw m3, m7 ; m3 = word: row 3 - movu [r2 + r3 * 2], xm2 - vextracti128 xm2, m2, 1 - movq [r2 + r3 * 2 + 16], xm2 - movu [r2 + r6], xm3 - vextracti128 xm3, m3, 1 - movq [r2 + r6 + 16], xm3 + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2], m0 + movu [r2 + mmsize], m2 %endif - lea r2, [r2 + r3 * 4] - - movu xm0, [r0 + r4] ; m0 = row 7 - punpckhbw xm3, xm6, xm0 - punpcklbw xm6, xm0 - vinserti128 m6, m6, xm3, 1 - pmaddubsw m3, m6, [r5 + 1 * mmsize] - paddw m4, m3 - pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] - movu xm3, [r0] ; m3 = row 8 - punpckhbw xm1, xm0, xm3 - punpcklbw xm0, xm3 - vinserti128 m0, m0, xm1, 1 - pmaddubsw m1, m0, [r5 + 1 * mmsize] - paddw m5, m1 - pmaddubsw m0, [r5] + movu m0, [r0 + x] ; m0 = row 4 + punpcklbw m2, m1, m0 + punpckhbw m3, m1, m0 + pmaddubsw m8, m2, m11 + pmaddubsw m9, m3, m11 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + paddw m4, m8 + paddw m5, m9 %ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 4 - pmulhrsw m5, m7 ; m5 = word: row 5 + pmulhrsw m4, m12 + pmulhrsw m5, m12 packuswb m4, m5 - vextracti128 xm5, m4, 1 - movq [r2], xm4 - movd [r2 + 8], xm5 - movhps [r2 + r3], xm4 - pextrd [r2 + r3 + 8], xm5, 2 + movu [r2 + r3], m4 %else - psubw m4, m7 ; m4 = word: row 4 - psubw m5, m7 ; m5 = word: row 5 - movu [r2], xm4 - vextracti128 xm4, m4, 1 - movq [r2 + 16], xm4 - movu [r2 + r3], xm5 - vextracti128 xm5, m5, 1 - movq [r2 + r3 + 16], xm5 + psubw m4, m12 + psubw m5, m12 + vperm2i128 m1, m4, m5, 0x20 + vperm2i128 m4, m4, m5, 0x31 + movu [r2 + r3], m1 + movu [r2 + r3 + mmsize], m4 %endif - movu xm1, [r0 + r1] ; m1 = row 9 - punpckhbw xm2, xm3, xm1 - punpcklbw xm3, xm1 - vinserti128 m3, m3, xm2, 1 - pmaddubsw m2, m3, [r5 + 1 * mmsize] - paddw m6, m2 - pmaddubsw m3, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 10 - punpckhbw xm4, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm4, 1 - pmaddubsw m4, m1, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m1, [r5] - + movu m1, [r0 + r1 + x] ; m1 = row 5 + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m6, m4 + paddw m7, m5 %ifidn %1,pp - pmulhrsw m6, m7 ; m6 = word: row 6 - pmulhrsw m0, m7 ; m0 = word: row 7 - packuswb m6, m0 - vextracti128 xm0, m6, 1 - movq [r2 + r3 * 2], xm6 - movd [r2 + r3 * 2 + 8], xm0 - movhps [r2 + r6], xm6 - pextrd [r2 + r6 + 8], xm0, 2 + pmulhrsw m6, m12 + pmulhrsw m7, m12 + packuswb m6, m7 + movu [r2 + r3 * 2], m6 %else - psubw m6, m7 ; m6 = word: row 6 - psubw m0, m7 ; m0 = word: row 7 - movu [r2 + r3 * 2], xm6 - vextracti128 xm6, m6, 1 - movq [r2 + r3 * 2 + 16], xm6 - movu [r2 + r6], xm0 - vextracti128 xm0, m0, 1 - movq [r2 + r6 + 16], xm0 + psubw m6, m12 + psubw m7, m12 + vperm2i128 m0, m6, m7, 0x20 + vperm2i128 m6, m6, m7, 0x31 + movu [r2 + r3 * 2], m0 + movu [r2 + r3 * 2 + mmsize], m6 %endif - lea r2, [r2 + r3 * 4] - movu xm4, [r0 + r4] ; m4 = row 11 - punpckhbw xm6, xm2, xm4 - punpcklbw xm2, xm4 - vinserti128 m2, m2, xm6, 1 - pmaddubsw m6, m2, [r5 + 1 * mmsize] - paddw m3, m6 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 12 - punpckhbw xm0, xm4, xm6 - punpcklbw xm4, xm6 - vinserti128 m4, m4, xm0, 1 - pmaddubsw m0, m4, [r5 + 1 * mmsize] - paddw m1, m0 - pmaddubsw m4, [r5] + movu m0, [r0 + r1 * 2 + x] ; m0 = row 6 + punpcklbw m6, m1, m0 + punpckhbw m7, m1, m0 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m2, m6 + paddw m3, m7 %ifidn %1,pp - pmulhrsw m3, m7 ; m3 = word: row 8 - pmulhrsw m1, m7 ; m1 = word: row 9 - packuswb m3, m1 - vextracti128 xm1, m3, 1 - movq [r2], xm3 - movd [r2 + 8], xm1 - movhps [r2 + r3], xm3 - pextrd [r2 + r3 + 8], xm1, 2 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2 + r5], m2 + add r2, 32 %else - psubw m3, m7 ; m3 = word: row 8 - psubw m1, m7 ; m1 = word: row 9 - movu [r2], xm3 - vextracti128 xm3, m3, 1 - movq [r2 + 16], xm3 - movu [r2 + r3], xm1 - vextracti128 xm1, m1, 1 - movq [r2 + r3 + 16], xm1 -%endif - - movu xm0, [r0 + r1] ; m0 = row 13 - punpckhbw xm1, xm6, xm0 - punpcklbw xm6, xm0 - vinserti128 m6, m6, xm1, 1 - pmaddubsw m1, m6, [r5 + 1 * mmsize] - paddw m2, m1 - pmaddubsw m6, [r5] - movu xm1, [r0 + r1 * 2] ; m1 = row 14 - punpckhbw xm5, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm5, 1 - pmaddubsw m5, m0, [r5 + 1 * mmsize] - paddw m4, m5 - pmaddubsw m0, [r5] -%ifidn %1,pp - pmulhrsw m2, m7 ; m2 = word: row 10 - pmulhrsw m4, m7 ; m4 = word: row 11 - packuswb m2, m4 - vextracti128 xm4, m2, 1 - movq [r2 + r3 * 2], xm2 - movd [r2 + r3 * 2 + 8], xm4 - movhps [r2 + r6], xm2 - pextrd [r2 + r6 + 8], xm4, 2 -%else - psubw m2, m7 ; m2 = word: row 10 - psubw m4, m7 ; m4 = word: row 11 - movu [r2 + r3 * 2], xm2 - vextracti128 xm2, m2, 1 - movq [r2 + r3 * 2 + 16], xm2 - movu [r2 + r6], xm4 - vextracti128 xm4, m4, 1 - movq [r2 + r6 + 16], xm4 + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2 + r5], m0 + movu [r2 + r5 + mmsize], m2 + add r2, 64 %endif - lea r2, [r2 + r3 * 4] - - movu xm5, [r0 + r4] ; m5 = row 15 - punpckhbw xm2, xm1, xm5 - punpcklbw xm1, xm5 - vinserti128 m1, m1, xm2, 1 - pmaddubsw m2, m1, [r5 + 1 * mmsize] - paddw m6, m2 - pmaddubsw m1, [r5] - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm5, xm2 - punpcklbw xm5, xm2 - vinserti128 m5, m5, xm3, 1 - pmaddubsw m3, m5, [r5 + 1 * mmsize] - paddw m0, m3 - pmaddubsw m5, [r5] - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m2, [r5 + 1 * mmsize] - paddw m1, m2 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm2, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm2, 1 - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m5, m3 - + sub r0, r7 +%assign x x+32 +%endrep %ifidn %1,pp - pmulhrsw m6, m7 ; m6 = word: row 12 - pmulhrsw m0, m7 ; m0 = word: row 13 - pmulhrsw m1, m7 ; m1 = word: row 14 - pmulhrsw m5, m7 ; m5 = word: row 15 - packuswb m6, m0 - packuswb m1, m5 - vextracti128 xm0, m6, 1 - vextracti128 xm5, m1, 1 - movq [r2], xm6 - movd [r2 + 8], xm0 - movhps [r2 + r3], xm6 - pextrd [r2 + r3 + 8], xm0, 2 - movq [r2 + r3 * 2], xm1 - movd [r2 + r3 * 2 + 8], xm5 - movhps [r2 + r6], xm1 - pextrd [r2 + r6 + 8], xm5, 2 + lea r2, [r2 + r3 * 4 - 64] %else - psubw m6, m7 ; m6 = word: row 12 - psubw m0, m7 ; m0 = word: row 13 - psubw m1, m7 ; m1 = word: row 14 - psubw m5, m7 ; m5 = word: row 15 - movu [r2], xm6 - vextracti128 xm6, m6, 1 - movq [r2 + 16], xm6 - movu [r2 + r3], xm0 - vextracti128 xm0, m0, 1 - movq [r2 + r3 + 16], xm0 - movu [r2 + r3 * 2], xm1 - vextracti128 xm1, m1, 1 - movq [r2 + r3 * 2 + 16], xm1 - movu [r2 + r6], xm5 - vextracti128 xm5, m5, 1 - movq [r2 + r6 + 16], xm5 + lea r2, [r2 + r3 * 4 - 128] %endif - lea r2, [r2 + r3 * 4] -%endrep + add r0, r7 + dec r6d + jnz .loopH RET +%endif %endmacro - FILTER_VER_CHROMA_AVX2_12xN pp, 16 - FILTER_VER_CHROMA_AVX2_12xN ps, 16 - FILTER_VER_CHROMA_AVX2_12xN pp, 32 - FILTER_VER_CHROMA_AVX2_12xN ps, 32 + FILTER_VER_CHROMA_AVX2_64xN pp, 64 + FILTER_VER_CHROMA_AVX2_64xN pp, 48 + FILTER_VER_CHROMA_AVX2_64xN pp, 32 + FILTER_VER_CHROMA_AVX2_64xN pp, 16 + FILTER_VER_CHROMA_AVX2_64xN ps, 64 + FILTER_VER_CHROMA_AVX2_64xN ps, 48 + FILTER_VER_CHROMA_AVX2_64xN ps, 32 + FILTER_VER_CHROMA_AVX2_64xN ps, 16 ;----------------------------------------------------------------------------- -;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W24 2 +%macro FILTER_V4_W16n_H2 2 INIT_XMM sse4 -cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 +cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 mov r4d, r4m sub r0, r1 @@ -11291,9 +9231,14 @@ cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] - mov r4d, %2 + mov r4d, %2/2 .loop: + + mov r6d, %1/16 + +.loopW: + movu m2, [r0] movu m3, [r0 + r1] @@ -11305,20 +9250,20 @@ cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 lea r5, [r0 + 2 * r1] movu m5, [r5] - movu m7, [r5 + r1] + movu m6, [r5 + r1] - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 + punpckhbw m7, m5, m6 + pmaddubsw m7, m0 + paddw m2, m7 - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 + punpcklbw m7, m5, m6 + pmaddubsw m7, m0 + paddw m4, m7 - mova m6, [pw_512] + mova m7, [pw_512] - pmulhrsw m4, m6 - pmulhrsw m2, m6 + pmulhrsw m4, m7 + pmulhrsw m2, m7 packuswb m4, m2 @@ -11330,857 +9275,817 @@ cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 pmaddubsw m4, m1 pmaddubsw m3, m1 - movu m2, [r5 + 2 * r1] + movu m5, [r5 + 2 * r1] - punpcklbw m5, m7, m2 - punpckhbw m7, m2 + punpcklbw m2, m6, m5 + punpckhbw m6, m5 - pmaddubsw m5, m0 - pmaddubsw m7, m0 + pmaddubsw m2, m0 + pmaddubsw m6, m0 - paddw m4, m5 - paddw m3, m7 + paddw m4, m2 + paddw m3, m6 - pmulhrsw m4, m6 - pmulhrsw m3, m6 + pmulhrsw m4, m7 + pmulhrsw m3, m7 packuswb m4, m3 movu [r2 + r3], m4 - movq m2, [r0 + 16] - movq m3, [r0 + r1 + 16] - movq m4, [r5 + 16] - movq m5, [r5 + r1 + 16] - - punpcklbw m2, m3 - punpcklbw m4, m5 - - pmaddubsw m2, m1 - pmaddubsw m4, m0 - - paddw m2, m4 - - pmulhrsw m2, m6 + add r0, 16 + add r2, 16 + dec r6d + jnz .loopW - movq m3, [r0 + r1 + 16] - movq m4, [r5 + 16] - movq m5, [r5 + r1 + 16] - movq m7, [r5 + 2 * r1 + 16] + lea r0, [r0 + r1 * 2 - %1] + lea r2, [r2 + r3 * 2 - %1] - punpcklbw m3, m4 - punpcklbw m5, m7 + dec r4d + jnz .loop + RET +%endmacro - pmaddubsw m3, m1 - pmaddubsw m5, m0 + FILTER_V4_W16n_H2 64, 64 + FILTER_V4_W16n_H2 64, 32 + FILTER_V4_W16n_H2 64, 48 + FILTER_V4_W16n_H2 48, 64 + FILTER_V4_W16n_H2 64, 16 - paddw m3, m5 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +%macro P2S_H_2xN 1 +INIT_XMM sse4 +cglobal filterPixelToShort_2x%1, 3, 4, 3 + mov r3d, r3m + add r3d, r3d - pmulhrsw m3, m6 - packuswb m2, m3 + ; load constant + mova m1, [pb_128] + mova m2, [tab_c_64_n64] - movh [r2 + 16], m2 - movhps [r2 + r3 + 16], m2 +%rep %1/2 + movd m0, [r0] + pinsrd m0, [r0 + r1], 1 + punpcklbw m0, m1 + pmaddubsw m0, m2 - mov r0, r5 - lea r2, [r2 + 2 * r3] + movd [r2 + r3 * 0], m0 + pextrd [r2 + r3 * 1], m0, 2 - sub r4, 2 - jnz .loop + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] +%endrep RET %endmacro - - FILTER_V4_W24 24, 32 - - FILTER_V4_W24 24, 64 + P2S_H_2xN 4 + P2S_H_2xN 8 + P2S_H_2xN 16 ;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W32 2 +%macro P2S_H_4xN 1 INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 +cglobal filterPixelToShort_4x%1, 3, 6, 4 + mov r3d, r3m + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] - mov r4d, r4m - sub r0, r1 + ; load constant + mova m2, [pb_128] + mova m3, [tab_c_64_n64] -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] +%assign x 0 +%rep %1/4 + movd m0, [r0] + pinsrd m0, [r0 + r1], 1 + punpcklbw m0, m2 + pmaddubsw m0, m3 + + movd m1, [r0 + r1 * 2] + pinsrd m1, [r0 + r5], 1 + punpcklbw m1, m2 + pmaddubsw m1, m3 + + movq [r2 + r3 * 0], m0 + movq [r2 + r3 * 2], m1 + movhps [r2 + r3 * 1], m0 + movhps [r2 + r4], m1 +%assign x x+1 +%if (x != %1/4) + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] %endif +%endrep + RET +%endmacro + P2S_H_4xN 4 + P2S_H_4xN 8 + P2S_H_4xN 16 + P2S_H_4xN 32 - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +%macro P2S_H_6xN 1 +INIT_XMM sse4 +cglobal filterPixelToShort_6x%1, 3, 7, 6 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] - mova m7, [pw_512] + ; load height + mov r6d, %1/4 - mov r4d, %2 + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] .loop: - movu m2, [r0] - movu m3, [r0 + r1] + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - punpcklbw m4, m2, m3 - punpckhbw m2, m3 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - pmaddubsw m4, m1 - pmaddubsw m2, m1 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - lea r5, [r0 + 2 * r1] - movu m3, [r5] - movu m5, [r5 + r1] + movh m3, [r0 + r4] + punpcklbw m3, m4 + pmaddubsw m3, m5 - punpcklbw m6, m3, m5 - punpckhbw m3, m5 + movh [r2 + r3 * 0], m0 + pextrd [r2 + r3 * 0 + 8], m0, 2 + movh [r2 + r3 * 1], m1 + pextrd [r2 + r3 * 1 + 8], m1, 2 + movh [r2 + r3 * 2], m2 + pextrd [r2 + r3 * 2 + 8], m2, 2 + movh [r2 + r5], m3 + pextrd [r2 + r5 + 8], m3, 2 - pmaddubsw m6, m0 - pmaddubsw m3, m0 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - paddw m4, m6 - paddw m2, m3 + dec r6d + jnz .loop + RET +%endmacro + P2S_H_6xN 8 + P2S_H_6xN 16 - pmulhrsw m4, m7 - pmulhrsw m2, m7 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +%macro P2S_H_8xN 1 +INIT_XMM ssse3 +cglobal filterPixelToShort_8x%1, 3, 7, 6 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] - packuswb m4, m2 + ; load height + mov r4d, %1/4 - movu [r2], m4 + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] - movu m2, [r0 + 16] - movu m3, [r0 + r1 + 16] +.loop: + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - punpcklbw m4, m2, m3 - punpckhbw m2, m3 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - pmaddubsw m4, m1 - pmaddubsw m2, m1 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - movu m3, [r5 + 16] - movu m5, [r5 + r1 + 16] + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - punpcklbw m6, m3, m5 - punpckhbw m3, m5 + movu [r2 + r3 * 0], m0 + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6 ], m3 - pmaddubsw m6, m0 - pmaddubsw m3, m0 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - paddw m4, m6 - paddw m2, m3 + dec r4d + jnz .loop + RET +%endmacro + P2S_H_8xN 8 + P2S_H_8xN 4 + P2S_H_8xN 16 + P2S_H_8xN 32 + P2S_H_8xN 12 + P2S_H_8xN 64 - pmulhrsw m4, m7 - pmulhrsw m2, m7 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal filterPixelToShort_8x6, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r1 * 5] + lea r6, [r3 * 3] - packuswb m4, m2 + ; load constant + mova m3, [pb_128] + mova m4, [tab_c_64_n64] - movu [r2 + 16], m4 + movh m0, [r0] + punpcklbw m0, m3 + pmaddubsw m0, m4 - lea r0, [r0 + r1] - lea r2, [r2 + r3] + movh m1, [r0 + r1] + punpcklbw m1, m3 + pmaddubsw m1, m4 + + movh m2, [r0 + r1 * 2] + punpcklbw m2, m3 + pmaddubsw m2, m4 + + movu [r2 + r3 * 0], m0 + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 2], m2 + + movh m0, [r0 + r4] + punpcklbw m0, m3 + pmaddubsw m0, m4 + + movh m1, [r0 + r1 * 4] + punpcklbw m1, m3 + pmaddubsw m1, m4 + + movh m2, [r0 + r5] + punpcklbw m2, m3 + pmaddubsw m2, m4 + + movu [r2 + r6 ], m0 + movu [r2 + r3 * 4], m1 + lea r2, [r2 + r3 * 4] + movu [r2 + r3], m2 - dec r4 - jnz .loop RET -%endmacro - FILTER_V4_W32 32, 8 - FILTER_V4_W32 32, 16 - FILTER_V4_W32 32, 24 - FILTER_V4_W32 32, 32 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +%macro P2S_H_16xN 1 +INIT_XMM ssse3 +cglobal filterPixelToShort_16x%1, 3, 7, 6 + mov r3d, r3m + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] - FILTER_V4_W32 32, 48 - FILTER_V4_W32 32, 64 + ; load height + mov r6d, %1/4 -%macro FILTER_VER_CHROMA_AVX2_32xN 2 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_32x%2, 4, 7, 13 - mov r4d, r4m - shl r4d, 6 + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif +.loop: + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - mova m10, [r5] - mova m11, [r5 + mmsize] - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m12, [pw_512] -%else + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 + + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 + + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 + + movu [r2 + r3 * 0], m0 + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r4], m3 + + lea r0, [r0 + 8] + + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 + + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 + + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 + + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 + + movu [r2 + r3 * 0 + 16], m0 + movu [r2 + r3 * 1 + 16], m1 + movu [r2 + r3 * 2 + 16], m2 + movu [r2 + r4 + 16], m3 + + lea r0, [r0 + r1 * 4 - 8] + lea r2, [r2 + r3 * 4] + + dec r6d + jnz .loop + RET +%endmacro + P2S_H_16xN 16 + P2S_H_16xN 4 + P2S_H_16xN 8 + P2S_H_16xN 12 + P2S_H_16xN 32 + P2S_H_16xN 64 + P2S_H_16xN 24 + +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal filterPixelToShort_16x4, 3, 4, 2 + mov r3d, r3m add r3d, r3d - vbroadcasti128 m12, [pw_2000] -%endif + + ; load constant + vbroadcasti128 m1, [pw_2000] + + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 + + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 + + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 + + lea r1, [r1 * 3] + lea r3, [r3 * 3] + + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 + RET + +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal filterPixelToShort_16x8, 3, 6, 2 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] lea r5, [r3 * 3] - mov r6d, %2 / 4 -.loopW: - movu m0, [r0] ; m0 = row 0 - movu m1, [r0 + r1] ; m1 = row 1 - punpcklbw m2, m0, m1 - punpckhbw m3, m0, m1 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - movu m0, [r0 + r1 * 2] ; m0 = row 2 - punpcklbw m4, m1, m0 - punpckhbw m5, m1, m0 - pmaddubsw m4, m10 - pmaddubsw m5, m10 - movu m1, [r0 + r4] ; m1 = row 3 - punpcklbw m6, m0, m1 - punpckhbw m7, m0, m1 - pmaddubsw m8, m6, m11 - pmaddubsw m9, m7, m11 - pmaddubsw m6, m10 - pmaddubsw m7, m10 - paddw m2, m8 - paddw m3, m9 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2], m2 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 + + ; load constant + vbroadcasti128 m1, [pw_2000] + + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 movu [r2], m0 - movu [r2 + mmsize], m2 -%endif - lea r0, [r0 + r1 * 4] - movu m0, [r0] ; m0 = row 4 - punpcklbw m2, m1, m0 - punpckhbw m3, m1, m0 - pmaddubsw m8, m2, m11 - pmaddubsw m9, m3, m11 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - paddw m4, m8 - paddw m5, m9 -%ifidn %1,pp - pmulhrsw m4, m12 - pmulhrsw m5, m12 - packuswb m4, m5 - movu [r2 + r3], m4 -%else - psubw m4, m12 - psubw m5, m12 - vperm2i128 m1, m4, m5, 0x20 - vperm2i128 m4, m4, m5, 0x31 - movu [r2 + r3], m1 - movu [r2 + r3 + mmsize], m4 -%endif - movu m1, [r0 + r1] ; m1 = row 5 - punpcklbw m4, m0, m1 - punpckhbw m5, m0, m1 - pmaddubsw m4, m11 - pmaddubsw m5, m11 - paddw m6, m4 - paddw m7, m5 -%ifidn %1,pp - pmulhrsw m6, m12 - pmulhrsw m7, m12 - packuswb m6, m7 - movu [r2 + r3 * 2], m6 -%else - psubw m6, m12 - psubw m7, m12 - vperm2i128 m0, m6, m7, 0x20 - vperm2i128 m6, m6, m7, 0x31 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 + + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 movu [r2 + r3 * 2], m0 - movu [r2 + r3 * 2 + mmsize], m6 -%endif - movu m0, [r0 + r1 * 2] ; m0 = row 6 - punpcklbw m6, m1, m0 - punpckhbw m7, m1, m0 - pmaddubsw m6, m11 - pmaddubsw m7, m11 - paddw m2, m6 - paddw m3, m7 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2 + r5], m2 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 movu [r2 + r5], m0 - movu [r2 + r5 + mmsize], m2 -%endif + + lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] - dec r6d - jnz .loopW - RET -%endif -%endmacro - FILTER_VER_CHROMA_AVX2_32xN pp, 64 - FILTER_VER_CHROMA_AVX2_32xN pp, 48 - FILTER_VER_CHROMA_AVX2_32xN pp, 32 - FILTER_VER_CHROMA_AVX2_32xN pp, 24 - FILTER_VER_CHROMA_AVX2_32xN pp, 16 - FILTER_VER_CHROMA_AVX2_32xN pp, 8 - FILTER_VER_CHROMA_AVX2_32xN ps, 64 - FILTER_VER_CHROMA_AVX2_32xN ps, 48 - FILTER_VER_CHROMA_AVX2_32xN ps, 32 - FILTER_VER_CHROMA_AVX2_32xN ps, 24 - FILTER_VER_CHROMA_AVX2_32xN ps, 16 - FILTER_VER_CHROMA_AVX2_32xN ps, 8 + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 -%macro FILTER_VER_CHROMA_AVX2_48x64 1 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_48x64, 4, 8, 13 - mov r4d, r4m - shl r4d, 6 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - mova m10, [r5] - mova m11, [r5 + mmsize] - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m12, [pw_512] -%else + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 + RET + +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal filterPixelToShort_16x12, 3, 6, 2 + mov r3d, r3m add r3d, r3d - vbroadcasti128 m12, [pw_2000] -%endif + lea r4, [r1 * 3] lea r5, [r3 * 3] - lea r7, [r1 * 4] - mov r6d, 16 -.loopH: - movu m0, [r0] ; m0 = row 0 - movu m1, [r0 + r1] ; m1 = row 1 - punpcklbw m2, m0, m1 - punpckhbw m3, m0, m1 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - movu m0, [r0 + r1 * 2] ; m0 = row 2 - punpcklbw m4, m1, m0 - punpckhbw m5, m1, m0 - pmaddubsw m4, m10 - pmaddubsw m5, m10 - movu m1, [r0 + r4] ; m1 = row 3 - punpcklbw m6, m0, m1 - punpckhbw m7, m0, m1 - pmaddubsw m8, m6, m11 - pmaddubsw m9, m7, m11 - pmaddubsw m6, m10 - pmaddubsw m7, m10 - paddw m2, m8 - paddw m3, m9 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2], m2 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 + + ; load constant + vbroadcasti128 m1, [pw_2000] + + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 movu [r2], m0 - movu [r2 + mmsize], m2 -%endif - lea r0, [r0 + r1 * 4] - movu m0, [r0] ; m0 = row 4 - punpcklbw m2, m1, m0 - punpckhbw m3, m1, m0 - pmaddubsw m8, m2, m11 - pmaddubsw m9, m3, m11 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - paddw m4, m8 - paddw m5, m9 -%ifidn %1,pp - pmulhrsw m4, m12 - pmulhrsw m5, m12 - packuswb m4, m5 - movu [r2 + r3], m4 -%else - psubw m4, m12 - psubw m5, m12 - vperm2i128 m1, m4, m5, 0x20 - vperm2i128 m4, m4, m5, 0x31 - movu [r2 + r3], m1 - movu [r2 + r3 + mmsize], m4 -%endif - movu m1, [r0 + r1] ; m1 = row 5 - punpcklbw m4, m0, m1 - punpckhbw m5, m0, m1 - pmaddubsw m4, m11 - pmaddubsw m5, m11 - paddw m6, m4 - paddw m7, m5 -%ifidn %1,pp - pmulhrsw m6, m12 - pmulhrsw m7, m12 - packuswb m6, m7 - movu [r2 + r3 * 2], m6 -%else - psubw m6, m12 - psubw m7, m12 - vperm2i128 m0, m6, m7, 0x20 - vperm2i128 m6, m6, m7, 0x31 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 + + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 movu [r2 + r3 * 2], m0 - movu [r2 + r3 * 2 + mmsize], m6 -%endif - movu m0, [r0 + r1 * 2] ; m0 = row 6 - punpcklbw m6, m1, m0 - punpckhbw m7, m1, m0 - pmaddubsw m6, m11 - pmaddubsw m7, m11 - paddw m2, m6 - paddw m3, m7 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2 + r5], m2 - add r2, 32 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 movu [r2 + r5], m0 - movu [r2 + r5 + mmsize], m2 - add r2, 64 -%endif - sub r0, r7 - movu xm0, [r0 + 32] ; m0 = row 0 - movu xm1, [r0 + r1 + 32] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, m10 - movu xm2, [r0 + r1 * 2 + 32] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, m10 - movu xm3, [r0 + r4 + 32] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, m11 - paddw m0, m4 - pmaddubsw m2, m10 lea r0, [r0 + r1 * 4] - movu xm4, [r0 + 32] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, m11 - paddw m1, m5 - pmaddubsw m3, m10 - movu xm5, [r0 + r1 + 32] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m4, m11 - paddw m2, m4 - movu xm6, [r0 + r1 * 2 + 32] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m5, m11 - paddw m3, m5 -%ifidn %1,pp - pmulhrsw m0, m12 ; m0 = word: row 0 - pmulhrsw m1, m12 ; m1 = word: row 1 - pmulhrsw m2, m12 ; m2 = word: row 2 - pmulhrsw m3, m12 ; m3 = word: row 3 - packuswb m0, m1 - packuswb m2, m3 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r5], xm3 - lea r2, [r2 + r3 * 4 - 32] -%else - psubw m0, m12 ; m0 = word: row 0 - psubw m1, m12 ; m1 = word: row 1 - psubw m2, m12 ; m2 = word: row 2 - psubw m3, m12 ; m3 = word: row 3 + lea r2, [r2 + r3 * 4] + + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r5], m3 - lea r2, [r2 + r3 * 4 - 64] -%endif - dec r6d - jnz .loopH - RET -%endif -%endmacro - FILTER_VER_CHROMA_AVX2_48x64 pp - FILTER_VER_CHROMA_AVX2_48x64 ps + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 -%macro FILTER_VER_CHROMA_AVX2_64xN 2 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_64x%2, 4, 8, 13 - mov r4d, r4m - shl r4d, 6 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - mova m10, [r5] - mova m11, [r5 + mmsize] - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m12, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m12, [pw_2000] -%endif - lea r5, [r3 * 3] - lea r7, [r1 * 4] - mov r6d, %2 / 4 -.loopH: -%assign x 0 -%rep 2 - movu m0, [r0 + x] ; m0 = row 0 - movu m1, [r0 + r1 + x] ; m1 = row 1 - punpcklbw m2, m0, m1 - punpckhbw m3, m0, m1 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - movu m0, [r0 + r1 * 2 + x] ; m0 = row 2 - punpcklbw m4, m1, m0 - punpckhbw m5, m1, m0 - pmaddubsw m4, m10 - pmaddubsw m5, m10 - movu m1, [r0 + r4 + x] ; m1 = row 3 - punpcklbw m6, m0, m1 - punpckhbw m7, m0, m1 - pmaddubsw m8, m6, m11 - pmaddubsw m9, m7, m11 - pmaddubsw m6, m10 - pmaddubsw m7, m10 - paddw m2, m8 - paddw m3, m9 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2], m2 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - movu [r2], m0 - movu [r2 + mmsize], m2 -%endif lea r0, [r0 + r1 * 4] - movu m0, [r0 + x] ; m0 = row 4 - punpcklbw m2, m1, m0 - punpckhbw m3, m1, m0 - pmaddubsw m8, m2, m11 - pmaddubsw m9, m3, m11 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - paddw m4, m8 - paddw m5, m9 -%ifidn %1,pp - pmulhrsw m4, m12 - pmulhrsw m5, m12 - packuswb m4, m5 - movu [r2 + r3], m4 -%else - psubw m4, m12 - psubw m5, m12 - vperm2i128 m1, m4, m5, 0x20 - vperm2i128 m4, m4, m5, 0x31 - movu [r2 + r3], m1 - movu [r2 + r3 + mmsize], m4 -%endif + lea r2, [r2 + r3 * 4] - movu m1, [r0 + r1 + x] ; m1 = row 5 - punpcklbw m4, m0, m1 - punpckhbw m5, m0, m1 - pmaddubsw m4, m11 - pmaddubsw m5, m11 - paddw m6, m4 - paddw m7, m5 -%ifidn %1,pp - pmulhrsw m6, m12 - pmulhrsw m7, m12 - packuswb m6, m7 - movu [r2 + r3 * 2], m6 -%else - psubw m6, m12 - psubw m7, m12 - vperm2i128 m0, m6, m7, 0x20 - vperm2i128 m6, m6, m7, 0x31 + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 + + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 + + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 movu [r2 + r3 * 2], m0 - movu [r2 + r3 * 2 + mmsize], m6 -%endif - movu m0, [r0 + r1 * 2 + x] ; m0 = row 6 - punpcklbw m6, m1, m0 - punpckhbw m7, m1, m0 - pmaddubsw m6, m11 - pmaddubsw m7, m11 - paddw m2, m6 - paddw m3, m7 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2 + r5], m2 - add r2, 32 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 movu [r2 + r5], m0 - movu [r2 + r5 + mmsize], m2 - add r2, 64 -%endif - sub r0, r7 -%assign x x+32 -%endrep -%ifidn %1,pp - lea r2, [r2 + r3 * 4 - 64] -%else - lea r2, [r2 + r3 * 4 - 128] -%endif - add r0, r7 - dec r6d - jnz .loopH RET -%endif -%endmacro - - FILTER_VER_CHROMA_AVX2_64xN pp, 64 - FILTER_VER_CHROMA_AVX2_64xN pp, 48 - FILTER_VER_CHROMA_AVX2_64xN pp, 32 - FILTER_VER_CHROMA_AVX2_64xN pp, 16 - FILTER_VER_CHROMA_AVX2_64xN ps, 64 - FILTER_VER_CHROMA_AVX2_64xN ps, 48 - FILTER_VER_CHROMA_AVX2_64xN ps, 32 - FILTER_VER_CHROMA_AVX2_64xN ps, 16 ;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W16n_H2 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 +INIT_YMM avx2 +cglobal filterPixelToShort_16x16, 3, 6, 2 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] - mov r4d, r4m - sub r0, r1 + ; load constant + vbroadcasti128 m1, [pw_2000] -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - mov r4d, %2/2 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 -.loop: + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - mov r6d, %1/16 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] -.loopW: + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - movu m2, [r0] - movu m3, [r0 + r1] + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - punpcklbw m4, m2, m3 - punpckhbw m2, m3 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - pmaddubsw m4, m1 - pmaddubsw m2, m1 + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - lea r5, [r0 + 2 * r1] - movu m5, [r5] - movu m6, [r5 + r1] + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - punpckhbw m7, m5, m6 - pmaddubsw m7, m0 - paddw m2, m7 + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - punpcklbw m7, m5, m6 - pmaddubsw m7, m0 - paddw m4, m7 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - mova m7, [pw_512] + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - pmulhrsw m4, m7 - pmulhrsw m2, m7 + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - packuswb m4, m2 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - movu [r2], m4 + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - punpcklbw m4, m3, m5 - punpckhbw m3, m5 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - pmaddubsw m4, m1 - pmaddubsw m3, m1 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - movu m5, [r5 + 2 * r1] + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 + RET - punpcklbw m2, m6, m5 - punpckhbw m6, m5 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal filterPixelToShort_16x24, 3, 7, 2 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] + mov r6d, 3 - pmaddubsw m2, m0 - pmaddubsw m6, m0 + ; load constant + vbroadcasti128 m1, [pw_2000] +.loop: + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - paddw m4, m2 - paddw m3, m6 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - pmulhrsw m4, m7 - pmulhrsw m3, m7 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - packuswb m4, m3 + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - movu [r2 + r3], m4 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - add r0, 16 - add r2, 16 - dec r6d - jnz .loopW + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - lea r0, [r0 + r1 * 2 - %1] - lea r2, [r2 + r3 * 2 - %1] + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - dec r4d - jnz .loop - RET -%endmacro + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - FILTER_V4_W16n_H2 64, 64 - FILTER_V4_W16n_H2 64, 32 - FILTER_V4_W16n_H2 64, 48 - FILTER_V4_W16n_H2 48, 64 - FILTER_V4_W16n_H2 64, 16 + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + + dec r6d + jnz .loop + RET ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro P2S_H_2xN 1 -INIT_XMM sse4 -cglobal filterPixelToShort_2x%1, 3, 4, 3 - mov r3d, r3m - add r3d, r3d +%macro P2S_H_16xN_avx2 1 +INIT_YMM avx2 +cglobal filterPixelToShort_16x%1, 3, 7, 2 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] + mov r6d, %1/16 ; load constant - mova m1, [pb_128] - mova m2, [tab_c_64_n64] + vbroadcasti128 m1, [pw_2000] +.loop: + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 -%rep %1/2 - movd m0, [r0] - pinsrd m0, [r0 + r1], 1 - punpcklbw m0, m1 - pmaddubsw m0, m2 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - movd [r2 + r3 * 0], m0 - pextrd [r2 + r3 * 1], m0, 2 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - lea r0, [r0 + r1 * 2] - lea r2, [r2 + r3 * 2] -%endrep - RET -%endmacro - P2S_H_2xN 4 - P2S_H_2xN 8 - P2S_H_2xN 16 + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_4xN 1 -INIT_XMM sse4 -cglobal filterPixelToShort_4x%1, 3, 6, 4 - mov r3d, r3m - add r3d, r3d - lea r4, [r3 * 3] - lea r5, [r1 * 3] + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - ; load constant - mova m2, [pb_128] - mova m3, [tab_c_64_n64] + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 -%assign x 0 -%rep %1/4 - movd m0, [r0] - pinsrd m0, [r0 + r1], 1 - punpcklbw m0, m2 - pmaddubsw m0, m3 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - movd m1, [r0 + r1 * 2] - pinsrd m1, [r0 + r5], 1 - punpcklbw m1, m2 - pmaddubsw m1, m3 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - movq [r2 + r3 * 0], m0 - movq [r2 + r3 * 2], m1 - movhps [r2 + r3 * 1], m0 - movhps [r2 + r4], m1 -%assign x x+1 -%if (x != %1/4) - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] -%endif -%endrep + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 + + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 + + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 + + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 + + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 + + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 + + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + + dec r6d + jnz .loop RET %endmacro - P2S_H_4xN 4 - P2S_H_4xN 8 - P2S_H_4xN 16 - P2S_H_4xN 32 +P2S_H_16xN_avx2 32 +P2S_H_16xN_avx2 64 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro P2S_H_6xN 1 -INIT_XMM sse4 -cglobal filterPixelToShort_6x%1, 3, 7, 6 +%macro P2S_H_32xN 1 +INIT_XMM ssse3 +cglobal filterPixelToShort_32x%1, 3, 7, 6 mov r3d, r3m add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] + lea r4, [r3 * 3] + lea r5, [r1 * 3] ; load height mov r6d, %1/4 @@ -12202,48 +10107,40 @@ cglobal filterPixelToShort_6x%1, 3, 7, 6 punpcklbw m2, m4 pmaddubsw m2, m5 - movh m3, [r0 + r4] + movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 - movh [r2 + r3 * 0], m0 - pextrd [r2 + r3 * 0 + 8], m0, 2 - movh [r2 + r3 * 1], m1 - pextrd [r2 + r3 * 1 + 8], m1, 2 - movh [r2 + r3 * 2], m2 - pextrd [r2 + r3 * 2 + 8], m2, 2 - movh [r2 + r5], m3 - pextrd [r2 + r5 + 8], m3, 2 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + movu [r2 + r3 * 0], m0 + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r4], m3 - dec r6d - jnz .loop - RET -%endmacro - P2S_H_6xN 8 - P2S_H_6xN 16 + lea r0, [r0 + 8] -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_8xN 1 -INIT_XMM ssse3 -cglobal filterPixelToShort_8x%1, 3, 7, 6 - mov r3d, r3m - add r3d, r3d - lea r5, [r1 * 3] - lea r6, [r3 * 3] + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - ; load height - mov r4d, %1/4 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - ; load constant - mova m4, [pb_128] - mova m5, [tab_c_64_n64] + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 + + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 + + movu [r2 + r3 * 0 + 16], m0 + movu [r2 + r3 * 1 + 16], m1 + movu [r2 + r3 * 2 + 16], m2 + movu [r2 + r4 + 16], m3 + + lea r0, [r0 + 8] -.loop: movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 @@ -12260,87 +10157,128 @@ cglobal filterPixelToShort_8x%1, 3, 7, 6 punpcklbw m3, m4 pmaddubsw m3, m5 - movu [r2 + r3 * 0], m0 - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r6 ], m3 + movu [r2 + r3 * 0 + 32], m0 + movu [r2 + r3 * 1 + 32], m1 + movu [r2 + r3 * 2 + 32], m2 + movu [r2 + r4 + 32], m3 - lea r0, [r0 + r1 * 4] + lea r0, [r0 + 8] + + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 + + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 + + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 + + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 + + movu [r2 + r3 * 0 + 48], m0 + movu [r2 + r3 * 1 + 48], m1 + movu [r2 + r3 * 2 + 48], m2 + movu [r2 + r4 + 48], m3 + + lea r0, [r0 + r1 * 4 - 24] lea r2, [r2 + r3 * 4] - dec r4d + dec r6d jnz .loop RET %endmacro - P2S_H_8xN 8 - P2S_H_8xN 4 - P2S_H_8xN 16 - P2S_H_8xN 32 - P2S_H_8xN 12 - P2S_H_8xN 64 + P2S_H_32xN 32 + P2S_H_32xN 8 + P2S_H_32xN 16 + P2S_H_32xN 24 + P2S_H_32xN 64 + P2S_H_32xN 48 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -INIT_XMM ssse3 -cglobal filterPixelToShort_8x6, 3, 7, 5 +%macro P2S_H_32xN_avx2 1 +INIT_YMM avx2 +cglobal filterPixelToShort_32x%1, 3, 7, 3 mov r3d, r3m add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r1 * 5] + lea r5, [r1 * 3] lea r6, [r3 * 3] - ; load constant - mova m3, [pb_128] - mova m4, [tab_c_64_n64] - - movh m0, [r0] - punpcklbw m0, m3 - pmaddubsw m0, m4 - - movh m1, [r0 + r1] - punpcklbw m1, m3 - pmaddubsw m1, m4 + ; load height + mov r4d, %1/4 - movh m2, [r0 + r1 * 2] - punpcklbw m2, m3 - pmaddubsw m2, m4 + ; load constant + vpbroadcastd m2, [pw_2000] - movu [r2 + r3 * 0], m0 - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 2], m2 +.loop: + pmovzxbw m0, [r0 + 0 * mmsize/2] + pmovzxbw m1, [r0 + 1 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psubw m0, m2 + psubw m1, m2 + movu [r2 + 0 * mmsize], m0 + movu [r2 + 1 * mmsize], m1 - movh m0, [r0 + r4] - punpcklbw m0, m3 - pmaddubsw m0, m4 + pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psubw m0, m2 + psubw m1, m2 + movu [r2 + r3 + 0 * mmsize], m0 + movu [r2 + r3 + 1 * mmsize], m1 - movh m1, [r0 + r1 * 4] - punpcklbw m1, m3 - pmaddubsw m1, m4 + pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psubw m0, m2 + psubw m1, m2 + movu [r2 + r3 * 2 + 0 * mmsize], m0 + movu [r2 + r3 * 2 + 1 * mmsize], m1 - movh m2, [r0 + r5] - punpcklbw m2, m3 - pmaddubsw m2, m4 + pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psubw m0, m2 + psubw m1, m2 + movu [r2 + r6 + 0 * mmsize], m0 + movu [r2 + r6 + 1 * mmsize], m1 - movu [r2 + r6 ], m0 - movu [r2 + r3 * 4], m1 + lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] - movu [r2 + r3], m2 + dec r4d + jnz .loop RET +%endmacro + P2S_H_32xN_avx2 32 + P2S_H_32xN_avx2 8 + P2S_H_32xN_avx2 16 + P2S_H_32xN_avx2 24 + P2S_H_32xN_avx2 64 + P2S_H_32xN_avx2 48 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro P2S_H_16xN 1 +%macro P2S_H_64xN 1 INIT_XMM ssse3 -cglobal filterPixelToShort_16x%1, 3, 7, 6 +cglobal filterPixelToShort_64x%1, 3, 7, 6 mov r3d, r3m add r3d, r3d lea r4, [r3 * 3] lea r5, [r1 * 3] - ; load height + ; load height mov r6d, %1/4 ; load constant @@ -12392,650 +10330,671 @@ cglobal filterPixelToShort_16x%1, 3, 7, 6 movu [r2 + r3 * 2 + 16], m2 movu [r2 + r4 + 16], m3 - lea r0, [r0 + r1 * 4 - 8] - lea r2, [r2 + r3 * 4] + lea r0, [r0 + 8] - dec r6d - jnz .loop - RET -%endmacro - P2S_H_16xN 16 - P2S_H_16xN 4 - P2S_H_16xN 8 - P2S_H_16xN 12 - P2S_H_16xN 32 - P2S_H_16xN 64 - P2S_H_16xN 24 + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal filterPixelToShort_16x4, 3, 4, 2 - mov r3d, r3m - add r3d, r3d + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - ; load constant - vbroadcasti128 m1, [pw_2000] + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 + movu [r2 + r3 * 0 + 32], m0 + movu [r2 + r3 * 1 + 32], m1 + movu [r2 + r3 * 2 + 32], m2 + movu [r2 + r4 + 32], m3 - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + lea r0, [r0 + 8] - lea r1, [r1 * 3] - lea r3, [r3 * 3] + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - RET + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal filterPixelToShort_16x8, 3, 6, 2 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - ; load constant - vbroadcasti128 m1, [pw_2000] + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 + movu [r2 + r3 * 0 + 48], m0 + movu [r2 + r3 * 1 + 48], m1 + movu [r2 + r3 * 2 + 48], m2 + movu [r2 + r4 + 48], m3 - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 + lea r0, [r0 + 8] - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 + movu [r2 + r3 * 0 + 64], m0 + movu [r2 + r3 * 1 + 64], m1 + movu [r2 + r3 * 2 + 64], m2 + movu [r2 + r4 + 64], m3 - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + lea r0, [r0 + 8] - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - RET + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal filterPixelToShort_16x12, 3, 6, 2 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - ; load constant - vbroadcasti128 m1, [pw_2000] + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 + movu [r2 + r3 * 0 + 80], m0 + movu [r2 + r3 * 1 + 80], m1 + movu [r2 + r3 * 2 + 80], m2 + movu [r2 + r4 + 80], m3 - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + lea r0, [r0 + 8] - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + movu [r2 + r3 * 0 + 96], m0 + movu [r2 + r3 * 1 + 96], m1 + movu [r2 + r3 * 2 + 96], m2 + movu [r2 + r4 + 96], m3 - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 + lea r0, [r0 + 8] - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 + movu [r2 + r3 * 0 + 112], m0 + movu [r2 + r3 * 1 + 112], m1 + movu [r2 + r3 * 2 + 112], m2 + movu [r2 + r4 + 112], m3 + + lea r0, [r0 + r1 * 4 - 56] + lea r2, [r2 + r3 * 4] + + dec r6d + jnz .loop RET +%endmacro + P2S_H_64xN 64 + P2S_H_64xN 16 + P2S_H_64xN 32 + P2S_H_64xN 48 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- +%macro P2S_H_64xN_avx2 1 INIT_YMM avx2 -cglobal filterPixelToShort_16x16, 3, 6, 2 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] +cglobal filterPixelToShort_64x%1, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] - ; load constant - vbroadcasti128 m1, [pw_2000] + ; load height + mov r4d, %1/4 - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 + ; load constant + vpbroadcastd m4, [pw_2000] - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 +.loop: + pmovzxbw m0, [r0 + 0 * mmsize/2] + pmovzxbw m1, [r0 + 1 * mmsize/2] + pmovzxbw m2, [r0 + 2 * mmsize/2] + pmovzxbw m3, [r0 + 3 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + movu [r2 + 0 * mmsize], m0 + movu [r2 + 1 * mmsize], m1 + movu [r2 + 2 * mmsize], m2 + movu [r2 + 3 * mmsize], m3 - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 + pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] + pmovzxbw m2, [r0 + r1 + 2 * mmsize/2] + pmovzxbw m3, [r0 + r1 + 3 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + movu [r2 + r3 + 0 * mmsize], m0 + movu [r2 + r3 + 1 * mmsize], m1 + movu [r2 + r3 + 2 * mmsize], m2 + movu [r2 + r3 + 3 * mmsize], m3 - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 + pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] + pmovzxbw m2, [r0 + r1 * 2 + 2 * mmsize/2] + pmovzxbw m3, [r0 + r1 * 2 + 3 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + movu [r2 + r3 * 2 + 0 * mmsize], m0 + movu [r2 + r3 * 2 + 1 * mmsize], m1 + movu [r2 + r3 * 2 + 2 * mmsize], m2 + movu [r2 + r3 * 2 + 3 * mmsize], m3 - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 + pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] + pmovzxbw m2, [r0 + r5 + 2 * mmsize/2] + pmovzxbw m3, [r0 + r5 + 3 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 + movu [r2 + r6 + 0 * mmsize], m0 + movu [r2 + r6 + 1 * mmsize], m1 + movu [r2 + r6 + 2 * mmsize], m2 + movu [r2 + r6 + 3 * mmsize], m3 - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 + dec r4d + jnz .loop RET +%endmacro + P2S_H_64xN_avx2 64 + P2S_H_64xN_avx2 16 + P2S_H_64xN_avx2 32 + P2S_H_64xN_avx2 48 ;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride) ;----------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal filterPixelToShort_16x24, 3, 7, 2 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - mov r6d, 3 +%macro P2S_H_12xN 1 +INIT_XMM ssse3 +cglobal filterPixelToShort_12x%1, 3, 7, 6 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r6, [r3 * 3] + mov r5d, %1/4 ; load constant - vbroadcasti128 m1, [pw_2000] -.loop: - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 + mova m4, [pb_128] + mova m5, [tab_c_64_n64] - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 +.loop: + movu m0, [r0] + punpcklbw m1, m0, m4 + punpckhbw m0, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + movu m2, [r0 + r1] + punpcklbw m3, m2, m4 + punpckhbw m2, m4 + pmaddubsw m2, m5 + pmaddubsw m3, m5 - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 + movu [r2 + r3 * 0], m1 + movu [r2 + r3 * 1], m3 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + movh [r2 + r3 * 0 + 16], m0 + movh [r2 + r3 * 1 + 16], m2 - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 + movu m0, [r0 + r1 * 2] + punpcklbw m1, m0, m4 + punpckhbw m0, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 + movu m2, [r0 + r4] + punpcklbw m3, m2, m4 + punpckhbw m2, m4 + pmaddubsw m2, m5 + pmaddubsw m3, m5 - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + movu [r2 + r3 * 2], m1 + movu [r2 + r6], m3 - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 + movh [r2 + r3 * 2 + 16], m0 + movh [r2 + r6 + 16], m2 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - dec r6d - jnz .loop + dec r5d + jnz .loop RET +%endmacro + P2S_H_12xN 16 + P2S_H_12xN 32 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro P2S_H_16xN_avx2 1 -INIT_YMM avx2 -cglobal filterPixelToShort_16x%1, 3, 7, 2 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - mov r6d, %1/16 +%macro P2S_H_24xN 1 +INIT_XMM ssse3 +cglobal filterPixelToShort_24x%1, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] + mov r6d, %1/4 ; load constant - vbroadcasti128 m1, [pw_2000] -.loop: - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 + mova m3, [pb_128] + mova m4, [tab_c_64_n64] - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 +.loop: + movu m0, [r0] + punpcklbw m1, m0, m3 + punpckhbw m0, m3 + pmaddubsw m0, m4 + pmaddubsw m1, m4 - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + movu m2, [r0 + 16] + punpcklbw m2, m3 + pmaddubsw m2, m4 - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 + movu [r2 + r3 * 0], m1 + movu [r2 + r3 * 0 + 16], m0 + movu [r2 + r3 * 0 + 32], m2 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + movu m0, [r0 + r1] + punpcklbw m1, m0, m3 + punpckhbw m0, m3 + pmaddubsw m0, m4 + pmaddubsw m1, m4 - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + movu m2, [r0 + r1 + 16] + punpcklbw m2, m3 + pmaddubsw m2, m4 - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 1 + 16], m0 + movu [r2 + r3 * 1 + 32], m2 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + movu m0, [r0 + r1 * 2] + punpcklbw m1, m0, m3 + punpckhbw m0, m3 + pmaddubsw m0, m4 + pmaddubsw m1, m4 - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 + movu m2, [r0 + r1 * 2 + 16] + punpcklbw m2, m3 + pmaddubsw m2, m4 - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 + movu [r2 + r3 * 2], m1 + movu [r2 + r3 * 2 + 16], m0 + movu [r2 + r3 * 2 + 32], m2 - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 + movu m0, [r0 + r4] + punpcklbw m1, m0, m3 + punpckhbw m0, m3 + pmaddubsw m0, m4 + pmaddubsw m1, m4 - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 + movu m2, [r0 + r4 + 16] + punpcklbw m2, m3 + pmaddubsw m2, m4 + movu [r2 + r5], m1 + movu [r2 + r5 + 16], m0 + movu [r2 + r5 + 32], m2 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - dec r6d - jnz .loop + dec r6d + jnz .loop RET %endmacro -P2S_H_16xN_avx2 32 -P2S_H_16xN_avx2 64 + P2S_H_24xN 32 + P2S_H_24xN 64 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro P2S_H_32xN 1 -INIT_XMM ssse3 -cglobal filterPixelToShort_32x%1, 3, 7, 6 +%macro P2S_H_24xN_avx2 1 +INIT_YMM avx2 +cglobal filterPixelToShort_24x%1, 3, 7, 4 mov r3d, r3m add r3d, r3d - lea r4, [r3 * 3] - lea r5, [r1 * 3] - - ; load height + lea r4, [r1 * 3] + lea r5, [r3 * 3] mov r6d, %1/4 ; load constant - mova m4, [pb_128] - mova m5, [tab_c_64_n64] + vpbroadcastd m1, [pw_2000] + vpbroadcastd m2, [pb_128] + vpbroadcastd m3, [tab_c_64_n64] .loop: - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0], m0 - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r4], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0 + 16], m0 - movu [r2 + r3 * 1 + 16], m1 - movu [r2 + r3 * 2 + 16], m2 - movu [r2 + r4 + 16], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - movu [r2 + r3 * 0 + 32], m0 - movu [r2 + r3 * 1 + 32], m1 - movu [r2 + r3 * 2 + 32], m2 - movu [r2 + r4 + 32], m3 + movu m0, [r0 + mmsize/2] + punpcklbw m0, m2 + pmaddubsw m0, m3 + movu [r2 + r3 * 0 + mmsize], xm0 - lea r0, [r0 + 8] + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 + movu m0, [r0 + r1 + mmsize/2] + punpcklbw m0, m2 + pmaddubsw m0, m3 + movu [r2 + r3 * 1 + mmsize], xm0 - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 + movu m0, [r0 + r1 * 2 + mmsize/2] + punpcklbw m0, m2 + pmaddubsw m0, m3 + movu [r2 + r3 * 2 + mmsize], xm0 - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - movu [r2 + r3 * 0 + 48], m0 - movu [r2 + r3 * 1 + 48], m1 - movu [r2 + r3 * 2 + 48], m2 - movu [r2 + r4 + 48], m3 + movu m0, [r0 + r4 + mmsize/2] + punpcklbw m0, m2 + pmaddubsw m0, m3 + movu [r2 + r5 + mmsize], xm0 - lea r0, [r0 + r1 * 4 - 24] + lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r6d jnz .loop RET %endmacro - P2S_H_32xN 32 - P2S_H_32xN 8 - P2S_H_32xN 16 - P2S_H_32xN 24 - P2S_H_32xN 64 - P2S_H_32xN 48 + P2S_H_24xN_avx2 32 + P2S_H_24xN_avx2 64 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro P2S_H_32xN_avx2 1 -INIT_YMM avx2 -cglobal filterPixelToShort_32x%1, 3, 7, 3 +INIT_XMM ssse3 +cglobal filterPixelToShort_48x64, 3, 7, 4 mov r3d, r3m add r3d, r3d - lea r5, [r1 * 3] - lea r6, [r3 * 3] - - ; load height - mov r4d, %1/4 + lea r4, [r1 * 3] + lea r5, [r3 * 3] + mov r6d, 16 ; load constant - vpbroadcastd m2, [pw_2000] + mova m2, [pb_128] + mova m3, [tab_c_64_n64] +.loop: + movu m0, [r0] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + + movu [r2 + r3 * 0], m1 + movu [r2 + r3 * 0 + 16], m0 + + movu m0, [r0 + 16] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + + movu [r2 + r3 * 0 + 32], m1 + movu [r2 + r3 * 0 + 48], m0 + + movu m0, [r0 + 32] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + + movu [r2 + r3 * 0 + 64], m1 + movu [r2 + r3 * 0 + 80], m0 + + movu m0, [r0 + r1] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 1 + 16], m0 + + movu m0, [r0 + r1 + 16] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + + movu [r2 + r3 * 1 + 32], m1 + movu [r2 + r3 * 1 + 48], m0 + + movu m0, [r0 + r1 + 32] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + + movu [r2 + r3 * 1 + 64], m1 + movu [r2 + r3 * 1 + 80], m0 + + movu m0, [r0 + r1 * 2] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + + movu [r2 + r3 * 2], m1 + movu [r2 + r3 * 2 + 16], m0 + + movu m0, [r0 + r1 * 2 + 16] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + + movu [r2 + r3 * 2 + 32], m1 + movu [r2 + r3 * 2 + 48], m0 + + movu m0, [r0 + r1 * 2 + 32] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + + movu [r2 + r3 * 2 + 64], m1 + movu [r2 + r3 * 2 + 80], m0 + + movu m0, [r0 + r4] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + + movu [r2 + r5], m1 + movu [r2 + r5 + 16], m0 + + movu m0, [r0 + r4 + 16] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + + movu [r2 + r5 + 32], m1 + movu [r2 + r5 + 48], m0 + + movu m0, [r0 + r4 + 32] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + + movu [r2 + r5 + 64], m1 + movu [r2 + r5 + 80], m0 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + + dec r6d + jnz .loop + RET + +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal filterPixelToShort_48x64, 3,7,4 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] + + ; load height + mov r4d, 64/4 + + ; load constant + vpbroadcastd m3, [pw_2000] + + ; just unroll(1) because it is best choice for 48x64 .loop: pmovzxbw m0, [r0 + 0 * mmsize/2] pmovzxbw m1, [r0 + 1 * mmsize/2] + pmovzxbw m2, [r0 + 2 * mmsize/2] psllw m0, 6 psllw m1, 6 - psubw m0, m2 - psubw m1, m2 + psllw m2, 6 + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 movu [r2 + 0 * mmsize], m0 movu [r2 + 1 * mmsize], m1 + movu [r2 + 2 * mmsize], m2 pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] + pmovzxbw m2, [r0 + r1 + 2 * mmsize/2] psllw m0, 6 psllw m1, 6 - psubw m0, m2 - psubw m1, m2 + psllw m2, 6 + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 movu [r2 + r3 + 0 * mmsize], m0 movu [r2 + r3 + 1 * mmsize], m1 + movu [r2 + r3 + 2 * mmsize], m2 pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] + pmovzxbw m2, [r0 + r1 * 2 + 2 * mmsize/2] psllw m0, 6 psllw m1, 6 - psubw m0, m2 - psubw m1, m2 + psllw m2, 6 + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 movu [r2 + r3 * 2 + 0 * mmsize], m0 movu [r2 + r3 * 2 + 1 * mmsize], m1 + movu [r2 + r3 * 2 + 2 * mmsize], m2 pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] + pmovzxbw m2, [r0 + r5 + 2 * mmsize/2] psllw m0, 6 psllw m1, 6 - psubw m0, m2 - psubw m1, m2 + psllw m2, 6 + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 movu [r2 + r6 + 0 * mmsize], m0 movu [r2 + r6 + 1 * mmsize], m1 + movu [r2 + r6 + 2 * mmsize], m2 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] @@ -13043,1014 +11002,1403 @@ cglobal filterPixelToShort_32x%1, 3, 7, 3 dec r4d jnz .loop RET -%endmacro - P2S_H_32xN_avx2 32 - P2S_H_32xN_avx2 8 - P2S_H_32xN_avx2 16 - P2S_H_32xN_avx2 24 - P2S_H_32xN_avx2 64 - P2S_H_32xN_avx2 48 -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_64xN 1 -INIT_XMM ssse3 -cglobal filterPixelToShort_64x%1, 3, 7, 6 - mov r3d, r3m - add r3d, r3d - lea r4, [r3 * 3] - lea r5, [r1 * 3] - ; load height - mov r6d, %1/4 +%macro PROCESS_LUMA_W4_4R 0 + movd m0, [r0] + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[0 1] - ; load constant - mova m4, [pb_128] - mova m5, [tab_c_64_n64] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[1 2] + punpcklqdq m2, m1 ; m2=[0 1 1 2] + pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2] -.loop: - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 + movd m1, [r0 + r1] + punpcklbw m5, m0, m1 ; m2=[2 3] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[3 4] + punpcklqdq m5, m1 ; m5=[2 3 3 4] + pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4] + paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2 + pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4 - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[4 5] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[5 6] + punpcklqdq m2, m1 ; m2=[4 5 5 6] + pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6] + paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2 + pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6] + paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4 - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[6 7] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[7 8] + punpcklqdq m2, m1 ; m2=[6 7 7 8] + pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8] + paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end + pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8] + paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4 - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[8 9] + movd m0, [r0 + 2 * r1] + punpcklbw m1, m0 ; m1=[9 10] + punpcklqdq m2, m1 ; m2=[8 9 9 10] + pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10] + paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end +%endmacro - movu [r2 + r3 * 0], m0 - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r4], m3 +%macro PROCESS_LUMA_W8_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1 - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2 - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3 + pmaddubsw m0, [r6 + 1 * 16] + paddw m7, m0 ;m7=[0+1+2+3] Row1 - movu [r2 + r3 * 0 + 16], m0 - movu [r2 + r3 * 1 + 16], m1 - movu [r2 + r3 * 2 + 16], m2 - movu [r2 + r4 + 16], m3 + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4 + pmaddubsw m1, [r6 + 1 * 16] + paddw m6, m1 ;m6 = [1+2+3+4] Row2 - lea r0, [r0 + 8] + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m2, m0, [r6 + 1 * 16] + pmaddubsw m0, [r6 + 2 * 16] + paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1 + paddw m5, m2 ;m5=[2+3+4+5] Row3 - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m2, m1, [r6 + 1 * 16] + pmaddubsw m1, [r6 + 2 * 16] + paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2 + paddw m4, m2 ;m4=[3+4+5+6] Row4 - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m2, m0, [r6 + 2 * 16] + pmaddubsw m0, [r6 + 3 * 16] + paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end + paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3 - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m2, m1, [r6 + 2 * 16] + pmaddubsw m1, [r6 + 3 * 16] + paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end + paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4 - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m0, [r6 + 3 * 16] + paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end - movu [r2 + r3 * 0 + 32], m0 - movu [r2 + r3 * 1 + 32], m1 - movu [r2 + r3 * 2 + 32], m2 - movu [r2 + r4 + 32], m3 + movq m0, [r0 + 2 * r1] + punpcklbw m1, m0 + pmaddubsw m1, [r6 + 3 * 16] + paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end +%endmacro - lea r0, [r0 + 8] +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_4xN 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6 + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 +%ifidn %3,ps + add r3d, r3d +%endif - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 +%ifidn %3,pp + mova m3, [pw_512] +%else + mova m3, [pw_2000] +%endif - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 + mov r4d, %2/4 + lea r5, [4 * r1] - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 +.loopH: + PROCESS_LUMA_W4_4R - movu [r2 + r3 * 0 + 48], m0 - movu [r2 + r3 * 1 + 48], m1 - movu [r2 + r3 * 2 + 48], m2 - movu [r2 + r4 + 48], m3 +%ifidn %3,pp + pmulhrsw m4, m3 + pmulhrsw m5, m3 - lea r0, [r0 + 8] + packuswb m4, m5 - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 + movd [r2], m4 + pextrd [r2 + r3], m4, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m4, 2 + pextrd [r2 + r3], m4, 3 +%else + psubw m4, m3 + psubw m5, m3 - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 + movlps [r2], m4 + movhps [r2 + r3], m4 + lea r2, [r2 + 2 * r3] + movlps [r2], m5 + movhps [r2 + r3], m5 +%endif - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 + sub r0, r5 + lea r2, [r2 + 2 * r3] - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 + dec r4d + jnz .loopH - movu [r2 + r3 * 0 + 64], m0 - movu [r2 + r3 * 1 + 64], m1 - movu [r2 + r3 * 2 + 64], m2 - movu [r2 + r4 + 64], m3 + RET +%endmacro - lea r0, [r0 + 8] - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 +INIT_YMM avx2 +cglobal interp_8tap_vert_pp_4x4, 4,6,8 + mov r4d, r4m + lea r5, [r1 * 3] + sub r0, r5 - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 + ; TODO: VPGATHERDD + movd xm1, [r0] ; m1 = row0 + movd xm2, [r0 + r1] ; m2 = row1 + punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00] - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 + movd xm3, [r0 + r1 * 2] ; m3 = row2 + punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10] + movd xm4, [r0 + r5] + punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20] + punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 + lea r0, [r0 + r1 * 4] + movd xm5, [r0] ; m5 = row4 + punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30] + punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] + vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] + movd xm2, [r0 + r1] ; m2 = row5 + punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40] + punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] + movd xm6, [r0 + r1 * 2] ; m6 = row6 + punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50] + punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] + vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] + movd xm4, [r0 + r5] ; m4 = row7 + punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60] + punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] - movu [r2 + r3 * 0 + 80], m0 - movu [r2 + r3 * 1 + 80], m1 - movu [r2 + r3 * 2 + 80], m2 - movu [r2 + r4 + 80], m3 + lea r0, [r0 + r1 * 4] + movd xm7, [r0] ; m7 = row8 + punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70] + punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] + vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] + movd xm2, [r0 + r1] ; m2 = row9 + punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80] + punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] + movd xm7, [r0 + r1 * 2] ; m7 = rowA + punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90] + punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] + vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] - lea r0, [r0 + 8] + ; load filter coeff +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8 + 0] + vpbroadcastd m2, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0] + vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4] +%endif - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 + pmaddubsw m1, m0 + pmaddubsw m3, m0 + pmaddubsw m5, m2 + pmaddubsw m6, m2 + vbroadcasti128 m0, [pw_1] + pmaddwd m1, m0 + pmaddwd m3, m0 + pmaddwd m5, m0 + pmaddwd m6, m0 + paddd m1, m5 ; m1 = DQWORD ROW[1 0] + paddd m3, m6 ; m3 = DQWORD ROW[3 2] + packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0] - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 + ; TODO: does it overflow? + pmulhrsw m1, [pw_512] + vextracti128 xm2, m1, 1 + packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0] + movd [r2], xm1 + pextrd [r2 + r3], xm1, 2 + pextrd [r2 + r3 * 2], xm1, 1 + lea r4, [r3 * 3] + pextrd [r2 + r4], xm1, 3 + RET - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 +INIT_YMM avx2 +cglobal interp_8tap_vert_ps_4x4, 4, 6, 5 + mov r4d, r4m + shl r4d, 7 - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif - movu [r2 + r3 * 0 + 96], m0 - movu [r2 + r3 * 1 + 96], m1 - movu [r2 + r3 * 2 + 96], m2 - movu [r2 + r4 + 96], m3 + lea r4, [r1 * 3] + sub r0, r4 - lea r0, [r0 + 8] + add r3d, r3d - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] + mova m3, [interp4_vpp_shuf1] + vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] + vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] + mova m3, [interp4_vpp_shuf1 + mmsize] + vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 + mova m3, [interp4_vpp_shuf] + pshufb m0, m0, m3 + pshufb m1, m1, m3 + pshufb m4, m4, m3 + pshufb m2, m2, m3 + pmaddubsw m0, [r5] + pmaddubsw m1, [r5 + mmsize] + pmaddubsw m4, [r5 + 2 * mmsize] + pmaddubsw m2, [r5 + 3 * mmsize] + paddw m0, m1 + paddw m0, m4 + paddw m0, m2 ; m0 = WORD ROW[3 2 1 0] - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 + psubw m0, [pw_2000] + vextracti128 xm2, m0, 1 + lea r5, [r3 * 3] + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r5], xm2 + RET - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 +%macro FILTER_VER_LUMA_AVX2_4xN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 10 + mov r4d, r4m + shl r4d, 7 - movu [r2 + r3 * 0 + 112], m0 - movu [r2 + r3 * 1 + 112], m1 - movu [r2 + r3 * 2 + 112], m2 - movu [r2 + r4 + 112], m3 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif - lea r0, [r0 + r1 * 4 - 56] - lea r2, [r2 + r3 * 4] + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r1 * 4] +%ifidn %3,pp + mova m6, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m6, [pw_2000] +%endif + lea r8, [r3 * 3] + mova m5, [interp4_vpp_shuf] + mova m0, [interp4_vpp_shuf1] + mova m7, [interp4_vpp_shuf1 + mmsize] + mov r7d, %2 / 8 +.loop: + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 + pinsrd xm3, [r0 + r4], 3 ; m3 = row[11 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[11 10 9 8 7 6 5 4] + lea r0, [r0 + r1 * 4] + movd xm4, [r0] + pinsrd xm4, [r0 + r1], 1 + pinsrd xm4, [r0 + r1 * 2], 2 ; m4 = row[x 14 13 12] + vinserti128 m3, m3, xm4, 1 ; m3 = row[x 14 13 12 11 10 9 8] + vpermd m8, m0, m1 ; m8 = row[4 3 3 2 2 1 1 0] + vpermd m4, m0, m2 ; m4 = row[8 7 7 6 6 5 5 4] + vpermd m1, m7, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m7, m2 ; m2 = row[10 9 9 8 8 7 7 6] + vpermd m9, m0, m3 ; m9 = row[12 11 11 10 10 9 9 8] + vpermd m3, m7, m3 ; m3 = row[14 13 13 12 12 11 11 10] - dec r6d - jnz .loop + pshufb m8, m8, m5 + pshufb m1, m1, m5 + pshufb m4, m4, m5 + pshufb m9, m9, m5 + pshufb m2, m2, m5 + pshufb m3, m3, m5 + pmaddubsw m8, [r5] + pmaddubsw m1, [r5 + mmsize] + pmaddubsw m9, [r5 + 2 * mmsize] + pmaddubsw m3, [r5 + 3 * mmsize] + paddw m8, m1 + paddw m9, m3 + pmaddubsw m1, m4, [r5 + 2 * mmsize] + pmaddubsw m3, m2, [r5 + 3 * mmsize] + pmaddubsw m4, [r5] + pmaddubsw m2, [r5 + mmsize] + paddw m3, m1 + paddw m2, m4 + paddw m8, m3 ; m8 = WORD ROW[3 2 1 0] + paddw m9, m2 ; m9 = WORD ROW[7 6 5 4] + +%ifidn %3,pp + pmulhrsw m8, m6 + pmulhrsw m9, m6 + packuswb m8, m9 + vextracti128 xm1, m8, 1 + movd [r2], xm8 + pextrd [r2 + r3], xm8, 1 + movd [r2 + r3 * 2], xm1 + pextrd [r2 + r8], xm1, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm8, 2 + pextrd [r2 + r3], xm8, 3 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r8], xm1, 3 +%else + psubw m8, m6 + psubw m9, m6 + vextracti128 xm1, m8, 1 + vextracti128 xm2, m9, 1 + movq [r2], xm8 + movhps [r2 + r3], xm8 + movq [r2 + r3 * 2], xm1 + movhps [r2 + r8], xm1 + lea r2, [r2 + r3 * 4] + movq [r2], xm9 + movhps [r2 + r3], xm9 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r8], xm2 +%endif + lea r2, [r2 + r3 * 4] + sub r0, r6 + dec r7d + jnz .loop RET +%endif %endmacro - P2S_H_64xN 64 - P2S_H_64xN 16 - P2S_H_64xN 32 - P2S_H_64xN 48 -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_64xN_avx2 1 -INIT_YMM avx2 -cglobal filterPixelToShort_64x%1, 3, 7, 5 - mov r3d, r3m - add r3d, r3d - lea r5, [r1 * 3] - lea r6, [r3 * 3] +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_4xN 4, 4, pp - ; load height - mov r4d, %1/4 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_4xN 4, 8, pp + FILTER_VER_LUMA_AVX2_4xN 4, 8, pp - ; load constant - vpbroadcastd m4, [pw_2000] +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_4xN 4, 16, pp + FILTER_VER_LUMA_AVX2_4xN 4, 16, pp -.loop: - pmovzxbw m0, [r0 + 0 * mmsize/2] - pmovzxbw m1, [r0 + 1 * mmsize/2] - pmovzxbw m2, [r0 + 2 * mmsize/2] - pmovzxbw m3, [r0 + 3 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psllw m3, 6 - psubw m0, m4 - psubw m1, m4 - psubw m2, m4 - psubw m3, m4 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_4xN 4, 4, ps - movu [r2 + 0 * mmsize], m0 - movu [r2 + 1 * mmsize], m1 - movu [r2 + 2 * mmsize], m2 - movu [r2 + 3 * mmsize], m3 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_4xN 4, 8, ps + FILTER_VER_LUMA_AVX2_4xN 4, 8, ps - pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] - pmovzxbw m2, [r0 + r1 + 2 * mmsize/2] - pmovzxbw m3, [r0 + r1 + 3 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psllw m3, 6 - psubw m0, m4 - psubw m1, m4 - psubw m2, m4 - psubw m3, m4 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_4xN 4, 16, ps + FILTER_VER_LUMA_AVX2_4xN 4, 16, ps - movu [r2 + r3 + 0 * mmsize], m0 - movu [r2 + r3 + 1 * mmsize], m1 - movu [r2 + r3 + 2 * mmsize], m2 - movu [r2 + r3 + 3 * mmsize], m3 +%macro PROCESS_LUMA_AVX2_W8_8R 0 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m5, m3 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] + vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m3, m4, [r5 + 3 * mmsize] + paddw m5, m3 + pmaddubsw m3, m4, [r5 + 2 * mmsize] + paddw m2, m3 + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] + vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + pmaddubsw m3, m0, [r5 + 3 * mmsize] + paddw m2, m3 + pmaddubsw m3, m0, [r5 + 2 * mmsize] + paddw m1, m3 + pmaddubsw m0, [r5 + 1 * mmsize] + paddw m4, m0 - pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] - pmovzxbw m2, [r0 + r1 * 2 + 2 * mmsize/2] - pmovzxbw m3, [r0 + r1 * 2 + 3 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psllw m3, 6 - psubw m0, m4 - psubw m1, m4 - psubw m2, m4 - psubw m3, m4 + movq xm3, [r0 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 12 + punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] + vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] + pmaddubsw m3, m6, [r5 + 3 * mmsize] + paddw m1, m3 + pmaddubsw m6, [r5 + 2 * mmsize] + paddw m4, m6 + movq xm3, [r0 + r1] ; m3 = row 13 + punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] + movq xm6, [r0 + r1 * 2] ; m6 = row 14 + punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] + vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] + pmaddubsw m0, [r5 + 3 * mmsize] + paddw m4, m0 +%endmacro - movu [r2 + r3 * 2 + 0 * mmsize], m0 - movu [r2 + r3 * 2 + 1 * mmsize], m1 - movu [r2 + r3 * 2 + 2 * mmsize], m2 - movu [r2 + r3 * 2 + 3 * mmsize], m3 +%macro PROCESS_LUMA_AVX2_W8_4R 0 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m5, m3 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] + vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m3, m4, [r5 + 3 * mmsize] + paddw m5, m3 + pmaddubsw m3, m4, [r5 + 2 * mmsize] + paddw m2, m3 + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] + vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + pmaddubsw m3, m0, [r5 + 3 * mmsize] + paddw m2, m3 +%endmacro - pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] - pmovzxbw m2, [r0 + r5 + 2 * mmsize/2] - pmovzxbw m3, [r0 + r5 + 3 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psllw m3, 6 - psubw m0, m4 - psubw m1, m4 - psubw m2, m4 - psubw m3, m4 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_8xN 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 - movu [r2 + r6 + 0 * mmsize], m0 - movu [r2 + r6 + 1 * mmsize], m1 - movu [r2 + r6 + 2 * mmsize], m2 - movu [r2 + r6 + 3 * mmsize], m3 +%ifidn %3,ps + add r3d, r3d +%endif - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif - dec r4d - jnz .loop - RET -%endmacro - P2S_H_64xN_avx2 64 - P2S_H_64xN_avx2 16 - P2S_H_64xN_avx2 32 - P2S_H_64xN_avx2 48 + %ifidn %3,pp + mova m3, [pw_512] +%else + mova m3, [pw_2000] +%endif -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_12xN 1 -INIT_XMM ssse3 -cglobal filterPixelToShort_12x%1, 3, 7, 6 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r6, [r3 * 3] - mov r5d, %1/4 + mov r4d, %2/4 + lea r5, [4 * r1] - ; load constant - mova m4, [pb_128] - mova m5, [tab_c_64_n64] +.loopH: + PROCESS_LUMA_W8_4R -.loop: - movu m0, [r0] - punpcklbw m1, m0, m4 - punpckhbw m0, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 +%ifidn %3,pp + pmulhrsw m7, m3 + pmulhrsw m6, m3 + pmulhrsw m5, m3 + pmulhrsw m4, m3 - movu m2, [r0 + r1] - punpcklbw m3, m2, m4 - punpckhbw m2, m4 - pmaddubsw m2, m5 - pmaddubsw m3, m5 + packuswb m7, m6 + packuswb m5, m4 - movu [r2 + r3 * 0], m1 - movu [r2 + r3 * 1], m3 + movlps [r2], m7 + movhps [r2 + r3], m7 + lea r2, [r2 + 2 * r3] + movlps [r2], m5 + movhps [r2 + r3], m5 +%else + psubw m7, m3 + psubw m6, m3 + psubw m5, m3 + psubw m4, m3 - movh [r2 + r3 * 0 + 16], m0 - movh [r2 + r3 * 1 + 16], m2 - - movu m0, [r0 + r1 * 2] - punpcklbw m1, m0, m4 - punpckhbw m0, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - - movu m2, [r0 + r4] - punpcklbw m3, m2, m4 - punpckhbw m2, m4 - pmaddubsw m2, m5 - pmaddubsw m3, m5 - - movu [r2 + r3 * 2], m1 - movu [r2 + r6], m3 + movu [r2], m7 + movu [r2 + r3], m6 + lea r2, [r2 + 2 * r3] + movu [r2], m5 + movu [r2 + r3], m4 +%endif - movh [r2 + r3 * 2 + 16], m0 - movh [r2 + r6 + 16], m2 + sub r0, r5 + lea r2, [r2 + 2 * r3] - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + dec r4d + jnz .loopH - dec r5d - jnz .loop RET %endmacro - P2S_H_12xN 16 - P2S_H_12xN 32 -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_24xN 1 -INIT_XMM ssse3 -cglobal filterPixelToShort_24x%1, 3, 7, 5 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - mov r6d, %1/4 +%macro FILTER_VER_LUMA_AVX2_8xN 3 +INIT_YMM avx2 +cglobal interp_8tap_vert_%3_%1x%2, 4, 7, 8, 0-gprsize + mov r4d, r4m + shl r4d, 7 - ; load constant - mova m3, [pb_128] - mova m4, [tab_c_64_n64] +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r1 * 4] +%ifidn %3,pp + mova m7, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m7, [pw_2000] +%endif + mov word [rsp], %2 / 8 .loop: - movu m0, [r0] - punpcklbw m1, m0, m3 - punpckhbw m0, m3 - pmaddubsw m0, m4 - pmaddubsw m1, m4 - - movu m2, [r0 + 16] - punpcklbw m2, m3 - pmaddubsw m2, m4 - - movu [r2 + r3 * 0], m1 - movu [r2 + r3 * 0 + 16], m0 - movu [r2 + r3 * 0 + 32], m2 - - movu m0, [r0 + r1] - punpcklbw m1, m0, m3 - punpckhbw m0, m3 - pmaddubsw m0, m4 - pmaddubsw m1, m4 - - movu m2, [r0 + r1 + 16] - punpcklbw m2, m3 - pmaddubsw m2, m4 - - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 1 + 16], m0 - movu [r2 + r3 * 1 + 32], m2 - - movu m0, [r0 + r1 * 2] - punpcklbw m1, m0, m3 - punpckhbw m0, m3 - pmaddubsw m0, m4 - pmaddubsw m1, m4 - - movu m2, [r0 + r1 * 2 + 16] - punpcklbw m2, m3 - pmaddubsw m2, m4 - - movu [r2 + r3 * 2], m1 - movu [r2 + r3 * 2 + 16], m0 - movu [r2 + r3 * 2 + 32], m2 - - movu m0, [r0 + r4] - punpcklbw m1, m0, m3 - punpckhbw m0, m3 - pmaddubsw m0, m4 - pmaddubsw m1, m4 + PROCESS_LUMA_AVX2_W8_8R +%ifidn %3,pp + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + lea r2, [r2 + r3 * 2] + movhps [r2], xm5 + movhps [r2 + r3], xm2 + lea r2, [r2 + r3 * 2] + movq [r2], xm1 + movq [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm1 + movhps [r2 + r3], xm4 +%else + psubw m5, m7 ; m5 = word: row 0, row 1 + psubw m2, m7 ; m2 = word: row 2, row 3 + psubw m1, m7 ; m1 = word: row 4, row 5 + psubw m4, m7 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movu [r2], xm5 + movu [r2 + r3], xm6 + lea r2, [r2 + r3 * 2] + movu [r2], xm2 + movu [r2 + r3], xm3 + lea r2, [r2 + r3 * 2] + movu [r2], xm1 + movu [r2 + r3], xm0 + lea r2, [r2 + r3 * 2] + movu [r2], xm4 + vextracti128 xm4, m4, 1 + movu [r2 + r3], xm4 +%endif + lea r2, [r2 + r3 * 2] + sub r0, r6 + dec word [rsp] + jnz .loop + RET +%endmacro - movu m2, [r0 + r4 + 16] - punpcklbw m2, m3 - pmaddubsw m2, m4 - movu [r2 + r5], m1 - movu [r2 + r5 + 16], m0 - movu [r2 + r5 + 32], m2 +%macro FILTER_VER_LUMA_AVX2_8x8 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_8x8, 4, 6, 7 + mov r4d, r4m + shl r4d, 7 - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif - dec r6d - jnz .loop + lea r4, [r1 * 3] + sub r0, r4 + PROCESS_LUMA_AVX2_W8_8R +%ifidn %1,pp + mova m3, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] +%endif + lea r4, [r3 * 3] +%ifidn %1,pp + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + pmulhrsw m4, m3 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 + movhps [r2 + r3 * 2], xm1 + movhps [r2 + r4], xm4 +%else + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + psubw m4, m3 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movu [r2], xm5 + movu [r2 + r3], xm6 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm4 + vextracti128 xm4, m4, 1 + movu [r2 + r4], xm4 +%endif RET %endmacro - P2S_H_24xN 32 - P2S_H_24xN 64 -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_24xN_avx2 1 +%macro FILTER_VER_LUMA_AVX2_8x4 1 INIT_YMM avx2 -cglobal filterPixelToShort_24x%1, 3, 7, 4 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - mov r6d, %1/4 - - ; load constant - vpbroadcastd m1, [pw_2000] - vpbroadcastd m2, [pb_128] - vpbroadcastd m3, [tab_c_64_n64] +cglobal interp_8tap_vert_%1_8x4, 4, 6, 7 + mov r4d, r4m + shl r4d, 7 -.loop: - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - movu m0, [r0 + mmsize/2] - punpcklbw m0, m2 - pmaddubsw m0, m3 - movu [r2 + r3 * 0 + mmsize], xm0 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 + lea r4, [r1 * 3] + sub r0, r4 + PROCESS_LUMA_AVX2_W8_4R +%ifidn %1,pp + mova m3, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] +%endif + lea r4, [r3 * 3] +%ifidn %1,pp + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + packuswb m5, m2 + vextracti128 xm2, m5, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm2 +%else + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + movu [r2], xm5 + vextracti128 xm5, m5, 1 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm2 + vextracti128 xm2, m2, 1 + movu [r2 + r4], xm2 +%endif + RET +%endmacro - movu m0, [r0 + r1 + mmsize/2] - punpcklbw m0, m2 - pmaddubsw m0, m3 - movu [r2 + r3 * 1 + mmsize], xm0 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 4, pp + FILTER_VER_LUMA_AVX2_8x4 pp - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 8, pp + FILTER_VER_LUMA_AVX2_8x8 pp - movu m0, [r0 + r1 * 2 + mmsize/2] - punpcklbw m0, m2 - pmaddubsw m0, m3 - movu [r2 + r3 * 2 + mmsize], xm0 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 16, pp + FILTER_VER_LUMA_AVX2_8xN 8, 16, pp - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 32, pp + FILTER_VER_LUMA_AVX2_8xN 8, 32, pp - movu m0, [r0 + r4 + mmsize/2] - punpcklbw m0, m2 - pmaddubsw m0, m3 - movu [r2 + r5 + mmsize], xm0 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 4, ps + FILTER_VER_LUMA_AVX2_8x4 ps - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 8, ps + FILTER_VER_LUMA_AVX2_8x8 ps - dec r6d - jnz .loop - RET -%endmacro - P2S_H_24xN_avx2 32 - P2S_H_24xN_avx2 64 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 16, ps + FILTER_VER_LUMA_AVX2_8xN 8, 16, ps -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_XMM ssse3 -cglobal filterPixelToShort_48x64, 3, 7, 4 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - mov r6d, 16 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 32, ps + FILTER_VER_LUMA_AVX2_8xN 8, 32, ps - ; load constant - mova m2, [pb_128] - mova m3, [tab_c_64_n64] +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_12xN 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 +%ifidn %3,ps + add r3d, r3d +%endif -.loop: - movu m0, [r0] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif - movu [r2 + r3 * 0], m1 - movu [r2 + r3 * 0 + 16], m0 + %ifidn %3,pp + mova m3, [pw_512] +%else + mova m3, [pw_2000] +%endif - movu m0, [r0 + 16] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 + mov r4d, %2/4 - movu [r2 + r3 * 0 + 32], m1 - movu [r2 + r3 * 0 + 48], m0 +.loopH: + PROCESS_LUMA_W8_4R - movu m0, [r0 + 32] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 +%ifidn %3,pp + pmulhrsw m7, m3 + pmulhrsw m6, m3 + pmulhrsw m5, m3 + pmulhrsw m4, m3 - movu [r2 + r3 * 0 + 64], m1 - movu [r2 + r3 * 0 + 80], m0 + packuswb m7, m6 + packuswb m5, m4 - movu m0, [r0 + r1] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 + movlps [r2], m7 + movhps [r2 + r3], m7 + lea r5, [r2 + 2 * r3] + movlps [r5], m5 + movhps [r5 + r3], m5 +%else + psubw m7, m3 + psubw m6, m3 + psubw m5, m3 + psubw m4, m3 - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 1 + 16], m0 + movu [r2], m7 + movu [r2 + r3], m6 + lea r5, [r2 + 2 * r3] + movu [r5], m5 + movu [r5 + r3], m4 +%endif - movu m0, [r0 + r1 + 16] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 + lea r5, [8 * r1 - 8] + sub r0, r5 +%ifidn %3,pp + add r2, 8 +%else + add r2, 16 +%endif - movu [r2 + r3 * 1 + 32], m1 - movu [r2 + r3 * 1 + 48], m0 + PROCESS_LUMA_W4_4R - movu m0, [r0 + r1 + 32] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 +%ifidn %3,pp + pmulhrsw m4, m3 + pmulhrsw m5, m3 - movu [r2 + r3 * 1 + 64], m1 - movu [r2 + r3 * 1 + 80], m0 + packuswb m4, m5 - movu m0, [r0 + r1 * 2] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 + movd [r2], m4 + pextrd [r2 + r3], m4, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m4, 2 + pextrd [r5 + r3], m4, 3 +%else + psubw m4, m3 + psubw m5, m3 - movu [r2 + r3 * 2], m1 - movu [r2 + r3 * 2 + 16], m0 + movlps [r2], m4 + movhps [r2 + r3], m4 + lea r5, [r2 + 2 * r3] + movlps [r5], m5 + movhps [r5 + r3], m5 +%endif - movu m0, [r0 + r1 * 2 + 16] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r3 * 2 + 32], m1 - movu [r2 + r3 * 2 + 48], m0 - - movu m0, [r0 + r1 * 2 + 32] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r3 * 2 + 64], m1 - movu [r2 + r3 * 2 + 80], m0 - - movu m0, [r0 + r4] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r5], m1 - movu [r2 + r5 + 16], m0 - - movu m0, [r0 + r4 + 16] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r5 + 32], m1 - movu [r2 + r5 + 48], m0 - - movu m0, [r0 + r4 + 32] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r5 + 64], m1 - movu [r2 + r5 + 80], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r6d - jnz .loop - RET - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal filterPixelToShort_48x64, 3,7,4 - mov r3d, r3m - add r3d, r3d - lea r5, [r1 * 3] - lea r6, [r3 * 3] - - ; load height - mov r4d, 64/4 - - ; load constant - vpbroadcastd m3, [pw_2000] - - ; just unroll(1) because it is best choice for 48x64 -.loop: - pmovzxbw m0, [r0 + 0 * mmsize/2] - pmovzxbw m1, [r0 + 1 * mmsize/2] - pmovzxbw m2, [r0 + 2 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psubw m0, m3 - psubw m1, m3 - psubw m2, m3 - movu [r2 + 0 * mmsize], m0 - movu [r2 + 1 * mmsize], m1 - movu [r2 + 2 * mmsize], m2 - - pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] - pmovzxbw m2, [r0 + r1 + 2 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psubw m0, m3 - psubw m1, m3 - psubw m2, m3 - movu [r2 + r3 + 0 * mmsize], m0 - movu [r2 + r3 + 1 * mmsize], m1 - movu [r2 + r3 + 2 * mmsize], m2 - - pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] - pmovzxbw m2, [r0 + r1 * 2 + 2 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psubw m0, m3 - psubw m1, m3 - psubw m2, m3 - movu [r2 + r3 * 2 + 0 * mmsize], m0 - movu [r2 + r3 * 2 + 1 * mmsize], m1 - movu [r2 + r3 * 2 + 2 * mmsize], m2 - - pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] - pmovzxbw m2, [r0 + r5 + 2 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psubw m0, m3 - psubw m1, m3 - psubw m2, m3 - movu [r2 + r6 + 0 * mmsize], m0 - movu [r2 + r6 + 1 * mmsize], m1 - movu [r2 + r6 + 2 * mmsize], m2 + lea r5, [4 * r1 + 8] + sub r0, r5 +%ifidn %3,pp + lea r2, [r2 + 4 * r3 - 8] +%else + lea r2, [r2 + 4 * r3 - 16] +%endif - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] + dec r4d + jnz .loopH - dec r4d - jnz .loop RET - - -%macro PROCESS_LUMA_W4_4R 0 - movd m0, [r0] - movd m1, [r0 + r1] - punpcklbw m2, m0, m1 ; m2=[0 1] - - lea r0, [r0 + 2 * r1] - movd m0, [r0] - punpcklbw m1, m0 ; m1=[1 2] - punpcklqdq m2, m1 ; m2=[0 1 1 2] - pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2] - - movd m1, [r0 + r1] - punpcklbw m5, m0, m1 ; m2=[2 3] - lea r0, [r0 + 2 * r1] - movd m0, [r0] - punpcklbw m1, m0 ; m1=[3 4] - punpcklqdq m5, m1 ; m5=[2 3 3 4] - pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4] - paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2 - pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4 - - movd m1, [r0 + r1] - punpcklbw m2, m0, m1 ; m2=[4 5] - lea r0, [r0 + 2 * r1] - movd m0, [r0] - punpcklbw m1, m0 ; m1=[5 6] - punpcklqdq m2, m1 ; m2=[4 5 5 6] - pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6] - paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2 - pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6] - paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4 - - movd m1, [r0 + r1] - punpcklbw m2, m0, m1 ; m2=[6 7] - lea r0, [r0 + 2 * r1] - movd m0, [r0] - punpcklbw m1, m0 ; m1=[7 8] - punpcklqdq m2, m1 ; m2=[6 7 7 8] - pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8] - paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end - pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8] - paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4 - - movd m1, [r0 + r1] - punpcklbw m2, m0, m1 ; m2=[8 9] - movd m0, [r0 + 2 * r1] - punpcklbw m1, m0 ; m1=[9 10] - punpcklqdq m2, m1 ; m2=[8 9 9 10] - pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10] - paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end %endmacro -%macro PROCESS_LUMA_W8_4R 0 - movq m0, [r0] - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movq m0, [r0] - punpcklbw m1, m0 - pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2 - - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3 - pmaddubsw m0, [r6 + 1 * 16] - paddw m7, m0 ;m7=[0+1+2+3] Row1 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_12xN 12, 16, pp - lea r0, [r0 + 2 * r1] - movq m0, [r0] - punpcklbw m1, m0 - pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4 - pmaddubsw m1, [r6 + 1 * 16] - paddw m6, m1 ;m6 = [1+2+3+4] Row2 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_12xN 12, 16, ps - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m2, m0, [r6 + 1 * 16] - pmaddubsw m0, [r6 + 2 * 16] - paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1 - paddw m5, m2 ;m5=[2+3+4+5] Row3 +%macro FILTER_VER_LUMA_AVX2_12x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_12x16, 4, 7, 15 + mov r4d, r4m + shl r4d, 7 - lea r0, [r0 + 2 * r1] - movq m0, [r0] - punpcklbw m1, m0 - pmaddubsw m2, m1, [r6 + 1 * 16] - pmaddubsw m1, [r6 + 2 * 16] - paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2 - paddw m4, m2 ;m4=[3+4+5+6] Row4 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m2, m0, [r6 + 2 * 16] - pmaddubsw m0, [r6 + 3 * 16] - paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end - paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3 + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + mova m14, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif + lea r6, [r3 * 3] - lea r0, [r0 + 2 * r1] - movq m0, [r0] - punpcklbw m1, m0 - pmaddubsw m2, m1, [r6 + 2 * 16] - pmaddubsw m1, [r6 + 3 * 16] - paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end - paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4 - - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m0, [r6 + 3 * 16] - paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end - - movq m0, [r0 + 2 * r1] - punpcklbw m1, m0 - pmaddubsw m1, [r6 + 3 * 16] - paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_4xN 3 -INIT_XMM sse4 -cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6 - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 -%ifidn %3,ps - add r3d, r3d -%endif + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - lea r6, [r5 + r4] +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movq [r2], xm0 + pextrd [r2 + 8], xm0, 2 + movq [r2 + r3], xm1 + pextrd [r2 + r3 + 8], xm1, 2 + movq [r2 + r3 * 2], xm2 + pextrd [r2 + r3 * 2 + 8], xm2, 2 + movq [r2 + r6], xm3 + pextrd [r2 + r6 + 8], xm3, 2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + pextrd [r2 + 8], xm4, 2 + movq [r2 + r3], xm5 + pextrd [r2 + r3 + 8], xm5, 2 %else - lea r6, [tab_LumaCoeffVer + r4] + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + 16], xm0 + movu [r2 + r3], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r3 + 16], xm1 + movu [r2 + r3 * 2], xm2 + vextracti128 xm2, m2, 1 + movq [r2 + r3 * 2 + 16], xm2 + movu [r2 + r6], xm3 + vextracti128 xm3, m3, 1 + movq [r2 + r6 + 16], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + vextracti128 xm4, m4, 1 + movq [r2 + 16], xm4 + movu [r2 + r3], xm5 + vextracti128 xm5, m5, 1 + movq [r2 + r3 + 16], xm5 %endif -%ifidn %3,pp - mova m3, [pw_512] + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] + +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movq [r2 + r3 * 2], xm6 + pextrd [r2 + r3 * 2 + 8], xm6, 2 + movq [r2 + r6], xm7 + pextrd [r2 + r6 + 8], xm7, 2 %else - mova m3, [pw_2000] + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], xm6 + vextracti128 xm6, m6, 1 + movq [r2 + r3 * 2 + 16], xm6 + movu [r2 + r6], xm7 + vextracti128 xm7, m7, 1 + movq [r2 + r6 + 16], xm7 %endif + lea r2, [r2 + r3 * 4] - mov r4d, %2/4 - lea r5, [4 * r1] - -.loopH: - PROCESS_LUMA_W4_4R - -%ifidn %3,pp - pmulhrsw m4, m3 - pmulhrsw m5, m3 - - packuswb m4, m5 - - movd [r2], m4 - pextrd [r2 + r3], m4, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m4, 2 - pextrd [r2 + r3], m4, 3 -%else - psubw m4, m3 - psubw m5, m3 - - movlps [r2], m4 - movhps [r2 + r3], m4 - lea r2, [r2 + 2 * r3] - movlps [r2], m5 - movhps [r2 + r3], m5 -%endif - - sub r0, r5 - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH - - RET -%endmacro - - -INIT_YMM avx2 -cglobal interp_8tap_vert_pp_4x4, 4,6,8 - mov r4d, r4m - lea r5, [r1 * 3] - sub r0, r5 - - ; TODO: VPGATHERDD - movd xm1, [r0] ; m1 = row0 - movd xm2, [r0 + r1] ; m2 = row1 - punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00] - - movd xm3, [r0 + r1 * 2] ; m3 = row2 - punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10] - movd xm4, [r0 + r5] - punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20] - punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] - + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] lea r0, [r0 + r1 * 4] - movd xm5, [r0] ; m5 = row4 - punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30] - punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] - movd xm2, [r0 + r1] ; m2 = row5 - punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40] - punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] - movd xm6, [r0 + r1 * 2] ; m6 = row6 - punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50] - punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] - movd xm4, [r0 + r5] ; m4 = row7 - punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60] - punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] - + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 lea r0, [r0 + r1 * 4] - movd xm7, [r0] ; m7 = row8 - punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70] - punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] - movd xm2, [r0 + r1] ; m2 = row9 - punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80] - punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] - movd xm7, [r0 + r1 * 2] ; m7 = rowA - punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90] - punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 - ; load filter coeff -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastd m0, [r5 + r4 * 8 + 0] - vpbroadcastd m2, [r5 + r4 * 8 + 4] +%ifidn %1,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movq [r2], xm8 + pextrd [r2 + 8], xm8, 2 + movq [r2 + r3], xm9 + pextrd [r2 + r3 + 8], xm9, 2 + movq [r2 + r3 * 2], xm10 + pextrd [r2 + r3 * 2 + 8], xm10, 2 + movq [r2 + r6], xm11 + pextrd [r2 + r6 + 8], xm11, 2 + lea r2, [r2 + r3 * 4] + movq [r2], xm12 + pextrd [r2 + 8], xm12, 2 + movq [r2 + r3], xm13 + pextrd [r2 + r3 + 8], xm13, 2 + movq [r2 + r3 * 2], xm0 + pextrd [r2 + r3 * 2 + 8], xm0, 2 + movq [r2 + r6], xm1 + pextrd [r2 + r6 + 8], xm1, 2 %else - vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0] - vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4] + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m12, m14 ; m12 = word: row 12 + psubw m13, m14 ; m13 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r2], xm8 + vextracti128 xm8, m8, 1 + movq [r2 + 16], xm8 + movu [r2 + r3], xm9 + vextracti128 xm9, m9, 1 + movq [r2 + r3 + 16], xm9 + movu [r2 + r3 * 2], xm10 + vextracti128 xm10, m10, 1 + movq [r2 + r3 * 2 + 16], xm10 + movu [r2 + r6], xm11 + vextracti128 xm11, m11, 1 + movq [r2 + r6 + 16], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + vextracti128 xm12, m12, 1 + movq [r2 + 16], xm12 + movu [r2 + r3], xm13 + vextracti128 xm13, m13, 1 + movq [r2 + r3 + 16], xm13 + movu [r2 + r3 * 2], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + r3 * 2 + 16], xm0 + movu [r2 + r6], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r6 + 16], xm1 %endif - - pmaddubsw m1, m0 - pmaddubsw m3, m0 - pmaddubsw m5, m2 - pmaddubsw m6, m2 - vbroadcasti128 m0, [pw_1] - pmaddwd m1, m0 - pmaddwd m3, m0 - pmaddwd m5, m0 - pmaddwd m6, m0 - paddd m1, m5 ; m1 = DQWORD ROW[1 0] - paddd m3, m6 ; m3 = DQWORD ROW[3 2] - packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0] - - ; TODO: does it overflow? - pmulhrsw m1, [pw_512] - vextracti128 xm2, m1, 1 - packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0] - movd [r2], xm1 - pextrd [r2 + r3], xm1, 2 - pextrd [r2 + r3 * 2], xm1, 1 - lea r4, [r3 * 3] - pextrd [r2 + r4], xm1, 3 RET +%endif +%endmacro + + FILTER_VER_LUMA_AVX2_12x16 pp + FILTER_VER_LUMA_AVX2_12x16 ps +%macro FILTER_VER_LUMA_AVX2_16x16 1 INIT_YMM avx2 -cglobal interp_8tap_vert_ps_4x4, 4, 6, 5 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 mov r4d, r4m shl r4d, 7 @@ -14063,754 +12411,629 @@ cglobal interp_8tap_vert_ps_4x4, 4, 6, 5 lea r4, [r1 * 3] sub r0, r4 - +%ifidn %1,pp + mova m14, [pw_512] +%else add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif + lea r6, [r3 * 3] - movd xm1, [r0] - pinsrd xm1, [r0 + r1], 1 - pinsrd xm1, [r0 + r1 * 2], 2 - pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm2, [r0] - pinsrd xm2, [r0 + r1], 1 - pinsrd xm2, [r0 + r1 * 2], 2 - pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] - vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm3, [r0] - pinsrd xm3, [r0 + r1], 1 - pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] - vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] - mova m3, [interp4_vpp_shuf1] - vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] - vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] - mova m3, [interp4_vpp_shuf1 + mmsize] - vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] - vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] - - mova m3, [interp4_vpp_shuf] - pshufb m0, m0, m3 - pshufb m1, m1, m3 - pshufb m4, m4, m3 - pshufb m2, m2, m3 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 pmaddubsw m0, [r5] - pmaddubsw m1, [r5 + mmsize] - pmaddubsw m4, [r5 + 2 * mmsize] - pmaddubsw m2, [r5 + 3 * mmsize] - paddw m0, m1 + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 - paddw m0, m2 ; m0 = WORD ROW[3 2 1 0] - - psubw m0, [pw_2000] - vextracti128 xm2, m0, 1 - lea r5, [r3 * 3] - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r5], xm2 - RET - -%macro FILTER_VER_LUMA_AVX2_4xN 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 10 - mov r4d, r4m - shl r4d, 7 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 %else - lea r5, [tab_LumaCoeffVer_32 + r4] + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + lea r2, [r2 + r3 * 4] + movu [r2], m4 + movu [r2 + r3], m5 %endif - lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r1 * 4] -%ifidn %3,pp - mova m6, [pw_512] + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] + +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 %else - add r3d, r3d - vbroadcasti128 m6, [pw_2000] + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], m6 + movu [r2 + r6], m7 %endif - lea r8, [r3 * 3] - mova m5, [interp4_vpp_shuf] - mova m0, [interp4_vpp_shuf1] - mova m7, [interp4_vpp_shuf1 + mmsize] - mov r7d, %2 / 8 -.loop: - movd xm1, [r0] - pinsrd xm1, [r0 + r1], 1 - pinsrd xm1, [r0 + r1 * 2], 2 - pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm2, [r0] - pinsrd xm2, [r0 + r1], 1 - pinsrd xm2, [r0 + r1 * 2], 2 - pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] - vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm3, [r0] - pinsrd xm3, [r0 + r1], 1 - pinsrd xm3, [r0 + r1 * 2], 2 - pinsrd xm3, [r0 + r4], 3 ; m3 = row[11 10 9 8] - vinserti128 m2, m2, xm3, 1 ; m2 = row[11 10 9 8 7 6 5 4] - lea r0, [r0 + r1 * 4] - movd xm4, [r0] - pinsrd xm4, [r0 + r1], 1 - pinsrd xm4, [r0 + r1 * 2], 2 ; m4 = row[x 14 13 12] - vinserti128 m3, m3, xm4, 1 ; m3 = row[x 14 13 12 11 10 9 8] - vpermd m8, m0, m1 ; m8 = row[4 3 3 2 2 1 1 0] - vpermd m4, m0, m2 ; m4 = row[8 7 7 6 6 5 5 4] - vpermd m1, m7, m1 ; m1 = row[6 5 5 4 4 3 3 2] - vpermd m2, m7, m2 ; m2 = row[10 9 9 8 8 7 7 6] - vpermd m9, m0, m3 ; m9 = row[12 11 11 10 10 9 9 8] - vpermd m3, m7, m3 ; m3 = row[14 13 13 12 12 11 11 10] + lea r2, [r2 + r3 * 4] - pshufb m8, m8, m5 - pshufb m1, m1, m5 - pshufb m4, m4, m5 - pshufb m9, m9, m5 - pshufb m2, m2, m5 - pshufb m3, m3, m5 - pmaddubsw m8, [r5] - pmaddubsw m1, [r5 + mmsize] - pmaddubsw m9, [r5 + 2 * mmsize] - pmaddubsw m3, [r5 + 3 * mmsize] - paddw m8, m1 + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] paddw m9, m3 - pmaddubsw m1, m4, [r5 + 2 * mmsize] - pmaddubsw m3, m2, [r5 + 3 * mmsize] - pmaddubsw m4, [r5] - pmaddubsw m2, [r5 + mmsize] - paddw m3, m1 - paddw m2, m4 - paddw m8, m3 ; m8 = WORD ROW[3 2 1 0] - paddw m9, m2 ; m9 = WORD ROW[7 6 5 4] + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 -%ifidn %3,pp - pmulhrsw m8, m6 - pmulhrsw m9, m6 +%ifidn %1,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 packuswb m8, m9 - vextracti128 xm1, m8, 1 - movd [r2], xm8 - pextrd [r2 + r3], xm8, 1 - movd [r2 + r3 * 2], xm1 - pextrd [r2 + r8], xm1, 1 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 lea r2, [r2 + r3 * 4] - pextrd [r2], xm8, 2 - pextrd [r2 + r3], xm8, 3 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r8], xm1, 3 + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm1 %else - psubw m8, m6 - psubw m9, m6 - vextracti128 xm1, m8, 1 - vextracti128 xm2, m9, 1 - movq [r2], xm8 - movhps [r2 + r3], xm8 - movq [r2 + r3 * 2], xm1 - movhps [r2 + r8], xm1 + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m12, m14 ; m12 = word: row 12 + psubw m13, m14 ; m13 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r2], m8 + movu [r2 + r3], m9 + movu [r2 + r3 * 2], m10 + movu [r2 + r6], m11 lea r2, [r2 + r3 * 4] - movq [r2], xm9 - movhps [r2 + r3], xm9 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r8], xm2 + movu [r2], m12 + movu [r2 + r3], m13 + movu [r2 + r3 * 2], m0 + movu [r2 + r6], m1 %endif - lea r2, [r2 + r3 * 4] - sub r0, r6 - dec r7d - jnz .loop RET %endif %endmacro -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_4xN 4, 4, pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_4xN 4, 8, pp - FILTER_VER_LUMA_AVX2_4xN 4, 8, pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_4xN 4, 16, pp - FILTER_VER_LUMA_AVX2_4xN 4, 16, pp + FILTER_VER_LUMA_AVX2_16x16 pp + FILTER_VER_LUMA_AVX2_16x16 ps -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_4xN 4, 4, ps +%macro FILTER_VER_LUMA_AVX2_16x12 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 + mov r4d, r4m + shl r4d, 7 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_4xN 4, 8, ps - FILTER_VER_LUMA_AVX2_4xN 4, 8, ps +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_4xN 4, 16, ps - FILTER_VER_LUMA_AVX2_4xN 4, 16, ps + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + mova m14, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif + lea r6, [r3 * 3] -%macro PROCESS_LUMA_AVX2_W8_8R 0 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m5, m3 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5] - movq xm3, [r0 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 8 - punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - pmaddubsw m3, m4, [r5 + 3 * mmsize] - paddw m5, m3 - pmaddubsw m3, m4, [r5 + 2 * mmsize] - paddw m2, m3 - pmaddubsw m3, m4, [r5 + 1 * mmsize] - paddw m1, m3 + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 pmaddubsw m4, [r5] - movq xm3, [r0 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - movq xm6, [r0 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - pmaddubsw m3, m0, [r5 + 3 * mmsize] - paddw m2, m3 - pmaddubsw m3, m0, [r5 + 2 * mmsize] - paddw m1, m3 - pmaddubsw m0, [r5 + 1 * mmsize] - paddw m4, m0 - - movq xm3, [r0 + r4] ; m3 = row 11 - punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] - lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 12 - punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] - pmaddubsw m3, m6, [r5 + 3 * mmsize] - paddw m1, m3 - pmaddubsw m6, [r5 + 2 * mmsize] - paddw m4, m6 - movq xm3, [r0 + r1] ; m3 = row 13 - punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] - movq xm6, [r0 + r1 * 2] ; m6 = row 14 - punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] - pmaddubsw m0, [r5 + 3 * mmsize] - paddw m4, m0 -%endmacro - -%macro PROCESS_LUMA_AVX2_W8_4R 0 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m5, m3 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - movq xm3, [r0 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 8 - punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - pmaddubsw m3, m4, [r5 + 3 * mmsize] - paddw m5, m3 - pmaddubsw m3, m4, [r5 + 2 * mmsize] - paddw m2, m3 - movq xm3, [r0 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - movq xm6, [r0 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - pmaddubsw m3, m0, [r5 + 3 * mmsize] - paddw m2, m3 -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_8xN 3 -INIT_XMM sse4 -cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 - -%ifidn %3,ps - add r3d, r3d -%endif - -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffVer + r4] -%endif + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] - %ifidn %3,pp - mova m3, [pw_512] +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 %else - mova m3, [pw_2000] + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + lea r2, [r2 + r3 * 4] + movu [r2], m4 + movu [r2 + r3], m5 %endif - mov r4d, %2/4 - lea r5, [4 * r1] - -.loopH: - PROCESS_LUMA_W8_4R - -%ifidn %3,pp - pmulhrsw m7, m3 - pmulhrsw m6, m3 - pmulhrsw m5, m3 - pmulhrsw m4, m3 - - packuswb m7, m6 - packuswb m5, m4 + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 - movlps [r2], m7 - movhps [r2 + r3], m7 - lea r2, [r2 + 2 * r3] - movlps [r2], m5 - movhps [r2 + r3], m5 +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 %else - psubw m7, m3 - psubw m6, m3 - psubw m5, m3 - psubw m4, m3 - - movu [r2], m7 - movu [r2 + r3], m6 - lea r2, [r2 + 2 * r3] - movu [r2], m5 - movu [r2 + r3], m4 + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], m6 + movu [r2 + r6], m7 %endif + lea r2, [r2 + r3 * 4] - sub r0, r5 - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH - - RET -%endmacro - -%macro FILTER_VER_LUMA_AVX2_8xN 3 -INIT_YMM avx2 -cglobal interp_8tap_vert_%3_%1x%2, 4, 7, 8, 0-gprsize - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r1 * 4] -%ifidn %3,pp - mova m7, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m7, [pw_2000] -%endif - mov word [rsp], %2 / 8 + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 -.loop: - PROCESS_LUMA_AVX2_W8_8R -%ifidn %3,pp - pmulhrsw m5, m7 ; m5 = word: row 0, row 1 - pmulhrsw m2, m7 ; m2 = word: row 2, row 3 - pmulhrsw m1, m7 ; m1 = word: row 4, row 5 - pmulhrsw m4, m7 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - lea r2, [r2 + r3 * 2] - movhps [r2], xm5 - movhps [r2 + r3], xm2 - lea r2, [r2 + r3 * 2] - movq [r2], xm1 - movq [r2 + r3], xm4 - lea r2, [r2 + r3 * 2] - movhps [r2], xm1 - movhps [r2 + r3], xm4 +%ifidn %1,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + packuswb m8, m9 + packuswb m10, m11 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 %else - psubw m5, m7 ; m5 = word: row 0, row 1 - psubw m2, m7 ; m2 = word: row 2, row 3 - psubw m1, m7 ; m1 = word: row 4, row 5 - psubw m4, m7 ; m4 = word: row 6, row 7 - vextracti128 xm6, m5, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm0, m1, 1 - movu [r2], xm5 - movu [r2 + r3], xm6 - lea r2, [r2 + r3 * 2] - movu [r2], xm2 - movu [r2 + r3], xm3 - lea r2, [r2 + r3 * 2] - movu [r2], xm1 - movu [r2 + r3], xm0 - lea r2, [r2 + r3 * 2] - movu [r2], xm4 - vextracti128 xm4, m4, 1 - movu [r2 + r3], xm4 + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + movu [r2], m8 + movu [r2 + r3], m9 + movu [r2 + r3 * 2], m10 + movu [r2 + r6], m11 %endif - lea r2, [r2 + r3 * 2] - sub r0, r6 - dec word [rsp] - jnz .loop RET +%endif %endmacro -%macro FILTER_VER_LUMA_AVX2_8x8 1 + FILTER_VER_LUMA_AVX2_16x12 pp + FILTER_VER_LUMA_AVX2_16x12 ps + +%macro FILTER_VER_LUMA_AVX2_16x8 1 INIT_YMM avx2 -cglobal interp_8tap_vert_%1_8x8, 4, 6, 7 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 mov r4d, r4m shl r4d, 7 - %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif - lea r4, [r1 * 3] sub r0, r4 - PROCESS_LUMA_AVX2_W8_8R %ifidn %1,pp - mova m3, [pw_512] + mova m14, [pw_512] %else add r3d, r3d - vbroadcasti128 m3, [pw_2000] -%endif - lea r4, [r3 * 3] -%ifidn %1,pp - pmulhrsw m5, m3 ; m5 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - pmulhrsw m1, m3 ; m1 = word: row 4, row 5 - pmulhrsw m4, m3 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r4], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm1 - movq [r2 + r3], xm4 - movhps [r2 + r3 * 2], xm1 - movhps [r2 + r4], xm4 -%else - psubw m5, m3 ; m5 = word: row 0, row 1 - psubw m2, m3 ; m2 = word: row 2, row 3 - psubw m1, m3 ; m1 = word: row 4, row 5 - psubw m4, m3 ; m4 = word: row 6, row 7 - vextracti128 xm6, m5, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm0, m1, 1 - movu [r2], xm5 - movu [r2 + r3], xm6 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm4 - vextracti128 xm4, m4, 1 - movu [r2 + r4], xm4 -%endif - RET -%endmacro - -%macro FILTER_VER_LUMA_AVX2_8x4 1 -INIT_YMM avx2 -cglobal interp_8tap_vert_%1_8x4, 4, 6, 7 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] + vbroadcasti128 m14, [pw_2000] %endif - - lea r4, [r1 * 3] - sub r0, r4 - PROCESS_LUMA_AVX2_W8_4R -%ifidn %1,pp - mova m3, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m3, [pw_2000] -%endif - lea r4, [r3 * 3] -%ifidn %1,pp - pmulhrsw m5, m3 ; m5 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - packuswb m5, m2 - vextracti128 xm2, m5, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r4], xm2 -%else - psubw m5, m3 ; m5 = word: row 0, row 1 - psubw m2, m3 ; m2 = word: row 2, row 3 - movu [r2], xm5 - vextracti128 xm5, m5, 1 - movu [r2 + r3], xm5 - movu [r2 + r3 * 2], xm2 - vextracti128 xm2, m2, 1 - movu [r2 + r4], xm2 -%endif - RET -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 4, pp - FILTER_VER_LUMA_AVX2_8x4 pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 8, pp - FILTER_VER_LUMA_AVX2_8x8 pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 16, pp - FILTER_VER_LUMA_AVX2_8xN 8, 16, pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 32, pp - FILTER_VER_LUMA_AVX2_8xN 8, 32, pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 4, ps - FILTER_VER_LUMA_AVX2_8x4 ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 8, ps - FILTER_VER_LUMA_AVX2_8x8 ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 16, ps - FILTER_VER_LUMA_AVX2_8xN 8, 16, ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 32, ps - FILTER_VER_LUMA_AVX2_8xN 8, 32, ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_12xN 3 -INIT_XMM sse4 -cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 -%ifidn %3,ps - add r3d, r3d -%endif - -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffVer + r4] -%endif - - %ifidn %3,pp - mova m3, [pw_512] -%else - mova m3, [pw_2000] -%endif - - mov r4d, %2/4 - -.loopH: - PROCESS_LUMA_W8_4R - -%ifidn %3,pp - pmulhrsw m7, m3 - pmulhrsw m6, m3 - pmulhrsw m5, m3 - pmulhrsw m4, m3 - - packuswb m7, m6 - packuswb m5, m4 - - movlps [r2], m7 - movhps [r2 + r3], m7 - lea r5, [r2 + 2 * r3] - movlps [r5], m5 - movhps [r5 + r3], m5 -%else - psubw m7, m3 - psubw m6, m3 - psubw m5, m3 - psubw m4, m3 - - movu [r2], m7 - movu [r2 + r3], m6 - lea r5, [r2 + 2 * r3] - movu [r5], m5 - movu [r5 + r3], m4 -%endif - - lea r5, [8 * r1 - 8] - sub r0, r5 -%ifidn %3,pp - add r2, 8 -%else - add r2, 16 -%endif - - PROCESS_LUMA_W4_4R - -%ifidn %3,pp - pmulhrsw m4, m3 - pmulhrsw m5, m3 - - packuswb m4, m5 - - movd [r2], m4 - pextrd [r2 + r3], m4, 1 - lea r5, [r2 + 2 * r3] - pextrd [r5], m4, 2 - pextrd [r5 + r3], m4, 3 -%else - psubw m4, m3 - psubw m5, m3 - - movlps [r2], m4 - movhps [r2 + r3], m4 - lea r5, [r2 + 2 * r3] - movlps [r5], m5 - movhps [r5 + r3], m5 -%endif - - lea r5, [4 * r1 + 8] - sub r0, r5 -%ifidn %3,pp - lea r2, [r2 + 4 * r3 - 8] -%else - lea r2, [r2 + 4 * r3 - 16] -%endif - - dec r4d - jnz .loopH - - RET -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_12xN 12, 16, pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_12xN 12, 16, ps - -%macro FILTER_VER_LUMA_AVX2_12x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_12x16, 4, 7, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,pp - mova m14, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%endif - lea r6, [r3 * 3] - movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 @@ -14888,7 +13111,6 @@ cglobal interp_8tap_vert_%1_12x16, 4, 7, 15 paddw m4, m10 pmaddubsw m10, m8, [r5 + 1 * mmsize] paddw m6, m10 - pmaddubsw m8, [r5] movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 @@ -14899,7 +13121,6 @@ cglobal interp_8tap_vert_%1_12x16, 4, 7, 15 paddw m5, m11 pmaddubsw m11, m9, [r5 + 1 * mmsize] paddw m7, m11 - pmaddubsw m9, [r5] movu xm11, [r0 + r4] ; m11 = row 11 punpckhbw xm12, xm10, xm11 punpcklbw xm10, xm11 @@ -14908,9 +13129,6 @@ cglobal interp_8tap_vert_%1_12x16, 4, 7, 15 paddw m4, m12 pmaddubsw m12, m10, [r5 + 2 * mmsize] paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] lea r0, [r0 + r1 * 4] movu xm12, [r0] ; m12 = row 12 punpckhbw xm13, xm11, xm12 @@ -14920,10 +13138,7 @@ cglobal interp_8tap_vert_%1_12x16, 4, 7, 15 paddw m5, m13 pmaddubsw m13, m11, [r5 + 2 * mmsize] paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] - + lea r4, [r3 * 3] %ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 @@ -14940,19 +13155,13 @@ cglobal interp_8tap_vert_%1_12x16, 4, 7, 15 vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 - movq [r2], xm0 - pextrd [r2 + 8], xm0, 2 - movq [r2 + r3], xm1 - pextrd [r2 + r3 + 8], xm1, 2 - movq [r2 + r3 * 2], xm2 - pextrd [r2 + r3 * 2 + 8], xm2, 2 - movq [r2 + r6], xm3 - pextrd [r2 + r6 + 8], xm3, 2 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 lea r2, [r2 + r3 * 4] - movq [r2], xm4 - pextrd [r2 + 8], xm4, 2 - movq [r2 + r3], xm5 - pextrd [r2 + r3 + 8], xm5, 2 + movu [r2], xm4 + movu [r2 + r3], xm5 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 @@ -14960,249 +13169,67 @@ cglobal interp_8tap_vert_%1_12x16, 4, 7, 15 psubw m3, m14 ; m3 = word: row 3 psubw m4, m14 ; m4 = word: row 4 psubw m5, m14 ; m5 = word: row 5 - movu [r2], xm0 - vextracti128 xm0, m0, 1 - movq [r2 + 16], xm0 - movu [r2 + r3], xm1 - vextracti128 xm1, m1, 1 - movq [r2 + r3 + 16], xm1 - movu [r2 + r3 * 2], xm2 - vextracti128 xm2, m2, 1 - movq [r2 + r3 * 2 + 16], xm2 - movu [r2 + r6], xm3 - vextracti128 xm3, m3, 1 - movq [r2 + r6 + 16], xm3 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r4], m3 lea r2, [r2 + r3 * 4] - movu [r2], xm4 - vextracti128 xm4, m4, 1 - movq [r2 + 16], xm4 - movu [r2 + r3], xm5 - vextracti128 xm5, m5, 1 - movq [r2 + r3 + 16], xm5 + movu [r2], m4 + movu [r2 + r3], m5 %endif - movu xm13, [r0 + r1] ; m13 = row 13 punpckhbw xm0, xm12, xm13 punpcklbw xm12, xm13 vinserti128 m12, m12, xm0, 1 pmaddubsw m0, m12, [r5 + 3 * mmsize] paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - pmaddubsw m12, [r5] movu xm0, [r0 + r1 * 2] ; m0 = row 14 punpckhbw xm1, xm13, xm0 punpcklbw xm13, xm0 vinserti128 m13, m13, xm1, 1 pmaddubsw m1, m13, [r5 + 3 * mmsize] paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 - pmaddubsw m13, [r5] - %ifidn %1,pp pmulhrsw m6, m14 ; m6 = word: row 6 pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m6, m7 vpermq m6, m6, 11011000b vextracti128 xm7, m6, 1 - movq [r2 + r3 * 2], xm6 - pextrd [r2 + r3 * 2 + 8], xm6, 2 - movq [r2 + r6], xm7 - pextrd [r2 + r6 + 8], xm7, 2 + movu [r2 + r3 * 2], xm6 + movu [r2 + r4], xm7 %else psubw m6, m14 ; m6 = word: row 6 psubw m7, m14 ; m7 = word: row 7 - movu [r2 + r3 * 2], xm6 - vextracti128 xm6, m6, 1 - movq [r2 + r3 * 2 + 16], xm6 - movu [r2 + r6], xm7 - vextracti128 xm7, m7, 1 - movq [r2 + r6 + 16], xm7 -%endif - lea r2, [r2 + r3 * 4] - - movu xm1, [r0 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - pmaddubsw m2, m0, [r5 + 1 * mmsize] - paddw m12, m2 - pmaddubsw m0, [r5] - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - pmaddubsw m3, m1, [r5 + 1 * mmsize] - paddw m13, m3 - pmaddubsw m1, [r5] - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - pmaddubsw m4, m2, [r5 + 2 * mmsize] - paddw m12, m4 - pmaddubsw m2, [r5 + 1 * mmsize] - paddw m0, m2 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 - pmaddubsw m5, m3, [r5 + 2 * mmsize] - paddw m13, m5 - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 - movu xm5, [r0 + r4] ; m5 = row 19 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 3 * mmsize] - paddw m12, m6 - pmaddubsw m4, [r5 + 2 * mmsize] - paddw m0, m4 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 20 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 3 * mmsize] - paddw m13, m7 - pmaddubsw m5, [r5 + 2 * mmsize] - paddw m1, m5 - movu xm7, [r0 + r1] ; m7 = row 21 - punpckhbw xm2, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddubsw m6, [r5 + 3 * mmsize] - paddw m0, m6 - movu xm2, [r0 + r1 * 2] ; m2 = row 22 - punpckhbw xm3, xm7, xm2 - punpcklbw xm7, xm2 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m7, [r5 + 3 * mmsize] - paddw m1, m7 - -%ifidn %1,pp - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m12, m14 ; m12 = word: row 12 - pmulhrsw m13, m14 ; m13 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 - packuswb m8, m9 - packuswb m10, m11 - packuswb m12, m13 - packuswb m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - movq [r2], xm8 - pextrd [r2 + 8], xm8, 2 - movq [r2 + r3], xm9 - pextrd [r2 + r3 + 8], xm9, 2 - movq [r2 + r3 * 2], xm10 - pextrd [r2 + r3 * 2 + 8], xm10, 2 - movq [r2 + r6], xm11 - pextrd [r2 + r6 + 8], xm11, 2 - lea r2, [r2 + r3 * 4] - movq [r2], xm12 - pextrd [r2 + 8], xm12, 2 - movq [r2 + r3], xm13 - pextrd [r2 + r3 + 8], xm13, 2 - movq [r2 + r3 * 2], xm0 - pextrd [r2 + r3 * 2 + 8], xm0, 2 - movq [r2 + r6], xm1 - pextrd [r2 + r6 + 8], xm1, 2 -%else - psubw m8, m14 ; m8 = word: row 8 - psubw m9, m14 ; m9 = word: row 9 - psubw m10, m14 ; m10 = word: row 10 - psubw m11, m14 ; m11 = word: row 11 - psubw m12, m14 ; m12 = word: row 12 - psubw m13, m14 ; m13 = word: row 13 - psubw m0, m14 ; m0 = word: row 14 - psubw m1, m14 ; m1 = word: row 15 - movu [r2], xm8 - vextracti128 xm8, m8, 1 - movq [r2 + 16], xm8 - movu [r2 + r3], xm9 - vextracti128 xm9, m9, 1 - movq [r2 + r3 + 16], xm9 - movu [r2 + r3 * 2], xm10 - vextracti128 xm10, m10, 1 - movq [r2 + r3 * 2 + 16], xm10 - movu [r2 + r6], xm11 - vextracti128 xm11, m11, 1 - movq [r2 + r6 + 16], xm11 - lea r2, [r2 + r3 * 4] - movu [r2], xm12 - vextracti128 xm12, m12, 1 - movq [r2 + 16], xm12 - movu [r2 + r3], xm13 - vextracti128 xm13, m13, 1 - movq [r2 + r3 + 16], xm13 - movu [r2 + r3 * 2], xm0 - vextracti128 xm0, m0, 1 - movq [r2 + r3 * 2 + 16], xm0 - movu [r2 + r6], xm1 - vextracti128 xm1, m1, 1 - movq [r2 + r6 + 16], xm1 + movu [r2 + r3 * 2], m6 + movu [r2 + r4], m7 %endif RET %endif %endmacro - FILTER_VER_LUMA_AVX2_12x16 pp - FILTER_VER_LUMA_AVX2_12x16 ps + FILTER_VER_LUMA_AVX2_16x8 pp + FILTER_VER_LUMA_AVX2_16x8 ps -%macro FILTER_VER_LUMA_AVX2_16x16 1 +%macro FILTER_VER_LUMA_AVX2_16x4 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 +cglobal interp_8tap_vert_%1_16x4, 4, 6, 13 mov r4d, r4m shl r4d, 7 - %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif - lea r4, [r1 * 3] sub r0, r4 %ifidn %1,pp - mova m14, [pw_512] + mova m12, [pw_512] %else add r3d, r3d - vbroadcasti128 m14, [pw_2000] + vbroadcasti128 m12, [pw_2000] %endif - lea r6, [r3 * 3] - movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 @@ -15237,7 +13264,6 @@ cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 paddw m0, m6 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 - pmaddubsw m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 @@ -15246,7 +13272,6 @@ cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 paddw m1, m7 pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 - pmaddubsw m5, [r5] movu xm7, [r0 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 @@ -15255,9 +13280,6 @@ cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 paddw m0, m8 pmaddubsw m8, m6, [r5 + 2 * mmsize] paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhbw xm9, xm7, xm8 @@ -15267,65 +13289,201 @@ cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 paddw m1, m9 pmaddubsw m9, m7, [r5 + 2 * mmsize] paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] movu xm9, [r0 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddubsw m10, m8, [r5 + 3 * mmsize] paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - pmaddubsw m8, [r5] movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m11, m9, [r5 + 3 * mmsize] paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] - %ifidn %1,pp - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 + pmulhrsw m0, m12 ; m0 = word: row 0 + pmulhrsw m1, m12 ; m1 = word: row 1 + pmulhrsw m2, m12 ; m2 = word: row 2 + pmulhrsw m3, m12 ; m3 = word: row 3 packuswb m0, m1 packuswb m2, m3 - packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + lea r4, [r3 * 3] + movu [r2 + r4], xm3 +%else + psubw m0, m12 ; m0 = word: row 0 + psubw m1, m12 ; m1 = word: row 1 + psubw m2, m12 ; m2 = word: row 2 + psubw m3, m12 ; m3 = word: row 3 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + lea r4, [r3 * 3] + movu [r2 + r4], m3 +%endif + RET +%endif +%endmacro + + FILTER_VER_LUMA_AVX2_16x4 pp + FILTER_VER_LUMA_AVX2_16x4 ps +%macro FILTER_VER_LUMA_AVX2_16xN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %3,ps + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%else + mova m14, [pw_512] +%endif + lea r6, [r3 * 3] + lea r7, [r1 * 4] + mov r8d, %2 / 16 + +.loop: + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + +%ifidn %3,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b @@ -15378,7 +13536,7 @@ cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 paddw m11, m1 pmaddubsw m13, [r5] -%ifidn %1,pp +%ifidn %3,pp pmulhrsw m6, m14 ; m6 = word: row 6 pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m6, m7 @@ -15392,6 +13550,7 @@ cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 movu [r2 + r3 * 2], m6 movu [r2 + r6], m7 %endif + lea r2, [r2 + r3 * 4] movu xm1, [r0 + r4] ; m1 = row 15 @@ -15467,7 +13626,7 @@ cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 pmaddubsw m7, [r5 + 3 * mmsize] paddw m1, m7 -%ifidn %1,pp +%ifidn %3,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 @@ -15516,37 +13675,21 @@ cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 movu [r2 + r3 * 2], m0 movu [r2 + r6], m1 %endif + + lea r2, [r2 + r3 * 4] + sub r0, r7 + dec r8d + jnz .loop RET %endif %endmacro - FILTER_VER_LUMA_AVX2_16x16 pp - FILTER_VER_LUMA_AVX2_16x16 ps - -%macro FILTER_VER_LUMA_AVX2_16x12 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,pp - mova m14, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%endif - lea r6, [r3 * 3] + FILTER_VER_LUMA_AVX2_16xN 16, 32, pp + FILTER_VER_LUMA_AVX2_16xN 16, 64, pp + FILTER_VER_LUMA_AVX2_16xN 16, 32, ps + FILTER_VER_LUMA_AVX2_16xN 16, 64, ps +%macro PROCESS_LUMA_AVX2_W16_16R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 @@ -15565,15 +13708,15 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 + movu xm5, [r7 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 @@ -15582,7 +13725,7 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 + movu xm6, [r7 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 @@ -15591,7 +13734,7 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 + movu xm7, [r7 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 @@ -15602,8 +13745,8 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 pmaddubsw m8, m6, [r5 + 1 * mmsize] paddw m4, m8 pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 @@ -15614,7 +13757,7 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 pmaddubsw m9, m7, [r5 + 1 * mmsize] paddw m5, m9 pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 + movu xm9, [r7 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 @@ -15625,7 +13768,7 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 pmaddubsw m10, m8, [r5 + 1 * mmsize] paddw m6, m10 pmaddubsw m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 + movu xm10, [r7 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 @@ -15636,7 +13779,7 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 pmaddubsw m11, m9, [r5 + 1 * mmsize] paddw m7, m11 pmaddubsw m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 + movu xm11, [r7 + r4] ; m11 = row 11 punpckhbw xm12, xm10, xm11 punpcklbw xm10, xm11 vinserti128 m10, m10, xm12, 1 @@ -15647,8 +13790,8 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 pmaddubsw m12, m10, [r5 + 1 * mmsize] paddw m8, m12 pmaddubsw m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 punpckhbw xm13, xm11, xm12 punpcklbw xm11, xm12 vinserti128 m11, m11, xm13, 1 @@ -15680,9 +13823,9 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 + lea r8, [r2 + r3 * 4] + movu [r8], xm4 + movu [r8 + r3], xm5 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 @@ -15694,12 +13837,12 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 movu [r2 + r6], m3 - lea r2, [r2 + r3 * 4] - movu [r2], m4 - movu [r2 + r3], m5 + lea r8, [r2 + r3 * 4] + movu [r8], m4 + movu [r8 + r3], m5 %endif - movu xm13, [r0 + r1] ; m13 = row 13 + movu xm13, [r7 + r1] ; m13 = row 13 punpckhbw xm0, xm12, xm13 punpcklbw xm12, xm13 vinserti128 m12, m12, xm0, 1 @@ -15709,7 +13852,8 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 paddw m8, m0 pmaddubsw m0, m12, [r5 + 1 * mmsize] paddw m10, m0 - movu xm0, [r0 + r1 * 2] ; m0 = row 14 + pmaddubsw m12, [r5] + movu xm0, [r7 + r1 * 2] ; m0 = row 14 punpckhbw xm1, xm13, xm0 punpcklbw xm13, xm0 vinserti128 m13, m13, xm1, 1 @@ -15719,6 +13863,7 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 paddw m9, m1 pmaddubsw m1, m13, [r5 + 1 * mmsize] paddw m11, m1 + pmaddubsw m13, [r5] %ifidn %1,pp pmulhrsw m6, m14 ; m6 = word: row 6 @@ -15726,17 +13871,18 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 packuswb m6, m7 vpermq m6, m6, 11011000b vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 %else psubw m6, m14 ; m6 = word: row 6 psubw m7, m14 ; m7 = word: row 7 - movu [r2 + r3 * 2], m6 - movu [r2 + r6], m7 + movu [r8 + r3 * 2], m6 + movu [r8 + r6], m7 %endif - lea r2, [r2 + r3 * 4] - movu xm1, [r0 + r4] ; m1 = row 15 + lea r8, [r8 + r3 * 4] + + movu xm1, [r7 + r4] ; m1 = row 15 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 @@ -15744,8 +13890,11 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 paddw m8, m2 pmaddubsw m2, m0, [r5 + 2 * mmsize] paddw m10, m2 - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 @@ -15753,71 +13902,111 @@ cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 paddw m9, m3 pmaddubsw m3, m1, [r5 + 2 * mmsize] paddw m11, m3 - movu xm3, [r0 + r1] ; m3 = row 17 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r7 + r1] ; m3 = row 17 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 3 * mmsize] paddw m10, m4 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 3 * mmsize] paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 + movu xm5, [r7 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r7, [r7 + r1 * 4] + movu xm6, [r7] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r7 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r7 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 %ifidn %1,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 packuswb m8, m9 packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 vpermq m8, m8, 11011000b vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r6], xm11 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 + lea r8, [r8 + r3 * 4] + movu [r8], xm12 + movu [r8 + r3], xm13 + movu [r8 + r3 * 2], xm0 + movu [r8 + r6], xm1 %else psubw m8, m14 ; m8 = word: row 8 psubw m9, m14 ; m9 = word: row 9 psubw m10, m14 ; m10 = word: row 10 psubw m11, m14 ; m11 = word: row 11 - movu [r2], m8 - movu [r2 + r3], m9 - movu [r2 + r3 * 2], m10 - movu [r2 + r6], m11 -%endif - RET + psubw m12, m14 ; m12 = word: row 12 + psubw m13, m14 ; m13 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r8], m8 + movu [r8 + r3], m9 + movu [r8 + r3 * 2], m10 + movu [r8 + r6], m11 + lea r8, [r8 + r3 * 4] + movu [r8], m12 + movu [r8 + r3], m13 + movu [r8 + r3 * 2], m0 + movu [r8 + r6], m1 %endif %endmacro - FILTER_VER_LUMA_AVX2_16x12 pp - FILTER_VER_LUMA_AVX2_16x12 ps - -%macro FILTER_VER_LUMA_AVX2_16x8 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 - mov r4d, r4m - shl r4d, 7 -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,pp - mova m14, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%endif +%macro PROCESS_LUMA_AVX2_W16_8R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 @@ -15836,15 +14025,15 @@ cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 + movu xm5, [r7 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 @@ -15853,7 +14042,7 @@ cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 + movu xm6, [r7 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 @@ -15862,7 +14051,7 @@ cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 + movu xm7, [r7 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 @@ -15873,8 +14062,8 @@ cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 pmaddubsw m8, m6, [r5 + 1 * mmsize] paddw m4, m8 pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 @@ -15885,7 +14074,7 @@ cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 pmaddubsw m9, m7, [r5 + 1 * mmsize] paddw m5, m9 pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 + movu xm9, [r7 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 @@ -15895,7 +14084,7 @@ cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 paddw m4, m10 pmaddubsw m10, m8, [r5 + 1 * mmsize] paddw m6, m10 - movu xm10, [r0 + r1 * 2] ; m10 = row 10 + movu xm10, [r7 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 @@ -15905,7 +14094,7 @@ cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 paddw m5, m11 pmaddubsw m11, m9, [r5 + 1 * mmsize] paddw m7, m11 - movu xm11, [r0 + r4] ; m11 = row 11 + movu xm11, [r7 + r4] ; m11 = row 11 punpckhbw xm12, xm10, xm11 punpcklbw xm10, xm11 vinserti128 m10, m10, xm12, 1 @@ -15913,8 +14102,8 @@ cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 paddw m4, m12 pmaddubsw m12, m10, [r5 + 2 * mmsize] paddw m6, m12 - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 punpckhbw xm13, xm11, xm12 punpcklbw xm11, xm12 vinserti128 m11, m11, xm13, 1 @@ -15922,7 +14111,7 @@ cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 paddw m5, m13 pmaddubsw m13, m11, [r5 + 2 * mmsize] paddw m7, m13 - lea r4, [r3 * 3] + %ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 @@ -15942,10 +14131,10 @@ cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 + movu [r2 + r6], xm3 + lea r8, [r2 + r3 * 4] + movu [r8], xm4 + movu [r8 + r3], xm5 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 @@ -15956,48 +14145,45 @@ cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 - movu [r2 + r4], m3 - lea r2, [r2 + r3 * 4] - movu [r2], m4 - movu [r2 + r3], m5 + movu [r2 + r6], m3 + lea r8, [r2 + r3 * 4] + movu [r8], m4 + movu [r8 + r3], m5 %endif - movu xm13, [r0 + r1] ; m13 = row 13 + + movu xm13, [r7 + r1] ; m13 = row 13 punpckhbw xm0, xm12, xm13 punpcklbw xm12, xm13 vinserti128 m12, m12, xm0, 1 pmaddubsw m0, m12, [r5 + 3 * mmsize] paddw m6, m0 - movu xm0, [r0 + r1 * 2] ; m0 = row 14 + movu xm0, [r7 + r1 * 2] ; m0 = row 14 punpckhbw xm1, xm13, xm0 punpcklbw xm13, xm0 vinserti128 m13, m13, xm1, 1 pmaddubsw m1, m13, [r5 + 3 * mmsize] paddw m7, m1 + %ifidn %1,pp pmulhrsw m6, m14 ; m6 = word: row 6 pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m6, m7 vpermq m6, m6, 11011000b vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r4], xm7 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 %else psubw m6, m14 ; m6 = word: row 6 psubw m7, m14 ; m7 = word: row 7 - movu [r2 + r3 * 2], m6 - movu [r2 + r4], m7 -%endif - RET + movu [r8 + r3 * 2], m6 + movu [r8 + r6], m7 %endif %endmacro - FILTER_VER_LUMA_AVX2_16x8 pp - FILTER_VER_LUMA_AVX2_16x8 ps - -%macro FILTER_VER_LUMA_AVX2_16x4 1 +%macro FILTER_VER_LUMA_AVX2_24x32 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_16x4, 4, 6, 13 +cglobal interp_8tap_vert_%1_24x32, 4, 11, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC @@ -16008,4684 +14194,1710 @@ cglobal interp_8tap_vert_%1_16x4, 4, 6, 13 %endif lea r4, [r1 * 3] sub r0, r4 +%ifidn %1,ps + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%else + mova m14, [pw_512] +%endif + lea r6, [r3 * 3] + lea r10, [r1 * 4] + mov r9d, 2 +.loopH: + PROCESS_LUMA_AVX2_W16_16R %1 %ifidn %1,pp - mova m12, [pw_512] + add r2, 16 %else - add r3d, r3d - vbroadcasti128 m12, [pw_2000] + add r2, 32 %endif - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 + add r0, 16 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 + movq xm3, [r0 + r1 * 2] ; m3 = row 2 punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 + vinserti128 m5, m1, xm2, 1 + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 + lea r7, [r0 + r1 * 4] + movq xm1, [r7] ; m1 = row 4 + punpcklbw xm4, xm1 + vinserti128 m2, m3, xm4, 1 + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 + movq xm3, [r7 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 + movq xm4, [r7 + r1 * 2] ; m4 = row 6 punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 -%ifidn %1,pp - pmulhrsw m0, m12 ; m0 = word: row 0 - pmulhrsw m1, m12 ; m1 = word: row 1 - pmulhrsw m2, m12 ; m2 = word: row 2 - pmulhrsw m3, m12 ; m3 = word: row 3 - packuswb m0, m1 - packuswb m2, m3 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - lea r4, [r3 * 3] - movu [r2 + r4], xm3 -%else - psubw m0, m12 ; m0 = word: row 0 - psubw m1, m12 ; m1 = word: row 1 - psubw m2, m12 ; m2 = word: row 2 - psubw m3, m12 ; m3 = word: row 3 - movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - lea r4, [r3 * 3] - movu [r2 + r4], m3 -%endif - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_16x4 pp - FILTER_VER_LUMA_AVX2_16x4 ps -%macro FILTER_VER_LUMA_AVX2_16xN 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %3,ps - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%else - mova m14, [pw_512] -%endif - lea r6, [r3 * 3] - lea r7, [r1 * 4] - mov r8d, %2 / 16 - -.loop: - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m5, m3 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 + movq xm3, [r7 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 + lea r7, [r7 + r1 * 4] + movq xm0, [r7] ; m0 = row 8 + punpcklbw xm3, xm0 + vinserti128 m4, m4, xm3, 1 + pmaddubsw m3, m4, [r5 + 3 * mmsize] + paddw m5, m3 + pmaddubsw m3, m4, [r5 + 2 * mmsize] + paddw m2, m3 + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 + movq xm3, [r7 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 + movq xm6, [r7 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 + vinserti128 m0, m0, xm3, 1 + pmaddubsw m3, m0, [r5 + 3 * mmsize] + paddw m2, m3 + pmaddubsw m3, m0, [r5 + 2 * mmsize] + paddw m1, m3 + pmaddubsw m3, m0, [r5 + 1 * mmsize] + paddw m4, m3 + pmaddubsw m0, [r5] + + movq xm3, [r7 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 + lea r7, [r7 + r1 * 4] + movq xm7, [r7] ; m7 = row 12 + punpcklbw xm3, xm7 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, [r5 + 3 * mmsize] + paddw m1, m3 + pmaddubsw m3, m6, [r5 + 2 * mmsize] + paddw m4, m3 + pmaddubsw m3, m6, [r5 + 1 * mmsize] + paddw m0, m3 pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 + movq xm3, [r7 + r1] ; m3 = row 13 + punpcklbw xm7, xm3 + movq xm8, [r7 + r1 * 2] ; m8 = row 14 + punpcklbw xm3, xm8 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m3, m7, [r5 + 3 * mmsize] + paddw m4, m3 + pmaddubsw m3, m7, [r5 + 2 * mmsize] + paddw m0, m3 + pmaddubsw m3, m7, [r5 + 1 * mmsize] + paddw m6, m3 pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 + movq xm3, [r7 + r4] ; m3 = row 15 + punpcklbw xm8, xm3 + lea r7, [r7 + r1 * 4] + movq xm9, [r7] ; m9 = row 16 + punpcklbw xm3, xm9 + vinserti128 m8, m8, xm3, 1 + pmaddubsw m3, m8, [r5 + 3 * mmsize] + paddw m0, m3 + pmaddubsw m3, m8, [r5 + 2 * mmsize] + paddw m6, m3 + pmaddubsw m3, m8, [r5 + 1 * mmsize] + paddw m7, m3 pmaddubsw m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] - -%ifidn %3,pp - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 + movq xm3, [r7 + r1] ; m3 = row 17 + punpcklbw xm9, xm3 + movq xm10, [r7 + r1 * 2] ; m10 = row 18 + punpcklbw xm3, xm10 + vinserti128 m9, m9, xm3, 1 + pmaddubsw m3, m9, [r5 + 3 * mmsize] + paddw m6, m3 + pmaddubsw m3, m9, [r5 + 2 * mmsize] + paddw m7, m3 + pmaddubsw m3, m9, [r5 + 1 * mmsize] + paddw m8, m3 + movq xm3, [r7 + r4] ; m3 = row 19 + punpcklbw xm10, xm3 + lea r7, [r7 + r1 * 4] + movq xm9, [r7] ; m9 = row 20 + punpcklbw xm3, xm9 + vinserti128 m10, m10, xm3, 1 + pmaddubsw m3, m10, [r5 + 3 * mmsize] + paddw m7, m3 + pmaddubsw m3, m10, [r5 + 2 * mmsize] + paddw m8, m3 + movq xm3, [r7 + r1] ; m3 = row 21 + punpcklbw xm9, xm3 + movq xm10, [r7 + r1 * 2] ; m10 = row 22 + punpcklbw xm3, xm10 + vinserti128 m9, m9, xm3, 1 + pmaddubsw m3, m9, [r5 + 3 * mmsize] + paddw m8, m3 +%ifidn %1,pp + pmulhrsw m5, m14 ; m5 = word: row 0, row 1 + pmulhrsw m2, m14 ; m2 = word: row 2, row 3 + pmulhrsw m1, m14 ; m1 = word: row 4, row 5 + pmulhrsw m4, m14 ; m4 = word: row 6, row 7 + pmulhrsw m0, m14 ; m0 = word: row 8, row 9 + pmulhrsw m6, m14 ; m6 = word: row 10, row 11 + pmulhrsw m7, m14 ; m7 = word: row 12, row 13 + pmulhrsw m8, m14 ; m8 = word: row 14, row 15 + packuswb m5, m2 + packuswb m1, m4 + packuswb m0, m6 + packuswb m7, m8 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + vextracti128 xm6, m0, 1 + vextracti128 xm8, m7, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 + lea r8, [r2 + r3 * 4] + movq [r8], xm1 + movq [r8 + r3], xm4 + movhps [r8 + r3 * 2], xm1 + movhps [r8 + r6], xm4 + lea r8, [r8 + r3 * 4] + movq [r8], xm0 + movq [r8 + r3], xm6 + movhps [r8 + r3 * 2], xm0 + movhps [r8 + r6], xm6 + lea r8, [r8 + r3 * 4] + movq [r8], xm7 + movq [r8 + r3], xm8 + movhps [r8 + r3 * 2], xm7 + movhps [r8 + r6], xm8 +%else + psubw m5, m14 ; m5 = word: row 0, row 1 + psubw m2, m14 ; m2 = word: row 2, row 3 + psubw m1, m14 ; m1 = word: row 4, row 5 + psubw m4, m14 ; m4 = word: row 6, row 7 + psubw m0, m14 ; m0 = word: row 8, row 9 + psubw m6, m14 ; m6 = word: row 10, row 11 + psubw m7, m14 ; m7 = word: row 12, row 13 + psubw m8, m14 ; m8 = word: row 14, row 15 + vextracti128 xm3, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm3 vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 -%else - psubw m0, m14 ; m0 = word: row 0 - psubw m1, m14 ; m1 = word: row 1 - psubw m2, m14 ; m2 = word: row 2 - psubw m3, m14 ; m3 = word: row 3 - psubw m4, m14 ; m4 = word: row 4 - psubw m5, m14 ; m5 = word: row 5 - movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m3 - lea r2, [r2 + r3 * 4] - movu [r2], m4 - movu [r2 + r3], m5 -%endif - - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - pmaddubsw m12, [r5] - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 - pmaddubsw m13, [r5] - -%ifidn %3,pp - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 -%else - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 - movu [r2 + r3 * 2], m6 - movu [r2 + r6], m7 -%endif - - lea r2, [r2 + r3 * 4] - - movu xm1, [r0 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - pmaddubsw m2, m0, [r5 + 1 * mmsize] - paddw m12, m2 - pmaddubsw m0, [r5] - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - pmaddubsw m3, m1, [r5 + 1 * mmsize] - paddw m13, m3 - pmaddubsw m1, [r5] - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - pmaddubsw m4, m2, [r5 + 2 * mmsize] - paddw m12, m4 - pmaddubsw m2, [r5 + 1 * mmsize] - paddw m0, m2 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 - pmaddubsw m5, m3, [r5 + 2 * mmsize] - paddw m13, m5 - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 - movu xm5, [r0 + r4] ; m5 = row 19 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 3 * mmsize] - paddw m12, m6 - pmaddubsw m4, [r5 + 2 * mmsize] - paddw m0, m4 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 20 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 3 * mmsize] - paddw m13, m7 - pmaddubsw m5, [r5 + 2 * mmsize] - paddw m1, m5 - movu xm7, [r0 + r1] ; m7 = row 21 - punpckhbw xm2, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddubsw m6, [r5 + 3 * mmsize] - paddw m0, m6 - movu xm2, [r0 + r1 * 2] ; m2 = row 22 - punpckhbw xm3, xm7, xm2 - punpcklbw xm7, xm2 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m7, [r5 + 3 * mmsize] - paddw m1, m7 - -%ifidn %3,pp - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m12, m14 ; m12 = word: row 12 - pmulhrsw m13, m14 ; m13 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 - packuswb m8, m9 - packuswb m10, m11 - packuswb m12, m13 - packuswb m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r6], xm11 - lea r2, [r2 + r3 * 4] - movu [r2], xm12 - movu [r2 + r3], xm13 - movu [r2 + r3 * 2], xm0 - movu [r2 + r6], xm1 -%else - psubw m8, m14 ; m8 = word: row 8 - psubw m9, m14 ; m9 = word: row 9 - psubw m10, m14 ; m10 = word: row 10 - psubw m11, m14 ; m11 = word: row 11 - psubw m12, m14 ; m12 = word: row 12 - psubw m13, m14 ; m13 = word: row 13 - psubw m0, m14 ; m0 = word: row 14 - psubw m1, m14 ; m1 = word: row 15 - movu [r2], m8 - movu [r2 + r3], m9 - movu [r2 + r3 * 2], m10 - movu [r2 + r6], m11 - lea r2, [r2 + r3 * 4] - movu [r2], m12 - movu [r2 + r3], m13 - movu [r2 + r3 * 2], m0 - movu [r2 + r6], m1 -%endif - - lea r2, [r2 + r3 * 4] - sub r0, r7 - dec r8d - jnz .loop - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_16xN 16, 32, pp - FILTER_VER_LUMA_AVX2_16xN 16, 64, pp - FILTER_VER_LUMA_AVX2_16xN 16, 32, ps - FILTER_VER_LUMA_AVX2_16xN 16, 64, ps - -%macro PROCESS_LUMA_AVX2_W16_16R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r7 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm8, [r7] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r7 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - pmaddubsw m8, [r5] - movu xm10, [r7 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r7 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r7, [r7 + r1 * 4] - movu xm12, [r7] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] - -%ifidn %1,pp - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r8, [r2 + r3 * 4] - movu [r8], xm4 - movu [r8 + r3], xm5 -%else - psubw m0, m14 ; m0 = word: row 0 - psubw m1, m14 ; m1 = word: row 1 - psubw m2, m14 ; m2 = word: row 2 - psubw m3, m14 ; m3 = word: row 3 - psubw m4, m14 ; m4 = word: row 4 - psubw m5, m14 ; m5 = word: row 5 - movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m3 - lea r8, [r2 + r3 * 4] - movu [r8], m4 - movu [r8 + r3], m5 -%endif - - movu xm13, [r7 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - pmaddubsw m12, [r5] - movu xm0, [r7 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 - pmaddubsw m13, [r5] - -%ifidn %1,pp - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 -%else - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 - movu [r8 + r3 * 2], m6 - movu [r8 + r6], m7 -%endif - - lea r8, [r8 + r3 * 4] - - movu xm1, [r7 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - pmaddubsw m2, m0, [r5 + 1 * mmsize] - paddw m12, m2 - pmaddubsw m0, [r5] - lea r7, [r7 + r1 * 4] - movu xm2, [r7] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - pmaddubsw m3, m1, [r5 + 1 * mmsize] - paddw m13, m3 - pmaddubsw m1, [r5] - movu xm3, [r7 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - pmaddubsw m4, m2, [r5 + 2 * mmsize] - paddw m12, m4 - pmaddubsw m2, [r5 + 1 * mmsize] - paddw m0, m2 - movu xm4, [r7 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 - pmaddubsw m5, m3, [r5 + 2 * mmsize] - paddw m13, m5 - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 - movu xm5, [r7 + r4] ; m5 = row 19 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 3 * mmsize] - paddw m12, m6 - pmaddubsw m4, [r5 + 2 * mmsize] - paddw m0, m4 - lea r7, [r7 + r1 * 4] - movu xm6, [r7] ; m6 = row 20 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 3 * mmsize] - paddw m13, m7 - pmaddubsw m5, [r5 + 2 * mmsize] - paddw m1, m5 - movu xm7, [r7 + r1] ; m7 = row 21 - punpckhbw xm2, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddubsw m6, [r5 + 3 * mmsize] - paddw m0, m6 - movu xm2, [r7 + r1 * 2] ; m2 = row 22 - punpckhbw xm3, xm7, xm2 - punpcklbw xm7, xm2 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m7, [r5 + 3 * mmsize] - paddw m1, m7 - -%ifidn %1,pp - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m12, m14 ; m12 = word: row 12 - pmulhrsw m13, m14 ; m13 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 - packuswb m8, m9 - packuswb m10, m11 - packuswb m12, m13 - packuswb m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - movu [r8], xm8 - movu [r8 + r3], xm9 - movu [r8 + r3 * 2], xm10 - movu [r8 + r6], xm11 - lea r8, [r8 + r3 * 4] - movu [r8], xm12 - movu [r8 + r3], xm13 - movu [r8 + r3 * 2], xm0 - movu [r8 + r6], xm1 -%else - psubw m8, m14 ; m8 = word: row 8 - psubw m9, m14 ; m9 = word: row 9 - psubw m10, m14 ; m10 = word: row 10 - psubw m11, m14 ; m11 = word: row 11 - psubw m12, m14 ; m12 = word: row 12 - psubw m13, m14 ; m13 = word: row 13 - psubw m0, m14 ; m0 = word: row 14 - psubw m1, m14 ; m1 = word: row 15 - movu [r8], m8 - movu [r8 + r3], m9 - movu [r8 + r3 * 2], m10 - movu [r8 + r6], m11 - lea r8, [r8 + r3 * 4] - movu [r8], m12 - movu [r8 + r3], m13 - movu [r8 + r3 * 2], m0 - movu [r8 + r6], m1 -%endif -%endmacro - -%macro PROCESS_LUMA_AVX2_W16_8R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r7 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm8, [r7] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r7 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - movu xm10, [r7 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - movu xm11, [r7 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - lea r7, [r7 + r1 * 4] - movu xm12, [r7] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - -%ifidn %1,pp - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r8, [r2 + r3 * 4] - movu [r8], xm4 - movu [r8 + r3], xm5 -%else - psubw m0, m14 ; m0 = word: row 0 - psubw m1, m14 ; m1 = word: row 1 - psubw m2, m14 ; m2 = word: row 2 - psubw m3, m14 ; m3 = word: row 3 - psubw m4, m14 ; m4 = word: row 4 - psubw m5, m14 ; m5 = word: row 5 - movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m3 - lea r8, [r2 + r3 * 4] - movu [r8], m4 - movu [r8 + r3], m5 -%endif - - movu xm13, [r7 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - movu xm0, [r7 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - -%ifidn %1,pp - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 -%else - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 - movu [r8 + r3 * 2], m6 - movu [r8 + r6], m7 -%endif -%endmacro - -%macro FILTER_VER_LUMA_AVX2_24x32 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_24x32, 4, 11, 15 - mov r4d, r4m - shl r4d, 7 -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,ps - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%else - mova m14, [pw_512] -%endif - lea r6, [r3 * 3] - lea r10, [r1 * 4] - mov r9d, 2 -.loopH: - PROCESS_LUMA_AVX2_W16_16R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 - vinserti128 m5, m1, xm2, 1 - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 - lea r7, [r0 + r1 * 4] - movq xm1, [r7] ; m1 = row 4 - punpcklbw xm4, xm1 - vinserti128 m2, m3, xm4, 1 - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r7 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 - movq xm4, [r7 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m5, m3 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - pmaddubsw m1, [r5] - movq xm3, [r7 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 - lea r7, [r7 + r1 * 4] - movq xm0, [r7] ; m0 = row 8 - punpcklbw xm3, xm0 - vinserti128 m4, m4, xm3, 1 - pmaddubsw m3, m4, [r5 + 3 * mmsize] - paddw m5, m3 - pmaddubsw m3, m4, [r5 + 2 * mmsize] - paddw m2, m3 - pmaddubsw m3, m4, [r5 + 1 * mmsize] - paddw m1, m3 - pmaddubsw m4, [r5] - movq xm3, [r7 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 - movq xm6, [r7 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 - vinserti128 m0, m0, xm3, 1 - pmaddubsw m3, m0, [r5 + 3 * mmsize] - paddw m2, m3 - pmaddubsw m3, m0, [r5 + 2 * mmsize] - paddw m1, m3 - pmaddubsw m3, m0, [r5 + 1 * mmsize] - paddw m4, m3 - pmaddubsw m0, [r5] - - movq xm3, [r7 + r4] ; m3 = row 11 - punpcklbw xm6, xm3 - lea r7, [r7 + r1 * 4] - movq xm7, [r7] ; m7 = row 12 - punpcklbw xm3, xm7 - vinserti128 m6, m6, xm3, 1 - pmaddubsw m3, m6, [r5 + 3 * mmsize] - paddw m1, m3 - pmaddubsw m3, m6, [r5 + 2 * mmsize] - paddw m4, m3 - pmaddubsw m3, m6, [r5 + 1 * mmsize] - paddw m0, m3 - pmaddubsw m6, [r5] - movq xm3, [r7 + r1] ; m3 = row 13 - punpcklbw xm7, xm3 - movq xm8, [r7 + r1 * 2] ; m8 = row 14 - punpcklbw xm3, xm8 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m3, m7, [r5 + 3 * mmsize] - paddw m4, m3 - pmaddubsw m3, m7, [r5 + 2 * mmsize] - paddw m0, m3 - pmaddubsw m3, m7, [r5 + 1 * mmsize] - paddw m6, m3 - pmaddubsw m7, [r5] - movq xm3, [r7 + r4] ; m3 = row 15 - punpcklbw xm8, xm3 - lea r7, [r7 + r1 * 4] - movq xm9, [r7] ; m9 = row 16 - punpcklbw xm3, xm9 - vinserti128 m8, m8, xm3, 1 - pmaddubsw m3, m8, [r5 + 3 * mmsize] - paddw m0, m3 - pmaddubsw m3, m8, [r5 + 2 * mmsize] - paddw m6, m3 - pmaddubsw m3, m8, [r5 + 1 * mmsize] - paddw m7, m3 - pmaddubsw m8, [r5] - movq xm3, [r7 + r1] ; m3 = row 17 - punpcklbw xm9, xm3 - movq xm10, [r7 + r1 * 2] ; m10 = row 18 - punpcklbw xm3, xm10 - vinserti128 m9, m9, xm3, 1 - pmaddubsw m3, m9, [r5 + 3 * mmsize] - paddw m6, m3 - pmaddubsw m3, m9, [r5 + 2 * mmsize] - paddw m7, m3 - pmaddubsw m3, m9, [r5 + 1 * mmsize] - paddw m8, m3 - movq xm3, [r7 + r4] ; m3 = row 19 - punpcklbw xm10, xm3 - lea r7, [r7 + r1 * 4] - movq xm9, [r7] ; m9 = row 20 - punpcklbw xm3, xm9 - vinserti128 m10, m10, xm3, 1 - pmaddubsw m3, m10, [r5 + 3 * mmsize] - paddw m7, m3 - pmaddubsw m3, m10, [r5 + 2 * mmsize] - paddw m8, m3 - movq xm3, [r7 + r1] ; m3 = row 21 - punpcklbw xm9, xm3 - movq xm10, [r7 + r1 * 2] ; m10 = row 22 - punpcklbw xm3, xm10 - vinserti128 m9, m9, xm3, 1 - pmaddubsw m3, m9, [r5 + 3 * mmsize] - paddw m8, m3 -%ifidn %1,pp - pmulhrsw m5, m14 ; m5 = word: row 0, row 1 - pmulhrsw m2, m14 ; m2 = word: row 2, row 3 - pmulhrsw m1, m14 ; m1 = word: row 4, row 5 - pmulhrsw m4, m14 ; m4 = word: row 6, row 7 - pmulhrsw m0, m14 ; m0 = word: row 8, row 9 - pmulhrsw m6, m14 ; m6 = word: row 10, row 11 - pmulhrsw m7, m14 ; m7 = word: row 12, row 13 - pmulhrsw m8, m14 ; m8 = word: row 14, row 15 - packuswb m5, m2 - packuswb m1, m4 - packuswb m0, m6 - packuswb m7, m8 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - vextracti128 xm6, m0, 1 - vextracti128 xm8, m7, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm2 - lea r8, [r2 + r3 * 4] - movq [r8], xm1 - movq [r8 + r3], xm4 - movhps [r8 + r3 * 2], xm1 - movhps [r8 + r6], xm4 - lea r8, [r8 + r3 * 4] - movq [r8], xm0 - movq [r8 + r3], xm6 - movhps [r8 + r3 * 2], xm0 - movhps [r8 + r6], xm6 - lea r8, [r8 + r3 * 4] - movq [r8], xm7 - movq [r8 + r3], xm8 - movhps [r8 + r3 * 2], xm7 - movhps [r8 + r6], xm8 -%else - psubw m5, m14 ; m5 = word: row 0, row 1 - psubw m2, m14 ; m2 = word: row 2, row 3 - psubw m1, m14 ; m1 = word: row 4, row 5 - psubw m4, m14 ; m4 = word: row 6, row 7 - psubw m0, m14 ; m0 = word: row 8, row 9 - psubw m6, m14 ; m6 = word: row 10, row 11 - psubw m7, m14 ; m7 = word: row 12, row 13 - psubw m8, m14 ; m8 = word: row 14, row 15 - vextracti128 xm3, m5, 1 - movu [r2], xm5 - movu [r2 + r3], xm3 - vextracti128 xm3, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - vextracti128 xm3, m1, 1 - lea r8, [r2 + r3 * 4] - movu [r8], xm1 - movu [r8 + r3], xm3 - vextracti128 xm3, m4, 1 - movu [r8 + r3 * 2], xm4 - movu [r8 + r6], xm3 - vextracti128 xm3, m0, 1 - lea r8, [r8 + r3 * 4] - movu [r8], xm0 - movu [r8 + r3], xm3 - vextracti128 xm3, m6, 1 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm3 - vextracti128 xm3, m7, 1 - lea r8, [r8 + r3 * 4] - movu [r8], xm7 - movu [r8 + r3], xm3 - vextracti128 xm3, m8, 1 - movu [r8 + r3 * 2], xm8 - movu [r8 + r6], xm3 -%endif - sub r7, r10 - lea r0, [r7 - 16] -%ifidn %1,pp - lea r2, [r8 + r3 * 4 - 16] -%else - lea r2, [r8 + r3 * 4 - 32] -%endif - dec r9d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_24x32 pp - FILTER_VER_LUMA_AVX2_24x32 ps - -%macro FILTER_VER_LUMA_AVX2_32xN 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 - mov r4d, r4m - shl r4d, 7 -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %3,ps - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%else - mova m14, [pw_512] -%endif - lea r6, [r3 * 3] - lea r11, [r1 * 4] - mov r9d, %2 / 16 -.loopH: - mov r10d, %1 / 16 -.loopW: - PROCESS_LUMA_AVX2_W16_16R %3 -%ifidn %3,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r10d - jnz .loopW - sub r7, r11 - lea r0, [r7 - 16] -%ifidn %3,pp - lea r2, [r8 + r3 * 4 - 16] -%else - lea r2, [r8 + r3 * 4 - 32] -%endif - dec r9d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_32xN 32, 32, pp - FILTER_VER_LUMA_AVX2_32xN 32, 64, pp - FILTER_VER_LUMA_AVX2_32xN 32, 32, ps - FILTER_VER_LUMA_AVX2_32xN 32, 64, ps - -%macro FILTER_VER_LUMA_AVX2_32x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_32x16, 4, 10, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,ps - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%else - mova m14, [pw_512] -%endif - lea r6, [r3 * 3] - mov r9d, 2 -.loopW: - PROCESS_LUMA_AVX2_W16_16R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r9d - jnz .loopW - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_32x16 pp - FILTER_VER_LUMA_AVX2_32x16 ps - -%macro FILTER_VER_LUMA_AVX2_32x24 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 - mov r4d, r4m - shl r4d, 7 -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,ps - add r3d, r3d -%endif - lea r6, [r3 * 3] -%ifidn %1,pp - mova m14, [pw_512] -%else - vbroadcasti128 m14, [pw_2000] -%endif - mov r9d, 2 -.loopW: - PROCESS_LUMA_AVX2_W16_16R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r9d - jnz .loopW - lea r9, [r1 * 4] - sub r7, r9 - lea r0, [r7 - 16] -%ifidn %1,pp - lea r2, [r8 + r3 * 4 - 16] -%else - lea r2, [r8 + r3 * 4 - 32] -%endif - mov r9d, 2 -.loop: - PROCESS_LUMA_AVX2_W16_8R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r9d - jnz .loop - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_32x24 pp - FILTER_VER_LUMA_AVX2_32x24 ps - -%macro FILTER_VER_LUMA_AVX2_32x8 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_32x8, 4, 10, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,ps - add r3d, r3d -%endif - lea r6, [r3 * 3] -%ifidn %1,pp - mova m14, [pw_512] -%else - vbroadcasti128 m14, [pw_2000] -%endif - mov r9d, 2 -.loopW: - PROCESS_LUMA_AVX2_W16_8R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r9d - jnz .loopW - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_32x8 pp - FILTER_VER_LUMA_AVX2_32x8 ps - -%macro FILTER_VER_LUMA_AVX2_48x64 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_48x64, 4, 12, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 - -%ifidn %1,ps - add r3d, r3d -%endif - - lea r6, [r3 * 3] - lea r11, [r1 * 4] - -%ifidn %1,pp - mova m14, [pw_512] -%else - vbroadcasti128 m14, [pw_2000] -%endif - - mov r9d, 4 -.loopH: - mov r10d, 3 -.loopW: - PROCESS_LUMA_AVX2_W16_16R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r10d - jnz .loopW - sub r7, r11 - lea r0, [r7 - 32] -%ifidn %1,pp - lea r2, [r8 + r3 * 4 - 32] -%else - lea r2, [r8 + r3 * 4 - 64] -%endif - dec r9d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_48x64 pp - FILTER_VER_LUMA_AVX2_48x64 ps - -%macro FILTER_VER_LUMA_AVX2_64xN 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 - -%ifidn %3,ps - add r3d, r3d -%endif - - lea r6, [r3 * 3] - lea r11, [r1 * 4] - -%ifidn %3,pp - mova m14, [pw_512] -%else - vbroadcasti128 m14, [pw_2000] -%endif - - mov r9d, %2 / 16 -.loopH: - mov r10d, %1 / 16 -.loopW: - PROCESS_LUMA_AVX2_W16_16R %3 -%ifidn %3,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r10d - jnz .loopW - sub r7, r11 - lea r0, [r7 - 48] -%ifidn %3,pp - lea r2, [r8 + r3 * 4 - 48] -%else - lea r2, [r8 + r3 * 4 - 96] -%endif - dec r9d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_64xN 64, 32, pp - FILTER_VER_LUMA_AVX2_64xN 64, 48, pp - FILTER_VER_LUMA_AVX2_64xN 64, 64, pp - FILTER_VER_LUMA_AVX2_64xN 64, 32, ps - FILTER_VER_LUMA_AVX2_64xN 64, 48, ps - FILTER_VER_LUMA_AVX2_64xN 64, 64, ps - -%macro FILTER_VER_LUMA_AVX2_64x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_64x16, 4, 10, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 - -%ifidn %1,ps - add r3d, r3d -%endif - - lea r6, [r3 * 3] - -%ifidn %1,pp - mova m14, [pw_512] -%else - vbroadcasti128 m14, [pw_2000] -%endif - - mov r9d, 4 -.loopW: - PROCESS_LUMA_AVX2_W16_16R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r9d - jnz .loopW - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_64x16 pp - FILTER_VER_LUMA_AVX2_64x16 ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA 3 -INIT_XMM sse4 -cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 -%ifidn %3,ps - add r3d, r3d -%endif - -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffVer + r4] -%endif - -%ifidn %3,pp - mova m3, [pw_512] -%else - mova m3, [pw_2000] -%endif - mov dword [rsp], %2/4 - -.loopH: - mov r4d, (%1/8) -.loopW: - PROCESS_LUMA_W8_4R -%ifidn %3,pp - pmulhrsw m7, m3 - pmulhrsw m6, m3 - pmulhrsw m5, m3 - pmulhrsw m4, m3 - - packuswb m7, m6 - packuswb m5, m4 - - movlps [r2], m7 - movhps [r2 + r3], m7 - lea r5, [r2 + 2 * r3] - movlps [r5], m5 - movhps [r5 + r3], m5 -%else - psubw m7, m3 - psubw m6, m3 - psubw m5, m3 - psubw m4, m3 - - movu [r2], m7 - movu [r2 + r3], m6 - lea r5, [r2 + 2 * r3] - movu [r5], m5 - movu [r5 + r3], m4 -%endif - - lea r5, [8 * r1 - 8] - sub r0, r5 -%ifidn %3,pp - add r2, 8 -%else - add r2, 16 -%endif - dec r4d - jnz .loopW - - lea r0, [r0 + 4 * r1 - %1] -%ifidn %3,pp - lea r2, [r2 + 4 * r3 - %1] -%else - lea r2, [r2 + 4 * r3 - 2 * %1] -%endif - - dec dword [rsp] - jnz .loopH - - RET -%endmacro - - FILTER_VER_LUMA 16, 4, pp - FILTER_VER_LUMA 16, 8, pp - FILTER_VER_LUMA 16, 12, pp - FILTER_VER_LUMA 16, 16, pp - FILTER_VER_LUMA 16, 32, pp - FILTER_VER_LUMA 16, 64, pp - FILTER_VER_LUMA 24, 32, pp - FILTER_VER_LUMA 32, 8, pp - FILTER_VER_LUMA 32, 16, pp - FILTER_VER_LUMA 32, 24, pp - FILTER_VER_LUMA 32, 32, pp - FILTER_VER_LUMA 32, 64, pp - FILTER_VER_LUMA 48, 64, pp - FILTER_VER_LUMA 64, 16, pp - FILTER_VER_LUMA 64, 32, pp - FILTER_VER_LUMA 64, 48, pp - FILTER_VER_LUMA 64, 64, pp - - FILTER_VER_LUMA 16, 4, ps - FILTER_VER_LUMA 16, 8, ps - FILTER_VER_LUMA 16, 12, ps - FILTER_VER_LUMA 16, 16, ps - FILTER_VER_LUMA 16, 32, ps - FILTER_VER_LUMA 16, 64, ps - FILTER_VER_LUMA 24, 32, ps - FILTER_VER_LUMA 32, 8, ps - FILTER_VER_LUMA 32, 16, ps - FILTER_VER_LUMA 32, 24, ps - FILTER_VER_LUMA 32, 32, ps - FILTER_VER_LUMA 32, 64, ps - FILTER_VER_LUMA 48, 64, ps - FILTER_VER_LUMA 64, 16, ps - FILTER_VER_LUMA 64, 32, ps - FILTER_VER_LUMA 64, 48, ps - FILTER_VER_LUMA 64, 64, ps - -%macro PROCESS_LUMA_SP_W4_4R 0 - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m1, m4 ;m1=[1 2] - pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[2 3] - pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 - pmaddwd m4, [r6 + 1 * 16] - paddd m0, m4 ;m0=[0+1+2+3] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[3 4] - pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 - pmaddwd m5, [r6 + 1 * 16] - paddd m1, m5 ;m1 = [1+2+3+4] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[4 5] - pmaddwd m6, m4, [r6 + 1 * 16] - paddd m2, m6 ;m2=[2+3+4+5] Row3 - pmaddwd m4, [r6 + 2 * 16] - paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[5 6] - pmaddwd m6, m5, [r6 + 1 * 16] - paddd m3, m6 ;m3=[3+4+5+6] Row4 - pmaddwd m5, [r6 + 2 * 16] - paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[6 7] - pmaddwd m6, m4, [r6 + 2 * 16] - paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 - pmaddwd m4, [r6 + 3 * 16] - paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[7 8] - pmaddwd m6, m5, [r6 + 2 * 16] - paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 - pmaddwd m5, [r6 + 3 * 16] - paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[8 9] - pmaddwd m4, [r6 + 3 * 16] - paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end - - movq m4, [r0 + 2 * r1] - punpcklwd m5, m4 ;m5=[9 10] - pmaddwd m5, [r6 + 3 * 16] - paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end -%endmacro - -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_SP 2 -INIT_XMM sse4 -cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize - - add r1d, r1d - lea r5, [r1 + 2 * r1] - sub r0, r5 - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_LumaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffV + r4] -%endif - - mova m7, [pd_526336] - - mov dword [rsp], %2/4 -.loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_LUMA_SP_W4_4R - - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 - - packssdw m0, m1 - packssdw m2, m3 - - packuswb m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r5, [r2 + 2 * r3] - pextrd [r5], m0, 2 - pextrd [r5 + r3], m0, 3 - - lea r5, [8 * r1 - 2 * 4] - sub r0, r5 - add r2, 4 - - dec r4d - jnz .loopW - - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - %1] - - dec dword [rsp] - jnz .loopH - - RET -%endmacro - -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_SP 4, 4 - FILTER_VER_LUMA_SP 8, 8 - FILTER_VER_LUMA_SP 8, 4 - FILTER_VER_LUMA_SP 4, 8 - FILTER_VER_LUMA_SP 16, 16 - FILTER_VER_LUMA_SP 16, 8 - FILTER_VER_LUMA_SP 8, 16 - FILTER_VER_LUMA_SP 16, 12 - FILTER_VER_LUMA_SP 12, 16 - FILTER_VER_LUMA_SP 16, 4 - FILTER_VER_LUMA_SP 4, 16 - FILTER_VER_LUMA_SP 32, 32 - FILTER_VER_LUMA_SP 32, 16 - FILTER_VER_LUMA_SP 16, 32 - FILTER_VER_LUMA_SP 32, 24 - FILTER_VER_LUMA_SP 24, 32 - FILTER_VER_LUMA_SP 32, 8 - FILTER_VER_LUMA_SP 8, 32 - FILTER_VER_LUMA_SP 64, 64 - FILTER_VER_LUMA_SP 64, 32 - FILTER_VER_LUMA_SP 32, 64 - FILTER_VER_LUMA_SP 64, 48 - FILTER_VER_LUMA_SP 48, 64 - FILTER_VER_LUMA_SP 64, 16 - FILTER_VER_LUMA_SP 16, 64 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal filterPixelToShort_4x2, 3, 4, 3 - mov r3d, r3m - add r3d, r3d - - ; load constant - mova m1, [pb_128] - mova m2, [tab_c_64_n64] - - movd m0, [r0] - pinsrd m0, [r0 + r1], 1 - punpcklbw m0, m1 - pmaddubsw m0, m2 - - movq [r2 + r3 * 0], m0 - movhps [r2 + r3 * 1], m0 - - RET - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_XMM ssse3 -cglobal filterPixelToShort_8x2, 3, 4, 3 - mov r3d, r3m - add r3d, r3d - - ; load constant - mova m1, [pb_128] - mova m2, [tab_c_64_n64] - - movh m0, [r0] - punpcklbw m0, m1 - pmaddubsw m0, m2 - movu [r2 + r3 * 0], m0 - - movh m0, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m0, m2 - movu [r2 + r3 * 1], m0 - - RET - -%macro PROCESS_CHROMA_SP_W4_4R 0 - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m1, m4 ;m1=[1 2] - pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[2 3] - pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 - pmaddwd m4, [r6 + 1 * 16] - paddd m0, m4 ;m0=[0+1+2+3] Row1 done - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[3 4] - pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 - pmaddwd m5, [r6 + 1 * 16] - paddd m1, m5 ;m1 = [1+2+3+4] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[4 5] - pmaddwd m4, [r6 + 1 * 16] - paddd m2, m4 ;m2=[2+3+4+5] Row3 - - movq m4, [r0 + 2 * r1] - punpcklwd m5, m4 ;m5=[5 6] - pmaddwd m5, [r6 + 1 * 16] - paddd m3, m5 ;m3=[3+4+5+6] Row4 -%endmacro - -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SP 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize - - add r1d, r1d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_ChromaCoeffV + r4] -%endif - - mova m6, [pd_526336] - - mov dword [rsp], %2/4 - -.loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_CHROMA_SP_W4_4R - - paddd m0, m6 - paddd m1, m6 - paddd m2, m6 - paddd m3, m6 - - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 - - packssdw m0, m1 - packssdw m2, m3 - - packuswb m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r5, [r2 + 2 * r3] - pextrd [r5], m0, 2 - pextrd [r5 + r3], m0, 3 - - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 4 - - dec r4d - jnz .loopW - - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - %1] - - dec dword [rsp] - jnz .loopH - - RET -%endmacro - - FILTER_VER_CHROMA_SP 4, 4 - FILTER_VER_CHROMA_SP 4, 8 - FILTER_VER_CHROMA_SP 16, 16 - FILTER_VER_CHROMA_SP 16, 8 - FILTER_VER_CHROMA_SP 16, 12 - FILTER_VER_CHROMA_SP 12, 16 - FILTER_VER_CHROMA_SP 16, 4 - FILTER_VER_CHROMA_SP 4, 16 - FILTER_VER_CHROMA_SP 32, 32 - FILTER_VER_CHROMA_SP 32, 16 - FILTER_VER_CHROMA_SP 16, 32 - FILTER_VER_CHROMA_SP 32, 24 - FILTER_VER_CHROMA_SP 24, 32 - FILTER_VER_CHROMA_SP 32, 8 - - FILTER_VER_CHROMA_SP 16, 24 - FILTER_VER_CHROMA_SP 16, 64 - FILTER_VER_CHROMA_SP 12, 32 - FILTER_VER_CHROMA_SP 4, 32 - FILTER_VER_CHROMA_SP 32, 64 - FILTER_VER_CHROMA_SP 32, 48 - FILTER_VER_CHROMA_SP 24, 64 - - FILTER_VER_CHROMA_SP 64, 64 - FILTER_VER_CHROMA_SP 64, 32 - FILTER_VER_CHROMA_SP 64, 48 - FILTER_VER_CHROMA_SP 48, 64 - FILTER_VER_CHROMA_SP 64, 16 - - -%macro PROCESS_CHROMA_SP_W2_4R 1 - movd m0, [r0] - movd m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - - lea r0, [r0 + 2 * r1] - movd m2, [r0] - punpcklwd m1, m2 ;m1=[1 2] - punpcklqdq m0, m1 ;m0=[0 1 1 2] - pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2 - - movd m1, [r0 + r1] - punpcklwd m2, m1 ;m2=[2 3] - - lea r0, [r0 + 2 * r1] - movd m3, [r0] - punpcklwd m1, m3 ;m2=[3 4] - punpcklqdq m2, m1 ;m2=[2 3 3 4] - - pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2 - pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4 - paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 - - movd m1, [r0 + r1] - punpcklwd m3, m1 ;m3=[4 5] - - movd m4, [r0 + 2 * r1] - punpcklwd m1, m4 ;m1=[5 6] - punpcklqdq m3, m1 ;m2=[4 5 5 6] - pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4 - paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 -%endmacro - -;------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SP_W2_4R 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6 - - add r1d, r1d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] -%else - lea r5, [tab_ChromaCoeffV + r4] -%endif - - mova m5, [pd_526336] - - mov r4d, (%2/4) - -.loopH: - PROCESS_CHROMA_SP_W2_4R r5 - - paddd m0, m5 - paddd m2, m5 - - psrad m0, 12 - psrad m2, 12 - - packssdw m0, m2 - packuswb m0, m0 - - pextrw [r2], m0, 0 - pextrw [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrw [r2], m0, 2 - pextrw [r2 + r3], m0, 3 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH - - RET -%endmacro - - FILTER_VER_CHROMA_SP_W2_4R 2, 4 - FILTER_VER_CHROMA_SP_W2_4R 2, 8 - - FILTER_VER_CHROMA_SP_W2_4R 2, 16 - -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_vert_sp_4x2, 5, 6, 5 - - add r1d, r1d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] -%else - lea r5, [tab_ChromaCoeffV + r4] -%endif - - mova m4, [pd_526336] - - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movq m2, [r0] - punpcklwd m1, m2 ;m1=[1 2] - pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 - - movq m3, [r0 + r1] - punpcklwd m2, m3 ;m4=[2 3] - pmaddwd m2, [r5 + 1 * 16] - paddd m0, m2 ;m0=[0+1+2+3] Row1 done - paddd m0, m4 - psrad m0, 12 - - movq m2, [r0 + 2 * r1] - punpcklwd m3, m2 ;m5=[3 4] - pmaddwd m3, [r5 + 1 * 16] - paddd m1, m3 ;m1 = [1+2+3+4] Row2 done - paddd m1, m4 - psrad m1, 12 - - packssdw m0, m1 - packuswb m0, m0 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - - RET - -;------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SP_W6_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7 - - add r1d, r1d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_ChromaCoeffV + r4] -%endif - - mova m6, [pd_526336] - - mov r4d, %2/4 - -.loopH: - PROCESS_CHROMA_SP_W4_4R - - paddd m0, m6 - paddd m1, m6 - paddd m2, m6 - paddd m3, m6 - - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 - - packssdw m0, m1 - packssdw m2, m3 - - packuswb m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r5, [r2 + 2 * r3] - pextrd [r5], m0, 2 - pextrd [r5 + r3], m0, 3 - - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 4 - - PROCESS_CHROMA_SP_W2_4R r6 - - paddd m0, m6 - paddd m2, m6 - - psrad m0, 12 - psrad m2, 12 - - packssdw m0, m2 - packuswb m0, m0 - - pextrw [r2], m0, 0 - pextrw [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrw [r2], m0, 2 - pextrw [r2 + r3], m0, 3 - - sub r0, 2 * 4 - lea r2, [r2 + 2 * r3 - 4] - - dec r4d - jnz .loopH - - RET -%endmacro - - FILTER_VER_CHROMA_SP_W6_H4 6, 8 - - FILTER_VER_CHROMA_SP_W6_H4 6, 16 - -%macro PROCESS_CHROMA_SP_W8_2R 0 - movu m1, [r0] - movu m3, [r0 + r1] - punpcklwd m0, m1, m3 - pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l - punpckhwd m1, m3 - pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h - - movu m4, [r0 + 2 * r1] - punpcklwd m2, m3, m4 - pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l - punpckhwd m3, m4 - pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h - - lea r0, [r0 + 2 * r1] - movu m5, [r0 + r1] - punpcklwd m6, m4, m5 - pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l - paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum - punpckhwd m4, m5 - pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h - paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum - - movu m4, [r0 + 2 * r1] - punpcklwd m6, m5, m4 - pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l - paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum - punpckhwd m5, m4 - pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h - paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum -%endmacro - -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SP_W8_H2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8 - - add r1d, r1d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] -%else - lea r5, [tab_ChromaCoeffV + r4] -%endif - - mova m7, [pd_526336] - - mov r4d, %2/2 -.loopH: - PROCESS_CHROMA_SP_W8_2R - - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 - - packssdw m0, m1 - packssdw m2, m3 - - packuswb m0, m2 - - movlps [r2], m0 - movhps [r2 + r3], m0 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH - - RET -%endmacro - - FILTER_VER_CHROMA_SP_W8_H2 8, 2 - FILTER_VER_CHROMA_SP_W8_H2 8, 4 - FILTER_VER_CHROMA_SP_W8_H2 8, 6 - FILTER_VER_CHROMA_SP_W8_H2 8, 8 - FILTER_VER_CHROMA_SP_W8_H2 8, 16 - FILTER_VER_CHROMA_SP_W8_H2 8, 32 - - FILTER_VER_CHROMA_SP_W8_H2 8, 12 - FILTER_VER_CHROMA_SP_W8_H2 8, 64 - - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -%macro FILTER_HORIZ_CHROMA_2xN 2 -INIT_XMM sse4 -cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride -%define coef2 m3 -%define Tm0 m2 -%define t1 m1 -%define t0 m0 - - dec srcq - mov r4d, r4m - add dststrided, dststrided - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - movd coef2, [r6 + r4 * 4] -%else - movd coef2, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufd coef2, coef2, 0 - mova t1, [pw_2000] - mova Tm0, [tab_Tm] - - mov r4d, %2 - cmp r5m, byte 0 - je .loopH - sub srcq, srcstrideq - add r4d, 3 - -.loopH: - movh t0, [srcq] - pshufb t0, t0, Tm0 - pmaddubsw t0, coef2 - phaddw t0, t0 - psubw t0, t1 - movd [dstq], t0 - - lea srcq, [srcq + srcstrideq] - lea dstq, [dstq + dststrideq] - - dec r4d - jnz .loopH - - RET -%endmacro - - FILTER_HORIZ_CHROMA_2xN 2, 4 - FILTER_HORIZ_CHROMA_2xN 2, 8 - - FILTER_HORIZ_CHROMA_2xN 2, 16 - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -%macro FILTER_HORIZ_CHROMA_4xN 2 -INIT_XMM sse4 -cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride -%define coef2 m3 -%define Tm0 m2 -%define t1 m1 -%define t0 m0 - - dec srcq - mov r4d, r4m - add dststrided, dststrided - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - movd coef2, [r6 + r4 * 4] -%else - movd coef2, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufd coef2, coef2, 0 - mova t1, [pw_2000] - mova Tm0, [tab_Tm] - - mov r4d, %2 - cmp r5m, byte 0 - je .loopH - sub srcq, srcstrideq - add r4d, 3 - -.loopH: - movh t0, [srcq] - pshufb t0, t0, Tm0 - pmaddubsw t0, coef2 - phaddw t0, t0 - psubw t0, t1 - movlps [dstq], t0 - - lea srcq, [srcq + srcstrideq] - lea dstq, [dstq + dststrideq] - - dec r4d - jnz .loopH - RET -%endmacro - - FILTER_HORIZ_CHROMA_4xN 4, 2 - FILTER_HORIZ_CHROMA_4xN 4, 4 - FILTER_HORIZ_CHROMA_4xN 4, 8 - FILTER_HORIZ_CHROMA_4xN 4, 16 - - FILTER_HORIZ_CHROMA_4xN 4, 32 - -%macro PROCESS_CHROMA_W6 3 - movu %1, [srcq] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - psubw %2, %3 - movh [dstq], %2 - pshufd %2, %2, 2 - movd [dstq + 8], %2 -%endmacro - -%macro PROCESS_CHROMA_W12 3 - movu %1, [srcq] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - psubw %2, %3 - movu [dstq], %2 - movu %1, [srcq + 8] - pshufb %1, %1, Tm0 - pmaddubsw %1, coef2 - phaddw %1, %1 - psubw %1, %3 - movh [dstq + 16], %1 -%endmacro - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -%macro FILTER_HORIZ_CHROMA 2 -INIT_XMM sse4 -cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride -%define coef2 m5 -%define Tm0 m4 -%define Tm1 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 - - dec srcq - mov r4d, r4m - add dststrided, dststrided - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - movd coef2, [r6 + r4 * 4] -%else - movd coef2, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufd coef2, coef2, 0 - mova t2, [pw_2000] - mova Tm0, [tab_Tm] - mova Tm1, [tab_Tm + 16] - - mov r4d, %2 - cmp r5m, byte 0 - je .loopH - sub srcq, srcstrideq - add r4d, 3 - -.loopH: - PROCESS_CHROMA_W%1 t0, t1, t2 - add srcq, srcstrideq - add dstq, dststrideq - - dec r4d - jnz .loopH - - RET -%endmacro - - FILTER_HORIZ_CHROMA 6, 8 - FILTER_HORIZ_CHROMA 12, 16 - - FILTER_HORIZ_CHROMA 6, 16 - FILTER_HORIZ_CHROMA 12, 32 - -%macro PROCESS_CHROMA_W8 3 - movu %1, [srcq] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - psubw %2, %3 - movu [dstq], %2 -%endmacro - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -%macro FILTER_HORIZ_CHROMA_8xN 2 -INIT_XMM sse4 -cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride -%define coef2 m5 -%define Tm0 m4 -%define Tm1 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 - - dec srcq - mov r4d, r4m - add dststrided, dststrided - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - movd coef2, [r6 + r4 * 4] -%else - movd coef2, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufd coef2, coef2, 0 - mova t2, [pw_2000] - mova Tm0, [tab_Tm] - mova Tm1, [tab_Tm + 16] - - mov r4d, %2 - cmp r5m, byte 0 - je .loopH - sub srcq, srcstrideq - add r4d, 3 - -.loopH: - PROCESS_CHROMA_W8 t0, t1, t2 - add srcq, srcstrideq - add dstq, dststrideq - - dec r4d - jnz .loopH - - RET -%endmacro - - FILTER_HORIZ_CHROMA_8xN 8, 2 - FILTER_HORIZ_CHROMA_8xN 8, 4 - FILTER_HORIZ_CHROMA_8xN 8, 6 - FILTER_HORIZ_CHROMA_8xN 8, 8 - FILTER_HORIZ_CHROMA_8xN 8, 16 - FILTER_HORIZ_CHROMA_8xN 8, 32 - - FILTER_HORIZ_CHROMA_8xN 8, 12 - FILTER_HORIZ_CHROMA_8xN 8, 64 - -%macro PROCESS_CHROMA_W16 4 - movu %1, [srcq] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq + 8] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - psubw %2, %3 - psubw %4, %3 - movu [dstq], %2 - movu [dstq + 16], %4 -%endmacro - -%macro PROCESS_CHROMA_W24 4 - movu %1, [srcq] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq + 8] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - psubw %2, %3 - psubw %4, %3 - movu [dstq], %2 - movu [dstq + 16], %4 - movu %1, [srcq + 16] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - psubw %2, %3 - movu [dstq + 32], %2 -%endmacro - -%macro PROCESS_CHROMA_W32 4 - movu %1, [srcq] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq + 8] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - psubw %2, %3 - psubw %4, %3 - movu [dstq], %2 - movu [dstq + 16], %4 - movu %1, [srcq + 16] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq + 24] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - psubw %2, %3 - psubw %4, %3 - movu [dstq + 32], %2 - movu [dstq + 48], %4 -%endmacro - -%macro PROCESS_CHROMA_W16o 5 - movu %1, [srcq + %5] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq + %5 + 8] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - psubw %2, %3 - psubw %4, %3 - movu [dstq + %5 * 2], %2 - movu [dstq + %5 * 2 + 16], %4 -%endmacro - -%macro PROCESS_CHROMA_W48 4 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 -%endmacro - -%macro PROCESS_CHROMA_W64 4 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 48 -%endmacro - -;------------------------------------------------------------------------------------------------------------------------------ -; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;------------------------------------------------------------------------------------------------------------------------------ -%macro FILTER_HORIZ_CHROMA_WxN 2 -INIT_XMM sse4 -cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride -%define coef2 m6 -%define Tm0 m5 -%define Tm1 m4 -%define t3 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 - - dec srcq - mov r4d, r4m - add dststrided, dststrided - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - movd coef2, [r6 + r4 * 4] -%else - movd coef2, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufd coef2, coef2, 0 - mova t2, [pw_2000] - mova Tm0, [tab_Tm] - mova Tm1, [tab_Tm + 16] - - mov r4d, %2 - cmp r5m, byte 0 - je .loopH - sub srcq, srcstrideq - add r4d, 3 - -.loopH: - PROCESS_CHROMA_W%1 t0, t1, t2, t3 - add srcq, srcstrideq - add dstq, dststrideq - - dec r4d - jnz .loopH - - RET -%endmacro - - FILTER_HORIZ_CHROMA_WxN 16, 4 - FILTER_HORIZ_CHROMA_WxN 16, 8 - FILTER_HORIZ_CHROMA_WxN 16, 12 - FILTER_HORIZ_CHROMA_WxN 16, 16 - FILTER_HORIZ_CHROMA_WxN 16, 32 - FILTER_HORIZ_CHROMA_WxN 24, 32 - FILTER_HORIZ_CHROMA_WxN 32, 8 - FILTER_HORIZ_CHROMA_WxN 32, 16 - FILTER_HORIZ_CHROMA_WxN 32, 24 - FILTER_HORIZ_CHROMA_WxN 32, 32 - - FILTER_HORIZ_CHROMA_WxN 16, 24 - FILTER_HORIZ_CHROMA_WxN 16, 64 - FILTER_HORIZ_CHROMA_WxN 24, 64 - FILTER_HORIZ_CHROMA_WxN 32, 48 - FILTER_HORIZ_CHROMA_WxN 32, 64 - - FILTER_HORIZ_CHROMA_WxN 64, 64 - FILTER_HORIZ_CHROMA_WxN 64, 32 - FILTER_HORIZ_CHROMA_WxN 64, 48 - FILTER_HORIZ_CHROMA_WxN 48, 64 - FILTER_HORIZ_CHROMA_WxN 64, 16 - - -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W16n 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 - - mov r4d, r4m - sub r0, r1 - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - mov r4d, %2/2 - -.loop: - - mov r6d, %1/16 - -.loopW: - - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r5, [r0 + 2 * r1] - movu m5, [r5] - movu m7, [r5 + r1] - - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 - - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 - - mova m6, [pw_2000] - - psubw m4, m6 - psubw m2, m6 - - movu [r2], m4 - movu [r2 + 16], m2 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m5, [r5 + 2 * r1] - - punpcklbw m2, m7, m5 - punpckhbw m7, m5 - - pmaddubsw m2, m0 - pmaddubsw m7, m0 - - paddw m4, m2 - paddw m3, m7 - - psubw m4, m6 - psubw m3, m6 - - movu [r2 + r3], m4 - movu [r2 + r3 + 16], m3 - - add r0, 16 - add r2, 32 - dec r6d - jnz .loopW - - lea r0, [r0 + r1 * 2 - %1] - lea r2, [r2 + r3 * 2 - %1 * 2] - - dec r4d - jnz .loop - RET -%endmacro - - FILTER_V_PS_W16n 64, 64 - FILTER_V_PS_W16n 64, 32 - FILTER_V_PS_W16n 64, 48 - FILTER_V_PS_W16n 48, 64 - FILTER_V_PS_W16n 64, 16 - - -;------------------------------------------------------------------------------------------------------------ -;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------ -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_2x4, 4, 6, 7 - - mov r4d, r4m - sub r0, r1 - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m0, [tab_Cm] - - lea r5, [3 * r1] - - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] - - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 - - pmaddubsw m2, m0 - - lea r0, [r0 + 4 * r1] - movd m6, [r0] - - punpcklbw m3, m4 - punpcklbw m1, m5, m6 - punpcklbw m3, m1 - - pmaddubsw m3, m0 - phaddw m2, m3 - - mova m1, [pw_2000] - - psubw m2, m1 - - movd [r2], m2 - pextrd [r2 + r3], m2, 2 - - movd m2, [r0 + r1] - - punpcklbw m4, m5 - punpcklbw m3, m6, m2 - punpcklbw m4, m3 - - pmaddubsw m4, m0 - - movd m3, [r0 + 2 * r1] - - punpcklbw m5, m6 - punpcklbw m2, m3 - punpcklbw m5, m2 - - pmaddubsw m5, m0 - phaddw m4, m5 - psubw m4, m1 - - lea r2, [r2 + 2 * r3] - movd [r2], m4 - pextrd [r2 + r3], m4, 2 - - RET - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W2 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] + vextracti128 xm3, m1, 1 + lea r8, [r2 + r3 * 4] + movu [r8], xm1 + movu [r8 + r3], xm3 + vextracti128 xm3, m4, 1 + movu [r8 + r3 * 2], xm4 + movu [r8 + r6], xm3 + vextracti128 xm3, m0, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm0 + movu [r8 + r3], xm3 + vextracti128 xm3, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm3 + vextracti128 xm3, m7, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm7 + movu [r8 + r3], xm3 + vextracti128 xm3, m8, 1 + movu [r8 + r3 * 2], xm8 + movu [r8 + r6], xm3 %endif - - pshufb m0, [tab_Cm] - - mova m1, [pw_2000] - lea r5, [3 * r1] - mov r4d, %2/4 -.loop: - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] - - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 - - pmaddubsw m2, m0 - - lea r0, [r0 + 4 * r1] - movd m6, [r0] - - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 - - pmaddubsw m3, m0 - - phaddw m2, m3 - psubw m2, m1 - - - movd [r2], m2 - pshufd m2, m2, 2 - movd [r2 + r3], m2 - - movd m2, [r0 + r1] - - punpcklbw m4, m5 - punpcklbw m3, m6, m2 - punpcklbw m4, m3 - - pmaddubsw m4, m0 - - movd m3, [r0 + 2 * r1] - - punpcklbw m5, m6 - punpcklbw m2, m3 - punpcklbw m5, m2 - - pmaddubsw m5, m0 - - phaddw m4, m5 - - psubw m4, m1 - - lea r2, [r2 + 2 * r3] - movd [r2], m4 - pshufd m4 , m4 ,2 - movd [r2 + r3], m4 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loop - - RET -%endmacro - - FILTER_V_PS_W2 2, 8 - - FILTER_V_PS_W2 2, 16 - -;----------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize - - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r6, [r5 + r4] + sub r7, r10 + lea r0, [r7 - 16] +%ifidn %1,pp + lea r2, [r8 + r3 * 4 - 16] %else - lea r6, [tab_ChromaCoeffV + r4] + lea r2, [r8 + r3 * 4 - 32] %endif - - mov dword [rsp], %2/4 - -.loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_CHROMA_SP_W4_4R - - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - packssdw m0, m1 - packssdw m2, m3 - - movlps [r2], m0 - movhps [r2 + r3], m0 - lea r5, [r2 + 2 * r3] - movlps [r5], m2 - movhps [r5 + r3], m2 - - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 2 * 4 - - dec r4d - jnz .loopW - - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - 2 * %1] - - dec dword [rsp] - jnz .loopH - + dec r9d + jnz .loopH RET +%endif %endmacro - FILTER_VER_CHROMA_SS 4, 4 - FILTER_VER_CHROMA_SS 4, 8 - FILTER_VER_CHROMA_SS 16, 16 - FILTER_VER_CHROMA_SS 16, 8 - FILTER_VER_CHROMA_SS 16, 12 - FILTER_VER_CHROMA_SS 12, 16 - FILTER_VER_CHROMA_SS 16, 4 - FILTER_VER_CHROMA_SS 4, 16 - FILTER_VER_CHROMA_SS 32, 32 - FILTER_VER_CHROMA_SS 32, 16 - FILTER_VER_CHROMA_SS 16, 32 - FILTER_VER_CHROMA_SS 32, 24 - FILTER_VER_CHROMA_SS 24, 32 - FILTER_VER_CHROMA_SS 32, 8 - - FILTER_VER_CHROMA_SS 16, 24 - FILTER_VER_CHROMA_SS 12, 32 - FILTER_VER_CHROMA_SS 4, 32 - FILTER_VER_CHROMA_SS 32, 64 - FILTER_VER_CHROMA_SS 16, 64 - FILTER_VER_CHROMA_SS 32, 48 - FILTER_VER_CHROMA_SS 24, 64 - - FILTER_VER_CHROMA_SS 64, 64 - FILTER_VER_CHROMA_SS 64, 32 - FILTER_VER_CHROMA_SS 64, 48 - FILTER_VER_CHROMA_SS 48, 64 - FILTER_VER_CHROMA_SS 64, 16 + FILTER_VER_LUMA_AVX2_24x32 pp + FILTER_VER_LUMA_AVX2_24x32 ps -%macro FILTER_VER_CHROMA_S_AVX2_4x4 1 +%macro FILTER_VER_LUMA_AVX2_32xN 3 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x4, 4, 6, 7 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 mov r4d, r4m - add r1d, r1d - shl r4d, 6 - sub r0, r1 - + shl r4d, 7 %ifdef PIC - lea r5, [pw_ChromaCoeffV] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [pw_ChromaCoeffV + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - lea r4, [r1 * 3] -%ifidn %1,sp - mova m6, [pd_526336] -%else + sub r0, r4 +%ifidn %3,ps add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%else + mova m14, [pw_512] %endif - - movq xm0, [r0] - movq xm1, [r0 + r1] - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m4, [r5 + 1 * mmsize] - paddd m2, m4 - -%ifidn %1,sp - paddd m0, m6 - paddd m2, m6 - psrad m0, 12 - psrad m2, 12 + lea r6, [r3 * 3] + lea r11, [r1 * 4] + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 16 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %3 +%ifidn %3,pp + add r2, 16 %else - psrad m0, 6 - psrad m2, 6 + add r2, 32 %endif - packssdw m0, m2 - vextracti128 xm2, m0, 1 - lea r4, [r3 * 3] - -%ifidn %1,sp - packuswb xm0, xm2 - movd [r2], xm0 - pextrd [r2 + r3], xm0, 2 - pextrd [r2 + r3 * 2], xm0, 1 - pextrd [r2 + r4], xm0, 3 + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 16] +%ifidn %3,pp + lea r2, [r8 + r3 * 4 - 16] %else - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r4], xm2 + lea r2, [r8 + r3 * 4 - 32] %endif + dec r9d + jnz .loopH RET +%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_4x4 sp - FILTER_VER_CHROMA_S_AVX2_4x4 ss + FILTER_VER_LUMA_AVX2_32xN 32, 32, pp + FILTER_VER_LUMA_AVX2_32xN 32, 64, pp + FILTER_VER_LUMA_AVX2_32xN 32, 32, ps + FILTER_VER_LUMA_AVX2_32xN 32, 64, ps -%macro FILTER_VER_CHROMA_S_AVX2_4x8 1 +%macro FILTER_VER_LUMA_AVX2_32x16 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x8, 4, 6, 8 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_32x16, 4, 10, 15 mov r4d, r4m - shl r4d, 6 - add r1d, r1d - sub r0, r1 + shl r4d, 7 %ifdef PIC - lea r5, [pw_ChromaCoeffV] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [pw_ChromaCoeffV + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] -%ifidn %1,sp - mova m7, [pd_526336] -%else + sub r0, r4 +%ifidn %1,ps add r3d, r3d -%endif - - movq xm0, [r0] - movq xm1, [r0 + r1] - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m5, m4, [r5 + 1 * mmsize] - paddd m2, m5 - pmaddwd m4, [r5] - movq xm3, [r0 + r4] - punpcklwd xm1, xm3 - lea r0, [r0 + 4 * r1] - movq xm6, [r0] - punpcklwd xm3, xm6 - vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] - pmaddwd m5, m1, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m1, [r5] - movq xm3, [r0 + r1] - punpcklwd xm6, xm3 - movq xm5, [r0 + 2 * r1] - punpcklwd xm3, xm5 - vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] - pmaddwd m6, [r5 + 1 * mmsize] - paddd m1, m6 - lea r4, [r3 * 3] - -%ifidn %1,sp - paddd m0, m7 - paddd m2, m7 - paddd m4, m7 - paddd m1, m7 - psrad m0, 12 - psrad m2, 12 - psrad m4, 12 - psrad m1, 12 + vbroadcasti128 m14, [pw_2000] %else - psrad m0, 6 - psrad m2, 6 - psrad m4, 6 - psrad m1, 6 + mova m14, [pw_512] %endif - packssdw m0, m2 - packssdw m4, m1 -%ifidn %1,sp - packuswb m0, m4 - vextracti128 xm2, m0, 1 - movd [r2], xm0 - movd [r2 + r3], xm2 - pextrd [r2 + r3 * 2], xm0, 1 - pextrd [r2 + r4], xm2, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm2, 2 - pextrd [r2 + r3 * 2], xm0, 3 - pextrd [r2 + r4], xm2, 3 + lea r6, [r3 * 3] + mov r9d, 2 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 %else - vextracti128 xm2, m0, 1 - vextracti128 xm1, m4, 1 - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r4], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movq [r2 + r3], xm1 - movhps [r2 + r3 * 2], xm4 - movhps [r2 + r4], xm1 + add r2, 32 %endif + add r0, 16 + dec r9d + jnz .loopW RET +%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_4x8 sp - FILTER_VER_CHROMA_S_AVX2_4x8 ss - -%macro PROCESS_CHROMA_AVX2_W4_16R 1 - movq xm0, [r0] - movq xm1, [r0 + r1] - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m5, m4, [r5 + 1 * mmsize] - paddd m2, m5 - pmaddwd m4, [r5] - movq xm3, [r0 + r4] - punpcklwd xm1, xm3 - lea r0, [r0 + 4 * r1] - movq xm6, [r0] - punpcklwd xm3, xm6 - vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] - pmaddwd m5, m1, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m1, [r5] - movq xm3, [r0 + r1] - punpcklwd xm6, xm3 - movq xm5, [r0 + 2 * r1] - punpcklwd xm3, xm5 - vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] - pmaddwd m3, m6, [r5 + 1 * mmsize] - paddd m1, m3 - pmaddwd m6, [r5] + FILTER_VER_LUMA_AVX2_32x16 pp + FILTER_VER_LUMA_AVX2_32x16 ps -%ifidn %1,sp - paddd m0, m7 - paddd m2, m7 - paddd m4, m7 - paddd m1, m7 - psrad m4, 12 - psrad m1, 12 - psrad m0, 12 - psrad m2, 12 +%macro FILTER_VER_LUMA_AVX2_32x24 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 %else - psrad m0, 6 - psrad m2, 6 - psrad m4, 6 - psrad m1, 6 + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - packssdw m0, m2 - packssdw m4, m1 -%ifidn %1,sp - packuswb m0, m4 - vextracti128 xm4, m0, 1 - movd [r2], xm0 - movd [r2 + r3], xm4 - pextrd [r2 + r3 * 2], xm0, 1 - pextrd [r2 + r6], xm4, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm4, 2 - pextrd [r2 + r3 * 2], xm0, 3 - pextrd [r2 + r6], xm4, 3 + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,ps + add r3d, r3d +%endif + lea r6, [r3 * 3] +%ifidn %1,pp + mova m14, [pw_512] %else - vextracti128 xm2, m0, 1 - vextracti128 xm1, m4, 1 - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movq [r2 + r3], xm1 - movhps [r2 + r3 * 2], xm4 - movhps [r2 + r6], xm1 + vbroadcasti128 m14, [pw_2000] +%endif + mov r9d, 2 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r9d + jnz .loopW + lea r9, [r1 * 4] + sub r7, r9 + lea r0, [r7 - 16] +%ifidn %1,pp + lea r2, [r8 + r3 * 4 - 16] +%else + lea r2, [r8 + r3 * 4 - 32] +%endif + mov r9d, 2 +.loop: + PROCESS_LUMA_AVX2_W16_8R %1 +%ifidn %1,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r9d + jnz .loop + RET %endif +%endmacro - movq xm2, [r0 + r4] - punpcklwd xm5, xm2 - lea r0, [r0 + 4 * r1] - movq xm0, [r0] - punpcklwd xm2, xm0 - vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] - pmaddwd m2, m5, [r5 + 1 * mmsize] - paddd m6, m2 - pmaddwd m5, [r5] - movq xm2, [r0 + r1] - punpcklwd xm0, xm2 - movq xm3, [r0 + 2 * r1] - punpcklwd xm2, xm3 - vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] - pmaddwd m2, m0, [r5 + 1 * mmsize] - paddd m5, m2 - pmaddwd m0, [r5] - movq xm4, [r0 + r4] - punpcklwd xm3, xm4 - lea r0, [r0 + 4 * r1] - movq xm1, [r0] - punpcklwd xm4, xm1 - vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] - pmaddwd m4, m3, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m3, [r5] - movq xm4, [r0 + r1] - punpcklwd xm1, xm4 - movq xm2, [r0 + 2 * r1] - punpcklwd xm4, xm2 - vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] - pmaddwd m1, [r5 + 1 * mmsize] - paddd m3, m1 + FILTER_VER_LUMA_AVX2_32x24 pp + FILTER_VER_LUMA_AVX2_32x24 ps -%ifidn %1,sp - paddd m6, m7 - paddd m5, m7 - paddd m0, m7 - paddd m3, m7 - psrad m6, 12 - psrad m5, 12 - psrad m0, 12 - psrad m3, 12 +%macro FILTER_VER_LUMA_AVX2_32x8 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_32x8, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 %else - psrad m6, 6 - psrad m5, 6 - psrad m0, 6 - psrad m3, 6 + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - packssdw m6, m5 - packssdw m0, m3 - lea r2, [r2 + r3 * 4] -%ifidn %1,sp - packuswb m6, m0 - vextracti128 xm0, m6, 1 - movd [r2], xm6 - movd [r2 + r3], xm0 - pextrd [r2 + r3 * 2], xm6, 1 - pextrd [r2 + r6], xm0, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm6, 2 - pextrd [r2 + r3], xm0, 2 - pextrd [r2 + r3 * 2], xm6, 3 - pextrd [r2 + r6], xm0, 3 + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,ps + add r3d, r3d +%endif + lea r6, [r3 * 3] +%ifidn %1,pp + mova m14, [pw_512] %else - vextracti128 xm5, m6, 1 - vextracti128 xm3, m0, 1 - movq [r2], xm6 - movq [r2 + r3], xm5 - movhps [r2 + r3 * 2], xm6 - movhps [r2 + r6], xm5 - lea r2, [r2 + r3 * 4] - movq [r2], xm0 - movq [r2 + r3], xm3 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm3 + vbroadcasti128 m14, [pw_2000] +%endif + mov r9d, 2 +.loopW: + PROCESS_LUMA_AVX2_W16_8R %1 +%ifidn %1,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r9d + jnz .loopW + RET %endif %endmacro -%macro FILTER_VER_CHROMA_S_AVX2_4x16 1 + FILTER_VER_LUMA_AVX2_32x8 pp + FILTER_VER_LUMA_AVX2_32x8 ps + +%macro FILTER_VER_LUMA_AVX2_48x64 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x16, 4, 7, 8 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_48x64, 4, 12, 15 mov r4d, r4m - shl r4d, 6 - add r1d, r1d - sub r0, r1 + shl r4d, 7 %ifdef PIC - lea r5, [pw_ChromaCoeffV] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [pw_ChromaCoeffV + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] -%ifidn %1,sp - mova m7, [pd_526336] -%else + sub r0, r4 + +%ifidn %1,ps add r3d, r3d %endif + lea r6, [r3 * 3] - PROCESS_CHROMA_AVX2_W4_16R %1 + lea r11, [r1 * 4] + +%ifidn %1,pp + mova m14, [pw_512] +%else + vbroadcasti128 m14, [pw_2000] +%endif + + mov r9d, 4 +.loopH: + mov r10d, 3 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 32] +%ifidn %1,pp + lea r2, [r8 + r3 * 4 - 32] +%else + lea r2, [r8 + r3 * 4 - 64] +%endif + dec r9d + jnz .loopH RET +%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_4x16 sp - FILTER_VER_CHROMA_S_AVX2_4x16 ss + FILTER_VER_LUMA_AVX2_48x64 pp + FILTER_VER_LUMA_AVX2_48x64 ps -%macro FILTER_VER_CHROMA_S_AVX2_4x32 1 +%macro FILTER_VER_LUMA_AVX2_64xN 3 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x32, 4, 7, 8 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 mov r4d, r4m - shl r4d, 6 - add r1d, r1d - sub r0, r1 + shl r4d, 7 %ifdef PIC - lea r5, [pw_ChromaCoeffV] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [pw_ChromaCoeffV + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] -%ifidn %1,sp - mova m7, [pd_526336] -%else + sub r0, r4 + +%ifidn %3,ps add r3d, r3d %endif + lea r6, [r3 * 3] -%rep 2 - PROCESS_CHROMA_AVX2_W4_16R %1 - lea r2, [r2 + r3 * 4] -%endrep + lea r11, [r1 * 4] + +%ifidn %3,pp + mova m14, [pw_512] +%else + vbroadcasti128 m14, [pw_2000] +%endif + + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 16 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %3 +%ifidn %3,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 48] +%ifidn %3,pp + lea r2, [r8 + r3 * 4 - 48] +%else + lea r2, [r8 + r3 * 4 - 96] +%endif + dec r9d + jnz .loopH RET +%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_4x32 sp - FILTER_VER_CHROMA_S_AVX2_4x32 ss + FILTER_VER_LUMA_AVX2_64xN 64, 32, pp + FILTER_VER_LUMA_AVX2_64xN 64, 48, pp + FILTER_VER_LUMA_AVX2_64xN 64, 64, pp + FILTER_VER_LUMA_AVX2_64xN 64, 32, ps + FILTER_VER_LUMA_AVX2_64xN 64, 48, ps + FILTER_VER_LUMA_AVX2_64xN 64, 64, ps -%macro FILTER_VER_CHROMA_S_AVX2_4x2 1 +%macro FILTER_VER_LUMA_AVX2_64x16 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x2, 4, 6, 6 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_64x16, 4, 10, 15 mov r4d, r4m - shl r4d, 6 - add r1d, r1d - sub r0, r1 + shl r4d, 7 %ifdef PIC - lea r5, [pw_ChromaCoeffV] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [pw_ChromaCoeffV + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] -%ifidn %1,sp - mova m5, [pd_526336] -%else + sub r0, r4 + +%ifidn %1,ps add r3d, r3d %endif - movq xm0, [r0] - movq xm1, [r0 + r1] - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] - punpcklwd xm2, xm3 - movq xm4, [r0 + 4 * r1] - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 -%ifidn %1,sp - paddd m0, m5 - psrad m0, 12 + + lea r6, [r3 * 3] + +%ifidn %1,pp + mova m14, [pw_512] %else - psrad m0, 6 + vbroadcasti128 m14, [pw_2000] %endif - vextracti128 xm1, m0, 1 - packssdw xm0, xm1 -%ifidn %1,sp - packuswb xm0, xm0 - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 + + mov r9d, 4 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 %else - movq [r2], xm0 - movhps [r2 + r3], xm0 + add r2, 32 %endif + add r0, 16 + dec r9d + jnz .loopW RET +%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_4x2 sp - FILTER_VER_CHROMA_S_AVX2_4x2 ss + FILTER_VER_LUMA_AVX2_64x16 pp + FILTER_VER_LUMA_AVX2_64x16 ps -%macro FILTER_VER_CHROMA_S_AVX2_2x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_2x4, 4, 6, 6 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - sub r0, r1 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 +%ifidn %3,ps + add r3d, r3d +%endif %ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] %else - lea r5, [pw_ChromaCoeffV + r4] + lea r6, [tab_LumaCoeffVer + r4] %endif - lea r4, [r1 * 3] -%ifidn %1,sp - mova m5, [pd_526336] +%ifidn %3,pp + mova m3, [pw_512] %else - add r3d, r3d + mova m3, [pw_2000] %endif - movd xm0, [r0] - movd xm1, [r0 + r1] - punpcklwd xm0, xm1 - movd xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] - movd xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movd xm4, [r0] - punpcklwd xm3, xm4 - punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] - vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] - movd xm1, [r0 + r1] - punpcklwd xm4, xm1 - movd xm3, [r0 + r1 * 2] - punpcklwd xm1, xm3 - punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] - vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] - pmaddwd m0, [r5] - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 -%ifidn %1,sp - paddd m0, m5 - psrad m0, 12 + mov dword [rsp], %2/4 + +.loopH: + mov r4d, (%1/8) +.loopW: + PROCESS_LUMA_W8_4R +%ifidn %3,pp + pmulhrsw m7, m3 + pmulhrsw m6, m3 + pmulhrsw m5, m3 + pmulhrsw m4, m3 + + packuswb m7, m6 + packuswb m5, m4 + + movlps [r2], m7 + movhps [r2 + r3], m7 + lea r5, [r2 + 2 * r3] + movlps [r5], m5 + movhps [r5 + r3], m5 %else - psrad m0, 6 + psubw m7, m3 + psubw m6, m3 + psubw m5, m3 + psubw m4, m3 + + movu [r2], m7 + movu [r2 + r3], m6 + lea r5, [r2 + 2 * r3] + movu [r5], m5 + movu [r5 + r3], m4 %endif - vextracti128 xm1, m0, 1 - packssdw xm0, xm1 - lea r4, [r3 * 3] -%ifidn %1,sp - packuswb xm0, xm0 - pextrw [r2], xm0, 0 - pextrw [r2 + r3], xm0, 1 - pextrw [r2 + 2 * r3], xm0, 2 - pextrw [r2 + r4], xm0, 3 + + lea r5, [8 * r1 - 8] + sub r0, r5 +%ifidn %3,pp + add r2, 8 %else - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - pextrd [r2 + 2 * r3], xm0, 2 - pextrd [r2 + r4], xm0, 3 + add r2, 16 +%endif + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - %1] +%ifidn %3,pp + lea r2, [r2 + 4 * r3 - %1] +%else + lea r2, [r2 + 4 * r3 - 2 * %1] %endif + + dec dword [rsp] + jnz .loopH + RET %endmacro - FILTER_VER_CHROMA_S_AVX2_2x4 sp - FILTER_VER_CHROMA_S_AVX2_2x4 ss + FILTER_VER_LUMA 16, 4, pp + FILTER_VER_LUMA 16, 8, pp + FILTER_VER_LUMA 16, 12, pp + FILTER_VER_LUMA 16, 16, pp + FILTER_VER_LUMA 16, 32, pp + FILTER_VER_LUMA 16, 64, pp + FILTER_VER_LUMA 24, 32, pp + FILTER_VER_LUMA 32, 8, pp + FILTER_VER_LUMA 32, 16, pp + FILTER_VER_LUMA 32, 24, pp + FILTER_VER_LUMA 32, 32, pp + FILTER_VER_LUMA 32, 64, pp + FILTER_VER_LUMA 48, 64, pp + FILTER_VER_LUMA 64, 16, pp + FILTER_VER_LUMA 64, 32, pp + FILTER_VER_LUMA 64, 48, pp + FILTER_VER_LUMA 64, 64, pp + + FILTER_VER_LUMA 16, 4, ps + FILTER_VER_LUMA 16, 8, ps + FILTER_VER_LUMA 16, 12, ps + FILTER_VER_LUMA 16, 16, ps + FILTER_VER_LUMA 16, 32, ps + FILTER_VER_LUMA 16, 64, ps + FILTER_VER_LUMA 24, 32, ps + FILTER_VER_LUMA 32, 8, ps + FILTER_VER_LUMA 32, 16, ps + FILTER_VER_LUMA 32, 24, ps + FILTER_VER_LUMA 32, 32, ps + FILTER_VER_LUMA 32, 64, ps + FILTER_VER_LUMA 48, 64, ps + FILTER_VER_LUMA 64, 16, ps + FILTER_VER_LUMA 64, 32, ps + FILTER_VER_LUMA 64, 48, ps + FILTER_VER_LUMA 64, 64, ps + +%macro PROCESS_LUMA_SP_W4_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m6, m4, [r6 + 1 * 16] + paddd m2, m6 ;m2=[2+3+4+5] Row3 + pmaddwd m4, [r6 + 2 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m6, m5, [r6 + 1 * 16] + paddd m3, m6 ;m3=[3+4+5+6] Row4 + pmaddwd m5, [r6 + 2 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[6 7] + pmaddwd m6, m4, [r6 + 2 * 16] + paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 + pmaddwd m4, [r6 + 3 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[7 8] + pmaddwd m6, m5, [r6 + 2 * 16] + paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 + pmaddwd m5, [r6 + 3 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[8 9] + pmaddwd m4, [r6 + 3 * 16] + paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[9 10] + pmaddwd m5, [r6 + 3 * 16] + paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_SP 2 +INIT_XMM sse4 +cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize -%macro FILTER_VER_CHROMA_S_AVX2_8x8 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x8, 4, 6, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d + add r1d, r1d + lea r5, [r1 + 2 * r1] + sub r0, r5 + shl r4d, 6 %ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] %else - lea r5, [pw_ChromaCoeffV + r4] + lea r6, [tab_LumaCoeffV + r4] %endif - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m7, [pd_526336] -%else - add r3d, r3d -%endif + mova m7, [pd_526336] - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m4 - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - pmaddwd m3, [r5] - paddd m1, m5 -%ifidn %1,sp - paddd m0, m7 - paddd m1, m7 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_LUMA_SP_W4_4R - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m1 -%ifidn %1,sp - paddd m2, m7 - paddd m3, m7 - psrad m2, 12 - psrad m3, 12 -%else - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m2, m3 + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 - movu xm1, [r0 + r4] ; m1 = row 7 - punpckhwd xm3, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm3, 1 - pmaddwd m3, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m3 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 - lea r4, [r3 * 3] -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r4], xm2 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - movu [r2], xm0 - vextracti128 xm0, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 -%endif - lea r2, [r2 + r3 * 4] - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 8 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - pmaddwd m1, [r5] - paddd m5, m2 -%ifidn %1,sp - paddd m4, m7 - paddd m5, m7 - psrad m4, 12 - psrad m5, 12 -%else - psrad m4, 6 - psrad m5, 6 -%endif - packssdw m4, m5 + packssdw m0, m1 + packssdw m2, m3 - movu xm2, [r0 + r1] ; m2 = row 9 - punpckhwd xm5, xm0, xm2 - punpcklwd xm0, xm2 - vinserti128 m0, m0, xm5, 1 - pmaddwd m0, [r5 + 1 * mmsize] - paddd m6, m0 - movu xm5, [r0 + r1 * 2] ; m5 = row 10 - punpckhwd xm0, xm2, xm5 - punpcklwd xm2, xm5 - vinserti128 m2, m2, xm0, 1 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m1, m2 + packuswb m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m0, 2 + pextrd [r5 + r3], m0, 3 + + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_SP 4, 4 + FILTER_VER_LUMA_SP 8, 8 + FILTER_VER_LUMA_SP 8, 4 + FILTER_VER_LUMA_SP 4, 8 + FILTER_VER_LUMA_SP 16, 16 + FILTER_VER_LUMA_SP 16, 8 + FILTER_VER_LUMA_SP 8, 16 + FILTER_VER_LUMA_SP 16, 12 + FILTER_VER_LUMA_SP 12, 16 + FILTER_VER_LUMA_SP 16, 4 + FILTER_VER_LUMA_SP 4, 16 + FILTER_VER_LUMA_SP 32, 32 + FILTER_VER_LUMA_SP 32, 16 + FILTER_VER_LUMA_SP 16, 32 + FILTER_VER_LUMA_SP 32, 24 + FILTER_VER_LUMA_SP 24, 32 + FILTER_VER_LUMA_SP 32, 8 + FILTER_VER_LUMA_SP 8, 32 + FILTER_VER_LUMA_SP 64, 64 + FILTER_VER_LUMA_SP 64, 32 + FILTER_VER_LUMA_SP 32, 64 + FILTER_VER_LUMA_SP 64, 48 + FILTER_VER_LUMA_SP 48, 64 + FILTER_VER_LUMA_SP 64, 16 + FILTER_VER_LUMA_SP 16, 64 + +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal filterPixelToShort_4x2, 3, 4, 3 + mov r3d, r3m + add r3d, r3d + + ; load constant + mova m1, [pb_128] + mova m2, [tab_c_64_n64] + + movd m0, [r0] + pinsrd m0, [r0 + r1], 1 + punpcklbw m0, m1 + pmaddubsw m0, m2 + + movq [r2 + r3 * 0], m0 + movhps [r2 + r3 * 1], m0 -%ifidn %1,sp - paddd m6, m7 - paddd m1, m7 - psrad m6, 12 - psrad m1, 12 -%else - psrad m6, 6 - psrad m1, 6 -%endif - packssdw m6, m1 -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r2], xm4 - movhps [r2 + r3], xm4 - movq [r2 + r3 * 2], xm6 - movhps [r2 + r4], xm6 -%else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm5, m4, 1 - vextracti128 xm1, m6, 1 - movu [r2], xm4 - movu [r2 + r3], xm5 - movu [r2 + r3 * 2], xm6 - movu [r2 + r4], xm1 -%endif RET -%endmacro - FILTER_VER_CHROMA_S_AVX2_8x8 sp - FILTER_VER_CHROMA_S_AVX2_8x8 ss +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal filterPixelToShort_8x2, 3, 4, 3 + mov r3d, r3m + add r3d, r3d -%macro PROCESS_CHROMA_S_AVX2_W8_16R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhwd xm7, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 1 * mmsize] - paddd m3, m7 - pmaddwd m5, [r5] -%ifidn %1,sp - paddd m0, m9 - paddd m1, m9 - paddd m2, m9 - paddd m3, m9 - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 -%else - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m0, m1 - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 -%endif + ; load constant + mova m1, [pb_128] + mova m2, [tab_c_64_n64] - movu xm7, [r7 + r4] ; m7 = row 7 - punpckhwd xm8, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddwd m8, m6, [r5 + 1 * mmsize] - paddd m4, m8 - pmaddwd m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm8, [r7] ; m8 = row 8 - punpckhwd xm0, xm7, xm8 - punpcklwd xm7, xm8 - vinserti128 m7, m7, xm0, 1 - pmaddwd m0, m7, [r5 + 1 * mmsize] - paddd m5, m0 - pmaddwd m7, [r5] - movu xm0, [r7 + r1] ; m0 = row 9 - punpckhwd xm1, xm8, xm0 - punpcklwd xm8, xm0 - vinserti128 m8, m8, xm1, 1 - pmaddwd m1, m8, [r5 + 1 * mmsize] - paddd m6, m1 - pmaddwd m8, [r5] - movu xm1, [r7 + r1 * 2] ; m1 = row 10 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m2, m0, [r5 + 1 * mmsize] - paddd m7, m2 - pmaddwd m0, [r5] -%ifidn %1,sp - paddd m4, m9 - paddd m5, m9 - psrad m4, 12 - psrad m5, 12 - paddd m6, m9 - paddd m7, m9 - psrad m6, 12 - psrad m7, 12 -%else - psrad m4, 6 - psrad m5, 6 - psrad m6, 6 - psrad m7, 6 -%endif - packssdw m4, m5 - packssdw m6, m7 - lea r8, [r2 + r3 * 4] -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r8], xm4 - movhps [r8 + r3], xm4 - movq [r8 + r3 * 2], xm6 - movhps [r8 + r6], xm6 -%else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm5, m4, 1 - vextracti128 xm7, m6, 1 - movu [r8], xm4 - movu [r8 + r3], xm5 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 -%endif + movh m0, [r0] + punpcklbw m0, m1 + pmaddubsw m0, m2 + movu [r2 + r3 * 0], m0 - movu xm2, [r7 + r4] ; m2 = row 11 - punpckhwd xm4, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm4, 1 - pmaddwd m4, m1, [r5 + 1 * mmsize] - paddd m8, m4 - pmaddwd m1, [r5] - lea r7, [r7 + r1 * 4] - movu xm4, [r7] ; m4 = row 12 - punpckhwd xm5, xm2, xm4 - punpcklwd xm2, xm4 - vinserti128 m2, m2, xm5, 1 - pmaddwd m5, m2, [r5 + 1 * mmsize] - paddd m0, m5 - pmaddwd m2, [r5] - movu xm5, [r7 + r1] ; m5 = row 13 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m1, m6 - pmaddwd m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 14 - punpckhwd xm7, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 1 * mmsize] - paddd m2, m7 - pmaddwd m5, [r5] -%ifidn %1,sp - paddd m8, m9 - paddd m0, m9 - paddd m1, m9 - paddd m2, m9 - psrad m8, 12 - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 -%else - psrad m8, 6 - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 -%endif - packssdw m8, m0 - packssdw m1, m2 - lea r8, [r8 + r3 * 4] -%ifidn %1,sp - packuswb m8, m1 - vpermd m8, m3, m8 - vextracti128 xm1, m8, 1 - movq [r8], xm8 - movhps [r8 + r3], xm8 - movq [r8 + r3 * 2], xm1 - movhps [r8 + r6], xm1 -%else - vpermq m8, m8, 11011000b - vpermq m1, m1, 11011000b - vextracti128 xm0, m8, 1 - vextracti128 xm2, m1, 1 - movu [r8], xm8 - movu [r8 + r3], xm0 - movu [r8 + r3 * 2], xm1 - movu [r8 + r6], xm2 -%endif - lea r8, [r8 + r3 * 4] + movh m0, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m0, m2 + movu [r2 + r3 * 1], m0 - movu xm7, [r7 + r4] ; m7 = row 15 - punpckhwd xm2, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddwd m2, m6, [r5 + 1 * mmsize] - paddd m4, m2 - pmaddwd m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm2, [r7] ; m2 = row 16 - punpckhwd xm1, xm7, xm2 - punpcklwd xm7, xm2 - vinserti128 m7, m7, xm1, 1 - pmaddwd m1, m7, [r5 + 1 * mmsize] - paddd m5, m1 - pmaddwd m7, [r5] - movu xm1, [r7 + r1] ; m1 = row 17 - punpckhwd xm0, xm2, xm1 - punpcklwd xm2, xm1 - vinserti128 m2, m2, xm0, 1 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m6, m2 - movu xm0, [r7 + r1 * 2] ; m0 = row 18 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m1, [r5 + 1 * mmsize] - paddd m7, m1 + RET -%ifidn %1,sp - paddd m4, m9 - paddd m5, m9 - paddd m6, m9 - paddd m7, m9 - psrad m4, 12 - psrad m5, 12 - psrad m6, 12 - psrad m7, 12 +%macro PROCESS_CHROMA_SP_W4_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 done + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m4, [r6 + 1 * 16] + paddd m2, m4 ;m2=[2+3+4+5] Row3 + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m5, [r6 + 1 * 16] + paddd m3, m5 ;m3=[3+4+5+6] Row4 +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] %else - psrad m4, 6 - psrad m5, 6 - psrad m6, 6 - psrad m7, 6 + lea r6, [tab_ChromaCoeffV + r4] %endif - packssdw m4, m5 - packssdw m6, m7 -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r8], xm4 - movhps [r8 + r3], xm4 - movq [r8 + r3 * 2], xm6 - movhps [r8 + r6], xm6 + + mova m6, [pd_526336] + + mov dword [rsp], %2/4 + +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_CHROMA_SP_W4_4R + + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m0, 2 + pextrd [r5 + r3], m0, 3 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SP 4, 4 + FILTER_VER_CHROMA_SP 4, 8 + FILTER_VER_CHROMA_SP 16, 16 + FILTER_VER_CHROMA_SP 16, 8 + FILTER_VER_CHROMA_SP 16, 12 + FILTER_VER_CHROMA_SP 12, 16 + FILTER_VER_CHROMA_SP 16, 4 + FILTER_VER_CHROMA_SP 4, 16 + FILTER_VER_CHROMA_SP 32, 32 + FILTER_VER_CHROMA_SP 32, 16 + FILTER_VER_CHROMA_SP 16, 32 + FILTER_VER_CHROMA_SP 32, 24 + FILTER_VER_CHROMA_SP 24, 32 + FILTER_VER_CHROMA_SP 32, 8 + + FILTER_VER_CHROMA_SP 16, 24 + FILTER_VER_CHROMA_SP 16, 64 + FILTER_VER_CHROMA_SP 12, 32 + FILTER_VER_CHROMA_SP 4, 32 + FILTER_VER_CHROMA_SP 32, 64 + FILTER_VER_CHROMA_SP 32, 48 + FILTER_VER_CHROMA_SP 24, 64 + + FILTER_VER_CHROMA_SP 64, 64 + FILTER_VER_CHROMA_SP 64, 32 + FILTER_VER_CHROMA_SP 64, 48 + FILTER_VER_CHROMA_SP 48, 64 + FILTER_VER_CHROMA_SP 64, 16 + + +%macro PROCESS_CHROMA_SP_W2_4R 1 + movd m0, [r0] + movd m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + + lea r0, [r0 + 2 * r1] + movd m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + punpcklqdq m0, m1 ;m0=[0 1 1 2] + pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2 + + movd m1, [r0 + r1] + punpcklwd m2, m1 ;m2=[2 3] + + lea r0, [r0 + 2 * r1] + movd m3, [r0] + punpcklwd m1, m3 ;m2=[3 4] + punpcklqdq m2, m1 ;m2=[2 3 3 4] + + pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2 + pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4 + paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 + + movd m1, [r0 + r1] + punpcklwd m3, m1 ;m3=[4 5] + + movd m4, [r0 + 2 * r1] + punpcklwd m1, m4 ;m1=[5 6] + punpcklqdq m3, m1 ;m2=[4 5 5 6] + pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4 + paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 +%endmacro + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP_W2_4R 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] %else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm5, m4, 1 - vextracti128 xm7, m6, 1 - movu [r8], xm4 - movu [r8 + r3], xm5 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 + lea r5, [tab_ChromaCoeffV + r4] %endif + + mova m5, [pd_526336] + + mov r4d, (%2/4) + +.loopH: + PROCESS_CHROMA_SP_W2_4R r5 + + paddd m0, m5 + paddd m2, m5 + + psrad m0, 12 + psrad m2, 12 + + packssdw m0, m2 + packuswb m0, m0 + + pextrw [r2], m0, 0 + pextrw [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrw [r2], m0, 2 + pextrw [r2 + r3], m0, 3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET %endmacro -%macro FILTER_VER_CHROMA_S_AVX2_Nx16 2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_%2x16, 4, 10, 10 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d + FILTER_VER_CHROMA_SP_W2_4R 2, 4 + FILTER_VER_CHROMA_SP_W2_4R 2, 8 + + FILTER_VER_CHROMA_SP_W2_4R 2, 16 + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_4x2, 5, 6, 5 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 %ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] %else - lea r5, [pw_ChromaCoeffV + r4] + lea r5, [tab_ChromaCoeffV + r4] %endif - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m9, [pd_526336] + + mova m4, [pd_526336] + + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 + + movq m3, [r0 + r1] + punpcklwd m2, m3 ;m4=[2 3] + pmaddwd m2, [r5 + 1 * 16] + paddd m0, m2 ;m0=[0+1+2+3] Row1 done + paddd m0, m4 + psrad m0, 12 + + movq m2, [r0 + 2 * r1] + punpcklwd m3, m2 ;m5=[3 4] + pmaddwd m3, [r5 + 1 * 16] + paddd m1, m3 ;m1 = [1+2+3+4] Row2 done + paddd m1, m4 + psrad m1, 12 + + packssdw m0, m1 + packuswb m0, m0 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + + RET + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP_W6_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] %else - add r3d, r3d + lea r6, [tab_ChromaCoeffV + r4] %endif - lea r6, [r3 * 3] - mov r9d, %2 / 8 -.loopW: - PROCESS_CHROMA_S_AVX2_W8_16R %1 -%ifidn %1,sp - add r2, 8 + + mova m6, [pd_526336] + + mov r4d, %2/4 + +.loopH: + PROCESS_CHROMA_SP_W4_4R + + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m0, 2 + pextrd [r5 + r3], m0, 3 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 4 + + PROCESS_CHROMA_SP_W2_4R r6 + + paddd m0, m6 + paddd m2, m6 + + psrad m0, 12 + psrad m2, 12 + + packssdw m0, m2 + packuswb m0, m0 + + pextrw [r2], m0, 0 + pextrw [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrw [r2], m0, 2 + pextrw [r2 + r3], m0, 3 + + sub r0, 2 * 4 + lea r2, [r2 + 2 * r3 - 4] + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SP_W6_H4 6, 8 + + FILTER_VER_CHROMA_SP_W6_H4 6, 16 + +%macro PROCESS_CHROMA_SP_W8_2R 0 + movu m1, [r0] + movu m3, [r0 + r1] + punpcklwd m0, m1, m3 + pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l + punpckhwd m1, m3 + pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h + + movu m4, [r0 + 2 * r1] + punpcklwd m2, m3, m4 + pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l + punpckhwd m3, m4 + pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h + + lea r0, [r0 + 2 * r1] + movu m5, [r0 + r1] + punpcklwd m6, m4, m5 + pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l + paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum + punpckhwd m4, m5 + pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h + paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum + + movu m4, [r0 + 2 * r1] + punpcklwd m6, m5, m4 + pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l + paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum + punpckhwd m5, m4 + pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h + paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP_W8_H2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] %else - add r2, 16 + lea r5, [tab_ChromaCoeffV + r4] %endif - add r0, 16 - dec r9d - jnz .loopW + + mova m7, [pd_526336] + + mov r4d, %2/2 +.loopH: + PROCESS_CHROMA_SP_W8_2R + + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movlps [r2], m0 + movhps [r2 + r3], m0 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + RET -%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 16 - FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 32 - FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 64 - FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 16 - FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 32 - FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 64 + FILTER_VER_CHROMA_SP_W8_H2 8, 2 + FILTER_VER_CHROMA_SP_W8_H2 8, 4 + FILTER_VER_CHROMA_SP_W8_H2 8, 6 + FILTER_VER_CHROMA_SP_W8_H2 8, 8 + FILTER_VER_CHROMA_SP_W8_H2 8, 16 + FILTER_VER_CHROMA_SP_W8_H2 8, 32 + + FILTER_VER_CHROMA_SP_W8_H2 8, 12 + FILTER_VER_CHROMA_SP_W8_H2 8, 64 -%macro FILTER_VER_CHROMA_S_AVX2_NxN 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%3_%1x%2, 4, 11, 10 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W16n 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d %ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %3,sp - mova m9, [pd_526336] + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] %else - add r3d, r3d + movd m0, [tab_ChromaCoeff + r4 * 4] %endif - lea r6, [r3 * 3] - mov r9d, %2 / 16 -.loopH: - mov r10d, %1 / 8 + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + mov r4d, %2/2 + +.loop: + + mov r6d, %1/16 + .loopW: - PROCESS_CHROMA_S_AVX2_W8_16R %3 -%ifidn %3,sp - add r2, 8 -%else - add r2, 16 -%endif - add r0, 16 - dec r10d - jnz .loopW - lea r0, [r7 - 2 * %1 + 16] -%ifidn %3,sp - lea r2, [r8 + r3 * 4 - %1 + 8] -%else - lea r2, [r8 + r3 * 4 - 2 * %1 + 16] -%endif - dec r9d - jnz .loopH - RET -%endif -%endmacro - FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, sp - FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, sp - FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, sp - FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, ss - FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, ss - FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, ss - FILTER_VER_CHROMA_S_AVX2_NxN 16, 64, sp - FILTER_VER_CHROMA_S_AVX2_NxN 24, 64, sp - FILTER_VER_CHROMA_S_AVX2_NxN 32, 64, sp - FILTER_VER_CHROMA_S_AVX2_NxN 32, 48, sp - FILTER_VER_CHROMA_S_AVX2_NxN 32, 48, ss - FILTER_VER_CHROMA_S_AVX2_NxN 16, 64, ss - FILTER_VER_CHROMA_S_AVX2_NxN 24, 64, ss - FILTER_VER_CHROMA_S_AVX2_NxN 32, 64, ss - FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, sp - FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, sp - FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, sp - FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, sp - FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, ss - FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, ss - FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, ss - FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, ss + movu m2, [r0] + movu m3, [r0 + r1] -%macro PROCESS_CHROMA_S_AVX2_W8_4R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m4, [r5 + 1 * mmsize] - paddd m2, m4 - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm4, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm4, 1 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m3, m5 -%ifidn %1,sp - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 -%else - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m0, m1 - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 -%endif -%endmacro + punpcklbw m4, m2, m3 + punpckhbw m2, m3 -%macro FILTER_VER_CHROMA_S_AVX2_8x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x4, 4, 6, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d + pmaddubsw m4, m1 + pmaddubsw m2, m1 -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif + lea r5, [r0 + 2 * r1] + movu m5, [r5] + movu m7, [r5 + r1] - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m7, [pd_526336] -%else - add r3d, r3d -%endif + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 - PROCESS_CHROMA_S_AVX2_W8_4R %1 - lea r4, [r3 * 3] -%ifidn %1,sp - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r4], xm2 -%else - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 -%endif + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 + + movu [r2], m4 + movu [r2 + 16], m2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m5, [r5 + 2 * r1] + + punpcklbw m2, m7, m5 + punpckhbw m7, m5 + + pmaddubsw m2, m0 + pmaddubsw m7, m0 + + paddw m4, m2 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 + + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 + + add r0, 16 + add r2, 32 + dec r6d + jnz .loopW + + lea r0, [r0 + r1 * 2 - %1] + lea r2, [r2 + r3 * 2 - %1 * 2] + + dec r4d + jnz .loop RET %endmacro - FILTER_VER_CHROMA_S_AVX2_8x4 sp - FILTER_VER_CHROMA_S_AVX2_8x4 ss + FILTER_V_PS_W16n 64, 64 + FILTER_V_PS_W16n 64, 32 + FILTER_V_PS_W16n 64, 48 + FILTER_V_PS_W16n 48, 64 + FILTER_V_PS_W16n 64, 16 + + +;------------------------------------------------------------------------------------------------------------ +;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_2x4, 4, 6, 7 -%macro FILTER_VER_CHROMA_S_AVX2_12x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_12x16, 4, 9, 10 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d + mov r4d, r4m + sub r0, r1 + add r3d, r3d %ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] %else - lea r5, [pw_ChromaCoeffV + r4] + movd m0, [tab_ChromaCoeff + r4 * 4] %endif - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m9, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] - PROCESS_CHROMA_S_AVX2_W8_16R %1 -%ifidn %1,sp - add r2, 8 -%else - add r2, 16 -%endif - add r0, 16 - mova m7, m9 - PROCESS_CHROMA_AVX2_W4_16R %1 - RET -%endif -%endmacro + pshufb m0, [tab_Cm] - FILTER_VER_CHROMA_S_AVX2_12x16 sp - FILTER_VER_CHROMA_S_AVX2_12x16 ss + lea r5, [3 * r1] -%macro FILTER_VER_CHROMA_S_AVX2_12x32 1 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_12x32, 4, 9, 10 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1, sp - mova m9, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] -%rep 2 - PROCESS_CHROMA_S_AVX2_W8_16R %1 -%ifidn %1, sp - add r2, 8 -%else - add r2, 16 -%endif - add r0, 16 - mova m7, m9 - PROCESS_CHROMA_AVX2_W4_16R %1 - sub r0, 16 -%ifidn %1, sp - lea r2, [r2 + r3 * 4 - 8] -%else - lea r2, [r2 + r3 * 4 - 16] -%endif -%endrep - RET -%endif -%endmacro + pmaddubsw m2, m0 - FILTER_VER_CHROMA_S_AVX2_12x32 sp - FILTER_VER_CHROMA_S_AVX2_12x32 ss + lea r0, [r0 + 4 * r1] + movd m6, [r0] -%macro FILTER_VER_CHROMA_S_AVX2_16x12 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_16x12, 4, 9, 9 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d + punpcklbw m3, m4 + punpcklbw m1, m5, m6 + punpcklbw m3, m1 -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif + pmaddubsw m3, m0 + phaddw m2, m3 - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m8, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] -%rep 2 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] -%ifidn %1,sp - paddd m0, m8 - paddd m1, m8 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 + mova m1, [pw_2000] - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m1 -%ifidn %1,sp - paddd m2, m8 - paddd m3, m8 - psrad m2, 12 - psrad m3, 12 -%else - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - movu [r2], xm0 - vextracti128 xm0, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 -%endif - lea r8, [r2 + r3 * 4] + psubw m2, m1 - movu xm1, [r7 + r4] ; m1 = row 7 - punpckhwd xm0, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm0, 1 - pmaddwd m0, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m0 - lea r7, [r7 + r1 * 4] - movu xm0, [r7] ; m0 = row 8 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - pmaddwd m1, [r5] - paddd m5, m2 -%ifidn %1,sp - paddd m4, m8 - paddd m5, m8 - psrad m4, 12 - psrad m5, 12 -%else - psrad m4, 6 - psrad m5, 6 -%endif - packssdw m4, m5 + movd [r2], m2 + pextrd [r2 + r3], m2, 2 + + movd m2, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + phaddw m4, m5 + psubw m4, m1 + + lea r2, [r2 + 2 * r3] + movd [r2], m4 + pextrd [r2 + r3], m4, 2 + + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8 - movu xm2, [r7 + r1] ; m2 = row 9 - punpckhwd xm5, xm0, xm2 - punpcklwd xm0, xm2 - vinserti128 m0, m0, xm5, 1 - pmaddwd m5, m0, [r5 + 1 * mmsize] - paddd m6, m5 - pmaddwd m0, [r5] - movu xm5, [r7 + r1 * 2] ; m5 = row 10 - punpckhwd xm7, xm2, xm5 - punpcklwd xm2, xm5 - vinserti128 m2, m2, xm7, 1 - pmaddwd m7, m2, [r5 + 1 * mmsize] - paddd m1, m7 - pmaddwd m2, [r5] + mov r4d, r4m + sub r0, r1 + add r3d, r3d -%ifidn %1,sp - paddd m6, m8 - paddd m1, m8 - psrad m6, 12 - psrad m1, 12 -%else - psrad m6, 6 - psrad m1, 6 -%endif - packssdw m6, m1 -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r8], xm4 - movhps [r8 + r3], xm4 - movq [r8 + r3 * 2], xm6 - movhps [r8 + r6], xm6 +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] %else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm7, m4, 1 - vextracti128 xm1, m6, 1 - movu [r8], xm4 - movu [r8 + r3], xm7 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm1 + movd m0, [tab_ChromaCoeff + r4 * 4] %endif - lea r8, [r8 + r3 * 4] - movu xm7, [r7 + r4] ; m7 = row 11 - punpckhwd xm1, xm5, xm7 - punpcklwd xm5, xm7 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - paddd m0, m1 - pmaddwd m5, [r5] - lea r7, [r7 + r1 * 4] - movu xm1, [r7] ; m1 = row 12 - punpckhwd xm4, xm7, xm1 - punpcklwd xm7, xm1 - vinserti128 m7, m7, xm4, 1 - pmaddwd m4, m7, [r5 + 1 * mmsize] - paddd m2, m4 - pmaddwd m7, [r5] -%ifidn %1,sp - paddd m0, m8 - paddd m2, m8 - psrad m0, 12 - psrad m2, 12 -%else - psrad m0, 6 - psrad m2, 6 -%endif - packssdw m0, m2 + pshufb m0, [tab_Cm] - movu xm4, [r7 + r1] ; m4 = row 13 - punpckhwd xm2, xm1, xm4 - punpcklwd xm1, xm4 - vinserti128 m1, m1, xm2, 1 - pmaddwd m1, [r5 + 1 * mmsize] - paddd m5, m1 - movu xm2, [r7 + r1 * 2] ; m2 = row 14 - punpckhwd xm6, xm4, xm2 - punpcklwd xm4, xm2 - vinserti128 m4, m4, xm6, 1 - pmaddwd m4, [r5 + 1 * mmsize] - paddd m7, m4 -%ifidn %1,sp - paddd m5, m8 - paddd m7, m8 - psrad m5, 12 - psrad m7, 12 -%else - psrad m5, 6 - psrad m7, 6 -%endif - packssdw m5, m7 -%ifidn %1,sp - packuswb m0, m5 - vpermd m0, m3, m0 - vextracti128 xm5, m0, 1 - movq [r8], xm0 - movhps [r8 + r3], xm0 - movq [r8 + r3 * 2], xm5 - movhps [r8 + r6], xm5 - add r2, 8 + mova m1, [pw_2000] + lea r5, [3 * r1] + mov r4d, %2/4 +.loop: + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 + + pmaddubsw m3, m0 + + phaddw m2, m3 + psubw m2, m1 + + + movd [r2], m2 + pshufd m2, m2, 2 + movd [r2 + r3], m2 + + movd m2, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + psubw m4, m1 + + lea r2, [r2 + 2 * r3] + movd [r2], m4 + pshufd m4 , m4 ,2 + movd [r2 + r3], m4 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + + RET +%endmacro + + FILTER_V_PS_W2 2, 8 + + FILTER_V_PS_W2 2, 16 + +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] %else - vpermq m0, m0, 11011000b - vpermq m5, m5, 11011000b - vextracti128 xm7, m0, 1 - vextracti128 xm6, m5, 1 - movu [r8], xm0 - movu [r8 + r3], xm7 - movu [r8 + r3 * 2], xm5 - movu [r8 + r6], xm6 - add r2, 16 + lea r6, [tab_ChromaCoeffV + r4] %endif - add r0, 16 -%endrep + + mov dword [rsp], %2/4 + +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_CHROMA_SP_W4_4R + + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 + + movlps [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movlps [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH + RET -%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_16x12 sp - FILTER_VER_CHROMA_S_AVX2_16x12 ss + FILTER_VER_CHROMA_SS 4, 4 + FILTER_VER_CHROMA_SS 4, 8 + FILTER_VER_CHROMA_SS 16, 16 + FILTER_VER_CHROMA_SS 16, 8 + FILTER_VER_CHROMA_SS 16, 12 + FILTER_VER_CHROMA_SS 12, 16 + FILTER_VER_CHROMA_SS 16, 4 + FILTER_VER_CHROMA_SS 4, 16 + FILTER_VER_CHROMA_SS 32, 32 + FILTER_VER_CHROMA_SS 32, 16 + FILTER_VER_CHROMA_SS 16, 32 + FILTER_VER_CHROMA_SS 32, 24 + FILTER_VER_CHROMA_SS 24, 32 + FILTER_VER_CHROMA_SS 32, 8 + + FILTER_VER_CHROMA_SS 16, 24 + FILTER_VER_CHROMA_SS 12, 32 + FILTER_VER_CHROMA_SS 4, 32 + FILTER_VER_CHROMA_SS 32, 64 + FILTER_VER_CHROMA_SS 16, 64 + FILTER_VER_CHROMA_SS 32, 48 + FILTER_VER_CHROMA_SS 24, 64 -%macro FILTER_VER_CHROMA_S_AVX2_8x12 1 -%if ARCH_X86_64 == 1 + FILTER_VER_CHROMA_SS 64, 64 + FILTER_VER_CHROMA_SS 64, 32 + FILTER_VER_CHROMA_SS 64, 48 + FILTER_VER_CHROMA_SS 48, 64 + FILTER_VER_CHROMA_SS 64, 16 + +%macro FILTER_VER_CHROMA_S_AVX2_4x4 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x12, 4, 7, 9 +cglobal interp_4tap_vert_%1_4x4, 4, 6, 7 mov r4d, r4m - shl r4d, 6 add r1d, r1d + shl r4d, 6 + sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] @@ -20695,184 +15907,39 @@ cglobal interp_4tap_vert_%1_8x12, 4, 7, 9 %endif lea r4, [r1 * 3] - sub r0, r1 %ifidn %1,sp - mova m8, [pd_526336] + mova m6, [pd_526336] %else add r3d, r3d %endif - lea r6, [r3 * 3] - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 + + movq xm0, [r0] + movq xm1, [r0 + r1] punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 + movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] -%ifidn %1,sp - paddd m0, m8 - paddd m1, m8 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 - - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m1 -%ifidn %1,sp - paddd m2, m8 - paddd m3, m8 - psrad m2, 12 - psrad m3, 12 -%else - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - movu [r2], xm0 - vextracti128 xm0, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 -%endif - lea r2, [r2 + r3 * 4] - - movu xm1, [r0 + r4] ; m1 = row 7 - punpckhwd xm0, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm0, 1 - pmaddwd m0, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m0 - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 8 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - pmaddwd m1, [r5] - paddd m5, m2 -%ifidn %1,sp - paddd m4, m8 - paddd m5, m8 - psrad m4, 12 - psrad m5, 12 -%else - psrad m4, 6 - psrad m5, 6 -%endif - packssdw m4, m5 - - movu xm2, [r0 + r1] ; m2 = row 9 - punpckhwd xm5, xm0, xm2 - punpcklwd xm0, xm2 - vinserti128 m0, m0, xm5, 1 - pmaddwd m5, m0, [r5 + 1 * mmsize] - paddd m6, m5 - pmaddwd m0, [r5] - movu xm5, [r0 + r1 * 2] ; m5 = row 10 - punpckhwd xm7, xm2, xm5 - punpcklwd xm2, xm5 - vinserti128 m2, m2, xm7, 1 - pmaddwd m7, m2, [r5 + 1 * mmsize] - paddd m1, m7 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] pmaddwd m2, [r5] - -%ifidn %1,sp - paddd m6, m8 - paddd m1, m8 - psrad m6, 12 - psrad m1, 12 -%else - psrad m6, 6 - psrad m1, 6 -%endif - packssdw m6, m1 -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r2], xm4 - movhps [r2 + r3], xm4 - movq [r2 + r3 * 2], xm6 - movhps [r2 + r6], xm6 -%else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm7, m4, 1 - vextracti128 xm1, m6, 1 - movu [r2], xm4 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm1 -%endif - lea r2, [r2 + r3 * 4] - - movu xm7, [r0 + r4] ; m7 = row 11 - punpckhwd xm1, xm5, xm7 - punpcklwd xm5, xm7 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - paddd m0, m1 - pmaddwd m5, [r5] - lea r0, [r0 + r1 * 4] - movu xm1, [r0] ; m1 = row 12 - punpckhwd xm4, xm7, xm1 - punpcklwd xm7, xm1 - vinserti128 m7, m7, xm4, 1 - pmaddwd m4, m7, [r5 + 1 * mmsize] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m4, [r5 + 1 * mmsize] paddd m2, m4 - pmaddwd m7, [r5] + %ifidn %1,sp - paddd m0, m8 - paddd m2, m8 + paddd m0, m6 + paddd m2, m6 psrad m0, 12 psrad m2, 12 %else @@ -20880,60 +15947,34 @@ cglobal interp_4tap_vert_%1_8x12, 4, 7, 9 psrad m2, 6 %endif packssdw m0, m2 + vextracti128 xm2, m0, 1 + lea r4, [r3 * 3] - movu xm4, [r0 + r1] ; m4 = row 13 - punpckhwd xm2, xm1, xm4 - punpcklwd xm1, xm4 - vinserti128 m1, m1, xm2, 1 - pmaddwd m1, [r5 + 1 * mmsize] - paddd m5, m1 - movu xm2, [r0 + r1 * 2] ; m2 = row 14 - punpckhwd xm6, xm4, xm2 - punpcklwd xm4, xm2 - vinserti128 m4, m4, xm6, 1 - pmaddwd m4, [r5 + 1 * mmsize] - paddd m7, m4 %ifidn %1,sp - paddd m5, m8 - paddd m7, m8 - psrad m5, 12 - psrad m7, 12 + packuswb xm0, xm2 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r4], xm0, 3 %else - psrad m5, 6 - psrad m7, 6 -%endif - packssdw m5, m7 -%ifidn %1,sp - packuswb m0, m5 - vpermd m0, m3, m0 - vextracti128 xm5, m0, 1 movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm5 -%else - vpermq m0, m0, 11011000b - vpermq m5, m5, 11011000b - vextracti128 xm7, m0, 1 - vextracti128 xm6, m5, 1 - movu [r2], xm0 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm5 - movu [r2 + r6], xm6 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 %endif RET -%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_8x12 sp - FILTER_VER_CHROMA_S_AVX2_8x12 ss + FILTER_VER_CHROMA_S_AVX2_4x4 sp + FILTER_VER_CHROMA_S_AVX2_4x4 ss -%macro FILTER_VER_CHROMA_S_AVX2_16x4 1 +%macro FILTER_VER_CHROMA_S_AVX2_4x8 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_16x4, 4, 7, 8 +cglobal interp_4tap_vert_%1_4x8, 4, 6, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d + sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] @@ -20943,196 +15984,275 @@ cglobal interp_4tap_vert_%1_16x4, 4, 7, 8 %endif lea r4, [r1 * 3] - sub r0, r1 %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif -%rep 2 - PROCESS_CHROMA_S_AVX2_W8_4R %1 - lea r6, [r3 * 3] -%ifidn %1,sp - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 - add r2, 8 -%else - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - add r2, 16 -%endif - lea r6, [4 * r1 - 16] - sub r0, r6 -%endrep - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_16x4 sp - FILTER_VER_CHROMA_S_AVX2_16x4 ss -%macro PROCESS_CHROMA_S_AVX2_W8_8R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 + movq xm0, [r0] + movq xm1, [r0 + r1] punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 + movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 + lea r4, [r3 * 3] + %ifidn %1,sp paddd m0, m7 + paddd m2, m7 + paddd m4, m7 paddd m1, m7 psrad m0, 12 + psrad m2, 12 + psrad m4, 12 psrad m1, 12 %else psrad m0, 6 + psrad m2, 6 + psrad m4, 6 psrad m1, 6 %endif - packssdw m0, m1 - - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m1 + packssdw m0, m2 + packssdw m4, m1 %ifidn %1,sp - paddd m2, m7 - paddd m3, m7 - psrad m2, 12 - psrad m3, 12 + packuswb m0, m4 + vextracti128 xm2, m0, 1 + movd [r2], xm0 + movd [r2 + r3], xm2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r4], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm2, 2 + pextrd [r2 + r3 * 2], xm0, 3 + pextrd [r2 + r4], xm2, 3 %else - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 vextracti128 xm2, m0, 1 + vextracti128 xm1, m4, 1 movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - movu [r2], xm0 - vextracti128 xm0, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r4], xm1 %endif - lea r8, [r2 + r3 * 4] + RET +%endmacro - movu xm1, [r7 + r4] ; m1 = row 7 - punpckhwd xm0, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm0, 1 - pmaddwd m0, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m0 - lea r7, [r7 + r1 * 4] - movu xm0, [r7] ; m0 = row 8 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] + FILTER_VER_CHROMA_S_AVX2_4x8 sp + FILTER_VER_CHROMA_S_AVX2_4x8 ss + +%macro PROCESS_CHROMA_AVX2_W4_16R 1 + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 pmaddwd m1, [r5] - paddd m5, m2 + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] + pmaddwd m3, m6, [r5 + 1 * mmsize] + paddd m1, m3 + pmaddwd m6, [r5] + %ifidn %1,sp + paddd m0, m7 + paddd m2, m7 paddd m4, m7 - paddd m5, m7 + paddd m1, m7 psrad m4, 12 - psrad m5, 12 + psrad m1, 12 + psrad m0, 12 + psrad m2, 12 %else + psrad m0, 6 + psrad m2, 6 psrad m4, 6 - psrad m5, 6 + psrad m1, 6 +%endif + packssdw m0, m2 + packssdw m4, m1 +%ifidn %1,sp + packuswb m0, m4 + vextracti128 xm4, m0, 1 + movd [r2], xm0 + movd [r2 + r3], xm4 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r6], xm4, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm4, 2 + pextrd [r2 + r3 * 2], xm0, 3 + pextrd [r2 + r6], xm4, 3 +%else + vextracti128 xm2, m0, 1 + vextracti128 xm1, m4, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 %endif - packssdw m4, m5 - movu xm2, [r7 + r1] ; m2 = row 9 - punpckhwd xm5, xm0, xm2 + movq xm2, [r0 + r4] + punpcklwd xm5, xm2 + lea r0, [r0 + 4 * r1] + movq xm0, [r0] + punpcklwd xm2, xm0 + vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] + pmaddwd m2, m5, [r5 + 1 * mmsize] + paddd m6, m2 + pmaddwd m5, [r5] + movq xm2, [r0 + r1] punpcklwd xm0, xm2 - vinserti128 m0, m0, xm5, 1 - pmaddwd m0, [r5 + 1 * mmsize] - paddd m6, m0 - movu xm5, [r7 + r1 * 2] ; m5 = row 10 - punpckhwd xm0, xm2, xm5 - punpcklwd xm2, xm5 - vinserti128 m2, m2, xm0, 1 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m1, m2 + movq xm3, [r0 + 2 * r1] + punpcklwd xm2, xm3 + vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m5, m2 + pmaddwd m0, [r5] + movq xm4, [r0 + r4] + punpcklwd xm3, xm4 + lea r0, [r0 + 4 * r1] + movq xm1, [r0] + punpcklwd xm4, xm1 + vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] + pmaddwd m4, m3, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m3, [r5] + movq xm4, [r0 + r1] + punpcklwd xm1, xm4 + movq xm2, [r0 + 2 * r1] + punpcklwd xm4, xm2 + vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] + pmaddwd m1, [r5 + 1 * mmsize] + paddd m3, m1 %ifidn %1,sp paddd m6, m7 - paddd m1, m7 + paddd m5, m7 + paddd m0, m7 + paddd m3, m7 psrad m6, 12 - psrad m1, 12 + psrad m5, 12 + psrad m0, 12 + psrad m3, 12 %else psrad m6, 6 - psrad m1, 6 + psrad m5, 6 + psrad m0, 6 + psrad m3, 6 %endif - packssdw m6, m1 + packssdw m6, m5 + packssdw m0, m3 + lea r2, [r2 + r3 * 4] + %ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r8], xm4 - movhps [r8 + r3], xm4 - movq [r8 + r3 * 2], xm6 - movhps [r8 + r6], xm6 + packuswb m6, m0 + vextracti128 xm0, m6, 1 + movd [r2], xm6 + movd [r2 + r3], xm0 + pextrd [r2 + r3 * 2], xm6, 1 + pextrd [r2 + r6], xm0, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm6, 2 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm6, 3 + pextrd [r2 + r6], xm0, 3 %else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm7, m4, 1 - vextracti128 xm1, m6, 1 - movu [r8], xm4 - movu [r8 + r3], xm7 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm1 + vextracti128 xm5, m6, 1 + vextracti128 xm3, m0, 1 + movq [r2], xm6 + movq [r2 + r3], xm5 + movhps [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm5 + lea r2, [r2 + r3 * 4] + movq [r2], xm0 + movq [r2 + r3], xm3 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm3 %endif %endmacro -%macro FILTER_VER_CHROMA_S_AVX2_Nx8 2 +%macro FILTER_VER_CHROMA_S_AVX2_4x16 1 INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_%2x8, 4, 9, 8 +cglobal interp_4tap_vert_%1_4x16, 4, 7, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d + sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] @@ -21142,37 +16262,58 @@ cglobal interp_4tap_vert_%1_%2x8, 4, 9, 8 %endif lea r4, [r1 * 3] - sub r0, r1 %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] -%rep %2 / 8 - PROCESS_CHROMA_S_AVX2_W8_8R %1 + PROCESS_CHROMA_AVX2_W4_16R %1 + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_4x16 sp + FILTER_VER_CHROMA_S_AVX2_4x16 ss + +%macro FILTER_VER_CHROMA_S_AVX2_4x32 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x32, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + sub r0, r1 + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] %ifidn %1,sp - add r2, 8 + mova m7, [pd_526336] %else - add r2, 16 + add r3d, r3d %endif - add r0, 16 + lea r6, [r3 * 3] +%rep 2 + PROCESS_CHROMA_AVX2_W4_16R %1 + lea r2, [r2 + r3 * 4] %endrep RET -%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 32 - FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 16 - FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 32 - FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 16 + FILTER_VER_CHROMA_S_AVX2_4x32 sp + FILTER_VER_CHROMA_S_AVX2_4x32 ss -%macro FILTER_VER_CHROMA_S_AVX2_8x2 1 +%macro FILTER_VER_CHROMA_S_AVX2_4x2 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x2, 4, 6, 6 +cglobal interp_4tap_vert_%1_4x2, 4, 6, 6 mov r4d, r4m shl r4d, 6 add r1d, r1d + sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] @@ -21182,67 +16323,120 @@ cglobal interp_4tap_vert_%1_8x2, 4, 6, 6 %endif lea r4, [r1 * 3] - sub r0, r1 %ifidn %1,sp mova m5, [pd_526336] %else add r3d, r3d %endif - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 + movq xm0, [r0] + movq xm1, [r0 + r1] punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 + movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 + movq xm4, [r0 + 4 * r1] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] pmaddwd m2, [r5 + 1 * mmsize] paddd m0, m2 - movu xm4, [r0 + r1 * 4] ; m4 = row 4 - punpckhwd xm2, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm2, 1 - pmaddwd m3, [r5 + 1 * mmsize] - paddd m1, m3 %ifidn %1,sp paddd m0, m5 - paddd m1, m5 psrad m0, 12 - psrad m1, 12 %else psrad m0, 6 - psrad m1, 6 %endif - packssdw m0, m1 -%ifidn %1,sp vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - pshufd xm0, xm0, 11011000b + packssdw xm0, xm1 +%ifidn %1,sp + packuswb xm0, xm0 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 +%else movq [r2], xm0 movhps [r2 + r3], xm0 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_4x2 sp + FILTER_VER_CHROMA_S_AVX2_4x2 ss + +%macro FILTER_VER_CHROMA_S_AVX2_2x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_2x4, 4, 6, 6 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + sub r0, r1 + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] +%ifidn %1,sp + mova m5, [pd_526336] +%else + add r3d, r3d +%endif + movd xm0, [r0] + movd xm1, [r0 + r1] + punpcklwd xm0, xm1 + movd xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] + movd xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movd xm4, [r0] + punpcklwd xm3, xm4 + punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] + vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] + movd xm1, [r0 + r1] + punpcklwd xm4, xm1 + movd xm3, [r0 + r1 * 2] + punpcklwd xm1, xm3 + punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] + vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] + pmaddwd m0, [r5] + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 +%ifidn %1,sp + paddd m0, m5 + psrad m0, 12 %else - vpermq m0, m0, 11011000b + psrad m0, 6 +%endif vextracti128 xm1, m0, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 + packssdw xm0, xm1 + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb xm0, xm0 + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + 2 * r3], xm0, 2 + pextrw [r2 + r4], xm0, 3 +%else + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + 2 * r3], xm0, 2 + pextrd [r2 + r4], xm0, 3 %endif RET %endmacro - FILTER_VER_CHROMA_S_AVX2_8x2 sp - FILTER_VER_CHROMA_S_AVX2_8x2 ss + FILTER_VER_CHROMA_S_AVX2_2x4 sp + FILTER_VER_CHROMA_S_AVX2_2x4 ss -%macro FILTER_VER_CHROMA_S_AVX2_8x6 1 +%macro FILTER_VER_CHROMA_S_AVX2_8x8 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x6, 4, 6, 8 +cglobal interp_4tap_vert_%1_8x8, 4, 6, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d @@ -21328,85 +16522,98 @@ cglobal interp_4tap_vert_%1_8x6, 4, 6, 8 punpckhwd xm3, xm6, xm1 punpcklwd xm6, xm1 vinserti128 m6, m6, xm3, 1 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m4, m6 - movu xm6, [r0 + r1 * 4] ; m6 = row 8 - punpckhwd xm3, xm1, xm6 - punpcklwd xm1, xm6 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5 + 1 * mmsize] - paddd m5, m1 -%ifidn %1,sp - paddd m4, m7 - paddd m5, m7 - psrad m4, 12 - psrad m5, 12 -%else - psrad m4, 6 - psrad m5, 6 -%endif - packssdw m4, m5 + pmaddwd m3, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m3 + lea r4, [r3 * 3] %ifidn %1,sp packuswb m0, m2 mova m3, [interp8_hps_shuf] vpermd m0, m3, m0 vextracti128 xm2, m0, 1 - vextracti128 xm5, m4, 1 - packuswb xm4, xm5 - pshufd xm4, xm4, 11011000b movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r4], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movhps [r2 + r3], xm4 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b movu [r2], xm0 vextracti128 xm0, m0, 1 vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 movu [r2 + r3], xm0 movu [r2 + r3 * 2], xm2 movu [r2 + r4], xm3 +%endif lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m7 + paddd m5, m7 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 %endif - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_8x6 sp - FILTER_VER_CHROMA_S_AVX2_8x6 ss + packssdw m4, m5 -%macro FILTER_VER_CHROMA_S_AVX2_8xN 2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_8x%2, 4, 7, 9 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d + movu xm2, [r0 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m6, m0 + movu xm5, [r0 + r1 * 2] ; m5 = row 10 + punpckhwd xm0, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m1, m2 -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 +%ifidn %1,sp + paddd m6, m7 + paddd m1, m7 + psrad m6, 12 + psrad m1, 12 %else - lea r5, [pw_ChromaCoeffV + r4] + psrad m6, 6 + psrad m1, 6 %endif - - lea r4, [r1 * 3] - sub r0, r1 + packssdw m6, m1 %ifidn %1,sp - mova m8, [pd_526336] + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r4], xm6 %else - add r3d, r3d + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 + vextracti128 xm1, m6, 1 + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r4], xm1 %endif - lea r6, [r3 * 3] -%rep %2 / 16 + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_8x8 sp + FILTER_VER_CHROMA_S_AVX2_8x8 ss + +%macro PROCESS_CHROMA_S_AVX2_W8_16R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 @@ -21425,48 +16632,44 @@ cglobal interp_4tap_vert_%1_8x%2, 4, 7, 9 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] -%ifidn %1,sp - paddd m0, m8 - paddd m1, m8 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 - - movu xm5, [r0 + r1] ; m5 = row 5 + movu xm5, [r7 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 pmaddwd m5, [r5] - paddd m3, m1 %ifidn %1,sp - paddd m2, m8 - paddd m3, m8 + paddd m0, m9 + paddd m1, m9 + paddd m2, m9 + paddd m3, m9 + psrad m0, 12 + psrad m1, 12 psrad m2, 12 psrad m3, 12 %else + psrad m0, 6 + psrad m1, 6 psrad m2, 6 psrad m3, 6 %endif + packssdw m0, m1 packssdw m2, m3 %ifidn %1,sp packuswb m0, m2 @@ -21480,239 +16683,216 @@ cglobal interp_4tap_vert_%1_8x%2, 4, 7, 9 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b - movu [r2], xm0 - vextracti128 xm0, m0, 1 + vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 - movu [r2 + r3], xm0 + movu [r2], xm0 + movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 %endif - lea r2, [r2 + r3 * 4] - movu xm1, [r0 + r4] ; m1 = row 7 - punpckhwd xm0, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm0, 1 - pmaddwd m0, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m0 - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 8 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - pmaddwd m1, [r5] - paddd m5, m2 -%ifidn %1,sp + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 1 * mmsize] paddd m4, m8 - paddd m5, m8 - psrad m4, 12 - psrad m5, 12 -%else - psrad m4, 6 - psrad m5, 6 -%endif - packssdw m4, m5 - - movu xm2, [r0 + r1] ; m2 = row 9 - punpckhwd xm5, xm0, xm2 - punpcklwd xm0, xm2 - vinserti128 m0, m0, xm5, 1 - pmaddwd m5, m0, [r5 + 1 * mmsize] - paddd m6, m5 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm0, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm0, 1 + pmaddwd m0, m7, [r5 + 1 * mmsize] + paddd m5, m0 + pmaddwd m7, [r5] + movu xm0, [r7 + r1] ; m0 = row 9 + punpckhwd xm1, xm8, xm0 + punpcklwd xm8, xm0 + vinserti128 m8, m8, xm1, 1 + pmaddwd m1, m8, [r5 + 1 * mmsize] + paddd m6, m1 + pmaddwd m8, [r5] + movu xm1, [r7 + r1 * 2] ; m1 = row 10 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m7, m2 pmaddwd m0, [r5] - movu xm5, [r0 + r1 * 2] ; m5 = row 10 - punpckhwd xm7, xm2, xm5 - punpcklwd xm2, xm5 - vinserti128 m2, m2, xm7, 1 - pmaddwd m7, m2, [r5 + 1 * mmsize] - paddd m1, m7 - pmaddwd m2, [r5] - %ifidn %1,sp - paddd m6, m8 - paddd m1, m8 + paddd m4, m9 + paddd m5, m9 + psrad m4, 12 + psrad m5, 12 + paddd m6, m9 + paddd m7, m9 psrad m6, 12 - psrad m1, 12 + psrad m7, 12 %else + psrad m4, 6 + psrad m5, 6 psrad m6, 6 - psrad m1, 6 + psrad m7, 6 %endif - packssdw m6, m1 + packssdw m4, m5 + packssdw m6, m7 + lea r8, [r2 + r3 * 4] %ifidn %1,sp packuswb m4, m6 vpermd m4, m3, m4 vextracti128 xm6, m4, 1 - movq [r2], xm4 - movhps [r2 + r3], xm4 - movq [r2 + r3 * 2], xm6 - movhps [r2 + r6], xm6 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 %else vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b - vextracti128 xm7, m4, 1 - vextracti128 xm1, m6, 1 - movu [r2], xm4 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm1 + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm5 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 %endif - lea r2, [r2 + r3 * 4] - movu xm7, [r0 + r4] ; m7 = row 11 - punpckhwd xm1, xm5, xm7 - punpcklwd xm5, xm7 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - paddd m0, m1 + movu xm2, [r7 + r4] ; m2 = row 11 + punpckhwd xm4, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm4, 1 + pmaddwd m4, m1, [r5 + 1 * mmsize] + paddd m8, m4 + pmaddwd m1, [r5] + lea r7, [r7 + r1 * 4] + movu xm4, [r7] ; m4 = row 12 + punpckhwd xm5, xm2, xm4 + punpcklwd xm2, xm4 + vinserti128 m2, m2, xm5, 1 + pmaddwd m5, m2, [r5 + 1 * mmsize] + paddd m0, m5 + pmaddwd m2, [r5] + movu xm5, [r7 + r1] ; m5 = row 13 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m1, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 14 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m2, m7 pmaddwd m5, [r5] - lea r0, [r0 + r1 * 4] - movu xm1, [r0] ; m1 = row 12 - punpckhwd xm4, xm7, xm1 - punpcklwd xm7, xm1 - vinserti128 m7, m7, xm4, 1 - pmaddwd m4, m7, [r5 + 1 * mmsize] - paddd m2, m4 - pmaddwd m7, [r5] %ifidn %1,sp - paddd m0, m8 - paddd m2, m8 + paddd m8, m9 + paddd m0, m9 + paddd m1, m9 + paddd m2, m9 + psrad m8, 12 psrad m0, 12 + psrad m1, 12 psrad m2, 12 %else + psrad m8, 6 psrad m0, 6 + psrad m1, 6 psrad m2, 6 %endif - packssdw m0, m2 - - movu xm4, [r0 + r1] ; m4 = row 13 - punpckhwd xm2, xm1, xm4 - punpcklwd xm1, xm4 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - paddd m5, m2 - pmaddwd m1, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 14 - punpckhwd xm6, xm4, xm2 - punpcklwd xm4, xm2 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m7, m6 - pmaddwd m4, [r5] -%ifidn %1,sp - paddd m5, m8 - paddd m7, m8 - psrad m5, 12 - psrad m7, 12 -%else - psrad m5, 6 - psrad m7, 6 -%endif - packssdw m5, m7 + packssdw m8, m0 + packssdw m1, m2 + lea r8, [r8 + r3 * 4] %ifidn %1,sp - packuswb m0, m5 - vpermd m0, m3, m0 - vextracti128 xm5, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm5 + packuswb m8, m1 + vpermd m8, m3, m8 + vextracti128 xm1, m8, 1 + movq [r8], xm8 + movhps [r8 + r3], xm8 + movq [r8 + r3 * 2], xm1 + movhps [r8 + r6], xm1 %else - vpermq m0, m0, 11011000b - vpermq m5, m5, 11011000b - vextracti128 xm7, m0, 1 - vextracti128 xm6, m5, 1 - movu [r2], xm0 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm5 - movu [r2 + r6], xm6 + vpermq m8, m8, 11011000b + vpermq m1, m1, 11011000b + vextracti128 xm0, m8, 1 + vextracti128 xm2, m1, 1 + movu [r8], xm8 + movu [r8 + r3], xm0 + movu [r8 + r3 * 2], xm1 + movu [r8 + r6], xm2 %endif - lea r2, [r2 + r3 * 4] + lea r8, [r8 + r3 * 4] - movu xm6, [r0 + r4] ; m6 = row 15 - punpckhwd xm5, xm2, xm6 - punpcklwd xm2, xm6 - vinserti128 m2, m2, xm5, 1 - pmaddwd m5, m2, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 16 - punpckhwd xm5, xm6, xm0 - punpcklwd xm6, xm0 - vinserti128 m6, m6, xm5, 1 - pmaddwd m5, m6, [r5 + 1 * mmsize] - paddd m4, m5 + movu xm7, [r7 + r4] ; m7 = row 15 + punpckhwd xm2, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddwd m2, m6, [r5 + 1 * mmsize] + paddd m4, m2 pmaddwd m6, [r5] -%ifidn %1,sp - paddd m1, m8 - paddd m4, m8 - psrad m1, 12 - psrad m4, 12 -%else - psrad m1, 6 - psrad m4, 6 -%endif - packssdw m1, m4 + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhwd xm1, xm7, xm2 + punpcklwd xm7, xm2 + vinserti128 m7, m7, xm1, 1 + pmaddwd m1, m7, [r5 + 1 * mmsize] + paddd m5, m1 + pmaddwd m7, [r5] + movu xm1, [r7 + r1] ; m1 = row 17 + punpckhwd xm0, xm2, xm1 + punpcklwd xm2, xm1 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m6, m2 + movu xm0, [r7 + r1 * 2] ; m0 = row 18 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m7, m1 - movu xm5, [r0 + r1] ; m5 = row 17 - punpckhwd xm4, xm0, xm5 - punpcklwd xm0, xm5 - vinserti128 m0, m0, xm4, 1 - pmaddwd m0, [r5 + 1 * mmsize] - paddd m2, m0 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhwd xm0, xm5, xm4 - punpcklwd xm5, xm4 - vinserti128 m5, m5, xm0, 1 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m6, m5 %ifidn %1,sp - paddd m2, m8 - paddd m6, m8 - psrad m2, 12 + paddd m4, m9 + paddd m5, m9 + paddd m6, m9 + paddd m7, m9 + psrad m4, 12 + psrad m5, 12 psrad m6, 12 + psrad m7, 12 %else - psrad m2, 6 + psrad m4, 6 + psrad m5, 6 psrad m6, 6 + psrad m7, 6 %endif - packssdw m2, m6 + packssdw m4, m5 + packssdw m6, m7 %ifidn %1,sp - packuswb m1, m2 - vpermd m1, m3, m1 - vextracti128 xm2, m1, 1 - movq [r2], xm1 - movhps [r2 + r3], xm1 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 -%else - vpermq m1, m1, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm6, m1, 1 - vextracti128 xm4, m2, 1 - movu [r2], xm1 - movu [r2 + r3], xm6 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm4 -%endif - lea r2, [r2 + r3 * 4] -%endrep - RET + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm5 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 %endif %endmacro - FILTER_VER_CHROMA_S_AVX2_8xN sp, 16 - FILTER_VER_CHROMA_S_AVX2_8xN sp, 32 - FILTER_VER_CHROMA_S_AVX2_8xN sp, 64 - FILTER_VER_CHROMA_S_AVX2_8xN ss, 16 - FILTER_VER_CHROMA_S_AVX2_8xN ss, 32 - FILTER_VER_CHROMA_S_AVX2_8xN ss, 64 - -%macro FILTER_VER_CHROMA_S_AVX2_Nx24 2 -%if ARCH_X86_64 == 1 +%macro FILTER_VER_CHROMA_S_AVX2_Nx16 2 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_%2x24, 4, 10, 10 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_%2x16, 4, 10, 10 mov r4d, r4m shl r4d, 6 add r1d, r1d @@ -21723,7 +16903,6 @@ cglobal interp_4tap_vert_%1_%2x24, 4, 10, 10 %else lea r5, [pw_ChromaCoeffV + r4] %endif - lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp @@ -21743,40 +16922,162 @@ cglobal interp_4tap_vert_%1_%2x24, 4, 10, 10 add r0, 16 dec r9d jnz .loopW -%ifidn %1,sp - lea r2, [r8 + r3 * 4 - %2 + 8] + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 16 + FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 32 + FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 64 + FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 16 + FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 32 + FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 64 + +%macro FILTER_VER_CHROMA_S_AVX2_NxN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%3_%1x%2, 4, 11, 10 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 %else - lea r2, [r8 + r3 * 4 - 2 * %2 + 16] + lea r5, [pw_ChromaCoeffV + r4] %endif - lea r0, [r7 - 2 * %2 + 16] - mova m7, m9 - mov r9d, %2 / 8 -.loop: - PROCESS_CHROMA_S_AVX2_W8_8R %1 -%ifidn %1,sp + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %3,sp + mova m9, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 8 +.loopW: + PROCESS_CHROMA_S_AVX2_W8_16R %3 +%ifidn %3,sp add r2, 8 %else add r2, 16 %endif add r0, 16 + dec r10d + jnz .loopW + lea r0, [r7 - 2 * %1 + 16] +%ifidn %3,sp + lea r2, [r8 + r3 * 4 - %1 + 8] +%else + lea r2, [r8 + r3 * 4 - 2 * %1 + 16] +%endif dec r9d - jnz .loop + jnz .loopH RET %endif %endmacro - FILTER_VER_CHROMA_S_AVX2_Nx24 sp, 32 - FILTER_VER_CHROMA_S_AVX2_Nx24 sp, 16 - FILTER_VER_CHROMA_S_AVX2_Nx24 ss, 32 - FILTER_VER_CHROMA_S_AVX2_Nx24 ss, 16 + FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, sp + FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, sp + FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, sp + FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, ss + FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, ss + FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, ss + FILTER_VER_CHROMA_S_AVX2_NxN 16, 64, sp + FILTER_VER_CHROMA_S_AVX2_NxN 24, 64, sp + FILTER_VER_CHROMA_S_AVX2_NxN 32, 64, sp + FILTER_VER_CHROMA_S_AVX2_NxN 32, 48, sp + FILTER_VER_CHROMA_S_AVX2_NxN 32, 48, ss + FILTER_VER_CHROMA_S_AVX2_NxN 16, 64, ss + FILTER_VER_CHROMA_S_AVX2_NxN 24, 64, ss + FILTER_VER_CHROMA_S_AVX2_NxN 32, 64, ss + FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, sp + FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, sp + FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, sp + FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, sp + FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, ss + FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, ss + FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, ss + FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, ss -%macro FILTER_VER_CHROMA_S_AVX2_2x8 1 +%macro PROCESS_CHROMA_S_AVX2_W8_4R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m2, m4 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm4, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm4, 1 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m3, m5 +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 +%endif +%endmacro + +%macro FILTER_VER_CHROMA_S_AVX2_8x4 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_2x8, 4, 6, 7 +cglobal interp_4tap_vert_%1_8x4, 4, 6, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d - sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] @@ -21786,97 +17087,78 @@ cglobal interp_4tap_vert_%1_2x8, 4, 6, 7 %endif lea r4, [r1 * 3] + sub r0, r1 %ifidn %1,sp - mova m6, [pd_526336] + mova m7, [pd_526336] %else add r3d, r3d %endif - movd xm0, [r0] - movd xm1, [r0 + r1] - punpcklwd xm0, xm1 - movd xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] - movd xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movd xm4, [r0] - punpcklwd xm3, xm4 - punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] - vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] - movd xm1, [r0 + r1] - punpcklwd xm4, xm1 - movd xm3, [r0 + r1 * 2] - punpcklwd xm1, xm3 - punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] - vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] - pmaddwd m0, [r5] - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 - movd xm1, [r0 + r4] - punpcklwd xm3, xm1 - lea r0, [r0 + 4 * r1] - movd xm2, [r0] - punpcklwd xm1, xm2 - punpcklqdq xm3, xm1 ; m3 = [8 7 7 6] - vinserti128 m4, m4, xm3, 1 ; m4 = [8 7 7 6 6 5 5 4] - movd xm1, [r0 + r1] - punpcklwd xm2, xm1 - movd xm5, [r0 + r1 * 2] - punpcklwd xm1, xm5 - punpcklqdq xm2, xm1 ; m2 = [10 9 9 8] - vinserti128 m3, m3, xm2, 1 ; m3 = [10 9 9 8 8 7 7 6] - pmaddwd m4, [r5] - pmaddwd m3, [r5 + 1 * mmsize] - paddd m4, m3 + + PROCESS_CHROMA_S_AVX2_W8_4R %1 + lea r4, [r3 * 3] +%ifidn %1,sp + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 +%else + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_8x4 sp + FILTER_VER_CHROMA_S_AVX2_8x4 ss + +%macro FILTER_VER_CHROMA_S_AVX2_12x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_12x16, 4, 9, 10 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 %ifidn %1,sp - paddd m0, m6 - paddd m4, m6 - psrad m0, 12 - psrad m4, 12 + mova m9, [pd_526336] %else - psrad m0, 6 - psrad m4, 6 + add r3d, r3d %endif - packssdw m0, m4 - vextracti128 xm4, m0, 1 - lea r4, [r3 * 3] + lea r6, [r3 * 3] + PROCESS_CHROMA_S_AVX2_W8_16R %1 %ifidn %1,sp - packuswb xm0, xm4 - pextrw [r2], xm0, 0 - pextrw [r2 + r3], xm0, 1 - pextrw [r2 + 2 * r3], xm0, 4 - pextrw [r2 + r4], xm0, 5 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 2 - pextrw [r2 + r3], xm0, 3 - pextrw [r2 + 2 * r3], xm0, 6 - pextrw [r2 + r4], xm0, 7 + add r2, 8 %else - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - movd [r2 + 2 * r3], xm4 - pextrd [r2 + r4], xm4, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm0, 3 - pextrd [r2 + 2 * r3], xm4, 2 - pextrd [r2 + r4], xm4, 3 + add r2, 16 %endif + add r0, 16 + mova m7, m9 + PROCESS_CHROMA_AVX2_W4_16R %1 RET +%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_2x8 sp - FILTER_VER_CHROMA_S_AVX2_2x8 ss + FILTER_VER_CHROMA_S_AVX2_12x16 sp + FILTER_VER_CHROMA_S_AVX2_12x16 ss -%macro FILTER_VER_CHROMA_S_AVX2_2x16 1 +%macro FILTER_VER_CHROMA_S_AVX2_12x32 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_2x16, 4, 6, 9 +cglobal interp_4tap_vert_%1_12x32, 4, 9, 10 mov r4d, r4m shl r4d, 6 add r1d, r1d - sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] @@ -21886,154 +17168,41 @@ cglobal interp_4tap_vert_%1_2x16, 4, 6, 9 %endif lea r4, [r1 * 3] -%ifidn %1,sp - mova m6, [pd_526336] + sub r0, r1 +%ifidn %1, sp + mova m9, [pd_526336] %else add r3d, r3d %endif - movd xm0, [r0] - movd xm1, [r0 + r1] - punpcklwd xm0, xm1 - movd xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] - movd xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movd xm4, [r0] - punpcklwd xm3, xm4 - punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] - vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] - movd xm1, [r0 + r1] - punpcklwd xm4, xm1 - movd xm3, [r0 + r1 * 2] - punpcklwd xm1, xm3 - punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] - vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] - pmaddwd m0, [r5] - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 - movd xm1, [r0 + r4] - punpcklwd xm3, xm1 - lea r0, [r0 + 4 * r1] - movd xm2, [r0] - punpcklwd xm1, xm2 - punpcklqdq xm3, xm1 ; m3 = [8 7 7 6] - vinserti128 m4, m4, xm3, 1 ; m4 = [8 7 7 6 6 5 5 4] - movd xm1, [r0 + r1] - punpcklwd xm2, xm1 - movd xm5, [r0 + r1 * 2] - punpcklwd xm1, xm5 - punpcklqdq xm2, xm1 ; m2 = [10 9 9 8] - vinserti128 m3, m3, xm2, 1 ; m3 = [10 9 9 8 8 7 7 6] - pmaddwd m4, [r5] - pmaddwd m3, [r5 + 1 * mmsize] - paddd m4, m3 - movd xm1, [r0 + r4] - punpcklwd xm5, xm1 - lea r0, [r0 + 4 * r1] - movd xm3, [r0] - punpcklwd xm1, xm3 - punpcklqdq xm5, xm1 ; m5 = [12 11 11 10] - vinserti128 m2, m2, xm5, 1 ; m2 = [12 11 11 10 10 9 9 8] - movd xm1, [r0 + r1] - punpcklwd xm3, xm1 - movd xm7, [r0 + r1 * 2] - punpcklwd xm1, xm7 - punpcklqdq xm3, xm1 ; m3 = [14 13 13 12] - vinserti128 m5, m5, xm3, 1 ; m5 = [14 13 13 12 12 11 11 10] - pmaddwd m2, [r5] - pmaddwd m5, [r5 + 1 * mmsize] - paddd m2, m5 - movd xm5, [r0 + r4] - punpcklwd xm7, xm5 - lea r0, [r0 + 4 * r1] - movd xm1, [r0] - punpcklwd xm5, xm1 - punpcklqdq xm7, xm5 ; m7 = [16 15 15 14] - vinserti128 m3, m3, xm7, 1 ; m3 = [16 15 15 14 14 13 13 12] - movd xm5, [r0 + r1] - punpcklwd xm1, xm5 - movd xm8, [r0 + r1 * 2] - punpcklwd xm5, xm8 - punpcklqdq xm1, xm5 ; m1 = [18 17 17 16] - vinserti128 m7, m7, xm1, 1 ; m7 = [18 17 17 16 16 15 15 14] - pmaddwd m3, [r5] - pmaddwd m7, [r5 + 1 * mmsize] - paddd m3, m7 -%ifidn %1,sp - paddd m0, m6 - paddd m4, m6 - paddd m2, m6 - paddd m3, m6 - psrad m0, 12 - psrad m4, 12 - psrad m2, 12 - psrad m3, 12 + lea r6, [r3 * 3] +%rep 2 + PROCESS_CHROMA_S_AVX2_W8_16R %1 +%ifidn %1, sp + add r2, 8 %else - psrad m0, 6 - psrad m4, 6 - psrad m2, 6 - psrad m3, 6 + add r2, 16 %endif - packssdw m0, m4 - packssdw m2, m3 - lea r4, [r3 * 3] -%ifidn %1,sp - packuswb m0, m2 - vextracti128 xm2, m0, 1 - pextrw [r2], xm0, 0 - pextrw [r2 + r3], xm0, 1 - pextrw [r2 + 2 * r3], xm2, 0 - pextrw [r2 + r4], xm2, 1 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 2 - pextrw [r2 + r3], xm0, 3 - pextrw [r2 + 2 * r3], xm2, 2 - pextrw [r2 + r4], xm2, 3 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 4 - pextrw [r2 + r3], xm0, 5 - pextrw [r2 + 2 * r3], xm2, 4 - pextrw [r2 + r4], xm2, 5 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 6 - pextrw [r2 + r3], xm0, 7 - pextrw [r2 + 2 * r3], xm2, 6 - pextrw [r2 + r4], xm2, 7 + add r0, 16 + mova m7, m9 + PROCESS_CHROMA_AVX2_W4_16R %1 + sub r0, 16 +%ifidn %1, sp + lea r2, [r2 + r3 * 4 - 8] %else - vextracti128 xm4, m0, 1 - vextracti128 xm3, m2, 1 - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - movd [r2 + 2 * r3], xm4 - pextrd [r2 + r4], xm4, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm0, 3 - pextrd [r2 + 2 * r3], xm4, 2 - pextrd [r2 + r4], xm4, 3 - lea r2, [r2 + r3 * 4] - movd [r2], xm2 - pextrd [r2 + r3], xm2, 1 - movd [r2 + 2 * r3], xm3 - pextrd [r2 + r4], xm3, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm2, 2 - pextrd [r2 + r3], xm2, 3 - pextrd [r2 + 2 * r3], xm3, 2 - pextrd [r2 + r4], xm3, 3 + lea r2, [r2 + r3 * 4 - 16] %endif +%endrep RET %endif %endmacro - FILTER_VER_CHROMA_S_AVX2_2x16 sp - FILTER_VER_CHROMA_S_AVX2_2x16 ss + FILTER_VER_CHROMA_S_AVX2_12x32 sp + FILTER_VER_CHROMA_S_AVX2_12x32 ss -%macro FILTER_VER_CHROMA_S_AVX2_6x8 1 +%macro FILTER_VER_CHROMA_S_AVX2_16x12 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_16x12, 4, 9, 9 mov r4d, r4m shl r4d, 6 add r1d, r1d @@ -22048,11 +17217,12 @@ cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp - mova m7, [pd_526336] + mova m8, [pd_526336] %else add r3d, r3d %endif - + lea r6, [r3 * 3] +%rep 2 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 @@ -22069,19 +17239,19 @@ cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] paddd m0, m4 - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] - pmaddwd m3, [r5] paddd m1, m5 + pmaddwd m3, [r5] %ifidn %1,sp - paddd m0, m7 - paddd m1, m7 + paddd m0, m8 + paddd m1, m8 psrad m0, 12 psrad m1, 12 %else @@ -22090,14 +17260,14 @@ cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 %endif packssdw m0, m1 - movu xm5, [r0 + r1] ; m5 = row 5 + movu xm5, [r7 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 + movu xm6, [r7 + r1 * 2] ; m6 = row 6 punpckhwd xm1, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm1, 1 @@ -22105,8 +17275,8 @@ cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 pmaddwd m5, [r5] paddd m3, m1 %ifidn %1,sp - paddd m2, m7 - paddd m3, m7 + paddd m2, m8 + paddd m3, m8 psrad m2, 12 psrad m3, 12 %else @@ -22114,42 +17284,36 @@ cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 psrad m3, 6 %endif packssdw m2, m3 - - movu xm1, [r0 + r4] ; m1 = row 7 - punpckhwd xm3, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm3, 1 - pmaddwd m3, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m3 - - lea r4, [r3 * 3] %ifidn %1,sp packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 vextracti128 xm2, m0, 1 - movd [r2], xm0 - pextrw [r2 + 4], xm2, 0 - pextrd [r2 + r3], xm0, 1 - pextrw [r2 + r3 + 4], xm2, 2 - pextrd [r2 + r3 * 2], xm0, 2 - pextrw [r2 + r3 * 2 + 4], xm2, 4 - pextrd [r2 + r4], xm0, 3 - pextrw [r2 + r4 + 4], xm2, 6 -%else movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 - movhps [r2 + r4], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + movu [r2], xm0 vextracti128 xm0, m0, 1 vextracti128 xm3, m2, 1 - movd [r2 + 8], xm0 - pextrd [r2 + r3 + 8], xm0, 2 - movd [r2 + r3 * 2 + 8], xm3 - pextrd [r2 + r4 + 8], xm3, 2 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 %endif - lea r2, [r2 + r3 * 4] - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 8 + lea r8, [r2 + r3 * 4] + + movu xm1, [r7 + r4] ; m1 = row 7 + punpckhwd xm0, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm0, 1 + pmaddwd m0, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m0 + lea r7, [r7 + r1 * 4] + movu xm0, [r7] ; m0 = row 8 punpckhwd xm2, xm1, xm0 punpcklwd xm1, xm0 vinserti128 m1, m1, xm2, 1 @@ -22157,8 +17321,8 @@ cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 pmaddwd m1, [r5] paddd m5, m2 %ifidn %1,sp - paddd m4, m7 - paddd m5, m7 + paddd m4, m8 + paddd m5, m8 psrad m4, 12 psrad m5, 12 %else @@ -22167,22 +17331,24 @@ cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 %endif packssdw m4, m5 - movu xm2, [r0 + r1] ; m2 = row 9 + movu xm2, [r7 + r1] ; m2 = row 9 punpckhwd xm5, xm0, xm2 punpcklwd xm0, xm2 vinserti128 m0, m0, xm5, 1 - pmaddwd m0, [r5 + 1 * mmsize] - paddd m6, m0 - movu xm5, [r0 + r1 * 2] ; m5 = row 10 - punpckhwd xm0, xm2, xm5 + pmaddwd m5, m0, [r5 + 1 * mmsize] + paddd m6, m5 + pmaddwd m0, [r5] + movu xm5, [r7 + r1 * 2] ; m5 = row 10 + punpckhwd xm7, xm2, xm5 punpcklwd xm2, xm5 - vinserti128 m2, m2, xm0, 1 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m1, m2 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 1 * mmsize] + paddd m1, m7 + pmaddwd m2, [r5] %ifidn %1,sp - paddd m6, m7 - paddd m1, m7 + paddd m6, m8 + paddd m1, m8 psrad m6, 12 psrad m1, 12 %else @@ -22192,37 +17358,105 @@ cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 packssdw m6, m1 %ifidn %1,sp packuswb m4, m6 + vpermd m4, m3, m4 vextracti128 xm6, m4, 1 - movd [r2], xm4 - pextrw [r2 + 4], xm6, 0 - pextrd [r2 + r3], xm4, 1 - pextrw [r2 + r3 + 4], xm6, 2 - pextrd [r2 + r3 * 2], xm4, 2 - pextrw [r2 + r3 * 2 + 4], xm6, 4 - pextrd [r2 + r4], xm4, 3 - pextrw [r2 + r4 + 4], xm6, 6 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 %else - movq [r2], xm4 - movhps [r2 + r3], xm4 - movq [r2 + r3 * 2], xm6 - movhps [r2 + r4], xm6 - vextracti128 xm5, m4, 1 + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm7, m4, 1 vextracti128 xm1, m6, 1 - movd [r2 + 8], xm5 - pextrd [r2 + r3 + 8], xm5, 2 - movd [r2 + r3 * 2 + 8], xm1 - pextrd [r2 + r4 + 8], xm1, 2 + movu [r8], xm4 + movu [r8 + r3], xm7 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm1 +%endif + lea r8, [r8 + r3 * 4] + + movu xm7, [r7 + r4] ; m7 = row 11 + punpckhwd xm1, xm5, xm7 + punpcklwd xm5, xm7 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + paddd m0, m1 + pmaddwd m5, [r5] + lea r7, [r7 + r1 * 4] + movu xm1, [r7] ; m1 = row 12 + punpckhwd xm4, xm7, xm1 + punpcklwd xm7, xm1 + vinserti128 m7, m7, xm4, 1 + pmaddwd m4, m7, [r5 + 1 * mmsize] + paddd m2, m4 + pmaddwd m7, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m2, m8 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + + movu xm4, [r7 + r1] ; m4 = row 13 + punpckhwd xm2, xm1, xm4 + punpcklwd xm1, xm4 + vinserti128 m1, m1, xm2, 1 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m5, m1 + movu xm2, [r7 + r1 * 2] ; m2 = row 14 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m7, m4 +%ifidn %1,sp + paddd m5, m8 + paddd m7, m8 + psrad m5, 12 + psrad m7, 12 +%else + psrad m5, 6 + psrad m7, 6 +%endif + packssdw m5, m7 +%ifidn %1,sp + packuswb m0, m5 + vpermd m0, m3, m0 + vextracti128 xm5, m0, 1 + movq [r8], xm0 + movhps [r8 + r3], xm0 + movq [r8 + r3 * 2], xm5 + movhps [r8 + r6], xm5 + add r2, 8 +%else + vpermq m0, m0, 11011000b + vpermq m5, m5, 11011000b + vextracti128 xm7, m0, 1 + vextracti128 xm6, m5, 1 + movu [r8], xm0 + movu [r8 + r3], xm7 + movu [r8 + r3 * 2], xm5 + movu [r8 + r6], xm6 + add r2, 16 %endif + add r0, 16 +%endrep RET +%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_6x8 sp - FILTER_VER_CHROMA_S_AVX2_6x8 ss + FILTER_VER_CHROMA_S_AVX2_16x12 sp + FILTER_VER_CHROMA_S_AVX2_16x12 ss -%macro FILTER_VER_CHROMA_S_AVX2_6x16 1 +%macro FILTER_VER_CHROMA_S_AVX2_8x12 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_6x16, 4, 7, 9 +cglobal interp_4tap_vert_%1_8x12, 4, 7, 9 mov r4d, r4m shl r4d, 6 add r1d, r1d @@ -22305,28 +17539,25 @@ cglobal interp_4tap_vert_%1_6x16, 4, 7, 9 packssdw m2, m3 %ifidn %1,sp packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 vextracti128 xm2, m0, 1 - movd [r2], xm0 - pextrw [r2 + 4], xm2, 0 - pextrd [r2 + r3], xm0, 1 - pextrw [r2 + r3 + 4], xm2, 2 - pextrd [r2 + r3 * 2], xm0, 2 - pextrw [r2 + r3 * 2 + 4], xm2, 4 - pextrd [r2 + r6], xm0, 3 - pextrw [r2 + r6 + 4], xm2, 6 -%else movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + movu [r2], xm0 vextracti128 xm0, m0, 1 vextracti128 xm3, m2, 1 - movd [r2 + 8], xm0 - pextrd [r2 + r3 + 8], xm0, 2 - movd [r2 + r3 * 2 + 8], xm3 - pextrd [r2 + r6 + 8], xm3, 2 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 %endif lea r2, [r2 + r3 * 4] + movu xm1, [r0 + r4] ; m1 = row 7 punpckhwd xm0, xm6, xm1 punpcklwd xm6, xm1 @@ -22380,28 +17611,24 @@ cglobal interp_4tap_vert_%1_6x16, 4, 7, 9 packssdw m6, m1 %ifidn %1,sp packuswb m4, m6 + vpermd m4, m3, m4 vextracti128 xm6, m4, 1 - movd [r2], xm4 - pextrw [r2 + 4], xm6, 0 - pextrd [r2 + r3], xm4, 1 - pextrw [r2 + r3 + 4], xm6, 2 - pextrd [r2 + r3 * 2], xm4, 2 - pextrw [r2 + r3 * 2 + 4], xm6, 4 - pextrd [r2 + r6], xm4, 3 - pextrw [r2 + r6 + 4], xm6, 6 -%else movq [r2], xm4 movhps [r2 + r3], xm4 movq [r2 + r3 * 2], xm6 movhps [r2 + r6], xm6 - vextracti128 xm4, m4, 1 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm7, m4, 1 vextracti128 xm1, m6, 1 - movd [r2 + 8], xm4 - pextrd [r2 + r3 + 8], xm4, 2 - movd [r2 + r3 * 2 + 8], xm1 - pextrd [r2 + r6 + 8], xm1, 2 + movu [r2], xm4 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm1 %endif lea r2, [r2 + r3 * 4] + movu xm7, [r0 + r4] ; m7 = row 11 punpckhwd xm1, xm5, xm7 punpcklwd xm5, xm7 @@ -22432,16 +17659,14 @@ cglobal interp_4tap_vert_%1_6x16, 4, 7, 9 punpckhwd xm2, xm1, xm4 punpcklwd xm1, xm4 vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - paddd m5, m2 - pmaddwd m1, [r5] + pmaddwd m1, [r5 + 1 * mmsize] + paddd m5, m1 movu xm2, [r0 + r1 * 2] ; m2 = row 14 punpckhwd xm6, xm4, xm2 punpcklwd xm4, xm2 vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m7, m6 - pmaddwd m4, [r5] + pmaddwd m4, [r5 + 1 * mmsize] + paddd m7, m4 %ifidn %1,sp paddd m5, m8 paddd m7, m8 @@ -22453,980 +17678,1344 @@ cglobal interp_4tap_vert_%1_6x16, 4, 7, 9 %endif packssdw m5, m7 %ifidn %1,sp - packuswb m0, m5 - vextracti128 xm5, m0, 1 - movd [r2], xm0 - pextrw [r2 + 4], xm5, 0 - pextrd [r2 + r3], xm0, 1 - pextrw [r2 + r3 + 4], xm5, 2 - pextrd [r2 + r3 * 2], xm0, 2 - pextrw [r2 + r3 * 2 + 4], xm5, 4 - pextrd [r2 + r6], xm0, 3 - pextrw [r2 + r6 + 4], xm5, 6 + packuswb m0, m5 + vpermd m0, m3, m0 + vextracti128 xm5, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm5 +%else + vpermq m0, m0, 11011000b + vpermq m5, m5, 11011000b + vextracti128 xm7, m0, 1 + vextracti128 xm6, m5, 1 + movu [r2], xm0 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm5 + movu [r2 + r6], xm6 +%endif + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_8x12 sp + FILTER_VER_CHROMA_S_AVX2_8x12 ss + +%macro FILTER_VER_CHROMA_S_AVX2_16x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_16x4, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m7, [pd_526336] %else + add r3d, r3d +%endif +%rep 2 + PROCESS_CHROMA_S_AVX2_W8_4R %1 + lea r6, [r3 * 3] +%ifidn %1,sp movq [r2], xm0 movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm5 - vextracti128 xm0, m0, 1 - vextracti128 xm7, m5, 1 - movd [r2 + 8], xm0 - pextrd [r2 + r3 + 8], xm0, 2 - movd [r2 + r3 * 2 + 8], xm7 - pextrd [r2 + r6 + 8], xm7, 2 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 + add r2, 8 +%else + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + add r2, 16 %endif - lea r2, [r2 + r3 * 4] + lea r6, [4 * r1 - 16] + sub r0, r6 +%endrep + RET +%endmacro - movu xm6, [r0 + r4] ; m6 = row 15 - punpckhwd xm5, xm2, xm6 - punpcklwd xm2, xm6 - vinserti128 m2, m2, xm5, 1 - pmaddwd m5, m2, [r5 + 1 * mmsize] - paddd m1, m5 + FILTER_VER_CHROMA_S_AVX2_16x4 sp + FILTER_VER_CHROMA_S_AVX2_16x4 ss + +%macro PROCESS_CHROMA_S_AVX2_W8_8R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 16 - punpckhwd xm5, xm6, xm0 - punpcklwd xm6, xm0 - vinserti128 m6, m6, xm5, 1 - pmaddwd m5, m6, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m6, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] %ifidn %1,sp - paddd m1, m8 - paddd m4, m8 + paddd m0, m7 + paddd m1, m7 + psrad m0, 12 psrad m1, 12 - psrad m4, 12 %else + psrad m0, 6 psrad m1, 6 - psrad m4, 6 %endif - packssdw m1, m4 + packssdw m0, m1 - movu xm5, [r0 + r1] ; m5 = row 17 - punpckhwd xm4, xm0, xm5 - punpcklwd xm0, xm5 - vinserti128 m0, m0, xm4, 1 - pmaddwd m0, [r5 + 1 * mmsize] - paddd m2, m0 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhwd xm0, xm5, xm4 - punpcklwd xm5, xm4 - vinserti128 m5, m5, xm0, 1 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m6, m5 + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 %ifidn %1,sp - paddd m2, m8 - paddd m6, m8 + paddd m2, m7 + paddd m3, m7 psrad m2, 12 - psrad m6, 12 + psrad m3, 12 %else psrad m2, 6 - psrad m6, 6 + psrad m3, 6 %endif - packssdw m2, m6 + packssdw m2, m3 %ifidn %1,sp - packuswb m1, m2 - vextracti128 xm2, m1, 1 - movd [r2], xm1 - pextrw [r2 + 4], xm2, 0 - pextrd [r2 + r3], xm1, 1 - pextrw [r2 + r3 + 4], xm2, 2 - pextrd [r2 + r3 * 2], xm1, 2 - pextrw [r2 + r3 * 2 + 4], xm2, 4 - pextrd [r2 + r6], xm1, 3 - pextrw [r2 + r6 + 4], xm2, 6 -%else - movq [r2], xm1 - movhps [r2 + r3], xm1 + packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 - vextracti128 xm4, m1, 1 - vextracti128 xm6, m2, 1 - movd [r2 + 8], xm4 - pextrd [r2 + r3 + 8], xm4, 2 - movd [r2 + r3 * 2 + 8], xm6 - pextrd [r2 + r6 + 8], xm6, 2 -%endif - RET +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 %endif -%endmacro - - FILTER_VER_CHROMA_S_AVX2_6x16 sp - FILTER_VER_CHROMA_S_AVX2_6x16 ss - -;--------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS_W2_4R 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5 - - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 + lea r8, [r2 + r3 * 4] -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] + movu xm1, [r7 + r4] ; m1 = row 7 + punpckhwd xm0, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm0, 1 + pmaddwd m0, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m0 + lea r7, [r7 + r1 * 4] + movu xm0, [r7] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m7 + paddd m5, m7 + psrad m4, 12 + psrad m5, 12 %else - lea r5, [tab_ChromaCoeffV + r4] + psrad m4, 6 + psrad m5, 6 %endif + packssdw m4, m5 - mov r4d, (%2/4) - -.loopH: - PROCESS_CHROMA_SP_W2_4R r5 - - psrad m0, 6 - psrad m2, 6 - - packssdw m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m0, 2 - pextrd [r2 + r3], m0, 3 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH - - RET -%endmacro - - FILTER_VER_CHROMA_SS_W2_4R 2, 4 - FILTER_VER_CHROMA_SS_W2_4R 2, 8 - - FILTER_VER_CHROMA_SS_W2_4R 2, 16 - -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -INIT_XMM sse2 -cglobal interp_4tap_vert_ss_4x2, 5, 6, 4 - - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 + movu xm2, [r7 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m6, m0 + movu xm5, [r7 + r1 * 2] ; m5 = row 10 + punpckhwd xm0, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m1, m2 -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] +%ifidn %1,sp + paddd m6, m7 + paddd m1, m7 + psrad m6, 12 + psrad m1, 12 %else - lea r5, [tab_ChromaCoeffV + r4] + psrad m6, 6 + psrad m1, 6 %endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm7, m4, 1 + vextracti128 xm1, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm7 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm1 +%endif +%endmacro - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movq m2, [r0] - punpcklwd m1, m2 ;m1=[1 2] - pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 - - movq m3, [r0 + r1] - punpcklwd m2, m3 ;m4=[2 3] - pmaddwd m2, [r5 + 1 * 16] - paddd m0, m2 ;m0=[0+1+2+3] Row1 done - psrad m0, 6 - - movq m2, [r0 + 2 * r1] - punpcklwd m3, m2 ;m5=[3 4] - pmaddwd m3, [r5 + 1 * 16] - paddd m1, m3 ;m1=[1+2+3+4] Row2 done - psrad m1, 6 - - packssdw m0, m1 - - movlps [r2], m0 - movhps [r2 + r3], m0 - - RET - -;------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS_W6_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6 - - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 +%macro FILTER_VER_CHROMA_S_AVX2_Nx8 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_%2x8, 4, 9, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r6, [r5 + r4] + lea r5, [pw_ChromaCoeffV] + add r5, r4 %else - lea r6, [tab_ChromaCoeffV + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif - mov r4d, %2/4 - -.loopH: - PROCESS_CHROMA_SP_W4_4R - - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - packssdw m0, m1 - packssdw m2, m3 - - movlps [r2], m0 - movhps [r2 + r3], m0 - lea r5, [r2 + 2 * r3] - movlps [r5], m2 - movhps [r5 + r3], m2 - - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 2 * 4 - - PROCESS_CHROMA_SP_W2_4R r6 - - psrad m0, 6 - psrad m2, 6 - - packssdw m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m0, 2 - pextrd [r2 + r3], m0, 3 - - sub r0, 2 * 4 - lea r2, [r2 + 2 * r3 - 2 * 4] - - dec r4d - jnz .loopH - + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] +%rep %2 / 8 + PROCESS_CHROMA_S_AVX2_W8_8R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 +%endrep RET +%endif %endmacro - FILTER_VER_CHROMA_SS_W6_H4 6, 8 - - FILTER_VER_CHROMA_SS_W6_H4 6, 16 - - -;---------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;---------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS_W8_H2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7 + FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 32 + FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 16 + FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 32 + FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 16 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 +%macro FILTER_VER_CHROMA_S_AVX2_8x2 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x2, 4, 6, 6 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] + lea r5, [pw_ChromaCoeffV] + add r5, r4 %else - lea r5, [tab_ChromaCoeffV + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif - mov r4d, %2/2 -.loopH: - PROCESS_CHROMA_SP_W8_2R - - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - packssdw m0, m1 - packssdw m2, m3 - - movu [r2], m0 - movu [r2 + r3], m2 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH - - RET -%endmacro - - FILTER_VER_CHROMA_SS_W8_H2 8, 2 - FILTER_VER_CHROMA_SS_W8_H2 8, 4 - FILTER_VER_CHROMA_SS_W8_H2 8, 6 - FILTER_VER_CHROMA_SS_W8_H2 8, 8 - FILTER_VER_CHROMA_SS_W8_H2 8, 16 - FILTER_VER_CHROMA_SS_W8_H2 8, 32 + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m5, [pd_526336] +%else + add r3d, r3d +%endif - FILTER_VER_CHROMA_SS_W8_H2 8, 12 - FILTER_VER_CHROMA_SS_W8_H2 8, 64 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r0 + r1 * 4] ; m4 = row 4 + punpckhwd xm2, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddwd m3, [r5 + 1 * mmsize] + paddd m1, m3 +%ifidn %1,sp + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 +%ifidn %1,sp + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, 11011000b + movq [r2], xm0 + movhps [r2 + r3], xm0 +%else + vpermq m0, m0, 11011000b + vextracti128 xm1, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 +%endif + RET +%endmacro -;----------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_SS 2 -INIT_XMM sse2 -cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize + FILTER_VER_CHROMA_S_AVX2_8x2 sp + FILTER_VER_CHROMA_S_AVX2_8x2 ss - add r1d, r1d - add r3d, r3d - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 +%macro FILTER_VER_CHROMA_S_AVX2_8x6 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x6, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d %ifdef PIC - lea r5, [tab_LumaCoeffV] - lea r6, [r5 + r4] + lea r5, [pw_ChromaCoeffV] + add r5, r4 %else - lea r6, [tab_LumaCoeffV + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif - mov dword [rsp], %2/4 -.loopH: - mov r4d, (%1/4) -.loopW: - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m1, m4 ;m1=[1 2] - pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[2 3] - pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 - pmaddwd m4, [r6 + 1 * 16] - paddd m0, m4 ;m0=[0+1+2+3] Row1 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m7 + paddd m3, m7 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[3 4] - pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 - pmaddwd m5, [r6 + 1 * 16] - paddd m1, m5 ;m1 = [1+2+3+4] Row2 + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm3, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm3, 1 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m4, m6 + movu xm6, [r0 + r1 * 4] ; m6 = row 8 + punpckhwd xm3, xm1, xm6 + punpcklwd xm1, xm6 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m5, m1 +%ifidn %1,sp + paddd m4, m7 + paddd m5, m7 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + pshufd xm4, xm4, 11011000b + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movhps [r2 + r3], xm4 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 +%endif + RET +%endmacro - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[4 5] - pmaddwd m6, m4, [r6 + 1 * 16] - paddd m2, m6 ;m2=[2+3+4+5] Row3 - pmaddwd m4, [r6 + 2 * 16] - paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 + FILTER_VER_CHROMA_S_AVX2_8x6 sp + FILTER_VER_CHROMA_S_AVX2_8x6 ss - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[5 6] - pmaddwd m6, m5, [r6 + 1 * 16] - paddd m3, m6 ;m3=[3+4+5+6] Row4 - pmaddwd m5, [r6 + 2 * 16] - paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 +%macro FILTER_VER_CHROMA_S_AVX2_8xN 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_8x%2, 4, 7, 9 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[6 7] - pmaddwd m6, m4, [r6 + 2 * 16] - paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 - pmaddwd m4, [r6 + 3 * 16] - paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end - psrad m0, 6 +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m8, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] +%rep %2 / 16 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m1, m8 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[7 8] - pmaddwd m6, m5, [r6 + 2 * 16] - paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 - pmaddwd m5, [r6 + 3 * 16] - paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end - psrad m1, 6 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m8 + paddd m3, m8 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif + lea r2, [r2 + r3 * 4] - packssdw m0, m1 + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm0, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm0, 1 + pmaddwd m0, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m0 + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m8 + paddd m5, m8 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 - movlps [r2], m0 - movhps [r2 + r3], m0 + movu xm2, [r0 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m5, m0, [r5 + 1 * mmsize] + paddd m6, m5 + pmaddwd m0, [r5] + movu xm5, [r0 + r1 * 2] ; m5 = row 10 + punpckhwd xm7, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 1 * mmsize] + paddd m1, m7 + pmaddwd m2, [r5] - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[8 9] - pmaddwd m4, [r6 + 3 * 16] - paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end - psrad m2, 6 +%ifidn %1,sp + paddd m6, m8 + paddd m1, m8 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm7, m4, 1 + vextracti128 xm1, m6, 1 + movu [r2], xm4 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm1 +%endif + lea r2, [r2 + r3 * 4] - movq m4, [r0 + 2 * r1] - punpcklwd m5, m4 ;m5=[9 10] - pmaddwd m5, [r6 + 3 * 16] - paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end - psrad m3, 6 + movu xm7, [r0 + r4] ; m7 = row 11 + punpckhwd xm1, xm5, xm7 + punpcklwd xm5, xm7 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + paddd m0, m1 + pmaddwd m5, [r5] + lea r0, [r0 + r1 * 4] + movu xm1, [r0] ; m1 = row 12 + punpckhwd xm4, xm7, xm1 + punpcklwd xm7, xm1 + vinserti128 m7, m7, xm4, 1 + pmaddwd m4, m7, [r5 + 1 * mmsize] + paddd m2, m4 + pmaddwd m7, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m2, m8 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 - packssdw m2, m3 + movu xm4, [r0 + r1] ; m4 = row 13 + punpckhwd xm2, xm1, xm4 + punpcklwd xm1, xm4 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + paddd m5, m2 + pmaddwd m1, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 14 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m7, m6 + pmaddwd m4, [r5] +%ifidn %1,sp + paddd m5, m8 + paddd m7, m8 + psrad m5, 12 + psrad m7, 12 +%else + psrad m5, 6 + psrad m7, 6 +%endif + packssdw m5, m7 +%ifidn %1,sp + packuswb m0, m5 + vpermd m0, m3, m0 + vextracti128 xm5, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm5 +%else + vpermq m0, m0, 11011000b + vpermq m5, m5, 11011000b + vextracti128 xm7, m0, 1 + vextracti128 xm6, m5, 1 + movu [r2], xm0 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm5 + movu [r2 + r6], xm6 +%endif + lea r2, [r2 + r3 * 4] - movlps [r2 + 2 * r3], m2 - lea r5, [3 * r3] - movhps [r2 + r5], m2 + movu xm6, [r0 + r4] ; m6 = row 15 + punpckhwd xm5, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm5, 1 + pmaddwd m5, m2, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 16 + punpckhwd xm5, xm6, xm0 + punpcklwd xm6, xm0 + vinserti128 m6, m6, xm5, 1 + pmaddwd m5, m6, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m6, [r5] +%ifidn %1,sp + paddd m1, m8 + paddd m4, m8 + psrad m1, 12 + psrad m4, 12 +%else + psrad m1, 6 + psrad m4, 6 +%endif + packssdw m1, m4 - lea r5, [8 * r1 - 2 * 4] - sub r0, r5 - add r2, 2 * 4 + movu xm5, [r0 + r1] ; m5 = row 17 + punpckhwd xm4, xm0, xm5 + punpcklwd xm0, xm5 + vinserti128 m0, m0, xm4, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m2, m0 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhwd xm0, xm5, xm4 + punpcklwd xm5, xm4 + vinserti128 m5, m5, xm0, 1 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m6, m5 +%ifidn %1,sp + paddd m2, m8 + paddd m6, m8 + psrad m2, 12 + psrad m6, 12 +%else + psrad m2, 6 + psrad m6, 6 +%endif + packssdw m2, m6 +%ifidn %1,sp + packuswb m1, m2 + vpermd m1, m3, m1 + vextracti128 xm2, m1, 1 + movq [r2], xm1 + movhps [r2 + r3], xm1 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m1, m1, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm6, m1, 1 + vextracti128 xm4, m2, 1 + movu [r2], xm1 + movu [r2 + r3], xm6 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm4 +%endif + lea r2, [r2 + r3 * 4] +%endrep + RET +%endif +%endmacro - dec r4d - jnz .loopW + FILTER_VER_CHROMA_S_AVX2_8xN sp, 16 + FILTER_VER_CHROMA_S_AVX2_8xN sp, 32 + FILTER_VER_CHROMA_S_AVX2_8xN sp, 64 + FILTER_VER_CHROMA_S_AVX2_8xN ss, 16 + FILTER_VER_CHROMA_S_AVX2_8xN ss, 32 + FILTER_VER_CHROMA_S_AVX2_8xN ss, 64 - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - 2 * %1] +%macro FILTER_VER_CHROMA_S_AVX2_Nx24 2 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_%2x24, 4, 10, 10 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d - dec dword [rsp] - jnz .loopH +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m9, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + mov r9d, %2 / 8 +.loopW: + PROCESS_CHROMA_S_AVX2_W8_16R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r9d + jnz .loopW +%ifidn %1,sp + lea r2, [r8 + r3 * 4 - %2 + 8] +%else + lea r2, [r8 + r3 * 4 - 2 * %2 + 16] +%endif + lea r0, [r7 - 2 * %2 + 16] + mova m7, m9 + mov r9d, %2 / 8 +.loop: + PROCESS_CHROMA_S_AVX2_W8_8R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r9d + jnz .loop RET +%endif %endmacro - FILTER_VER_LUMA_SS 4, 4 - FILTER_VER_LUMA_SS 8, 8 - FILTER_VER_LUMA_SS 8, 4 - FILTER_VER_LUMA_SS 4, 8 - FILTER_VER_LUMA_SS 16, 16 - FILTER_VER_LUMA_SS 16, 8 - FILTER_VER_LUMA_SS 8, 16 - FILTER_VER_LUMA_SS 16, 12 - FILTER_VER_LUMA_SS 12, 16 - FILTER_VER_LUMA_SS 16, 4 - FILTER_VER_LUMA_SS 4, 16 - FILTER_VER_LUMA_SS 32, 32 - FILTER_VER_LUMA_SS 32, 16 - FILTER_VER_LUMA_SS 16, 32 - FILTER_VER_LUMA_SS 32, 24 - FILTER_VER_LUMA_SS 24, 32 - FILTER_VER_LUMA_SS 32, 8 - FILTER_VER_LUMA_SS 8, 32 - FILTER_VER_LUMA_SS 64, 64 - FILTER_VER_LUMA_SS 64, 32 - FILTER_VER_LUMA_SS 32, 64 - FILTER_VER_LUMA_SS 64, 48 - FILTER_VER_LUMA_SS 48, 64 - FILTER_VER_LUMA_SS 64, 16 - FILTER_VER_LUMA_SS 16, 64 + FILTER_VER_CHROMA_S_AVX2_Nx24 sp, 32 + FILTER_VER_CHROMA_S_AVX2_Nx24 sp, 16 + FILTER_VER_CHROMA_S_AVX2_Nx24 ss, 32 + FILTER_VER_CHROMA_S_AVX2_Nx24 ss, 16 -%macro FILTER_VER_LUMA_AVX2_4x4 1 +%macro FILTER_VER_CHROMA_S_AVX2_2x8 1 INIT_YMM avx2 -cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 +cglobal interp_4tap_vert_%1_2x8, 4, 6, 7 mov r4d, r4m + shl r4d, 6 add r1d, r1d - shl r4d, 7 + sub r0, r1 %ifdef PIC - lea r5, [pw_LumaCoeffVer] + lea r5, [pw_ChromaCoeffV] add r5, r4 %else - lea r5, [pw_LumaCoeffVer + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] - sub r0, r4 - %ifidn %1,sp mova m6, [pd_526336] %else add r3d, r3d %endif - - movq xm0, [r0] - movq xm1, [r0 + r1] + movd xm0, [r0] + movd xm1, [r0 + r1] punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] + movd xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] + punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] + movd xm3, [r0 + r4] punpcklwd xm2, xm3 lea r0, [r0 + 4 * r1] - movq xm4, [r0] + movd xm4, [r0] punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m5, m4, [r5 + 2 * mmsize] - pmaddwd m4, [r5 + 1 * mmsize] - paddd m0, m5 - paddd m2, m4 - movq xm3, [r0 + r4] + punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] + vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] + movd xm1, [r0 + r1] + punpcklwd xm4, xm1 + movd xm3, [r0 + r1 * 2] punpcklwd xm1, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] - punpcklwd xm3, xm4 - vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] - pmaddwd m5, m1, [r5 + 3 * mmsize] - pmaddwd m1, [r5 + 2 * mmsize] - paddd m0, m5 - paddd m2, m1 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + 2 * r1] + punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] + vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] + pmaddwd m0, [r5] + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movd xm1, [r0 + r4] punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [A 9 9 8] - pmaddwd m4, [r5 + 3 * mmsize] - paddd m2, m4 - + lea r0, [r0 + 4 * r1] + movd xm2, [r0] + punpcklwd xm1, xm2 + punpcklqdq xm3, xm1 ; m3 = [8 7 7 6] + vinserti128 m4, m4, xm3, 1 ; m4 = [8 7 7 6 6 5 5 4] + movd xm1, [r0 + r1] + punpcklwd xm2, xm1 + movd xm5, [r0 + r1 * 2] + punpcklwd xm1, xm5 + punpcklqdq xm2, xm1 ; m2 = [10 9 9 8] + vinserti128 m3, m3, xm2, 1 ; m3 = [10 9 9 8 8 7 7 6] + pmaddwd m4, [r5] + pmaddwd m3, [r5 + 1 * mmsize] + paddd m4, m3 %ifidn %1,sp paddd m0, m6 - paddd m2, m6 + paddd m4, m6 psrad m0, 12 - psrad m2, 12 + psrad m4, 12 %else psrad m0, 6 - psrad m2, 6 + psrad m4, 6 %endif - packssdw m0, m2 - vextracti128 xm2, m0, 1 + packssdw m0, m4 + vextracti128 xm4, m0, 1 lea r4, [r3 * 3] - %ifidn %1,sp - packuswb xm0, xm2 - movd [r2], xm0 - pextrd [r2 + r3], xm0, 2 - pextrd [r2 + r3 * 2], xm0, 1 - pextrd [r2 + r4], xm0, 3 + packuswb xm0, xm4 + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + 2 * r3], xm0, 4 + pextrw [r2 + r4], xm0, 5 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 2 + pextrw [r2 + r3], xm0, 3 + pextrw [r2 + 2 * r3], xm0, 6 + pextrw [r2 + r4], xm0, 7 %else - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r4], xm2 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + movd [r2 + 2 * r3], xm4 + pextrd [r2 + r4], xm4, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm0, 3 + pextrd [r2 + 2 * r3], xm4, 2 + pextrd [r2 + r4], xm4, 3 %endif RET %endmacro - FILTER_VER_LUMA_AVX2_4x4 sp - FILTER_VER_LUMA_AVX2_4x4 ss + FILTER_VER_CHROMA_S_AVX2_2x8 sp + FILTER_VER_CHROMA_S_AVX2_2x8 ss -%macro FILTER_VER_LUMA_AVX2_4x8 1 +%macro FILTER_VER_CHROMA_S_AVX2_2x16 1 +%if ARCH_X86_64 == 1 INIT_YMM avx2 -cglobal interp_8tap_vert_%1_4x8, 4, 7, 8 +cglobal interp_4tap_vert_%1_2x16, 4, 6, 9 mov r4d, r4m - shl r4d, 7 + shl r4d, 6 add r1d, r1d + sub r0, r1 %ifdef PIC - lea r5, [pw_LumaCoeffVer] + lea r5, [pw_ChromaCoeffV] add r5, r4 %else - lea r5, [pw_LumaCoeffVer + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif - lea r4, [r1 * 3] - sub r0, r4 - + lea r4, [r1 * 3] %ifidn %1,sp - mova m7, [pd_526336] + mova m6, [pd_526336] %else add r3d, r3d %endif - lea r6, [r3 * 3] - - movq xm0, [r0] - movq xm1, [r0 + r1] + movd xm0, [r0] + movd xm1, [r0 + r1] punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] + movd xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] + punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] + movd xm3, [r0 + r4] punpcklwd xm2, xm3 lea r0, [r0 + 4 * r1] - movq xm4, [r0] + movd xm4, [r0] punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] + punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] + vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] + movd xm1, [r0 + r1] + punpcklwd xm4, xm1 + movd xm3, [r0 + r1 * 2] + punpcklwd xm1, xm3 + punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] + vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] + pmaddwd m0, [r5] + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movd xm1, [r0 + r4] punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m5, m4, [r5 + 2 * mmsize] - paddd m0, m5 - pmaddwd m5, m4, [r5 + 1 * mmsize] - paddd m2, m5 + lea r0, [r0 + 4 * r1] + movd xm2, [r0] + punpcklwd xm1, xm2 + punpcklqdq xm3, xm1 ; m3 = [8 7 7 6] + vinserti128 m4, m4, xm3, 1 ; m4 = [8 7 7 6 6 5 5 4] + movd xm1, [r0 + r1] + punpcklwd xm2, xm1 + movd xm5, [r0 + r1 * 2] + punpcklwd xm1, xm5 + punpcklqdq xm2, xm1 ; m2 = [10 9 9 8] + vinserti128 m3, m3, xm2, 1 ; m3 = [10 9 9 8 8 7 7 6] pmaddwd m4, [r5] - movq xm3, [r0 + r4] - punpcklwd xm1, xm3 + pmaddwd m3, [r5 + 1 * mmsize] + paddd m4, m3 + movd xm1, [r0 + r4] + punpcklwd xm5, xm1 lea r0, [r0 + 4 * r1] - movq xm6, [r0] - punpcklwd xm3, xm6 - vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] - pmaddwd m5, m1, [r5 + 3 * mmsize] - paddd m0, m5 - pmaddwd m5, m1, [r5 + 2 * mmsize] + movd xm3, [r0] + punpcklwd xm1, xm3 + punpcklqdq xm5, xm1 ; m5 = [12 11 11 10] + vinserti128 m2, m2, xm5, 1 ; m2 = [12 11 11 10 10 9 9 8] + movd xm1, [r0 + r1] + punpcklwd xm3, xm1 + movd xm7, [r0 + r1 * 2] + punpcklwd xm1, xm7 + punpcklqdq xm3, xm1 ; m3 = [14 13 13 12] + vinserti128 m5, m5, xm3, 1 ; m5 = [14 13 13 12 12 11 11 10] + pmaddwd m2, [r5] + pmaddwd m5, [r5 + 1 * mmsize] paddd m2, m5 - pmaddwd m5, m1, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m1, [r5] - movq xm3, [r0 + r1] - punpcklwd xm6, xm3 - movq xm5, [r0 + 2 * r1] - punpcklwd xm3, xm5 - vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] - pmaddwd m3, m6, [r5 + 3 * mmsize] - paddd m2, m3 - pmaddwd m3, m6, [r5 + 2 * mmsize] - paddd m4, m3 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m1, m6 - + movd xm5, [r0 + r4] + punpcklwd xm7, xm5 + lea r0, [r0 + 4 * r1] + movd xm1, [r0] + punpcklwd xm5, xm1 + punpcklqdq xm7, xm5 ; m7 = [16 15 15 14] + vinserti128 m3, m3, xm7, 1 ; m3 = [16 15 15 14 14 13 13 12] + movd xm5, [r0 + r1] + punpcklwd xm1, xm5 + movd xm8, [r0 + r1 * 2] + punpcklwd xm5, xm8 + punpcklqdq xm1, xm5 ; m1 = [18 17 17 16] + vinserti128 m7, m7, xm1, 1 ; m7 = [18 17 17 16 16 15 15 14] + pmaddwd m3, [r5] + pmaddwd m7, [r5 + 1 * mmsize] + paddd m3, m7 %ifidn %1,sp - paddd m0, m7 - paddd m2, m7 + paddd m0, m6 + paddd m4, m6 + paddd m2, m6 + paddd m3, m6 psrad m0, 12 + psrad m4, 12 psrad m2, 12 + psrad m3, 12 %else psrad m0, 6 - psrad m2, 6 -%endif - packssdw m0, m2 - - movq xm3, [r0 + r4] - punpcklwd xm5, xm3 - lea r0, [r0 + 4 * r1] - movq xm2, [r0] - punpcklwd xm3, xm2 - vinserti128 m5, m5, xm3, 1 ; m5 = [C B B A] - pmaddwd m3, m5, [r5 + 3 * mmsize] - paddd m4, m3 - pmaddwd m5, [r5 + 2 * mmsize] - paddd m1, m5 - movq xm3, [r0 + r1] - punpcklwd xm2, xm3 - movq xm5, [r0 + 2 * r1] - punpcklwd xm3, xm5 - vinserti128 m2, m2, xm3, 1 ; m2 = [E D D C] - pmaddwd m2, [r5 + 3 * mmsize] - paddd m1, m2 - -%ifidn %1,sp - paddd m4, m7 - paddd m1, m7 - psrad m4, 12 - psrad m1, 12 -%else psrad m4, 6 - psrad m1, 6 + psrad m2, 6 + psrad m3, 6 %endif - packssdw m4, m1 - + packssdw m0, m4 + packssdw m2, m3 + lea r4, [r3 * 3] %ifidn %1,sp - packuswb m0, m4 + packuswb m0, m2 vextracti128 xm2, m0, 1 - movd [r2], xm0 - movd [r2 + r3], xm2 - pextrd [r2 + r3 * 2], xm0, 1 - pextrd [r2 + r6], xm2, 1 + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + 2 * r3], xm2, 0 + pextrw [r2 + r4], xm2, 1 lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm2, 2 - pextrd [r2 + r3 * 2], xm0, 3 - pextrd [r2 + r6], xm2, 3 + pextrw [r2], xm0, 2 + pextrw [r2 + r3], xm0, 3 + pextrw [r2 + 2 * r3], xm2, 2 + pextrw [r2 + r4], xm2, 3 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 4 + pextrw [r2 + r3], xm0, 5 + pextrw [r2 + 2 * r3], xm2, 4 + pextrw [r2 + r4], xm2, 5 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 6 + pextrw [r2 + r3], xm0, 7 + pextrw [r2 + 2 * r3], xm2, 6 + pextrw [r2 + r4], xm2, 7 %else - vextracti128 xm2, m0, 1 - vextracti128 xm1, m4, 1 - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm2 + vextracti128 xm4, m0, 1 + vextracti128 xm3, m2, 1 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + movd [r2 + 2 * r3], xm4 + pextrd [r2 + r4], xm4, 1 lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movq [r2 + r3], xm1 - movhps [r2 + r3 * 2], xm4 - movhps [r2 + r6], xm1 -%endif - RET -%endmacro - - FILTER_VER_LUMA_AVX2_4x8 sp - FILTER_VER_LUMA_AVX2_4x8 ss - -%macro PROCESS_LUMA_AVX2_W4_16R 1 - movq xm0, [r0] - movq xm1, [r0 + r1] - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m5, m4, [r5 + 2 * mmsize] - paddd m0, m5 - pmaddwd m5, m4, [r5 + 1 * mmsize] - paddd m2, m5 - pmaddwd m4, [r5] - movq xm3, [r0 + r4] - punpcklwd xm1, xm3 - lea r0, [r0 + 4 * r1] - movq xm6, [r0] - punpcklwd xm3, xm6 - vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] - pmaddwd m5, m1, [r5 + 3 * mmsize] - paddd m0, m5 - pmaddwd m5, m1, [r5 + 2 * mmsize] - paddd m2, m5 - pmaddwd m5, m1, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m1, [r5] - movq xm3, [r0 + r1] - punpcklwd xm6, xm3 - movq xm5, [r0 + 2 * r1] - punpcklwd xm3, xm5 - vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] - pmaddwd m3, m6, [r5 + 3 * mmsize] - paddd m2, m3 - pmaddwd m3, m6, [r5 + 2 * mmsize] - paddd m4, m3 - pmaddwd m3, m6, [r5 + 1 * mmsize] - paddd m1, m3 - pmaddwd m6, [r5] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm0, 3 + pextrd [r2 + 2 * r3], xm4, 2 + pextrd [r2 + r4], xm4, 3 + lea r2, [r2 + r3 * 4] + movd [r2], xm2 + pextrd [r2 + r3], xm2, 1 + movd [r2 + 2 * r3], xm3 + pextrd [r2 + r4], xm3, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm2, 2 + pextrd [r2 + r3], xm2, 3 + pextrd [r2 + 2 * r3], xm3, 2 + pextrd [r2 + r4], xm3, 3 +%endif + RET +%endif +%endmacro -%ifidn %1,sp - paddd m0, m7 - paddd m2, m7 - psrad m0, 12 - psrad m2, 12 + FILTER_VER_CHROMA_S_AVX2_2x16 sp + FILTER_VER_CHROMA_S_AVX2_2x16 ss + +%macro FILTER_VER_CHROMA_S_AVX2_6x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 %else - psrad m0, 6 - psrad m2, 6 + lea r5, [pw_ChromaCoeffV + r4] %endif - packssdw m0, m2 - vextracti128 xm2, m0, 1 + + lea r4, [r1 * 3] + sub r0, r1 %ifidn %1,sp - packuswb xm0, xm2 - movd [r2], xm0 - pextrd [r2 + r3], xm0, 2 - pextrd [r2 + r3 * 2], xm0, 1 - pextrd [r2 + r6], xm0, 3 + mova m7, [pd_526336] %else - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm2 + add r3d, r3d %endif - movq xm2, [r0 + r4] - punpcklwd xm5, xm2 - lea r0, [r0 + 4 * r1] - movq xm0, [r0] - punpcklwd xm2, xm0 - vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] - pmaddwd m2, m5, [r5 + 3 * mmsize] - paddd m4, m2 - pmaddwd m2, m5, [r5 + 2 * mmsize] - paddd m1, m2 - pmaddwd m2, m5, [r5 + 1 * mmsize] - paddd m6, m2 - pmaddwd m5, [r5] - movq xm2, [r0 + r1] - punpcklwd xm0, xm2 - movq xm3, [r0 + 2 * r1] - punpcklwd xm2, xm3 - vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] - pmaddwd m2, m0, [r5 + 3 * mmsize] - paddd m1, m2 - pmaddwd m2, m0, [r5 + 2 * mmsize] - paddd m6, m2 - pmaddwd m2, m0, [r5 + 1 * mmsize] - paddd m5, m2 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] - + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 %ifidn %1,sp - paddd m4, m7 + paddd m0, m7 paddd m1, m7 - psrad m4, 12 + psrad m0, 12 psrad m1, 12 %else - psrad m4, 6 + psrad m0, 6 psrad m1, 6 %endif - packssdw m4, m1 - vextracti128 xm1, m4, 1 - lea r2, [r2 + r3 * 4] -%ifidn %1,sp - packuswb xm4, xm1 - movd [r2], xm4 - pextrd [r2 + r3], xm4, 2 - pextrd [r2 + r3 * 2], xm4, 1 - pextrd [r2 + r6], xm4, 3 -%else - movq [r2], xm4 - movq [r2 + r3], xm1 - movhps [r2 + r3 * 2], xm4 - movhps [r2 + r6], xm1 -%endif + packssdw m0, m1 - movq xm4, [r0 + r4] - punpcklwd xm3, xm4 - lea r0, [r0 + 4 * r1] - movq xm1, [r0] - punpcklwd xm4, xm1 - vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] - pmaddwd m4, m3, [r5 + 3 * mmsize] - paddd m6, m4 - pmaddwd m4, m3, [r5 + 2 * mmsize] - paddd m5, m4 - pmaddwd m4, m3, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m3, [r5] - movq xm4, [r0 + r1] - punpcklwd xm1, xm4 - movq xm2, [r0 + 2 * r1] - punpcklwd xm4, xm2 - vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] - pmaddwd m4, m1, [r5 + 3 * mmsize] - paddd m5, m4 - pmaddwd m4, m1, [r5 + 2 * mmsize] - paddd m0, m4 - pmaddwd m1, [r5 + 1 * mmsize] - paddd m3, m1 - movq xm4, [r0 + r4] - punpcklwd xm2, xm4 - lea r0, [r0 + 4 * r1] - movq xm1, [r0] - punpcklwd xm4, xm1 - vinserti128 m2, m2, xm4, 1 ; m2 = [20 19 19 18] - pmaddwd m4, m2, [r5 + 3 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5 + 2 * mmsize] - paddd m3, m2 - movq xm4, [r0 + r1] - punpcklwd xm1, xm4 - movq xm2, [r0 + 2 * r1] - punpcklwd xm4, xm2 - vinserti128 m1, m1, xm4, 1 ; m1 = [22 21 21 20] - pmaddwd m1, [r5 + 3 * mmsize] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] paddd m3, m1 - %ifidn %1,sp - paddd m6, m7 - paddd m5, m7 - paddd m0, m7 + paddd m2, m7 paddd m3, m7 - psrad m6, 12 - psrad m5, 12 - psrad m0, 12 + psrad m2, 12 psrad m3, 12 %else - psrad m6, 6 - psrad m5, 6 - psrad m0, 6 + psrad m2, 6 psrad m3, 6 %endif - packssdw m6, m5 - packssdw m0, m3 - lea r2, [r2 + r3 * 4] + packssdw m2, m3 + + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm3, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm3, 1 + pmaddwd m3, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m3 + lea r4, [r3 * 3] %ifidn %1,sp - packuswb m6, m0 - vextracti128 xm0, m6, 1 - movd [r2], xm6 - movd [r2 + r3], xm0 - pextrd [r2 + r3 * 2], xm6, 1 - pextrd [r2 + r6], xm0, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm6, 2 - pextrd [r2 + r3], xm0, 2 - pextrd [r2 + r3 * 2], xm6, 3 - pextrd [r2 + r6], xm0, 3 + packuswb m0, m2 + vextracti128 xm2, m0, 1 + movd [r2], xm0 + pextrw [r2 + 4], xm2, 0 + pextrd [r2 + r3], xm0, 1 + pextrw [r2 + r3 + 4], xm2, 2 + pextrd [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r3 * 2 + 4], xm2, 4 + pextrd [r2 + r4], xm0, 3 + pextrw [r2 + r4 + 4], xm2, 6 %else - vextracti128 xm5, m6, 1 - vextracti128 xm3, m0, 1 - movq [r2], xm6 - movq [r2 + r3], xm5 - movhps [r2 + r3 * 2], xm6 - movhps [r2 + r6], xm5 - lea r2, [r2 + r3 * 4] movq [r2], xm0 - movq [r2 + r3], xm3 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm3 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movd [r2 + 8], xm0 + pextrd [r2 + r3 + 8], xm0, 2 + movd [r2 + r3 * 2 + 8], xm3 + pextrd [r2 + r4 + 8], xm3, 2 +%endif + lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m7 + paddd m5, m7 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 %endif -%endmacro + packssdw m4, m5 -%macro FILTER_VER_LUMA_AVX2_4x16 1 -INIT_YMM avx2 -cglobal interp_8tap_vert_%1_4x16, 4, 7, 8 - mov r4d, r4m - shl r4d, 7 - add r1d, r1d + movu xm2, [r0 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m6, m0 + movu xm5, [r0 + r1 * 2] ; m5 = row 10 + punpckhwd xm0, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m1, m2 -%ifdef PIC - lea r5, [pw_LumaCoeffVer] - add r5, r4 +%ifidn %1,sp + paddd m6, m7 + paddd m1, m7 + psrad m6, 12 + psrad m1, 12 %else - lea r5, [pw_LumaCoeffVer + r4] + psrad m6, 6 + psrad m1, 6 %endif - - lea r4, [r1 * 3] - sub r0, r4 + packssdw m6, m1 %ifidn %1,sp - mova m7, [pd_526336] + packuswb m4, m6 + vextracti128 xm6, m4, 1 + movd [r2], xm4 + pextrw [r2 + 4], xm6, 0 + pextrd [r2 + r3], xm4, 1 + pextrw [r2 + r3 + 4], xm6, 2 + pextrd [r2 + r3 * 2], xm4, 2 + pextrw [r2 + r3 * 2 + 4], xm6, 4 + pextrd [r2 + r4], xm4, 3 + pextrw [r2 + r4 + 4], xm6, 6 %else - add r3d, r3d + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r4], xm6 + vextracti128 xm5, m4, 1 + vextracti128 xm1, m6, 1 + movd [r2 + 8], xm5 + pextrd [r2 + r3 + 8], xm5, 2 + movd [r2 + r3 * 2 + 8], xm1 + pextrd [r2 + r4 + 8], xm1, 2 %endif - lea r6, [r3 * 3] - PROCESS_LUMA_AVX2_W4_16R %1 RET %endmacro - FILTER_VER_LUMA_AVX2_4x16 sp - FILTER_VER_LUMA_AVX2_4x16 ss + FILTER_VER_CHROMA_S_AVX2_6x8 sp + FILTER_VER_CHROMA_S_AVX2_6x8 ss -%macro FILTER_VER_LUMA_S_AVX2_8x8 1 -INIT_YMM avx2 +%macro FILTER_VER_CHROMA_S_AVX2_6x16 1 %if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_6x16, 4, 7, 9 mov r4d, r4m - shl r4d, 7 + shl r4d, 6 add r1d, r1d %ifdef PIC - lea r5, [pw_LumaCoeffVer] + lea r5, [pw_ChromaCoeffV] add r5, r4 %else - lea r5, [pw_LumaCoeffVer + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] - sub r0, r4 - + sub r0, r1 %ifidn %1,sp - mova m11, [pd_526336] + mova m8, [pd_526336] %else add r3d, r3d %endif - + lea r6, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 @@ -23443,687 +19032,763 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] paddd m0, m4 + pmaddwd m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] - pmaddwd m3, [r5] paddd m1, m5 + pmaddwd m3, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m1, m8 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 2 * mmsize] - paddd m0, m6 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm7, xm5, xm6 + punpckhwd xm1, xm5, xm6 punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 2 * mmsize] - paddd m1, m7 - pmaddwd m7, m5, [r5 + 1 * mmsize] + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] pmaddwd m5, [r5] - paddd m3, m7 - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhwd xm8, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddwd m8, m6, [r5 + 3 * mmsize] - paddd m0, m8 - pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m3, m1 +%ifidn %1,sp paddd m2, m8 - pmaddwd m8, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m8 - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhwd xm9, xm7, xm8 - punpcklwd xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddwd m9, m7, [r5 + 3 * mmsize] - paddd m1, m9 - pmaddwd m9, m7, [r5 + 2 * mmsize] - paddd m3, m9 - pmaddwd m9, m7, [r5 + 1 * mmsize] - pmaddwd m7, [r5] - paddd m5, m9 - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhwd xm10, xm8, xm9 - punpcklwd xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddwd m10, m8, [r5 + 3 * mmsize] - paddd m2, m10 - pmaddwd m10, m8, [r5 + 2 * mmsize] - pmaddwd m8, [r5 + 1 * mmsize] - paddd m4, m10 - paddd m6, m8 - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhwd xm8, xm9, xm10 - punpcklwd xm9, xm10 - vinserti128 m9, m9, xm8, 1 - pmaddwd m8, m9, [r5 + 3 * mmsize] paddd m3, m8 - pmaddwd m8, m9, [r5 + 2 * mmsize] - pmaddwd m9, [r5 + 1 * mmsize] - paddd m5, m8 - paddd m7, m9 - movu xm8, [r0 + r4] ; m8 = row 11 - punpckhwd xm9, xm10, xm8 - punpcklwd xm10, xm8 - vinserti128 m10, m10, xm9, 1 - pmaddwd m9, m10, [r5 + 3 * mmsize] - pmaddwd m10, [r5 + 2 * mmsize] - paddd m4, m9 - paddd m6, m10 - - lea r4, [r3 * 3] -%ifidn %1,sp - paddd m0, m11 - paddd m1, m11 - paddd m2, m11 - paddd m3, m11 - psrad m0, 12 - psrad m1, 12 psrad m2, 12 psrad m3, 12 %else - psrad m0, 6 - psrad m1, 6 psrad m2, 6 psrad m3, 6 %endif - packssdw m0, m1 packssdw m2, m3 %ifidn %1,sp packuswb m0, m2 - mova m1, [interp8_hps_shuf] - vpermd m0, m1, m0 vextracti128 xm2, m0, 1 + movd [r2], xm0 + pextrw [r2 + 4], xm2, 0 + pextrd [r2 + r3], xm0, 1 + pextrw [r2 + r3 + 4], xm2, 2 + pextrd [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r3 * 2 + 4], xm2, 4 + pextrd [r2 + r6], xm0, 3 + pextrw [r2 + r6 + 4], xm2, 6 +%else movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 - movhps [r2 + r4], xm2 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm1, m0, 1 + movhps [r2 + r6], xm2 + vextracti128 xm0, m0, 1 vextracti128 xm3, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 + movd [r2 + 8], xm0 + pextrd [r2 + r3 + 8], xm0, 2 + movd [r2 + r3 * 2 + 8], xm3 + pextrd [r2 + r6 + 8], xm3, 2 %endif - + lea r2, [r2 + r3 * 4] + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm0, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm0, 1 + pmaddwd m0, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m0 lea r0, [r0 + r1 * 4] - movu xm9, [r0] ; m9 = row 12 - punpckhwd xm3, xm8, xm9 - punpcklwd xm8, xm9 - vinserti128 m8, m8, xm3, 1 - pmaddwd m3, m8, [r5 + 3 * mmsize] - pmaddwd m8, [r5 + 2 * mmsize] - paddd m5, m3 - paddd m7, m8 - movu xm3, [r0 + r1] ; m3 = row 13 - punpckhwd xm0, xm9, xm3 - punpcklwd xm9, xm3 - vinserti128 m9, m9, xm0, 1 - pmaddwd m9, [r5 + 3 * mmsize] - paddd m6, m9 - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhwd xm9, xm3, xm0 - punpcklwd xm3, xm0 - vinserti128 m3, m3, xm9, 1 - pmaddwd m3, [r5 + 3 * mmsize] - paddd m7, m3 + movu xm0, [r0] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m8 + paddd m5, m8 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r0 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m5, m0, [r5 + 1 * mmsize] + paddd m6, m5 + pmaddwd m0, [r5] + movu xm5, [r0 + r1 * 2] ; m5 = row 10 + punpckhwd xm7, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 1 * mmsize] + paddd m1, m7 + pmaddwd m2, [r5] %ifidn %1,sp - paddd m4, m11 - paddd m5, m11 - paddd m6, m11 - paddd m7, m11 - psrad m4, 12 - psrad m5, 12 + paddd m6, m8 + paddd m1, m8 psrad m6, 12 - psrad m7, 12 + psrad m1, 12 %else - psrad m4, 6 - psrad m5, 6 psrad m6, 6 - psrad m7, 6 + psrad m1, 6 %endif - packssdw m4, m5 - packssdw m6, m7 - lea r2, [r2 + r3 * 4] + packssdw m6, m1 %ifidn %1,sp packuswb m4, m6 - vpermd m4, m1, m4 vextracti128 xm6, m4, 1 + movd [r2], xm4 + pextrw [r2 + 4], xm6, 0 + pextrd [r2 + r3], xm4, 1 + pextrw [r2 + r3 + 4], xm6, 2 + pextrd [r2 + r3 * 2], xm4, 2 + pextrw [r2 + r3 * 2 + 4], xm6, 4 + pextrd [r2 + r6], xm4, 3 + pextrw [r2 + r6 + 4], xm6, 6 +%else movq [r2], xm4 movhps [r2 + r3], xm4 movq [r2 + r3 * 2], xm6 - movhps [r2 + r4], xm6 -%else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm5, m4, 1 - vextracti128 xm7, m6, 1 - movu [r2], xm4 - movu [r2 + r3], xm5 - movu [r2 + r3 * 2], xm6 - movu [r2 + r4], xm7 -%endif - RET -%endif -%endmacro - - FILTER_VER_LUMA_S_AVX2_8x8 sp - FILTER_VER_LUMA_S_AVX2_8x8 ss - -%macro FILTER_VER_LUMA_S_AVX2_8xN 2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_8x%2, 4, 9, 15 - mov r4d, r4m - shl r4d, 7 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_LumaCoeffVer] - add r5, r4 -%else - lea r5, [pw_LumaCoeffVer + r4] + movhps [r2 + r6], xm6 + vextracti128 xm4, m4, 1 + vextracti128 xm1, m6, 1 + movd [r2 + 8], xm4 + pextrd [r2 + r3 + 8], xm4, 2 + movd [r2 + r3 * 2 + 8], xm1 + pextrd [r2 + r6 + 8], xm1, 2 %endif - - lea r4, [r1 * 3] - sub r0, r4 + lea r2, [r2 + r3 * 4] + movu xm7, [r0 + r4] ; m7 = row 11 + punpckhwd xm1, xm5, xm7 + punpcklwd xm5, xm7 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + paddd m0, m1 + pmaddwd m5, [r5] + lea r0, [r0 + r1 * 4] + movu xm1, [r0] ; m1 = row 12 + punpckhwd xm4, xm7, xm1 + punpcklwd xm7, xm1 + vinserti128 m7, m7, xm4, 1 + pmaddwd m4, m7, [r5 + 1 * mmsize] + paddd m2, m4 + pmaddwd m7, [r5] %ifidn %1,sp - mova m14, [pd_526336] + paddd m0, m8 + paddd m2, m8 + psrad m0, 12 + psrad m2, 12 %else - add r3d, r3d + psrad m0, 6 + psrad m2, 6 %endif - lea r6, [r3 * 3] - lea r7, [r1 * 4] - mov r8d, %2 / 16 -.loopH: - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 + packssdw m0, m2 + + movu xm4, [r0 + r1] ; m4 = row 13 + punpckhwd xm2, xm1, xm4 + punpcklwd xm1, xm4 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + paddd m5, m2 pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 + movu xm2, [r0 + r1 * 2] ; m2 = row 14 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 2 * mmsize] - paddd m0, m6 pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 + paddd m7, m6 pmaddwd m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm7, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 2 * mmsize] - paddd m1, m7 - pmaddwd m7, m5, [r5 + 1 * mmsize] - paddd m3, m7 - pmaddwd m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhwd xm8, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddwd m8, m6, [r5 + 3 * mmsize] - paddd m0, m8 - pmaddwd m8, m6, [r5 + 2 * mmsize] - paddd m2, m8 - pmaddwd m8, m6, [r5 + 1 * mmsize] - paddd m4, m8 - pmaddwd m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhwd xm9, xm7, xm8 - punpcklwd xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddwd m9, m7, [r5 + 3 * mmsize] - paddd m1, m9 - pmaddwd m9, m7, [r5 + 2 * mmsize] - paddd m3, m9 - pmaddwd m9, m7, [r5 + 1 * mmsize] - paddd m5, m9 - pmaddwd m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhwd xm10, xm8, xm9 - punpcklwd xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddwd m10, m8, [r5 + 3 * mmsize] - paddd m2, m10 - pmaddwd m10, m8, [r5 + 2 * mmsize] - paddd m4, m10 - pmaddwd m10, m8, [r5 + 1 * mmsize] - paddd m6, m10 - pmaddwd m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhwd xm11, xm9, xm10 - punpcklwd xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddwd m11, m9, [r5 + 3 * mmsize] - paddd m3, m11 - pmaddwd m11, m9, [r5 + 2 * mmsize] - paddd m5, m11 - pmaddwd m11, m9, [r5 + 1 * mmsize] - paddd m7, m11 - pmaddwd m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhwd xm12, xm10, xm11 - punpcklwd xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddwd m12, m10, [r5 + 3 * mmsize] - paddd m4, m12 - pmaddwd m12, m10, [r5 + 2 * mmsize] - paddd m6, m12 - pmaddwd m12, m10, [r5 + 1 * mmsize] - paddd m8, m12 - pmaddwd m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhwd xm13, xm11, xm12 - punpcklwd xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddwd m13, m11, [r5 + 3 * mmsize] - paddd m5, m13 - pmaddwd m13, m11, [r5 + 2 * mmsize] - paddd m7, m13 - pmaddwd m13, m11, [r5 + 1 * mmsize] - paddd m9, m13 - pmaddwd m11, [r5] +%ifidn %1,sp + paddd m5, m8 + paddd m7, m8 + psrad m5, 12 + psrad m7, 12 +%else + psrad m5, 6 + psrad m7, 6 +%endif + packssdw m5, m7 +%ifidn %1,sp + packuswb m0, m5 + vextracti128 xm5, m0, 1 + movd [r2], xm0 + pextrw [r2 + 4], xm5, 0 + pextrd [r2 + r3], xm0, 1 + pextrw [r2 + r3 + 4], xm5, 2 + pextrd [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r3 * 2 + 4], xm5, 4 + pextrd [r2 + r6], xm0, 3 + pextrw [r2 + r6 + 4], xm5, 6 +%else + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm5 + vextracti128 xm0, m0, 1 + vextracti128 xm7, m5, 1 + movd [r2 + 8], xm0 + pextrd [r2 + r3 + 8], xm0, 2 + movd [r2 + r3 * 2 + 8], xm7 + pextrd [r2 + r6 + 8], xm7, 2 +%endif + lea r2, [r2 + r3 * 4] + movu xm6, [r0 + r4] ; m6 = row 15 + punpckhwd xm5, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm5, 1 + pmaddwd m5, m2, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 16 + punpckhwd xm5, xm6, xm0 + punpcklwd xm6, xm0 + vinserti128 m6, m6, xm5, 1 + pmaddwd m5, m6, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m6, [r5] %ifidn %1,sp - paddd m0, m14 - paddd m1, m14 - paddd m2, m14 - paddd m3, m14 - paddd m4, m14 - paddd m5, m14 - psrad m0, 12 + paddd m1, m8 + paddd m4, m8 psrad m1, 12 - psrad m2, 12 - psrad m3, 12 psrad m4, 12 - psrad m5, 12 %else - psrad m0, 6 psrad m1, 6 - psrad m2, 6 - psrad m3, 6 psrad m4, 6 - psrad m5, 6 %endif - packssdw m0, m1 - packssdw m2, m3 - packssdw m4, m5 + packssdw m1, m4 + + movu xm5, [r0 + r1] ; m5 = row 17 + punpckhwd xm4, xm0, xm5 + punpcklwd xm0, xm5 + vinserti128 m0, m0, xm4, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m2, m0 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhwd xm0, xm5, xm4 + punpcklwd xm5, xm4 + vinserti128 m5, m5, xm0, 1 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m6, m5 %ifidn %1,sp - packuswb m0, m2 - mova m1, [interp8_hps_shuf] - vpermd m0, m1, m0 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 + paddd m2, m8 + paddd m6, m8 + psrad m2, 12 + psrad m6, 12 +%else + psrad m2, 6 + psrad m6, 6 +%endif + packssdw m2, m6 +%ifidn %1,sp + packuswb m1, m2 + vextracti128 xm2, m1, 1 + movd [r2], xm1 + pextrw [r2 + 4], xm2, 0 + pextrd [r2 + r3], xm1, 1 + pextrw [r2 + r3 + 4], xm2, 2 + pextrd [r2 + r3 * 2], xm1, 2 + pextrw [r2 + r3 * 2 + 4], xm2, 4 + pextrd [r2 + r6], xm1, 3 + pextrw [r2 + r6 + 4], xm2, 6 +%else + movq [r2], xm1 + movhps [r2 + r3], xm1 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 + vextracti128 xm4, m1, 1 + vextracti128 xm6, m2, 1 + movd [r2 + 8], xm4 + pextrd [r2 + r3 + 8], xm4, 2 + movd [r2 + r3 * 2 + 8], xm6 + pextrd [r2 + r6 + 8], xm6, 2 +%endif + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_6x16 sp + FILTER_VER_CHROMA_S_AVX2_6x16 ss + +;--------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS_W2_4R 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] %else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, (%2/4) + +.loopH: + PROCESS_CHROMA_SP_W2_4R r5 + + psrad m0, 6 + psrad m2, 6 + + packssdw m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m0, 2 + pextrd [r2 + r3], m0, 3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SS_W2_4R 2, 4 + FILTER_VER_CHROMA_SS_W2_4R 2, 8 + + FILTER_VER_CHROMA_SS_W2_4R 2, 16 + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal interp_4tap_vert_ss_4x2, 5, 6, 4 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 + + movq m3, [r0 + r1] + punpcklwd m2, m3 ;m4=[2 3] + pmaddwd m2, [r5 + 1 * 16] + paddd m0, m2 ;m0=[0+1+2+3] Row1 done + psrad m0, 6 + + movq m2, [r0 + 2 * r1] + punpcklwd m3, m2 ;m5=[3 4] + pmaddwd m3, [r5 + 1 * 16] + paddd m1, m3 ;m1=[1+2+3+4] Row2 done + psrad m1, 6 + + packssdw m0, m1 + + movlps [r2], m0 + movhps [r2 + r3], m0 + + RET + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS_W6_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] %endif - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhwd xm0, xm12, xm13 - punpcklwd xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddwd m0, m12, [r5 + 3 * mmsize] - paddd m6, m0 - pmaddwd m0, m12, [r5 + 2 * mmsize] - paddd m8, m0 - pmaddwd m0, m12, [r5 + 1 * mmsize] - paddd m10, m0 - pmaddwd m12, [r5] - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhwd xm2, xm13, xm0 - punpcklwd xm13, xm0 - vinserti128 m13, m13, xm2, 1 - pmaddwd m2, m13, [r5 + 3 * mmsize] - paddd m7, m2 - pmaddwd m2, m13, [r5 + 2 * mmsize] - paddd m9, m2 - pmaddwd m2, m13, [r5 + 1 * mmsize] - paddd m11, m2 - pmaddwd m13, [r5] + mov r4d, %2/4 + +.loopH: + PROCESS_CHROMA_SP_W4_4R + + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 + + movlps [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movlps [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + PROCESS_CHROMA_SP_W2_4R r6 + + psrad m0, 6 + psrad m2, 6 + + packssdw m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m0, 2 + pextrd [r2 + r3], m0, 3 + + sub r0, 2 * 4 + lea r2, [r2 + 2 * r3 - 2 * 4] + + dec r4d + jnz .loopH + + RET +%endmacro -%ifidn %1,sp - paddd m6, m14 - paddd m7, m14 - psrad m6, 12 - psrad m7, 12 + FILTER_VER_CHROMA_SS_W6_H4 6, 8 + + FILTER_VER_CHROMA_SS_W6_H4 6, 16 + + +;---------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;---------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS_W8_H2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] %else - psrad m6, 6 - psrad m7, 6 + lea r5, [tab_ChromaCoeffV + r4] %endif - packssdw m6, m7 - lea r2, [r2 + r3 * 4] -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m1, m4 - vextracti128 xm6, m4, 1 - movq [r2], xm4 - movhps [r2 + r3], xm4 - movq [r2 + r3 * 2], xm6 - movhps [r2 + r6], xm6 + mov r4d, %2/2 +.loopH: + PROCESS_CHROMA_SP_W8_2R + + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 + + movu [r2], m0 + movu [r2 + r3], m2 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SS_W8_H2 8, 2 + FILTER_VER_CHROMA_SS_W8_H2 8, 4 + FILTER_VER_CHROMA_SS_W8_H2 8, 6 + FILTER_VER_CHROMA_SS_W8_H2 8, 8 + FILTER_VER_CHROMA_SS_W8_H2 8, 16 + FILTER_VER_CHROMA_SS_W8_H2 8, 32 + + FILTER_VER_CHROMA_SS_W8_H2 8, 12 + FILTER_VER_CHROMA_SS_W8_H2 8, 64 + +;----------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_SS 2 +INIT_XMM sse2 +cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize + + add r1d, r1d + add r3d, r3d + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] %else - vpermq m6, m6, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m4, 1 - vextracti128 xm7, m6, 1 - movu [r2], xm4 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 + lea r6, [tab_LumaCoeffV + r4] %endif - movu xm6, [r0 + r4] ; m6 = row 15 - punpckhwd xm5, xm0, xm6 - punpcklwd xm0, xm6 - vinserti128 m0, m0, xm5, 1 - pmaddwd m5, m0, [r5 + 3 * mmsize] - paddd m8, m5 - pmaddwd m5, m0, [r5 + 2 * mmsize] - paddd m10, m5 - pmaddwd m5, m0, [r5 + 1 * mmsize] - paddd m12, m5 - pmaddwd m0, [r5] - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhwd xm3, xm6, xm2 - punpcklwd xm6, xm2 - vinserti128 m6, m6, xm3, 1 - pmaddwd m3, m6, [r5 + 3 * mmsize] - paddd m9, m3 - pmaddwd m3, m6, [r5 + 2 * mmsize] - paddd m11, m3 - pmaddwd m3, m6, [r5 + 1 * mmsize] - paddd m13, m3 - pmaddwd m6, [r5] - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 3 * mmsize] - paddd m10, m4 - pmaddwd m4, m2, [r5 + 2 * mmsize] - paddd m12, m4 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhwd xm2, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm2, 1 - pmaddwd m2, m3, [r5 + 3 * mmsize] - paddd m11, m2 - pmaddwd m2, m3, [r5 + 2 * mmsize] - paddd m13, m2 - pmaddwd m3, [r5 + 1 * mmsize] - paddd m6, m3 - movu xm2, [r0 + r4] ; m2 = row 19 - punpckhwd xm7, xm4, xm2 - punpcklwd xm4, xm2 - vinserti128 m4, m4, xm7, 1 - pmaddwd m7, m4, [r5 + 3 * mmsize] - paddd m12, m7 - pmaddwd m4, [r5 + 2 * mmsize] - paddd m0, m4 - lea r0, [r0 + r1 * 4] - movu xm7, [r0] ; m7 = row 20 - punpckhwd xm3, xm2, xm7 - punpcklwd xm2, xm7 - vinserti128 m2, m2, xm3, 1 - pmaddwd m3, m2, [r5 + 3 * mmsize] - paddd m13, m3 - pmaddwd m2, [r5 + 2 * mmsize] - paddd m6, m2 - movu xm3, [r0 + r1] ; m3 = row 21 - punpckhwd xm2, xm7, xm3 - punpcklwd xm7, xm3 - vinserti128 m7, m7, xm2, 1 - pmaddwd m7, [r5 + 3 * mmsize] - paddd m0, m7 - movu xm2, [r0 + r1 * 2] ; m2 = row 22 - punpckhwd xm7, xm3, xm2 - punpcklwd xm3, xm2 - vinserti128 m3, m3, xm7, 1 - pmaddwd m3, [r5 + 3 * mmsize] - paddd m6, m3 + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m6, m4, [r6 + 1 * 16] + paddd m2, m6 ;m2=[2+3+4+5] Row3 + pmaddwd m4, [r6 + 2 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m6, m5, [r6 + 1 * 16] + paddd m3, m6 ;m3=[3+4+5+6] Row4 + pmaddwd m5, [r6 + 2 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[6 7] + pmaddwd m6, m4, [r6 + 2 * 16] + paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 + pmaddwd m4, [r6 + 3 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end + psrad m0, 6 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[7 8] + pmaddwd m6, m5, [r6 + 2 * 16] + paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 + pmaddwd m5, [r6 + 3 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end + psrad m1, 6 + + packssdw m0, m1 + + movlps [r2], m0 + movhps [r2 + r3], m0 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[8 9] + pmaddwd m4, [r6 + 3 * 16] + paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end + psrad m2, 6 -%ifidn %1,sp - paddd m8, m14 - paddd m9, m14 - paddd m10, m14 - paddd m11, m14 - paddd m12, m14 - paddd m13, m14 - paddd m0, m14 - paddd m6, m14 - psrad m8, 12 - psrad m9, 12 - psrad m10, 12 - psrad m11, 12 - psrad m12, 12 - psrad m13, 12 - psrad m0, 12 - psrad m6, 12 + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[9 10] + pmaddwd m5, [r6 + 3 * 16] + paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end + psrad m3, 6 + + packssdw m2, m3 + + movlps [r2 + 2 * r3], m2 + lea r5, [3 * r3] + movhps [r2 + r5], m2 + + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_LUMA_SS 4, 4 + FILTER_VER_LUMA_SS 8, 8 + FILTER_VER_LUMA_SS 8, 4 + FILTER_VER_LUMA_SS 4, 8 + FILTER_VER_LUMA_SS 16, 16 + FILTER_VER_LUMA_SS 16, 8 + FILTER_VER_LUMA_SS 8, 16 + FILTER_VER_LUMA_SS 16, 12 + FILTER_VER_LUMA_SS 12, 16 + FILTER_VER_LUMA_SS 16, 4 + FILTER_VER_LUMA_SS 4, 16 + FILTER_VER_LUMA_SS 32, 32 + FILTER_VER_LUMA_SS 32, 16 + FILTER_VER_LUMA_SS 16, 32 + FILTER_VER_LUMA_SS 32, 24 + FILTER_VER_LUMA_SS 24, 32 + FILTER_VER_LUMA_SS 32, 8 + FILTER_VER_LUMA_SS 8, 32 + FILTER_VER_LUMA_SS 64, 64 + FILTER_VER_LUMA_SS 64, 32 + FILTER_VER_LUMA_SS 32, 64 + FILTER_VER_LUMA_SS 64, 48 + FILTER_VER_LUMA_SS 48, 64 + FILTER_VER_LUMA_SS 64, 16 + FILTER_VER_LUMA_SS 16, 64 + +%macro FILTER_VER_LUMA_AVX2_4x4 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 + mov r4d, r4m + add r1d, r1d + shl r4d, 7 + +%ifdef PIC + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - psrad m8, 6 - psrad m9, 6 - psrad m10, 6 - psrad m11, 6 - psrad m12, 6 - psrad m13, 6 - psrad m0, 6 - psrad m6, 6 + lea r5, [pw_LumaCoeffVer + r4] %endif - packssdw m8, m9 - packssdw m10, m11 - packssdw m12, m13 - packssdw m0, m6 - lea r2, [r2 + r3 * 4] + + lea r4, [r1 * 3] + sub r0, r4 %ifidn %1,sp - packuswb m8, m10 - packuswb m12, m0 - vpermd m8, m1, m8 - vpermd m12, m1, m12 - vextracti128 xm10, m8, 1 - vextracti128 xm0, m12, 1 - movq [r2], xm8 - movhps [r2 + r3], xm8 - movq [r2 + r3 * 2], xm10 - movhps [r2 + r6], xm10 - lea r2, [r2 + r3 * 4] - movq [r2], xm12 - movhps [r2 + r3], xm12 - movq [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm0 + mova m6, [pd_526336] %else - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm6, m0, 1 - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r6], xm11 - lea r2, [r2 + r3 * 4] - movu [r2], xm12 - movu [r2 + r3], xm13 - movu [r2 + r3 * 2], xm0 - movu [r2 + r6], xm6 -%endif - - lea r2, [r2 + r3 * 4] - sub r0, r7 - dec r8d - jnz .loopH - RET + add r3d, r3d %endif -%endmacro - - FILTER_VER_LUMA_S_AVX2_8xN sp, 16 - FILTER_VER_LUMA_S_AVX2_8xN sp, 32 - FILTER_VER_LUMA_S_AVX2_8xN ss, 16 - FILTER_VER_LUMA_S_AVX2_8xN ss, 32 -%macro PROCESS_LUMA_S_AVX2_W8_4R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 + movq xm0, [r0] + movq xm1, [r0 + r1] punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 + movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 2 * mmsize] - paddd m0, m6 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] pmaddwd m4, [r5 + 1 * mmsize] + paddd m0, m5 paddd m2, m4 - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm4, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm4, 1 - pmaddwd m4, m5, [r5 + 2 * mmsize] - paddd m1, m4 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m3, m5 - movu xm4, [r0 + r4] ; m4 = row 7 - punpckhwd xm5, xm6, xm4 - punpcklwd xm6, xm4 - vinserti128 m6, m6, xm5, 1 - pmaddwd m5, m6, [r5 + 3 * mmsize] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + pmaddwd m1, [r5 + 2 * mmsize] paddd m0, m5 - pmaddwd m6, [r5 + 2 * mmsize] - paddd m2, m6 - lea r0, [r0 + r1 * 4] - movu xm5, [r0] ; m5 = row 8 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 3 * mmsize] - paddd m1, m6 - pmaddwd m4, [r5 + 2 * mmsize] - paddd m3, m4 - movu xm6, [r0 + r1] ; m6 = row 9 - punpckhwd xm4, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm4, 1 - pmaddwd m5, [r5 + 3 * mmsize] - paddd m2, m5 - movu xm4, [r0 + r1 * 2] ; m4 = row 10 - punpckhwd xm5, xm6, xm4 - punpcklwd xm6, xm4 - vinserti128 m6, m6, xm5, 1 - pmaddwd m6, [r5 + 3 * mmsize] - paddd m3, m6 - -%ifidn %1,sp - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 + paddd m2, m1 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + 2 * r1] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [A 9 9 8] + pmaddwd m4, [r5 + 3 * mmsize] + paddd m2, m4 + +%ifidn %1,sp + paddd m0, m6 + paddd m2, m6 psrad m0, 12 - psrad m1, 12 psrad m2, 12 - psrad m3, 12 %else psrad m0, 6 - psrad m1, 6 psrad m2, 6 - psrad m3, 6 %endif - packssdw m0, m1 - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - mova m4, [interp8_hps_shuf] - vpermd m0, m4, m0 + packssdw m0, m2 vextracti128 xm2, m0, 1 + lea r4, [r3 * 3] + +%ifidn %1,sp + packuswb xm0, xm2 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r4], xm0, 3 %else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 %endif + RET %endmacro -%macro FILTER_VER_LUMA_S_AVX2_8x4 1 + FILTER_VER_LUMA_AVX2_4x4 sp + FILTER_VER_LUMA_AVX2_4x4 ss + +%macro FILTER_VER_LUMA_AVX2_4x8 1 INIT_YMM avx2 -cglobal interp_8tap_vert_%1_8x4, 4, 6, 8 +cglobal interp_8tap_vert_%1_4x8, 4, 7, 8 mov r4d, r4m shl r4d, 7 add r1d, r1d @@ -24137,390 +19802,355 @@ cglobal interp_8tap_vert_%1_8x4, 4, 6, 8 lea r4, [r1 * 3] sub r0, r4 + %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif + lea r6, [r3 * 3] + + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] + paddd m0, m5 + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + paddd m0, m5 + pmaddwd m5, m1, [r5 + 2 * mmsize] + paddd m2, m5 + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] + pmaddwd m3, m6, [r5 + 3 * mmsize] + paddd m2, m3 + pmaddwd m3, m6, [r5 + 2 * mmsize] + paddd m4, m3 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 - PROCESS_LUMA_S_AVX2_W8_4R %1 - lea r4, [r3 * 3] %ifidn %1,sp - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r4], xm2 + paddd m0, m7 + paddd m2, m7 + psrad m0, 12 + psrad m2, 12 %else - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + + movq xm3, [r0 + r4] + punpcklwd xm5, xm3 + lea r0, [r0 + 4 * r1] + movq xm2, [r0] + punpcklwd xm3, xm2 + vinserti128 m5, m5, xm3, 1 ; m5 = [C B B A] + pmaddwd m3, m5, [r5 + 3 * mmsize] + paddd m4, m3 + pmaddwd m5, [r5 + 2 * mmsize] + paddd m1, m5 + movq xm3, [r0 + r1] + punpcklwd xm2, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m2, m2, xm3, 1 ; m2 = [E D D C] + pmaddwd m2, [r5 + 3 * mmsize] + paddd m1, m2 + +%ifidn %1,sp + paddd m4, m7 + paddd m1, m7 + psrad m4, 12 + psrad m1, 12 +%else + psrad m4, 6 + psrad m1, 6 +%endif + packssdw m4, m1 + +%ifidn %1,sp + packuswb m0, m4 + vextracti128 xm2, m0, 1 + movd [r2], xm0 + movd [r2 + r3], xm2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r6], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm2, 2 + pextrd [r2 + r3 * 2], xm0, 3 + pextrd [r2 + r6], xm2, 3 +%else + vextracti128 xm2, m0, 1 + vextracti128 xm1, m4, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 %endif RET %endmacro - FILTER_VER_LUMA_S_AVX2_8x4 sp - FILTER_VER_LUMA_S_AVX2_8x4 ss + FILTER_VER_LUMA_AVX2_4x8 sp + FILTER_VER_LUMA_AVX2_4x8 ss -%macro PROCESS_LUMA_AVX2_W8_16R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 +%macro PROCESS_LUMA_AVX2_W4_16R 1 + movq xm0, [r0] + movq xm1, [r0 + r1] punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 + movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 2 * mmsize] - paddd m0, m6 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] + paddd m0, m5 + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 pmaddwd m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhwd xm7, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 2 * mmsize] - paddd m1, m7 - pmaddwd m7, m5, [r5 + 1 * mmsize] - paddd m3, m7 - pmaddwd m5, [r5] - movu xm7, [r7 + r4] ; m7 = row 7 - punpckhwd xm8, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddwd m8, m6, [r5 + 3 * mmsize] - paddd m0, m8 - pmaddwd m8, m6, [r5 + 2 * mmsize] - paddd m2, m8 - pmaddwd m8, m6, [r5 + 1 * mmsize] - paddd m4, m8 + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + paddd m0, m5 + pmaddwd m5, m1, [r5 + 2 * mmsize] + paddd m2, m5 + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] + pmaddwd m3, m6, [r5 + 3 * mmsize] + paddd m2, m3 + pmaddwd m3, m6, [r5 + 2 * mmsize] + paddd m4, m3 + pmaddwd m3, m6, [r5 + 1 * mmsize] + paddd m1, m3 pmaddwd m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm8, [r7] ; m8 = row 8 - punpckhwd xm9, xm7, xm8 - punpcklwd xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddwd m9, m7, [r5 + 3 * mmsize] - paddd m1, m9 - pmaddwd m9, m7, [r5 + 2 * mmsize] - paddd m3, m9 - pmaddwd m9, m7, [r5 + 1 * mmsize] - paddd m5, m9 - pmaddwd m7, [r5] - movu xm9, [r7 + r1] ; m9 = row 9 - punpckhwd xm10, xm8, xm9 - punpcklwd xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddwd m10, m8, [r5 + 3 * mmsize] - paddd m2, m10 - pmaddwd m10, m8, [r5 + 2 * mmsize] - paddd m4, m10 - pmaddwd m10, m8, [r5 + 1 * mmsize] - paddd m6, m10 - pmaddwd m8, [r5] - movu xm10, [r7 + r1 * 2] ; m10 = row 10 - punpckhwd xm11, xm9, xm10 - punpcklwd xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddwd m11, m9, [r5 + 3 * mmsize] - paddd m3, m11 - pmaddwd m11, m9, [r5 + 2 * mmsize] - paddd m5, m11 - pmaddwd m11, m9, [r5 + 1 * mmsize] - paddd m7, m11 - pmaddwd m9, [r5] - movu xm11, [r7 + r4] ; m11 = row 11 - punpckhwd xm12, xm10, xm11 - punpcklwd xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddwd m12, m10, [r5 + 3 * mmsize] - paddd m4, m12 - pmaddwd m12, m10, [r5 + 2 * mmsize] - paddd m6, m12 - pmaddwd m12, m10, [r5 + 1 * mmsize] - paddd m8, m12 - pmaddwd m10, [r5] - lea r7, [r7 + r1 * 4] - movu xm12, [r7] ; m12 = row 12 - punpckhwd xm13, xm11, xm12 - punpcklwd xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddwd m13, m11, [r5 + 3 * mmsize] - paddd m5, m13 - pmaddwd m13, m11, [r5 + 2 * mmsize] - paddd m7, m13 - pmaddwd m13, m11, [r5 + 1 * mmsize] - paddd m9, m13 - pmaddwd m11, [r5] %ifidn %1,sp - paddd m0, m14 - paddd m1, m14 - paddd m2, m14 - paddd m3, m14 - paddd m4, m14 - paddd m5, m14 + paddd m0, m7 + paddd m2, m7 psrad m0, 12 - psrad m1, 12 psrad m2, 12 - psrad m3, 12 - psrad m4, 12 - psrad m5, 12 %else psrad m0, 6 - psrad m1, 6 psrad m2, 6 - psrad m3, 6 - psrad m4, 6 - psrad m5, 6 %endif - packssdw m0, m1 - packssdw m2, m3 - packssdw m4, m5 -%ifidn %1,sp - packuswb m0, m2 - mova m5, [interp8_hps_shuf] - vpermd m0, m5, m0 + packssdw m0, m2 vextracti128 xm2, m0, 1 +%ifidn %1,sp + packuswb xm0, xm2 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r6], xm0, 3 +%else movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 movhps [r2 + r6], xm2 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 %endif - movu xm13, [r7 + r1] ; m13 = row 13 - punpckhwd xm0, xm12, xm13 - punpcklwd xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddwd m0, m12, [r5 + 3 * mmsize] - paddd m6, m0 - pmaddwd m0, m12, [r5 + 2 * mmsize] - paddd m8, m0 - pmaddwd m0, m12, [r5 + 1 * mmsize] - paddd m10, m0 - pmaddwd m12, [r5] - movu xm0, [r7 + r1 * 2] ; m0 = row 14 - punpckhwd xm1, xm13, xm0 - punpcklwd xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddwd m1, m13, [r5 + 3 * mmsize] - paddd m7, m1 - pmaddwd m1, m13, [r5 + 2 * mmsize] - paddd m9, m1 - pmaddwd m1, m13, [r5 + 1 * mmsize] - paddd m11, m1 - pmaddwd m13, [r5] + movq xm2, [r0 + r4] + punpcklwd xm5, xm2 + lea r0, [r0 + 4 * r1] + movq xm0, [r0] + punpcklwd xm2, xm0 + vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] + pmaddwd m2, m5, [r5 + 3 * mmsize] + paddd m4, m2 + pmaddwd m2, m5, [r5 + 2 * mmsize] + paddd m1, m2 + pmaddwd m2, m5, [r5 + 1 * mmsize] + paddd m6, m2 + pmaddwd m5, [r5] + movq xm2, [r0 + r1] + punpcklwd xm0, xm2 + movq xm3, [r0 + 2 * r1] + punpcklwd xm2, xm3 + vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m1, m2 + pmaddwd m2, m0, [r5 + 2 * mmsize] + paddd m6, m2 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m5, m2 + pmaddwd m0, [r5] %ifidn %1,sp - paddd m6, m14 - paddd m7, m14 - psrad m6, 12 - psrad m7, 12 + paddd m4, m7 + paddd m1, m7 + psrad m4, 12 + psrad m1, 12 %else - psrad m6, 6 - psrad m7, 6 + psrad m4, 6 + psrad m1, 6 %endif - packssdw m6, m7 - lea r8, [r2 + r3 * 4] - + packssdw m4, m1 + vextracti128 xm1, m4, 1 + lea r2, [r2 + r3 * 4] %ifidn %1,sp - packuswb m4, m6 - vpermd m4, m5, m4 - vextracti128 xm6, m4, 1 - movq [r8], xm4 - movhps [r8 + r3], xm4 - movq [r8 + r3 * 2], xm6 - movhps [r8 + r6], xm6 + packuswb xm4, xm1 + movd [r2], xm4 + pextrd [r2 + r3], xm4, 2 + pextrd [r2 + r3 * 2], xm4, 1 + pextrd [r2 + r6], xm4, 3 %else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm1, m4, 1 - vextracti128 xm7, m6, 1 - movu [r8], xm4 - movu [r8 + r3], xm1 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 %endif - movu xm1, [r7 + r4] ; m1 = row 15 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m2, m0, [r5 + 3 * mmsize] - paddd m8, m2 - pmaddwd m2, m0, [r5 + 2 * mmsize] - paddd m10, m2 - pmaddwd m2, m0, [r5 + 1 * mmsize] - paddd m12, m2 - pmaddwd m0, [r5] - lea r7, [r7 + r1 * 4] - movu xm2, [r7] ; m2 = row 16 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m3, m1, [r5 + 3 * mmsize] - paddd m9, m3 - pmaddwd m3, m1, [r5 + 2 * mmsize] - paddd m11, m3 - pmaddwd m3, m1, [r5 + 1 * mmsize] - paddd m13, m3 - pmaddwd m1, [r5] - movu xm3, [r7 + r1] ; m3 = row 17 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 3 * mmsize] - paddd m10, m4 - pmaddwd m4, m2, [r5 + 2 * mmsize] - paddd m12, m4 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 - movu xm4, [r7 + r1 * 2] ; m4 = row 18 - punpckhwd xm2, xm3, xm4 + movq xm4, [r0 + r4] punpcklwd xm3, xm4 - vinserti128 m3, m3, xm2, 1 - pmaddwd m2, m3, [r5 + 3 * mmsize] - paddd m11, m2 - pmaddwd m2, m3, [r5 + 2 * mmsize] - paddd m13, m2 - pmaddwd m3, [r5 + 1 * mmsize] - paddd m1, m3 - movu xm2, [r7 + r4] ; m2 = row 19 - punpckhwd xm6, xm4, xm2 + lea r0, [r0 + 4 * r1] + movq xm1, [r0] + punpcklwd xm4, xm1 + vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] + pmaddwd m4, m3, [r5 + 3 * mmsize] + paddd m6, m4 + pmaddwd m4, m3, [r5 + 2 * mmsize] + paddd m5, m4 + pmaddwd m4, m3, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m3, [r5] + movq xm4, [r0 + r1] + punpcklwd xm1, xm4 + movq xm2, [r0 + 2 * r1] punpcklwd xm4, xm2 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 3 * mmsize] - paddd m12, m6 - pmaddwd m4, [r5 + 2 * mmsize] + vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] + pmaddwd m4, m1, [r5 + 3 * mmsize] + paddd m5, m4 + pmaddwd m4, m1, [r5 + 2 * mmsize] + paddd m0, m4 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m3, m1 + movq xm4, [r0 + r4] + punpcklwd xm2, xm4 + lea r0, [r0 + 4 * r1] + movq xm1, [r0] + punpcklwd xm4, xm1 + vinserti128 m2, m2, xm4, 1 ; m2 = [20 19 19 18] + pmaddwd m4, m2, [r5 + 3 * mmsize] paddd m0, m4 - lea r7, [r7 + r1 * 4] - movu xm6, [r7] ; m6 = row 20 - punpckhwd xm7, xm2, xm6 - punpcklwd xm2, xm6 - vinserti128 m2, m2, xm7, 1 - pmaddwd m7, m2, [r5 + 3 * mmsize] - paddd m13, m7 pmaddwd m2, [r5 + 2 * mmsize] - paddd m1, m2 - movu xm7, [r7 + r1] ; m7 = row 21 - punpckhwd xm2, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddwd m6, [r5 + 3 * mmsize] - paddd m0, m6 - movu xm2, [r7 + r1 * 2] ; m2 = row 22 - punpckhwd xm3, xm7, xm2 - punpcklwd xm7, xm2 - vinserti128 m7, m7, xm3, 1 - pmaddwd m7, [r5 + 3 * mmsize] - paddd m1, m7 + paddd m3, m2 + movq xm4, [r0 + r1] + punpcklwd xm1, xm4 + movq xm2, [r0 + 2 * r1] + punpcklwd xm4, xm2 + vinserti128 m1, m1, xm4, 1 ; m1 = [22 21 21 20] + pmaddwd m1, [r5 + 3 * mmsize] + paddd m3, m1 %ifidn %1,sp - paddd m8, m14 - paddd m9, m14 - paddd m10, m14 - paddd m11, m14 - paddd m12, m14 - paddd m13, m14 - paddd m0, m14 - paddd m1, m14 - psrad m8, 12 - psrad m9, 12 - psrad m10, 12 - psrad m11, 12 - psrad m12, 12 - psrad m13, 12 + paddd m6, m7 + paddd m5, m7 + paddd m0, m7 + paddd m3, m7 + psrad m6, 12 + psrad m5, 12 psrad m0, 12 - psrad m1, 12 + psrad m3, 12 %else - psrad m8, 6 - psrad m9, 6 - psrad m10, 6 - psrad m11, 6 - psrad m12, 6 - psrad m13, 6 + psrad m6, 6 + psrad m5, 6 psrad m0, 6 - psrad m1, 6 + psrad m3, 6 %endif - packssdw m8, m9 - packssdw m10, m11 - packssdw m12, m13 - packssdw m0, m1 - lea r8, [r8 + r3 * 4] + packssdw m6, m5 + packssdw m0, m3 + lea r2, [r2 + r3 * 4] %ifidn %1,sp - packuswb m8, m10 - packuswb m12, m0 - vpermd m8, m5, m8 - vpermd m12, m5, m12 - vextracti128 xm10, m8, 1 - vextracti128 xm0, m12, 1 - movq [r8], xm8 - movhps [r8 + r3], xm8 - movq [r8 + r3 * 2], xm10 - movhps [r8 + r6], xm10 - lea r8, [r8 + r3 * 4] - movq [r8], xm12 - movhps [r8 + r3], xm12 - movq [r8 + r3 * 2], xm0 - movhps [r8 + r6], xm0 + packuswb m6, m0 + vextracti128 xm0, m6, 1 + movd [r2], xm6 + movd [r2 + r3], xm0 + pextrd [r2 + r3 * 2], xm6, 1 + pextrd [r2 + r6], xm0, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm6, 2 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm6, 3 + pextrd [r2 + r6], xm0, 3 %else - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - movu [r8], xm8 - movu [r8 + r3], xm9 - movu [r8 + r3 * 2], xm10 - movu [r8 + r6], xm11 - lea r8, [r8 + r3 * 4] - movu [r8], xm12 - movu [r8 + r3], xm13 - movu [r8 + r3 * 2], xm0 - movu [r8 + r6], xm1 + vextracti128 xm5, m6, 1 + vextracti128 xm3, m0, 1 + movq [r2], xm6 + movq [r2 + r3], xm5 + movhps [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm5 + lea r2, [r2 + r3 * 4] + movq [r2], xm0 + movq [r2 + r3], xm3 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm3 %endif %endmacro -%macro FILTER_VER_LUMA_AVX2_Nx16 2 +%macro FILTER_VER_LUMA_AVX2_4x16 1 INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_%2x16, 4, 10, 15 +cglobal interp_8tap_vert_%1_4x16, 4, 7, 8 mov r4d, r4m shl r4d, 7 add r1d, r1d @@ -24535,37 +20165,22 @@ cglobal interp_8tap_vert_%1_%2x16, 4, 10, 15 lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp - mova m14, [pd_526336] + mova m7, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] - mov r9d, %2 / 8 -.loopW: - PROCESS_LUMA_AVX2_W8_16R %1 -%ifidn %1,sp - add r2, 8 -%else - add r2, 16 -%endif - add r0, 16 - dec r9d - jnz .loopW + PROCESS_LUMA_AVX2_W4_16R %1 RET -%endif %endmacro - FILTER_VER_LUMA_AVX2_Nx16 sp, 16 - FILTER_VER_LUMA_AVX2_Nx16 sp, 32 - FILTER_VER_LUMA_AVX2_Nx16 sp, 64 - FILTER_VER_LUMA_AVX2_Nx16 ss, 16 - FILTER_VER_LUMA_AVX2_Nx16 ss, 32 - FILTER_VER_LUMA_AVX2_Nx16 ss, 64 + FILTER_VER_LUMA_AVX2_4x16 sp + FILTER_VER_LUMA_AVX2_4x16 ss -%macro FILTER_VER_LUMA_AVX2_NxN 3 +%macro FILTER_VER_LUMA_S_AVX2_8x8 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 +cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 mov r4d, r4m shl r4d, 7 add r1d, r1d @@ -24580,102 +20195,215 @@ cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 lea r4, [r1 * 3] sub r0, r4 -%ifidn %3,sp - mova m14, [pd_526336] +%ifidn %1,sp + mova m11, [pd_526336] %else add r3d, r3d %endif - lea r6, [r3 * 3] - lea r11, [r1 * 4] - mov r9d, %2 / 16 -.loopH: - mov r10d, %1 / 8 -.loopW: - PROCESS_LUMA_AVX2_W8_16R %3 -%ifidn %3,sp - add r2, 8 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m7 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m8 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + pmaddwd m7, [r5] + paddd m5, m9 + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + pmaddwd m8, [r5 + 1 * mmsize] + paddd m4, m10 + paddd m6, m8 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhwd xm8, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm8, 1 + pmaddwd m8, m9, [r5 + 3 * mmsize] + paddd m3, m8 + pmaddwd m8, m9, [r5 + 2 * mmsize] + pmaddwd m9, [r5 + 1 * mmsize] + paddd m5, m8 + paddd m7, m9 + movu xm8, [r0 + r4] ; m8 = row 11 + punpckhwd xm9, xm10, xm8 + punpcklwd xm10, xm8 + vinserti128 m10, m10, xm9, 1 + pmaddwd m9, m10, [r5 + 3 * mmsize] + pmaddwd m10, [r5 + 2 * mmsize] + paddd m4, m9 + paddd m6, m10 + + lea r4, [r3 * 3] +%ifidn %1,sp + paddd m0, m11 + paddd m1, m11 + paddd m2, m11 + paddd m3, m11 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 %else - add r2, 16 + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 %endif - add r0, 16 - dec r10d - jnz .loopW - sub r7, r11 - lea r0, [r7 - 2 * %1 + 16] -%ifidn %3,sp - lea r2, [r8 + r3 * 4 - %1 + 8] + packssdw m0, m1 + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m1, [interp8_hps_shuf] + vpermd m0, m1, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 %else - lea r2, [r8 + r3 * 4 - 2 * %1 + 16] -%endif - dec r9d - jnz .loopH - RET + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 %endif -%endmacro - - FILTER_VER_LUMA_AVX2_NxN 16, 32, sp - FILTER_VER_LUMA_AVX2_NxN 16, 64, sp - FILTER_VER_LUMA_AVX2_NxN 24, 32, sp - FILTER_VER_LUMA_AVX2_NxN 32, 32, sp - FILTER_VER_LUMA_AVX2_NxN 32, 64, sp - FILTER_VER_LUMA_AVX2_NxN 48, 64, sp - FILTER_VER_LUMA_AVX2_NxN 64, 32, sp - FILTER_VER_LUMA_AVX2_NxN 64, 48, sp - FILTER_VER_LUMA_AVX2_NxN 64, 64, sp - FILTER_VER_LUMA_AVX2_NxN 16, 32, ss - FILTER_VER_LUMA_AVX2_NxN 16, 64, ss - FILTER_VER_LUMA_AVX2_NxN 24, 32, ss - FILTER_VER_LUMA_AVX2_NxN 32, 32, ss - FILTER_VER_LUMA_AVX2_NxN 32, 64, ss - FILTER_VER_LUMA_AVX2_NxN 48, 64, ss - FILTER_VER_LUMA_AVX2_NxN 64, 32, ss - FILTER_VER_LUMA_AVX2_NxN 64, 48, ss - FILTER_VER_LUMA_AVX2_NxN 64, 64, ss - -%macro FILTER_VER_LUMA_S_AVX2_12x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_12x16, 4, 9, 15 - mov r4d, r4m - shl r4d, 7 - add r1d, r1d -%ifdef PIC - lea r5, [pw_LumaCoeffVer] - add r5, r4 -%else - lea r5, [pw_LumaCoeffVer + r4] -%endif + lea r0, [r0 + r1 * 4] + movu xm9, [r0] ; m9 = row 12 + punpckhwd xm3, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm3, 1 + pmaddwd m3, m8, [r5 + 3 * mmsize] + pmaddwd m8, [r5 + 2 * mmsize] + paddd m5, m3 + paddd m7, m8 + movu xm3, [r0 + r1] ; m3 = row 13 + punpckhwd xm0, xm9, xm3 + punpcklwd xm9, xm3 + vinserti128 m9, m9, xm0, 1 + pmaddwd m9, [r5 + 3 * mmsize] + paddd m6, m9 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhwd xm9, xm3, xm0 + punpcklwd xm3, xm0 + vinserti128 m3, m3, xm9, 1 + pmaddwd m3, [r5 + 3 * mmsize] + paddd m7, m3 - lea r4, [r1 * 3] - sub r0, r4 %ifidn %1,sp - mova m14, [pd_526336] + paddd m4, m11 + paddd m5, m11 + paddd m6, m11 + paddd m7, m11 + psrad m4, 12 + psrad m5, 12 + psrad m6, 12 + psrad m7, 12 %else - add r3d, r3d + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 %endif - lea r6, [r3 * 3] - PROCESS_LUMA_AVX2_W8_16R %1 + packssdw m4, m5 + packssdw m6, m7 + lea r2, [r2 + r3 * 4] %ifidn %1,sp - add r2, 8 + packuswb m4, m6 + vpermd m4, m1, m4 + vextracti128 xm6, m4, 1 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r4], xm6 %else - add r2, 16 + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r4], xm7 %endif - add r0, 16 - mova m7, m14 - PROCESS_LUMA_AVX2_W4_16R %1 RET %endif %endmacro - FILTER_VER_LUMA_S_AVX2_12x16 sp - FILTER_VER_LUMA_S_AVX2_12x16 ss + FILTER_VER_LUMA_S_AVX2_8x8 sp + FILTER_VER_LUMA_S_AVX2_8x8 ss -%macro FILTER_VER_LUMA_S_AVX2_16x12 1 +%macro FILTER_VER_LUMA_S_AVX2_8xN 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 +cglobal interp_8tap_vert_%1_8x%2, 4, 9, 15 mov r4d, r4m shl r4d, 7 add r1d, r1d @@ -24695,8 +20423,9 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 add r3d, r3d %endif lea r6, [r3 * 3] - mov r9d, 2 -.loopW: + lea r7, [r1 * 4] + mov r8d, %2 / 16 +.loopH: movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 @@ -24715,15 +20444,15 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] - movu xm5, [r7 + r1] ; m5 = row 5 + movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 @@ -24732,7 +20461,7 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhwd xm7, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm7, 1 @@ -24741,7 +20470,7 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 pmaddwd m7, m5, [r5 + 1 * mmsize] paddd m3, m7 pmaddwd m5, [r5] - movu xm7, [r7 + r4] ; m7 = row 7 + movu xm7, [r0 + r4] ; m7 = row 7 punpckhwd xm8, xm6, xm7 punpcklwd xm6, xm7 vinserti128 m6, m6, xm8, 1 @@ -24752,8 +20481,8 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 pmaddwd m8, m6, [r5 + 1 * mmsize] paddd m4, m8 pmaddwd m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm8, [r7] ; m8 = row 8 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 punpckhwd xm9, xm7, xm8 punpcklwd xm7, xm8 vinserti128 m7, m7, xm9, 1 @@ -24764,7 +20493,7 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 pmaddwd m9, m7, [r5 + 1 * mmsize] paddd m5, m9 pmaddwd m7, [r5] - movu xm9, [r7 + r1] ; m9 = row 9 + movu xm9, [r0 + r1] ; m9 = row 9 punpckhwd xm10, xm8, xm9 punpcklwd xm8, xm9 vinserti128 m8, m8, xm10, 1 @@ -24775,7 +20504,7 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 pmaddwd m10, m8, [r5 + 1 * mmsize] paddd m6, m10 pmaddwd m8, [r5] - movu xm10, [r7 + r1 * 2] ; m10 = row 10 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhwd xm11, xm9, xm10 punpcklwd xm9, xm10 vinserti128 m9, m9, xm11, 1 @@ -24786,7 +20515,7 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 pmaddwd m11, m9, [r5 + 1 * mmsize] paddd m7, m11 pmaddwd m9, [r5] - movu xm11, [r7 + r4] ; m11 = row 11 + movu xm11, [r0 + r4] ; m11 = row 11 punpckhwd xm12, xm10, xm11 punpcklwd xm10, xm11 vinserti128 m10, m10, xm12, 1 @@ -24797,8 +20526,8 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 pmaddwd m12, m10, [r5 + 1 * mmsize] paddd m8, m12 pmaddwd m10, [r5] - lea r7, [r7 + r1 * 4] - movu xm12, [r7] ; m12 = row 12 + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 punpckhwd xm13, xm11, xm12 punpcklwd xm11, xm12 vinserti128 m11, m11, xm13, 1 @@ -24834,11 +20563,10 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 - %ifidn %1,sp packuswb m0, m2 - mova m5, [interp8_hps_shuf] - vpermd m0, m5, m0 + mova m1, [interp8_hps_shuf] + vpermd m0, m1, m0 vextracti128 xm2, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 @@ -24855,7 +20583,7 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 movu [r2 + r6], xm3 %endif - movu xm13, [r7 + r1] ; m13 = row 13 + movu xm13, [r0 + r1] ; m13 = row 13 punpckhwd xm0, xm12, xm13 punpcklwd xm12, xm13 vinserti128 m12, m12, xm0, 1 @@ -24863,18 +20591,20 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 paddd m6, m0 pmaddwd m0, m12, [r5 + 2 * mmsize] paddd m8, m0 - pmaddwd m12, [r5 + 1 * mmsize] - paddd m10, m12 - movu xm0, [r7 + r1 * 2] ; m0 = row 14 - punpckhwd xm1, xm13, xm0 + pmaddwd m0, m12, [r5 + 1 * mmsize] + paddd m10, m0 + pmaddwd m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhwd xm2, xm13, xm0 punpcklwd xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddwd m1, m13, [r5 + 3 * mmsize] - paddd m7, m1 - pmaddwd m1, m13, [r5 + 2 * mmsize] - paddd m9, m1 - pmaddwd m13, [r5 + 1 * mmsize] - paddd m11, m13 + vinserti128 m13, m13, xm2, 1 + pmaddwd m2, m13, [r5 + 3 * mmsize] + paddd m7, m2 + pmaddwd m2, m13, [r5 + 2 * mmsize] + paddd m9, m2 + pmaddwd m2, m13, [r5 + 1 * mmsize] + paddd m11, m2 + pmaddwd m13, [r5] %ifidn %1,sp paddd m6, m14 @@ -24886,109 +20616,288 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 psrad m7, 6 %endif packssdw m6, m7 - lea r8, [r2 + r3 * 4] + lea r2, [r2 + r3 * 4] %ifidn %1,sp packuswb m4, m6 - vpermd m4, m5, m4 + vpermd m4, m1, m4 vextracti128 xm6, m4, 1 - movq [r8], xm4 - movhps [r8 + r3], xm4 - movq [r8 + r3 * 2], xm6 - movhps [r8 + r6], xm6 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm6 %else - vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b + vpermq m4, m4, 11011000b vextracti128 xm1, m4, 1 vextracti128 xm7, m6, 1 - movu [r8], xm4 - movu [r8 + r3], xm1 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 + movu [r2], xm4 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 %endif - movu xm1, [r7 + r4] ; m1 = row 15 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m2, m0, [r5 + 3 * mmsize] - paddd m8, m2 - pmaddwd m0, [r5 + 2 * mmsize] - paddd m10, m0 - lea r7, [r7 + r1 * 4] - movu xm2, [r7] ; m2 = row 16 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m3, m1, [r5 + 3 * mmsize] + movu xm6, [r0 + r4] ; m6 = row 15 + punpckhwd xm5, xm0, xm6 + punpcklwd xm0, xm6 + vinserti128 m0, m0, xm5, 1 + pmaddwd m5, m0, [r5 + 3 * mmsize] + paddd m8, m5 + pmaddwd m5, m0, [r5 + 2 * mmsize] + paddd m10, m5 + pmaddwd m5, m0, [r5 + 1 * mmsize] + paddd m12, m5 + pmaddwd m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhwd xm3, xm6, xm2 + punpcklwd xm6, xm2 + vinserti128 m6, m6, xm3, 1 + pmaddwd m3, m6, [r5 + 3 * mmsize] paddd m9, m3 - pmaddwd m1, [r5 + 2 * mmsize] - paddd m11, m1 - movu xm3, [r7 + r1] ; m3 = row 17 + pmaddwd m3, m6, [r5 + 2 * mmsize] + paddd m11, m3 + pmaddwd m3, m6, [r5 + 1 * mmsize] + paddd m13, m3 + pmaddwd m6, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddwd m2, [r5 + 3 * mmsize] - paddd m10, m2 - movu xm4, [r7 + r1 * 2] ; m4 = row 18 + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m10, m4 + pmaddwd m4, m2, [r5 + 2 * mmsize] + paddd m12, m4 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhwd xm2, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm2, 1 + pmaddwd m2, m3, [r5 + 3 * mmsize] + paddd m11, m2 + pmaddwd m2, m3, [r5 + 2 * mmsize] + paddd m13, m2 + pmaddwd m3, [r5 + 1 * mmsize] + paddd m6, m3 + movu xm2, [r0 + r4] ; m2 = row 19 + punpckhwd xm7, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm7, 1 + pmaddwd m7, m4, [r5 + 3 * mmsize] + paddd m12, m7 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm7, [r0] ; m7 = row 20 + punpckhwd xm3, xm2, xm7 + punpcklwd xm2, xm7 + vinserti128 m2, m2, xm3, 1 + pmaddwd m3, m2, [r5 + 3 * mmsize] + paddd m13, m3 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m6, m2 + movu xm3, [r0 + r1] ; m3 = row 21 + punpckhwd xm2, xm7, xm3 + punpcklwd xm7, xm3 + vinserti128 m7, m7, xm2, 1 + pmaddwd m7, [r5 + 3 * mmsize] + paddd m0, m7 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhwd xm7, xm3, xm2 + punpcklwd xm3, xm2 + vinserti128 m3, m3, xm7, 1 pmaddwd m3, [r5 + 3 * mmsize] - paddd m11, m3 + paddd m6, m3 %ifidn %1,sp paddd m8, m14 paddd m9, m14 paddd m10, m14 paddd m11, m14 + paddd m12, m14 + paddd m13, m14 + paddd m0, m14 + paddd m6, m14 psrad m8, 12 psrad m9, 12 psrad m10, 12 psrad m11, 12 + psrad m12, 12 + psrad m13, 12 + psrad m0, 12 + psrad m6, 12 %else psrad m8, 6 psrad m9, 6 psrad m10, 6 psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m6, 6 %endif packssdw m8, m9 packssdw m10, m11 - lea r8, [r8 + r3 * 4] + packssdw m12, m13 + packssdw m0, m6 + lea r2, [r2 + r3 * 4] %ifidn %1,sp packuswb m8, m10 - vpermd m8, m5, m8 + packuswb m12, m0 + vpermd m8, m1, m8 + vpermd m12, m1, m12 vextracti128 xm10, m8, 1 - movq [r8], xm8 - movhps [r8 + r3], xm8 - movq [r8 + r3 * 2], xm10 - movhps [r8 + r6], xm10 - add r2, 8 + vextracti128 xm0, m12, 1 + movq [r2], xm8 + movhps [r2 + r3], xm8 + movq [r2 + r3 * 2], xm10 + movhps [r2 + r6], xm10 + lea r2, [r2 + r3 * 4] + movq [r2], xm12 + movhps [r2 + r3], xm12 + movq [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm0 %else vpermq m8, m8, 11011000b vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 - movu [r8], xm8 - movu [r8 + r3], xm9 - movu [r8 + r3 * 2], xm10 - movu [r8 + r6], xm11 - add r2, 16 + vextracti128 xm13, m12, 1 + vextracti128 xm6, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm6 %endif - add r0, 16 - dec r9d - jnz .loopW + + lea r2, [r2 + r3 * 4] + sub r0, r7 + dec r8d + jnz .loopH RET %endif %endmacro - FILTER_VER_LUMA_S_AVX2_16x12 sp - FILTER_VER_LUMA_S_AVX2_16x12 ss + FILTER_VER_LUMA_S_AVX2_8xN sp, 16 + FILTER_VER_LUMA_S_AVX2_8xN sp, 32 + FILTER_VER_LUMA_S_AVX2_8xN ss, 16 + FILTER_VER_LUMA_S_AVX2_8xN ss, 32 + +%macro PROCESS_LUMA_S_AVX2_W8_4R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m2, m4 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm4, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm4, 1 + pmaddwd m4, m5, [r5 + 2 * mmsize] + paddd m1, m4 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m3, m5 + movu xm4, [r0 + r4] ; m4 = row 7 + punpckhwd xm5, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm5, 1 + pmaddwd m5, m6, [r5 + 3 * mmsize] + paddd m0, m5 + pmaddwd m6, [r5 + 2 * mmsize] + paddd m2, m6 + lea r0, [r0 + r1 * 4] + movu xm5, [r0] ; m5 = row 8 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 3 * mmsize] + paddd m1, m6 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m3, m4 + movu xm6, [r0 + r1] ; m6 = row 9 + punpckhwd xm4, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm4, 1 + pmaddwd m5, [r5 + 3 * mmsize] + paddd m2, m5 + movu xm4, [r0 + r1 * 2] ; m4 = row 10 + punpckhwd xm5, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm5, 1 + pmaddwd m6, [r5 + 3 * mmsize] + paddd m3, m6 + +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m4, [interp8_hps_shuf] + vpermd m0, m4, m0 + vextracti128 xm2, m0, 1 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 +%endif +%endmacro -%macro FILTER_VER_LUMA_S_AVX2_16x4 1 +%macro FILTER_VER_LUMA_S_AVX2_8x4 1 INIT_YMM avx2 -cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize +cglobal interp_8tap_vert_%1_8x4, 4, 6, 8 mov r4d, r4m shl r4d, 7 add r1d, r1d @@ -25007,34 +20916,27 @@ cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize %else add r3d, r3d %endif - mov dword [rsp], 2 -.loopW: + PROCESS_LUMA_S_AVX2_W8_4R %1 - lea r6, [r3 * 3] + lea r4, [r3 * 3] %ifidn %1,sp movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 - add r2, 8 + movhps [r2 + r4], xm2 %else movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - add r2, 16 + movu [r2 + r4], xm3 %endif - lea r6, [8 * r1 - 16] - sub r0, r6 - dec dword [rsp] - jnz .loopW RET %endmacro - FILTER_VER_LUMA_S_AVX2_16x4 sp - FILTER_VER_LUMA_S_AVX2_16x4 ss + FILTER_VER_LUMA_S_AVX2_8x4 sp + FILTER_VER_LUMA_S_AVX2_8x4 ss -%macro PROCESS_LUMA_S_AVX2_W8_8R 1 +%macro PROCESS_LUMA_AVX2_W8_16R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 @@ -25110,43 +21012,51 @@ cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize paddd m2, m10 pmaddwd m10, m8, [r5 + 2 * mmsize] paddd m4, m10 - pmaddwd m8, [r5 + 1 * mmsize] - paddd m6, m8 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] movu xm10, [r7 + r1 * 2] ; m10 = row 10 - punpckhwd xm8, xm9, xm10 + punpckhwd xm11, xm9, xm10 punpcklwd xm9, xm10 - vinserti128 m9, m9, xm8, 1 - pmaddwd m8, m9, [r5 + 3 * mmsize] - paddd m3, m8 - pmaddwd m8, m9, [r5 + 2 * mmsize] - paddd m5, m8 - pmaddwd m9, [r5 + 1 * mmsize] - paddd m7, m9 - movu xm8, [r7 + r4] ; m8 = row 11 - punpckhwd xm9, xm10, xm8 - punpcklwd xm10, xm8 - vinserti128 m10, m10, xm9, 1 - pmaddwd m9, m10, [r5 + 3 * mmsize] - paddd m4, m9 - pmaddwd m10, [r5 + 2 * mmsize] - paddd m6, m10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 3 * mmsize] + paddd m4, m12 + pmaddwd m12, m10, [r5 + 2 * mmsize] + paddd m6, m12 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] lea r7, [r7 + r1 * 4] - movu xm9, [r7] ; m9 = row 12 - punpckhwd xm10, xm8, xm9 - punpcklwd xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddwd m10, m8, [r5 + 3 * mmsize] - paddd m5, m10 - pmaddwd m8, [r5 + 2 * mmsize] - paddd m7, m8 + movu xm12, [r7] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 3 * mmsize] + paddd m5, m13 + pmaddwd m13, m11, [r5 + 2 * mmsize] + paddd m7, m13 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] %ifidn %1,sp - paddd m0, m11 - paddd m1, m11 - paddd m2, m11 - paddd m3, m11 - paddd m4, m11 - paddd m5, m11 + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 psrad m0, 12 psrad m1, 12 psrad m2, 12 @@ -25164,7 +21074,6 @@ cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 - %ifidn %1,sp packuswb m0, m2 mova m5, [interp8_hps_shuf] @@ -25185,22 +21094,32 @@ cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize movu [r2 + r6], xm3 %endif - movu xm10, [r7 + r1] ; m10 = row 13 - punpckhwd xm0, xm9, xm10 - punpcklwd xm9, xm10 - vinserti128 m9, m9, xm0, 1 - pmaddwd m9, [r5 + 3 * mmsize] - paddd m6, m9 + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m0, m12, [r5 + 3 * mmsize] + paddd m6, m0 + pmaddwd m0, m12, [r5 + 2 * mmsize] + paddd m8, m0 + pmaddwd m0, m12, [r5 + 1 * mmsize] + paddd m10, m0 + pmaddwd m12, [r5] movu xm0, [r7 + r1 * 2] ; m0 = row 14 - punpckhwd xm1, xm10, xm0 - punpcklwd xm10, xm0 - vinserti128 m10, m10, xm1, 1 - pmaddwd m10, [r5 + 3 * mmsize] - paddd m7, m10 + punpckhwd xm1, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddwd m1, m13, [r5 + 3 * mmsize] + paddd m7, m1 + pmaddwd m1, m13, [r5 + 2 * mmsize] + paddd m9, m1 + pmaddwd m1, m13, [r5 + 1 * mmsize] + paddd m11, m1 + pmaddwd m13, [r5] %ifidn %1,sp - paddd m6, m11 - paddd m7, m11 + paddd m6, m14 + paddd m7, m14 psrad m6, 12 psrad m7, 12 %else @@ -25219,21 +21138,163 @@ cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize movq [r8 + r3 * 2], xm6 movhps [r8 + r6], xm6 %else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm5, m4, 1 - vextracti128 xm7, m6, 1 - movu [r8], xm4 - movu [r8 + r3], xm5 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm1, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 +%endif + + movu xm1, [r7 + r4] ; m1 = row 15 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m8, m2 + pmaddwd m2, m0, [r5 + 2 * mmsize] + paddd m10, m2 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m12, m2 + pmaddwd m0, [r5] + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m3, m1, [r5 + 3 * mmsize] + paddd m9, m3 + pmaddwd m3, m1, [r5 + 2 * mmsize] + paddd m11, m3 + pmaddwd m3, m1, [r5 + 1 * mmsize] + paddd m13, m3 + pmaddwd m1, [r5] + movu xm3, [r7 + r1] ; m3 = row 17 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m10, m4 + pmaddwd m4, m2, [r5 + 2 * mmsize] + paddd m12, m4 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddwd m2, m3, [r5 + 3 * mmsize] + paddd m11, m2 + pmaddwd m2, m3, [r5 + 2 * mmsize] + paddd m13, m2 + pmaddwd m3, [r5 + 1 * mmsize] + paddd m1, m3 + movu xm2, [r7 + r4] ; m2 = row 19 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 3 * mmsize] + paddd m12, m6 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m0, m4 + lea r7, [r7 + r1 * 4] + movu xm6, [r7] ; m6 = row 20 + punpckhwd xm7, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 3 * mmsize] + paddd m13, m7 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m1, m2 + movu xm7, [r7 + r1] ; m7 = row 21 + punpckhwd xm2, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddwd m6, [r5 + 3 * mmsize] + paddd m0, m6 + movu xm2, [r7 + r1 * 2] ; m2 = row 22 + punpckhwd xm3, xm7, xm2 + punpcklwd xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddwd m7, [r5 + 3 * mmsize] + paddd m1, m7 + +%ifidn %1,sp + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 + paddd m12, m14 + paddd m13, m14 + paddd m0, m14 + paddd m1, m14 + psrad m8, 12 + psrad m9, 12 + psrad m10, 12 + psrad m11, 12 + psrad m12, 12 + psrad m13, 12 + psrad m0, 12 + psrad m1, 12 +%else + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m0, m1 + lea r8, [r8 + r3 * 4] + +%ifidn %1,sp + packuswb m8, m10 + packuswb m12, m0 + vpermd m8, m5, m8 + vpermd m12, m5, m12 + vextracti128 xm10, m8, 1 + vextracti128 xm0, m12, 1 + movq [r8], xm8 + movhps [r8 + r3], xm8 + movq [r8 + r3 * 2], xm10 + movhps [r8 + r6], xm10 + lea r8, [r8 + r3 * 4] + movq [r8], xm12 + movhps [r8 + r3], xm12 + movq [r8 + r3 * 2], xm0 + movhps [r8 + r6], xm0 +%else + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 + lea r8, [r8 + r3 * 4] + movu [r8], xm12 + movu [r8 + r3], xm13 + movu [r8 + r3 * 2], xm0 + movu [r8 + r6], xm1 %endif %endmacro -%macro FILTER_VER_LUMA_AVX2_Nx8 2 +%macro FILTER_VER_LUMA_AVX2_Nx16 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_%2x8, 4, 10, 12 +cglobal interp_8tap_vert_%1_%2x16, 4, 10, 15 mov r4d, r4m shl r4d, 7 add r1d, r1d @@ -25248,14 +21309,14 @@ cglobal interp_8tap_vert_%1_%2x8, 4, 10, 12 lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp - mova m11, [pd_526336] + mova m14, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] mov r9d, %2 / 8 .loopW: - PROCESS_LUMA_S_AVX2_W8_8R %1 + PROCESS_LUMA_AVX2_W8_16R %1 %ifidn %1,sp add r2, 8 %else @@ -25268,15 +21329,17 @@ cglobal interp_8tap_vert_%1_%2x8, 4, 10, 12 %endif %endmacro - FILTER_VER_LUMA_AVX2_Nx8 sp, 32 - FILTER_VER_LUMA_AVX2_Nx8 sp, 16 - FILTER_VER_LUMA_AVX2_Nx8 ss, 32 - FILTER_VER_LUMA_AVX2_Nx8 ss, 16 + FILTER_VER_LUMA_AVX2_Nx16 sp, 16 + FILTER_VER_LUMA_AVX2_Nx16 sp, 32 + FILTER_VER_LUMA_AVX2_Nx16 sp, 64 + FILTER_VER_LUMA_AVX2_Nx16 ss, 16 + FILTER_VER_LUMA_AVX2_Nx16 ss, 32 + FILTER_VER_LUMA_AVX2_Nx16 ss, 64 -%macro FILTER_VER_LUMA_S_AVX2_32x24 1 +%macro FILTER_VER_LUMA_AVX2_NxN 3 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 +cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 mov r4d, r4m shl r4d, 7 add r1d, r1d @@ -25290,2859 +21353,757 @@ cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 lea r4, [r1 * 3] sub r0, r4 -%ifidn %1,sp + +%ifidn %3,sp mova m14, [pd_526336] %else add r3d, r3d %endif + lea r6, [r3 * 3] - mov r9d, 4 + lea r11, [r1 * 4] + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 8 .loopW: - PROCESS_LUMA_AVX2_W8_16R %1 -%ifidn %1,sp + PROCESS_LUMA_AVX2_W8_16R %3 +%ifidn %3,sp add r2, 8 %else add r2, 16 %endif add r0, 16 - dec r9d + dec r10d jnz .loopW - lea r9, [r1 * 4] - sub r7, r9 - lea r0, [r7 - 48] -%ifidn %1,sp - lea r2, [r8 + r3 * 4 - 24] -%else - lea r2, [r8 + r3 * 4 - 48] -%endif - mova m11, m14 - mov r9d, 4 -.loop: - PROCESS_LUMA_S_AVX2_W8_8R %1 -%ifidn %1,sp - add r2, 8 + sub r7, r11 + lea r0, [r7 - 2 * %1 + 16] +%ifidn %3,sp + lea r2, [r8 + r3 * 4 - %1 + 8] %else - add r2, 16 + lea r2, [r8 + r3 * 4 - 2 * %1 + 16] %endif - add r0, 16 dec r9d - jnz .loop + jnz .loopH RET %endif %endmacro - FILTER_VER_LUMA_S_AVX2_32x24 sp - FILTER_VER_LUMA_S_AVX2_32x24 ss - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_32x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;-----------------------------------------------------------------------------------------------------------------------------; -INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_32x32, 4,6,8 - mov r4d, r4m - add r3d, r3d - dec r0 - - ; check isRowExt - cmp r5m, byte 0 - - lea r5, [tab_ChromaCoeff] - vpbroadcastw m0, [r5 + r4 * 4 + 0] - vpbroadcastw m1, [r5 + r4 * 4 + 2] - mova m7, [pw_2000] - - ; register map - ; m0 - interpolate coeff Low - ; m1 - interpolate coeff High - ; m7 - constant pw_2000 - mov r4d, 32 - je .loop - sub r0, r1 - add r4d, 3 - -.loop: - ; Row 0 - movu m2, [r0] - movu m3, [r0 + 1] - punpckhbw m4, m2, m3 - punpcklbw m2, m3 - pmaddubsw m4, m0 - pmaddubsw m2, m0 - - movu m3, [r0 + 2] - movu m5, [r0 + 3] - punpckhbw m6, m3, m5 - punpcklbw m3, m5 - pmaddubsw m6, m1 - pmaddubsw m3, m1 - - paddw m4, m6 - paddw m2, m3 - psubw m4, m7 - psubw m2, m7 - vperm2i128 m3, m2, m4, 0x20 - vperm2i128 m5, m2, m4, 0x31 - movu [r2], m3 - movu [r2 + mmsize], m5 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop - RET + FILTER_VER_LUMA_AVX2_NxN 16, 32, sp + FILTER_VER_LUMA_AVX2_NxN 16, 64, sp + FILTER_VER_LUMA_AVX2_NxN 24, 32, sp + FILTER_VER_LUMA_AVX2_NxN 32, 32, sp + FILTER_VER_LUMA_AVX2_NxN 32, 64, sp + FILTER_VER_LUMA_AVX2_NxN 48, 64, sp + FILTER_VER_LUMA_AVX2_NxN 64, 32, sp + FILTER_VER_LUMA_AVX2_NxN 64, 48, sp + FILTER_VER_LUMA_AVX2_NxN 64, 64, sp + FILTER_VER_LUMA_AVX2_NxN 16, 32, ss + FILTER_VER_LUMA_AVX2_NxN 16, 64, ss + FILTER_VER_LUMA_AVX2_NxN 24, 32, ss + FILTER_VER_LUMA_AVX2_NxN 32, 32, ss + FILTER_VER_LUMA_AVX2_NxN 32, 64, ss + FILTER_VER_LUMA_AVX2_NxN 48, 64, ss + FILTER_VER_LUMA_AVX2_NxN 64, 32, ss + FILTER_VER_LUMA_AVX2_NxN 64, 48, ss + FILTER_VER_LUMA_AVX2_NxN 64, 64, ss -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_16x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;-----------------------------------------------------------------------------------------------------------------------------; +%macro FILTER_VER_LUMA_S_AVX2_12x16 1 INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_16x16, 4,7,6 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_12x16, 4, 9, 15 mov r4d, r4m - mov r5d, r5m - add r3d, r3d + shl r4d, 7 + add r1d, r1d %ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [pw_LumaCoeffVer + r4] %endif - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m5, [pw_2000] - mova m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - mov r6d, 16 - dec r0 - test r5d, r5d - je .loop - sub r0 , r1 - add r6d , 3 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 8] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, m5 - vpermq m3, m3, 11011000b - movu [r2], m3 - - add r2, r3 - add r0, r1 - dec r6d - jnz .loop - RET - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_16xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -%macro IPFILTER_CHROMA_PS_16xN_AVX2 2 -INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6 - mov r4d, r4m - mov r5d, r5m - add r3d, r3d - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m14, [pd_526336] %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + add r3d, r3d %endif - - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m5, [pw_2000] - mova m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - mov r6d, %2 - dec r0 - test r5d, r5d - je .loop - sub r0 , r1 - add r6d , 3 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 8] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, m5 - - vpermq m3, m3, 11011000b - movu [r2], m3 - - add r2, r3 - add r0, r1 - dec r6d - jnz .loop - RET -%endmacro - - IPFILTER_CHROMA_PS_16xN_AVX2 16 , 32 - IPFILTER_CHROMA_PS_16xN_AVX2 16 , 12 - IPFILTER_CHROMA_PS_16xN_AVX2 16 , 8 - IPFILTER_CHROMA_PS_16xN_AVX2 16 , 4 - IPFILTER_CHROMA_PS_16xN_AVX2 16 , 24 - IPFILTER_CHROMA_PS_16xN_AVX2 16 , 64 - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_32xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -%macro IPFILTER_CHROMA_PS_32xN_AVX2 2 -INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6 - mov r4d, r4m - mov r5d, r5m - add r3d, r3d - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] + lea r6, [r3 * 3] + PROCESS_LUMA_AVX2_W8_16R %1 +%ifidn %1,sp + add r2, 8 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + add r2, 16 %endif - - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m5, [pw_2000] - mova m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - mov r6d, %2 - dec r0 - test r5d, r5d - je .loop - sub r0 , r1 - add r6d , 3 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 8] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, m5 - - vpermq m3, m3, 11011000b - movu [r2], m3 - - vbroadcasti128 m3, [r0 + 16] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 24] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, m5 - - vpermq m3, m3, 11011000b - movu [r2 + 32], m3 - - add r2, r3 - add r0, r1 - dec r6d - jnz .loop - RET -%endmacro - - IPFILTER_CHROMA_PS_32xN_AVX2 32 , 16 - IPFILTER_CHROMA_PS_32xN_AVX2 32 , 24 - IPFILTER_CHROMA_PS_32xN_AVX2 32 , 8 - IPFILTER_CHROMA_PS_32xN_AVX2 32 , 64 - IPFILTER_CHROMA_PS_32xN_AVX2 32 , 48 -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_4x4, 4,7,5 - mov r4d, r4m - mov r5d, r5m - add r3d, r3d - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - test r5d, r5d - je .label - sub r0 , r1 - -.label: - ; Row 0-1 - movu xm3, [r0] - vinserti128 m3, m3, [r0 + r1], 1 - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 2-3 - lea r0, [r0 + r1 * 2] - movu xm4, [r0] - vinserti128 m4, m4, [r0 + r1], 1 - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, [pw_2000] - vextracti128 xm4, m3, 1 - movq [r2], xm3 - movq [r2+r3], xm4 - lea r2, [r2 + r3 * 2] - movhps [r2], xm3 - movhps [r2 + r3], xm4 - - test r5d, r5d - jz .end - lea r2, [r2 + r3 * 2] - lea r0, [r0 + r1 * 2] - - ;Row 5-6 - movu xm3, [r0] - vinserti128 m3, m3, [r0 + r1], 1 - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 7 - lea r0, [r0 + r1 * 2] - vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, [pw_2000] - - vextracti128 xm4, m3, 1 - movq [r2], xm3 - movq [r2+r3], xm4 - lea r2, [r2 + r3 * 2] - movhps [r2], xm3 -.end: - RET - -cglobal interp_4tap_horiz_ps_4x2, 4,7,5 - mov r4d, r4m - mov r5d, r5m - add r3d, r3d - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - test r5d, r5d - je .label - sub r0 , r1 - -.label: - ; Row 0-1 - movu xm3, [r0] - vinserti128 m3, m3, [r0 + r1], 1 - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - packssdw m3, m3 - psubw m3, [pw_2000] - vextracti128 xm4, m3, 1 - movq [r2], xm3 - movq [r2+r3], xm4 - - test r5d, r5d - jz .end - lea r2, [r2 + r3 * 2] - lea r0, [r0 + r1 * 2] - - ;Row 2-3 - movu xm3, [r0] - vinserti128 m3, m3, [r0 + r1], 1 - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 5 - lea r0, [r0 + r1 * 2] - vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, [pw_2000] - - vextracti128 xm4, m3, 1 - movq [r2], xm3 - movq [r2+r3], xm4 - lea r2, [r2 + r3 * 2] - movhps [r2], xm3 -.end: + add r0, 16 + mova m7, m14 + PROCESS_LUMA_AVX2_W4_16R %1 RET - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_4xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;-----------------------------------------------------------------------------------------------------------------------------; -%macro IPFILTER_CHROMA_PS_4xN_AVX2 2 -INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_%1x%2, 4,7,5 - mov r4d, r4m - mov r5d, r5m - add r3d, r3d - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif - - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - mov r4, %2 - dec r0 - test r5d, r5d - je .loop - sub r0 , r1 - - -.loop: - sub r4d, 4 - ; Row 0-1 - movu xm3, [r0] - vinserti128 m3, m3, [r0 + r1], 1 - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 2-3 - lea r0, [r0 + r1 * 2] - movu xm4, [r0] - vinserti128 m4, m4, [r0 + r1], 1 - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, [pw_2000] - vextracti128 xm4, m3, 1 - movq [r2], xm3 - movq [r2+r3], xm4 - lea r2, [r2 + r3 * 2] - movhps [r2], xm3 - movhps [r2 + r3], xm4 - - lea r2, [r2 + r3 * 2] - lea r0, [r0 + r1 * 2] - - test r4d, r4d - jnz .loop - test r5d, r5d - jz .end - - ;Row 5-6 - movu xm3, [r0] - vinserti128 m3, m3, [r0 + r1], 1 - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 7 - lea r0, [r0 + r1 * 2] - vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, [pw_2000] - - vextracti128 xm4, m3, 1 - movq [r2], xm3 - movq [r2+r3], xm4 - lea r2, [r2 + r3 * 2] - movhps [r2], xm3 -.end: - RET %endmacro - IPFILTER_CHROMA_PS_4xN_AVX2 4 , 8 - IPFILTER_CHROMA_PS_4xN_AVX2 4 , 16 -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_8x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;-----------------------------------------------------------------------------------------------------------------------------; + FILTER_VER_LUMA_S_AVX2_12x16 sp + FILTER_VER_LUMA_S_AVX2_12x16 ss + +%macro FILTER_VER_LUMA_S_AVX2_16x12 1 INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_8x8, 4,7,6 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 mov r4d, r4m - mov r5d, r5m - add r3d, r3d + shl r4d, 7 + add r1d, r1d %ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [pw_LumaCoeffVer + r4] %endif - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m5, [pw_2000] - mova m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - mov r6d, 4 - dec r0 - test r5d, r5d - je .loop - sub r0 , r1 - add r6d , 1 - -.loop: - dec r6d - ; Row 0 - vbroadcasti128 m3, [r0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, m5 - - vpermq m3, m3, 11011000b - vextracti128 xm4, m3, 1 - movu [r2], xm3 - movu [r2 + r3], xm4 - - lea r2, [r2 + r3 * 2] - lea r0, [r0 + r1 * 2] - test r6d, r6d - jnz .loop - test r5d, r5d - je .end - - ;Row 11 - vbroadcasti128 m3, [r0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - packssdw m3, m3 - psubw m3, m5 - vpermq m3, m3, 11011000b - movu [r2], xm3 -.end: - RET - -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_4x2, 4,6,4 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m14, [pd_526336] %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + add r3d, r3d %endif + lea r6, [r3 * 3] + mov r9d, 2 +.loopW: + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhwd xm11, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 3 * mmsize] + paddd m4, m12 + pmaddwd m12, m10, [r5 + 2 * mmsize] + paddd m6, m12 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 3 * mmsize] + paddd m5, m13 + pmaddwd m13, m11, [r5 + 2 * mmsize] + paddd m7, m13 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] - vbroadcasti128 m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - - ; Row 0-1 - movu xm2, [r0 - 1] - vinserti128 m2, m2, [r0 + r1 - 1], 1 - pshufb m2, m1 - pmaddubsw m2, m0 - pmaddwd m2, [pw_1] - - packssdw m2, m2 - pmulhrsw m2, [pw_512] - vextracti128 xm3, m2, 1 - packuswb xm2, xm3 - - movd [r2], xm2 - pextrd [r2+r3], xm2, 2 - RET - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_32xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -%macro IPFILTER_CHROMA_PP_32xN_AVX2 2 -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_%1x%2, 4,6,7 - mov r4d, r4m - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] +%ifidn %1,sp + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, 12 + psrad m5, 12 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 %endif + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 - mova m1, [interp4_horiz_shuf1] - vpbroadcastd m2, [pw_1] - mova m6, [pw_512] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - mov r4d, %2 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 4] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - vbroadcasti128 m4, [r0 + 16] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + 20] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - vpermq m3, m3, 11011000b - - movu [r2], m3 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop - RET -%endmacro - - IPFILTER_CHROMA_PP_32xN_AVX2 32, 16 - IPFILTER_CHROMA_PP_32xN_AVX2 32, 24 - IPFILTER_CHROMA_PP_32xN_AVX2 32, 8 - IPFILTER_CHROMA_PP_32xN_AVX2 32, 64 - IPFILTER_CHROMA_PP_32xN_AVX2 32, 48 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_8xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -%macro IPFILTER_CHROMA_PP_8xN_AVX2 2 -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_%1x%2, 4,6,6 - mov r4d, r4m - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] +%ifidn %1,sp + packuswb m0, m2 + mova m5, [interp8_hps_shuf] + vpermd m0, m5, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 %endif - movu m1, [tab_Tm] - vpbroadcastd m2, [pw_1] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - sub r0, 1 - mov r4d, %2 - -.loop: - sub r4d, 4 - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, [pw_512] - lea r0, [r0 + r1 * 2] - - ; Row 2 - vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - ; Row 3 - vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, [pw_512] - - packuswb m3, m4 - mova m5, [interp_4tap_8x8_horiz_shuf] - vpermd m3, m5, m3 - vextracti128 xm4, m3, 1 - movq [r2], xm3 - movhps [r2 + r3], xm3 - lea r2, [r2 + r3 * 2] - movq [r2], xm4 - movhps [r2 + r3], xm4 - lea r2, [r2 + r3 * 2] - lea r0, [r0 + r1*2] - test r4d, r4d - jnz .loop - RET -%endmacro - - IPFILTER_CHROMA_PP_8xN_AVX2 8 , 16 - IPFILTER_CHROMA_PP_8xN_AVX2 8 , 32 - IPFILTER_CHROMA_PP_8xN_AVX2 8 , 4 - IPFILTER_CHROMA_PP_8xN_AVX2 8 , 64 - IPFILTER_CHROMA_PP_8xN_AVX2 8 , 12 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_4xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -%macro IPFILTER_CHROMA_PP_4xN_AVX2 2 -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_%1x%2, 4,6,6 - mov r4d, r4m + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m0, m12, [r5 + 3 * mmsize] + paddd m6, m0 + pmaddwd m0, m12, [r5 + 2 * mmsize] + paddd m8, m0 + pmaddwd m12, [r5 + 1 * mmsize] + paddd m10, m12 + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddwd m1, m13, [r5 + 3 * mmsize] + paddd m7, m1 + pmaddwd m1, m13, [r5 + 2 * mmsize] + paddd m9, m1 + pmaddwd m13, [r5 + 1 * mmsize] + paddd m11, m13 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] +%ifidn %1,sp + paddd m6, m14 + paddd m7, m14 + psrad m6, 12 + psrad m7, 12 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + psrad m6, 6 + psrad m7, 6 %endif + packssdw m6, m7 + lea r8, [r2 + r3 * 4] - vpbroadcastd m2, [pw_1] - vbroadcasti128 m1, [tab_Tm] - mov r4d, %2 - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - -.loop: - sub r4d, 4 - ; Row 0-1 - movu xm3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - vinserti128 m3, m3, [r0 + r1], 1 - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 2-3 - lea r0, [r0 + r1 * 2] - movu xm4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - vinserti128 m4, m4, [r0 + r1], 1 - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - pmulhrsw m3, [pw_512] - vextracti128 xm4, m3, 1 - packuswb xm3, xm4 - - movd [r2], xm3 - pextrd [r2+r3], xm3, 2 - lea r2, [r2 + r3 * 2] - pextrd [r2], xm3, 1 - pextrd [r2+r3], xm3, 3 - - lea r0, [r0 + r1 * 2] - lea r2, [r2 + r3 * 2] - test r4d, r4d - jnz .loop - RET -%endmacro - - IPFILTER_CHROMA_PP_4xN_AVX2 4 , 8 - IPFILTER_CHROMA_PP_4xN_AVX2 4 , 16 - -%macro IPFILTER_LUMA_PS_32xN_AVX2 2 -INIT_YMM avx2 -cglobal interp_8tap_horiz_ps_%1x%2, 4, 7, 8 - mov r5d, r5m - mov r4d, r4m -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m5, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 %else - vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm1, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 %endif - mova m6, [tab_Lm + 32] - mova m1, [tab_Lm] - mov r4d, %2 ;height - add r3d, r3d - vbroadcasti128 m2, [pw_1] - mova m7, [interp8_hps_shuf] - - ; register map - ; m0 - interpolate coeff - ; m1 , m6 - shuffle order table - ; m2 - pw_1 - - - sub r0, 3 - test r5d, r5d - jz .label - lea r6, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride - sub r0, r6 - add r4d, 7 - -.label: - lea r6, [pw_2000] -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m3, m6 ; row 0 (col 4 to 7) - pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 - - - vbroadcasti128 m4, [r0 + 8] - pshufb m5, m4, m6 ;row 0 (col 12 to 15) - pshufb m4, m1 ;row 0 (col 8 to 11) - pmaddubsw m4, m0 - pmaddubsw m5, m0 - pmaddwd m4, m2 - pmaddwd m5, m2 - packssdw m4, m5 - - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 - vpermd m3, m7, m3 - psubw m3, [r6] - - movu [r2], m3 ;row 0 - - vbroadcasti128 m3, [r0 + 16] - pshufb m4, m3, m6 ; row 0 (col 20 to 23) - pshufb m3, m1 ; row 0 (col 16 to 19) - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 - - vbroadcasti128 m4, [r0 + 24] - pshufb m5, m4, m6 ;row 0 (col 28 to 31) - pshufb m4, m1 ;row 0 (col 24 to 27) - pmaddubsw m4, m0 - pmaddubsw m5, m0 - pmaddwd m4, m2 - pmaddwd m5, m2 - packssdw m4, m5 - - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 - vpermd m3, m7, m3 - psubw m3, [r6] - - movu [r2 + 32], m3 ;row 0 - - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET -%endmacro - - IPFILTER_LUMA_PS_32xN_AVX2 32 , 32 - IPFILTER_LUMA_PS_32xN_AVX2 32 , 16 - IPFILTER_LUMA_PS_32xN_AVX2 32 , 24 - IPFILTER_LUMA_PS_32xN_AVX2 32 , 8 - IPFILTER_LUMA_PS_32xN_AVX2 32 , 64 -INIT_YMM avx2 -cglobal interp_8tap_horiz_ps_48x64, 4, 7, 8 - mov r5d, r5m - mov r4d, r4m -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] -%endif - mova m6, [tab_Lm + 32] - mova m1, [tab_Lm] - mov r4d, 64 ;height - add r3d, r3d - vbroadcasti128 m2, [pw_2000] - mova m7, [pw_1] - - ; register map - ; m0 - interpolate coeff - ; m1 , m6 - shuffle order table - ; m2 - pw_2000 - - sub r0, 3 - test r5d, r5d - jz .label - lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride - sub r0, r6 ; r0(src)-r6 - add r4d, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) - -.label: - lea r6, [interp8_hps_shuf] -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m3, m6 ; row 0 (col 4 to 7) - pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m7 - pmaddwd m4, m7 - packssdw m3, m4 - - vbroadcasti128 m4, [r0 + 8] - pshufb m5, m4, m6 ;row 0 (col 12 to 15) - pshufb m4, m1 ;row 0 (col 8 to 11) - pmaddubsw m4, m0 - pmaddubsw m5, m0 - pmaddwd m4, m7 - pmaddwd m5, m7 - packssdw m4, m5 - pmaddwd m3, m7 - pmaddwd m4, m7 - packssdw m3, m4 - mova m5, [r6] - vpermd m3, m5, m3 - psubw m3, m2 - movu [r2], m3 ;row 0 - - vbroadcasti128 m3, [r0 + 16] - pshufb m4, m3, m6 ; row 0 (col 20 to 23) - pshufb m3, m1 ; row 0 (col 16 to 19) - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m7 - pmaddwd m4, m7 - packssdw m3, m4 - - vbroadcasti128 m4, [r0 + 24] - pshufb m5, m4, m6 ;row 0 (col 28 to 31) - pshufb m4, m1 ;row 0 (col 24 to 27) - pmaddubsw m4, m0 - pmaddubsw m5, m0 - pmaddwd m4, m7 - pmaddwd m5, m7 - packssdw m4, m5 - pmaddwd m3, m7 - pmaddwd m4, m7 - packssdw m3, m4 - mova m5, [r6] - vpermd m3, m5, m3 - psubw m3, m2 - movu [r2 + 32], m3 ;row 0 - - vbroadcasti128 m3, [r0 + 32] - pshufb m4, m3, m6 ; row 0 (col 36 to 39) - pshufb m3, m1 ; row 0 (col 32 to 35) - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m7 - pmaddwd m4, m7 - packssdw m3, m4 - - vbroadcasti128 m4, [r0 + 40] - pshufb m5, m4, m6 ;row 0 (col 44 to 47) - pshufb m4, m1 ;row 0 (col 40 to 43) - pmaddubsw m4, m0 - pmaddubsw m5, m0 - pmaddwd m4, m7 - pmaddwd m5, m7 - packssdw m4, m5 - pmaddwd m3, m7 - pmaddwd m4, m7 - packssdw m3, m4 - mova m5, [r6] - vpermd m3, m5, m3 - psubw m3, m2 - movu [r2 + 64], m3 ;row 0 - - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET + movu xm1, [r7 + r4] ; m1 = row 15 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m8, m2 + pmaddwd m0, [r5 + 2 * mmsize] + paddd m10, m0 + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m3, m1, [r5 + 3 * mmsize] + paddd m9, m3 + pmaddwd m1, [r5 + 2 * mmsize] + paddd m11, m1 + movu xm3, [r7 + r1] ; m3 = row 17 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m2, [r5 + 3 * mmsize] + paddd m10, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddwd m3, [r5 + 3 * mmsize] + paddd m11, m3 -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_24x32, 4,6,8 - sub r0, 3 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastd m0, [r5 + r4 * 8] - vpbroadcastd m1, [r5 + r4 * 8 + 4] +%ifidn %1,sp + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 + psrad m8, 12 + psrad m9, 12 + psrad m10, 12 + psrad m11, 12 %else - vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] - vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 %endif - movu m3, [tab_Tm + 16] - vpbroadcastd m7, [pw_1] - lea r5, [tab_Tm] - - ; register map - ; m0 , m1 interpolate coeff - ; m2 , m2 shuffle order table - ; m7 - pw_1 - - mov r4d, 32 -.loop: - ; Row 0 - vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m4, m3 - pshufb m4, [r5] - pmaddubsw m4, m0 - pmaddubsw m5, m1 - paddw m4, m5 - pmaddwd m4, m7 - - vbroadcasti128 m5, [r0 + 8] - pshufb m6, m5, m3 - pshufb m5, [r5] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] - pmulhrsw m4, [pw_512] - - vbroadcasti128 m2, [r0 + 16] - pshufb m5, m2, m3 - pshufb m2, [r5] - pmaddubsw m2, m0 - pmaddubsw m5, m1 - paddw m2, m5 - pmaddwd m2, m7 - - packssdw m2, m2 - pmulhrsw m2, [pw_512] - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm5, m4, 1 - pshufd xm4, xm4, 11011000b - pshufd xm5, xm5, 11011000b - - movu [r2], xm4 - movq [r2 + 16], xm5 - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET + packssdw m8, m9 + packssdw m10, m11 + lea r8, [r8 + r3 * 4] -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_12x16, 4,6,8 - sub r0, 3 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastd m0, [r5 + r4 * 8] - vpbroadcastd m1, [r5 + r4 * 8 + 4] +%ifidn %1,sp + packuswb m8, m10 + vpermd m8, m5, m8 + vextracti128 xm10, m8, 1 + movq [r8], xm8 + movhps [r8 + r3], xm8 + movq [r8 + r3 * 2], xm10 + movhps [r8 + r6], xm10 + add r2, 8 %else - vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] - vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 + add r2, 16 %endif - movu m3, [tab_Tm + 16] - vpbroadcastd m7, [pw_1] - lea r5, [tab_Tm] - - ; register map - ; m0 , m1 interpolate coeff - ; m2 , m2 shuffle order table - ; m7 - pw_1 - - mov r4d, 8 -.loop: - ; Row 0 - vbroadcasti128 m4, [r0] ;first 8 element - pshufb m5, m4, m3 - pshufb m4, [r5] - pmaddubsw m4, m0 - pmaddubsw m5, m1 - paddw m4, m5 - pmaddwd m4, m7 - - vbroadcasti128 m5, [r0 + 8] ; element 8 to 11 - pshufb m6, m5, m3 - pshufb m5, [r5] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - - packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] - pmulhrsw m4, [pw_512] - - ;Row 1 - vbroadcasti128 m2, [r0 + r1] - pshufb m5, m2, m3 - pshufb m2, [r5] - pmaddubsw m2, m0 - pmaddubsw m5, m1 - paddw m2, m5 - pmaddwd m2, m7 - - vbroadcasti128 m5, [r0 + r1 + 8] - pshufb m6, m5, m3 - pshufb m5, [r5] - pmaddubsw m5, m0 - pmaddubsw m6, m1 - paddw m5, m6 - pmaddwd m5, m7 - - packssdw m2, m5 - pmulhrsw m2, [pw_512] - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm5, m4, 1 - pshufd xm4, xm4, 11011000b - pshufd xm5, xm5, 11011000b - - movq [r2], xm4 - pextrd [r2+8], xm4, 2 - movq [r2 + r3], xm5 - pextrd [r2+r3+8], xm5, 2 - lea r0, [r0 + r1 * 2] - lea r2, [r2 + r3 * 2] - dec r4d - jnz .loop + add r0, 16 + dec r9d + jnz .loopW RET - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_16xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -%macro IPFILTER_CHROMA_PP_16xN_AVX2 2 -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7 - mov r4d, r4m - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif - - mova m6, [pw_512] - mova m1, [interp4_horiz_shuf1] - vpbroadcastd m2, [pw_1] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - mov r4d, %2/2 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - vpermq m3, m3, 11011000b - - vextracti128 xm4, m3, 1 - movu [r2], xm3 - movu [r2 + r3], xm4 - lea r2, [r2 + r3 * 2] - lea r0, [r0 + r1 * 2] - dec r4d - jnz .loop - RET %endmacro - IPFILTER_CHROMA_PP_16xN_AVX2 16 , 8 - IPFILTER_CHROMA_PP_16xN_AVX2 16 , 32 - IPFILTER_CHROMA_PP_16xN_AVX2 16 , 12 - IPFILTER_CHROMA_PP_16xN_AVX2 16 , 4 - IPFILTER_CHROMA_PP_16xN_AVX2 16 , 64 - IPFILTER_CHROMA_PP_16xN_AVX2 16 , 24 - -%macro IPFILTER_LUMA_PS_64xN_AVX2 1 -INIT_YMM avx2 -cglobal interp_8tap_horiz_ps_64x%1, 4, 7, 8 - mov r5d, r5m - mov r4d, r4m -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] -%endif - mova m6, [tab_Lm + 32] - mova m1, [tab_Lm] - mov r4d, %1 ;height - add r3d, r3d - vbroadcasti128 m2, [pw_1] - mova m7, [interp8_hps_shuf] - - ; register map - ; m0 - interpolate coeff - ; m1 , m6 - shuffle order table - ; m2 - pw_2000 - - sub r0, 3 - test r5d, r5d - jz .label - lea r6, [r1 * 3] - sub r0, r6 ; r0(src)-r6 - add r4d, 7 ; blkheight += N - 1 - -.label: - lea r6, [pw_2000] -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m3, m6 ; row 0 (col 4 to 7) - pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 - - vbroadcasti128 m4, [r0 + 8] - pshufb m5, m4, m6 ;row 0 (col 12 to 15) - pshufb m4, m1 ;row 0 (col 8 to 11) - pmaddubsw m4, m0 - pmaddubsw m5, m0 - pmaddwd m4, m2 - pmaddwd m5, m2 - packssdw m4, m5 - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 - vpermd m3, m7, m3 - psubw m3, [r6] - movu [r2], m3 ;row 0 - - vbroadcasti128 m3, [r0 + 16] - pshufb m4, m3, m6 ; row 0 (col 20 to 23) - pshufb m3, m1 ; row 0 (col 16 to 19) - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 - - vbroadcasti128 m4, [r0 + 24] - pshufb m5, m4, m6 ;row 0 (col 28 to 31) - pshufb m4, m1 ;row 0 (col 24 to 27) - pmaddubsw m4, m0 - pmaddubsw m5, m0 - pmaddwd m4, m2 - pmaddwd m5, m2 - packssdw m4, m5 - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 - vpermd m3, m7, m3 - psubw m3, [r6] - movu [r2 + 32], m3 ;row 0 - - vbroadcasti128 m3, [r0 + 32] - pshufb m4, m3, m6 ; row 0 (col 36 to 39) - pshufb m3, m1 ; row 0 (col 32 to 35) - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 - - vbroadcasti128 m4, [r0 + 40] - pshufb m5, m4, m6 ;row 0 (col 44 to 47) - pshufb m4, m1 ;row 0 (col 40 to 43) - pmaddubsw m4, m0 - pmaddubsw m5, m0 - pmaddwd m4, m2 - pmaddwd m5, m2 - packssdw m4, m5 - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 - vpermd m3, m7, m3 - psubw m3, [r6] - movu [r2 + 64], m3 ;row 0 - vbroadcasti128 m3, [r0 + 48] - pshufb m4, m3, m6 ; row 0 (col 52 to 55) - pshufb m3, m1 ; row 0 (col 48 to 51) - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 - - vbroadcasti128 m4, [r0 + 56] - pshufb m5, m4, m6 ;row 0 (col 60 to 63) - pshufb m4, m1 ;row 0 (col 56 to 59) - pmaddubsw m4, m0 - pmaddubsw m5, m0 - pmaddwd m4, m2 - pmaddwd m5, m2 - packssdw m4, m5 - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 - vpermd m3, m7, m3 - psubw m3, [r6] - movu [r2 + 96], m3 ;row 0 - - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET -%endmacro - - IPFILTER_LUMA_PS_64xN_AVX2 64 - IPFILTER_LUMA_PS_64xN_AVX2 48 - IPFILTER_LUMA_PS_64xN_AVX2 32 - IPFILTER_LUMA_PS_64xN_AVX2 16 + FILTER_VER_LUMA_S_AVX2_16x12 sp + FILTER_VER_LUMA_S_AVX2_16x12 ss -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_8xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -%macro IPFILTER_CHROMA_PS_8xN_AVX2 1 +%macro FILTER_VER_LUMA_S_AVX2_16x4 1 INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_8x%1, 4,7,6 - mov r4d, r4m - mov r5d, r5m - add r3d, r3d +cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize + mov r4d, r4m + shl r4d, 7 + add r1d, r1d %ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [pw_LumaCoeffVer + r4] %endif - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m5, [pw_2000] - mova m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - mov r6d, %1/2 - dec r0 - test r5d, r5d - jz .loop - sub r0 , r1 - inc r6d - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - psubw m3, m5 - vpermq m3, m3, 11011000b - vextracti128 xm4, m3, 1 - movu [r2], xm3 - movu [r2 + r3], xm4 - - lea r2, [r2 + r3 * 2] - lea r0, [r0 + r1 * 2] - dec r6d - jnz .loop - test r5d, r5d - jz .end - - ;Row 11 - vbroadcasti128 m3, [r0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - packssdw m3, m3 - psubw m3, m5 - vpermq m3, m3, 11011000b - movu [r2], xm3 -.end: - RET -%endmacro - - IPFILTER_CHROMA_PS_8xN_AVX2 2 - IPFILTER_CHROMA_PS_8xN_AVX2 32 - IPFILTER_CHROMA_PS_8xN_AVX2 16 - IPFILTER_CHROMA_PS_8xN_AVX2 6 - IPFILTER_CHROMA_PS_8xN_AVX2 4 - IPFILTER_CHROMA_PS_8xN_AVX2 12 - IPFILTER_CHROMA_PS_8xN_AVX2 64 - -INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_2x4, 4, 7, 3 - mov r4d, r4m - mov r5d, r5m - add r3d, r3d -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - mova xm3, [pw_2000] - dec r0 - test r5d, r5d - jz .label - sub r0, r1 - -.label: - lea r6, [r1 * 3] - movq xm1, [r0] - movhps xm1, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r6] - - vinserti128 m1, m1, xm2, 1 - pshufb m1, [interp4_hpp_shuf] - pmaddubsw m1, m0 - pmaddwd m1, [pw_1] - vextracti128 xm2, m1, 1 - packssdw xm1, xm2 - psubw xm1, xm3 - - lea r4, [r3 * 3] - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r4], xm1, 3 - - test r5d, r5d - jz .end - lea r2, [r2 + r3 * 4] - lea r0, [r0 + r1 * 4] - - movq xm1, [r0] - movhps xm1, [r0 + r1] - movq xm2, [r0 + r1 * 2] - vinserti128 m1, m1, xm2, 1 - pshufb m1, [interp4_hpp_shuf] - pmaddubsw m1, m0 - pmaddwd m1, [pw_1] - vextracti128 xm2, m1, 1 - packssdw xm1, xm2 - psubw xm1, xm3 - - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - pextrd [r2 + r3 * 2], xm1, 2 -.end: - RET - -INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_2x8, 4, 7, 7 - mov r4d, r4m - mov r5d, r5m - add r3d, r3d - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - vbroadcasti128 m6, [pw_2000] - test r5d, r5d - jz .label - sub r0, r1 - -.label: - mova m4, [interp4_hpp_shuf] - mova m5, [pw_1] - dec r0 - lea r4, [r1 * 3] - movq xm1, [r0] ;row 0 - movhps xm1, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m1, m1, xm2, 1 - lea r0, [r0 + r1 * 4] - movq xm3, [r0] - movhps xm3, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m3, m3, xm2, 1 - - pshufb m1, m4 - pshufb m3, m4 - pmaddubsw m1, m0 - pmaddubsw m3, m0 - pmaddwd m1, m5 - pmaddwd m3, m5 - packssdw m1, m3 - psubw m1, m6 - - lea r4, [r3 * 3] - vextracti128 xm2, m1, 1 - - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - movd [r2 + r3 * 2], xm2 - pextrd [r2 + r4], xm2, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm1, 2 - pextrd [r2 + r3], xm1, 3 - pextrd [r2 + r3 * 2], xm2, 2 - pextrd [r2 + r4], xm2, 3 - test r5d, r5d - jz .end - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - movq xm1, [r0] ;row 0 - movhps xm1, [r0 + r1] - movq xm2, [r0 + r1 * 2] - vinserti128 m1, m1, xm2, 1 - pshufb m1, m4 - pmaddubsw m1, m0 - pmaddwd m1, m5 - packssdw m1, m1 - psubw m1, m6 - vextracti128 xm2, m1, 1 - - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - movd [r2 + r3 * 2], xm2 -.end: - RET - -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_12x16, 4, 6, 7 - mov r4d, r4m - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m7, [pd_526336] %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + add r3d, r3d %endif - - mova m6, [pw_512] - mova m1, [interp4_horiz_shuf1] - vpbroadcastd m2, [pw_1] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - mov r4d, 8 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - vpermq m3, m3, 11011000b - - vextracti128 xm4, m3, 1 - movq [r2], xm3 - pextrd [r2+8], xm3, 2 - movq [r2 + r3], xm4 - pextrd [r2 + r3 + 8],xm4, 2 - lea r2, [r2 + r3 * 2] - lea r0, [r0 + r1 * 2] - dec r4d - jnz .loop - RET - -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_24x32, 4,6,7 - mov r4d, r4m - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] + mov dword [rsp], 2 +.loopW: + PROCESS_LUMA_S_AVX2_W8_4R %1 + lea r6, [r3 * 3] +%ifidn %1,sp + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 + add r2, 8 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + add r2, 16 %endif - - mova m1, [interp4_horiz_shuf1] - vpbroadcastd m2, [pw_1] - mova m6, [pw_512] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - mov r4d, 32 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 4] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - vbroadcasti128 m4, [r0 + 16] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + 20] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - vpermq m3, m3, 11011000b - - vextracti128 xm4, m3, 1 - movu [r2], xm3 - movq [r2 + 16], xm4 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop + lea r6, [8 * r1 - 16] + sub r0, r6 + dec dword [rsp] + jnz .loopW RET +%endmacro -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;-----------------------------------------------------------------------------------------------------------------------------; -INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_6x8, 4,7,6 - mov r4d, r4m - mov r5d, r5m - add r3d, r3d + FILTER_VER_LUMA_S_AVX2_16x4 sp + FILTER_VER_LUMA_S_AVX2_16x4 ss -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] +%macro PROCESS_LUMA_S_AVX2_W8_8R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m8, [r5 + 1 * mmsize] + paddd m6, m8 + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhwd xm8, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm8, 1 + pmaddwd m8, m9, [r5 + 3 * mmsize] + paddd m3, m8 + pmaddwd m8, m9, [r5 + 2 * mmsize] + paddd m5, m8 + pmaddwd m9, [r5 + 1 * mmsize] + paddd m7, m9 + movu xm8, [r7 + r4] ; m8 = row 11 + punpckhwd xm9, xm10, xm8 + punpcklwd xm10, xm8 + vinserti128 m10, m10, xm9, 1 + pmaddwd m9, m10, [r5 + 3 * mmsize] + paddd m4, m9 + pmaddwd m10, [r5 + 2 * mmsize] + paddd m6, m10 + lea r7, [r7 + r1 * 4] + movu xm9, [r7] ; m9 = row 12 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m5, m10 + pmaddwd m8, [r5 + 2 * mmsize] + paddd m7, m8 + +%ifidn %1,sp + paddd m0, m11 + paddd m1, m11 + paddd m2, m11 + paddd m3, m11 + paddd m4, m11 + paddd m5, m11 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, 12 + psrad m5, 12 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 %endif + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m5, [pw_2000] - mova m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - mov r6d, 8/2 - dec r0 - test r5d, r5d - jz .loop - sub r0 , r1 - inc r6d - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - psubw m3, m5 - vpermq m3, m3, 11011000b - vextracti128 xm4, m3, 1 - movq [r2], xm3 - pextrd [r2 + 8], xm3, 2 - movq [r2 + r3], xm4 - pextrd [r2 + r3 + 8], xm4, 2 - lea r2, [r2 + r3 * 2] - lea r0, [r0 + r1 * 2] - dec r6d - jnz .loop - test r5d, r5d - jz .end - - ;Row 11 - vbroadcasti128 m3, [r0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - packssdw m3, m3 - psubw m3, m5 - vextracti128 xm4, m3, 1 - movq [r2], xm3 - movd [r2+8], xm4 -.end: - RET - -INIT_YMM avx2 -cglobal interp_8tap_horiz_ps_12x16, 6, 7, 8 - mov r5d, r5m - mov r4d, r4m -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] -%endif - mova m6, [tab_Lm + 32] - mova m1, [tab_Lm] - add r3d, r3d - vbroadcasti128 m2, [pw_2000] - mov r4d, 16 - vbroadcasti128 m7, [pw_1] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - pw_2000 - - mova m5, [interp8_hps_shuf] - sub r0, 3 - test r5d, r5d - jz .loop - lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride - sub r0, r6 ; r0(src)-r6 - add r4d, 7 -.loop: - - ; Row 0 - - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m3, m6 - pshufb m3, m1 ; shuffled based on the col order tab_Lm - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m7 - pmaddwd m4, m7 - packssdw m3, m4 - - vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m7 - packssdw m4, m4 - - pmaddwd m3, m7 - pmaddwd m4, m7 - packssdw m3, m4 - - vpermd m3, m5, m3 - psubw m3, m2 - - vextracti128 xm4, m3, 1 - movu [r2], xm3 ;row 0 - movq [r2 + 16], xm4 ;row 1 - - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET - -INIT_YMM avx2 -cglobal interp_8tap_horiz_ps_24x32, 4, 7, 8 - mov r5d, r5m - mov r4d, r4m -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] -%endif - mova m6, [tab_Lm + 32] - mova m1, [tab_Lm] - mov r4d, 32 ;height - add r3d, r3d - vbroadcasti128 m2, [pw_2000] - vbroadcasti128 m7, [pw_1] - - ; register map - ; m0 - interpolate coeff - ; m1 , m6 - shuffle order table - ; m2 - pw_2000 - - sub r0, 3 - test r5d, r5d - jz .label - lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride - sub r0, r6 ; r0(src)-r6 - add r4d, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) - -.label: - lea r6, [interp8_hps_shuf] -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m3, m6 ; row 0 (col 4 to 7) - pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m7 - pmaddwd m4, m7 - packssdw m3, m4 - - vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m4, m6 ;row 1 (col 4 to 7) - pshufb m4, m1 ;row 1 (col 0 to 3) - pmaddubsw m4, m0 - pmaddubsw m5, m0 - pmaddwd m4, m7 - pmaddwd m5, m7 - packssdw m4, m5 - pmaddwd m3, m7 - pmaddwd m4, m7 - packssdw m3, m4 - mova m5, [r6] - vpermd m3, m5, m3 - psubw m3, m2 - movu [r2], m3 ;row 0 - - vbroadcasti128 m3, [r0 + 16] - pshufb m4, m3, m6 - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m7 - pmaddwd m4, m7 - packssdw m3, m4 - pmaddwd m3, m7 - pmaddwd m4, m7 - packssdw m3, m4 - mova m4, [r6] - vpermd m3, m4, m3 - psubw m3, m2 - movu [r2 + 32], xm3 ;row 0 - - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_24x32, 4,7,6 - mov r4d, r4m - mov r5d, r5m - add r3d, r3d -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m5, [pw_2000] - mova m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - mov r6d, 32 - dec r0 - test r5d, r5d - je .loop - sub r0 , r1 - add r6d , 3 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - psubw m3, m5 - vpermq m3, m3, 11011000b - movu [r2], m3 - - vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - packssdw m3, m3 - psubw m3, m5 - vpermq m3, m3, 11011000b - movu [r2 + 32], xm3 - - add r2, r3 - add r0, r1 - dec r6d - jnz .loop - RET - -;----------------------------------------------------------------------------------------------------------------------- -;macro FILTER_H8_W8_16N_AVX2 -;----------------------------------------------------------------------------------------------------------------------- -%macro FILTER_H8_W8_16N_AVX2 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m3, m6 ; row 0 (col 4 to 7) - pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) - pmaddubsw m3, m0 - pmaddubsw m4, m0 - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] - - vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m4, m6 ;row 1 (col 4 to 7) - pshufb m4, m1 ;row 1 (col 0 to 3) - pmaddubsw m4, m0 - pmaddubsw m5, m0 - pmaddwd m4, m2 - pmaddwd m5, m2 - packssdw m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] - - pmaddwd m3, m2 - pmaddwd m4, m2 - packssdw m3, m4 ; all rows and col completed. - - mova m5, [interp8_hps_shuf] - vpermd m3, m5, m3 - psubw m3, m8 - - vextracti128 xm4, m3, 1 - mova [r4], xm3 - mova [r4 + 16], xm4 - %endmacro - -;----------------------------------------------------------------------------- -; void interp_8tap_hv_pp_16x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) -;----------------------------------------------------------------------------- -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_hv_pp_16x16, 4, 10, 15, 0-31*32 -%define stk_buf1 rsp - mov r4d, r4m - mov r5d, r5m -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] +%ifidn %1,sp + packuswb m0, m2 + mova m5, [interp8_hps_shuf] + vpermd m0, m5, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 %else - vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 %endif - xor r6, r6 - mov r4, rsp - mova m6, [tab_Lm + 32] - mova m1, [tab_Lm] - mov r8, 16 ;height - vbroadcasti128 m8, [pw_2000] - vbroadcasti128 m2, [pw_1] - sub r0, 3 - lea r7, [r1 * 3] ; r7 = (N / 2 - 1) * srcStride - sub r0, r7 ; r0(src)-r7 - add r8, 7 - -.loopH: - FILTER_H8_W8_16N_AVX2 - add r0, r1 - add r4, 32 - inc r6 - cmp r6, 16+7 - jnz .loopH - -; vertical phase - xor r6, r6 - xor r1, r1 -.loopV: - -;load necessary variables - mov r4d, r5d ;coeff here for vertical is r5m - shl r4d, 7 - mov r1d, 16 - add r1d, r1d - - ; load intermedia buffer - mov r0, stk_buf1 - - ; register mapping - ; r0 - src - ; r5 - coeff - ; r6 - loop_i + movu xm10, [r7 + r1] ; m10 = row 13 + punpckhwd xm0, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm0, 1 + pmaddwd m9, [r5 + 3 * mmsize] + paddd m6, m9 + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm10, xm0 + punpcklwd xm10, xm0 + vinserti128 m10, m10, xm1, 1 + pmaddwd m10, [r5 + 3 * mmsize] + paddd m7, m10 -; load coeff table -%ifdef PIC - lea r5, [pw_LumaCoeffVer] - add r5, r4 +%ifidn %1,sp + paddd m6, m11 + paddd m7, m11 + psrad m6, 12 + psrad m7, 12 %else - lea r5, [pw_LumaCoeffVer + r4] + psrad m6, 6 + psrad m7, 6 %endif + packssdw m6, m7 + lea r8, [r2 + r3 * 4] - lea r4, [r1*3] - mova m14, [pd_526336] - lea r6, [r3 * 3] - mov r9d, 16 / 8 - -.loopW: - PROCESS_LUMA_AVX2_W8_16R sp - add r2, 8 - add r0, 16 - dec r9d - jnz .loopW - RET +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m5, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm5 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 %endif +%endmacro +%macro FILTER_VER_LUMA_AVX2_Nx8 2 INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_12x32, 4, 6, 7 - mov r4d, r4m +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_%2x8, 4, 10, 12 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d %ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [pw_LumaCoeffVer + r4] %endif - mova m6, [pw_512] - mova m1, [interp4_horiz_shuf1] - vpbroadcastd m2, [pw_1] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - mov r4d, 16 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - vpermq m3, m3, 11011000b - - vextracti128 xm4, m3, 1 - movq [r2], xm3 - pextrd [r2+8], xm3, 2 - movq [r2 + r3], xm4 - pextrd [r2 + r3 + 8],xm4, 2 - lea r2, [r2 + r3 * 2] - lea r0, [r0 + r1 * 2] - dec r4d - jnz .loop - RET - -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_24x64, 4,6,7 - mov r4d, r4m - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m11, [pd_526336] %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + add r3d, r3d %endif - - mova m1, [interp4_horiz_shuf1] - vpbroadcastd m2, [pw_1] - mova m6, [pw_512] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - mov r4d, 64 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 4] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - vbroadcasti128 m4, [r0 + 16] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + 20] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - vpermq m3, m3, 11011000b - - vextracti128 xm4, m3, 1 - movu [r2], xm3 - movq [r2 + 16], xm4 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop - RET - - -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_2x16, 4, 6, 6 - mov r4d, r4m - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - mova m4, [interp4_hpp_shuf] - mova m5, [pw_1] - dec r0 - lea r4, [r1 * 3] - movq xm1, [r0] - movhps xm1, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m1, m1, xm2, 1 - lea r0, [r0 + r1 * 4] - movq xm3, [r0] - movhps xm3, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m3, m3, xm2, 1 - - pshufb m1, m4 - pshufb m3, m4 - pmaddubsw m1, m0 - pmaddubsw m3, m0 - pmaddwd m1, m5 - pmaddwd m3, m5 - packssdw m1, m3 - pmulhrsw m1, [pw_512] - vextracti128 xm2, m1, 1 - packuswb xm1, xm2 - - lea r4, [r3 * 3] - pextrw [r2], xm1, 0 - pextrw [r2 + r3], xm1, 1 - pextrw [r2 + r3 * 2], xm1, 4 - pextrw [r2 + r4], xm1, 5 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm1, 2 - pextrw [r2 + r3], xm1, 3 - pextrw [r2 + r3 * 2], xm1, 6 - pextrw [r2 + r4], xm1, 7 - lea r2, [r2 + r3 * 4] - lea r0, [r0 + r1 * 4] - - lea r4, [r1 * 3] - movq xm1, [r0] - movhps xm1, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m1, m1, xm2, 1 - lea r0, [r0 + r1 * 4] - movq xm3, [r0] - movhps xm3, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m3, m3, xm2, 1 - - pshufb m1, m4 - pshufb m3, m4 - pmaddubsw m1, m0 - pmaddubsw m3, m0 - pmaddwd m1, m5 - pmaddwd m3, m5 - packssdw m1, m3 - pmulhrsw m1, [pw_512] - vextracti128 xm2, m1, 1 - packuswb xm1, xm2 - - lea r4, [r3 * 3] - pextrw [r2], xm1, 0 - pextrw [r2 + r3], xm1, 1 - pextrw [r2 + r3 * 2], xm1, 4 - pextrw [r2 + r4], xm1, 5 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm1, 2 - pextrw [r2 + r3], xm1, 3 - pextrw [r2 + r3 * 2], xm1, 6 - pextrw [r2 + r4], xm1, 7 - RET - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -%macro IPFILTER_CHROMA_PP_64xN_AVX2 1 -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_64x%1, 4,6,7 - mov r4d, r4m - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] + lea r6, [r3 * 3] + mov r9d, %2 / 8 +.loopW: + PROCESS_LUMA_S_AVX2_W8_8R %1 +%ifidn %1,sp + add r2, 8 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + add r2, 16 %endif - - mova m1, [interp4_horiz_shuf1] - vpbroadcastd m2, [pw_1] - mova m6, [pw_512] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - mov r4d, %1 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 4] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - vbroadcasti128 m4, [r0 + 16] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + 20] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - packuswb m3, m4 - vpermq m3, m3, 11011000b - movu [r2], m3 - - vbroadcasti128 m3, [r0 + 32] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 36] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - vbroadcasti128 m4, [r0 + 48] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + 52] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - packuswb m3, m4 - vpermq m3, m3, 11011000b - movu [r2 + 32], m3 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop + add r0, 16 + dec r9d + jnz .loopW RET +%endif %endmacro - IPFILTER_CHROMA_PP_64xN_AVX2 64 - IPFILTER_CHROMA_PP_64xN_AVX2 32 - IPFILTER_CHROMA_PP_64xN_AVX2 48 - IPFILTER_CHROMA_PP_64xN_AVX2 16 + FILTER_VER_LUMA_AVX2_Nx8 sp, 32 + FILTER_VER_LUMA_AVX2_Nx8 sp, 16 + FILTER_VER_LUMA_AVX2_Nx8 ss, 32 + FILTER_VER_LUMA_AVX2_Nx8 ss, 16 -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_48x64(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_S_AVX2_32x24 1 INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_48x64, 4,6,7 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 mov r4d, r4m + shl r4d, 7 + add r1d, r1d %ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [pw_LumaCoeffVer + r4] %endif - mova m1, [interp4_horiz_shuf1] - vpbroadcastd m2, [pw_1] - mova m6, [pw_512] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 - mov r4d, 64 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 4] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - vbroadcasti128 m4, [r0 + 16] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + 20] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - vpermq m3, m3, q3120 - - movu [r2], m3 - - vbroadcasti128 m3, [r0 + mmsize] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + mmsize + 4] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - vbroadcasti128 m4, [r0 + mmsize + 16] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - vbroadcasti128 m5, [r0 + mmsize + 20] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - vpermq m3, m3, q3120 - movu [r2 + mmsize], xm3 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop - RET - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_48x64(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;-----------------------------------------------------------------------------------------------------------------------------; - -INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_48x64, 4,7,6 - mov r4d, r4m - mov r5d, r5m + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m14, [pd_526336] +%else add r3d, r3d - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] +%endif + lea r6, [r3 * 3] + mov r9d, 4 +.loopW: + PROCESS_LUMA_AVX2_W8_16R %1 +%ifidn %1,sp + add r2, 8 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + add r2, 16 %endif - - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m5, [pw_2000] - mova m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - mov r6d, 64 - dec r0 - test r5d, r5d - je .loop - sub r0 , r1 - add r6d , 3 - -.loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, m5 - vpermq m3, m3, q3120 - movu [r2], m3 - - vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 24] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, m5 - vpermq m3, m3, q3120 - movu [r2 + 32], m3 - - vbroadcasti128 m3, [r0 + 32] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 40] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - packssdw m3, m4 - psubw m3, m5 - vpermq m3, m3, q3120 - movu [r2 + 64], m3 - - add r2, r3 - add r0, r1 - dec r6d - jnz .loop - RET - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_24x64(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_24x64, 4,7,6 - mov r4d, r4m - mov r5d, r5m - add r3d, r3d -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - vbroadcasti128 m2, [pw_1] - vbroadcasti128 m5, [pw_2000] - mova m1, [tab_Tm] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - mov r6d, 64 - dec r0 - test r5d, r5d - je .loop - sub r0 , r1 - add r6d , 3 - + add r0, 16 + dec r9d + jnz .loopW + lea r9, [r1 * 4] + sub r7, r9 + lea r0, [r7 - 48] +%ifidn %1,sp + lea r2, [r8 + r3 * 4 - 24] +%else + lea r2, [r8 + r3 * 4 - 48] +%endif + mova m11, m14 + mov r9d, 4 .loop: - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - psubw m3, m5 - vpermq m3, m3, q3120 - movu [r2], m3 - - vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - packssdw m3, m3 - psubw m3, m5 - vpermq m3, m3, q3120 - movu [r2 + 32], xm3 - - add r2, r3 - add r0, r1 - dec r6d - jnz .loop - RET - -INIT_YMM avx2 -cglobal interp_4tap_horiz_ps_2x16, 4, 7, 7 - mov r4d, r4m - mov r5d, r5m - add r3d, r3d - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastd m0, [r6 + r4 * 4] -%else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] -%endif - vbroadcasti128 m6, [pw_2000] - test r5d, r5d - jz .label - sub r0, r1 - -.label: - mova m4, [interp4_hps_shuf] - mova m5, [pw_1] - dec r0 - lea r4, [r1 * 3] - movq xm1, [r0] ;row 0 - movhps xm1, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m1, m1, xm2, 1 - lea r0, [r0 + r1 * 4] - movq xm3, [r0] - movhps xm3, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m3, m3, xm2, 1 - - pshufb m1, m4 - pshufb m3, m4 - pmaddubsw m1, m0 - pmaddubsw m3, m0 - pmaddwd m1, m5 - pmaddwd m3, m5 - packssdw m1, m3 - psubw m1, m6 - - lea r4, [r3 * 3] - vextracti128 xm2, m1, 1 - - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - movd [r2 + r3 * 2], xm2 - pextrd [r2 + r4], xm2, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm1, 2 - pextrd [r2 + r3], xm1, 3 - pextrd [r2 + r3 * 2], xm2, 2 - pextrd [r2 + r4], xm2, 3 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - lea r4, [r1 * 3] - movq xm1, [r0] - movhps xm1, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m1, m1, xm2, 1 - lea r0, [r0 + r1 * 4] - movq xm3, [r0] - movhps xm3, [r0 + r1] - movq xm2, [r0 + r1 * 2] - movhps xm2, [r0 + r4] - vinserti128 m3, m3, xm2, 1 - - pshufb m1, m4 - pshufb m3, m4 - pmaddubsw m1, m0 - pmaddubsw m3, m0 - pmaddwd m1, m5 - pmaddwd m3, m5 - packssdw m1, m3 - psubw m1, m6 - - lea r4, [r3 * 3] - vextracti128 xm2, m1, 1 - - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - movd [r2 + r3 * 2], xm2 - pextrd [r2 + r4], xm2, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm1, 2 - pextrd [r2 + r3], xm1, 3 - pextrd [r2 + r3 * 2], xm2, 2 - pextrd [r2 + r4], xm2, 3 - - test r5d, r5d - jz .end - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - movq xm1, [r0] - movhps xm1, [r0 + r1] - movq xm2, [r0 + r1 * 2] - vinserti128 m1, m1, xm2, 1 - pshufb m1, m4 - pmaddubsw m1, m0 - pmaddwd m1, m5 - packssdw m1, m1 - psubw m1, m6 - vextracti128 xm2, m1, 1 - - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - movd [r2 + r3 * 2], xm2 -.end: - RET - -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_6x16, 4, 6, 7 - mov r4d, r4m - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastd m0, [r5 + r4 * 4] + PROCESS_LUMA_S_AVX2_W8_8R %1 +%ifidn %1,sp + add r2, 8 %else - vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] + add r2, 16 %endif - - mova m1, [tab_Tm] - mova m2, [pw_1] - mova m6, [pw_512] - lea r4, [r1 * 3] - lea r5, [r3 * 3] - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - constant word 1 - - dec r0 -%rep 4 - ; Row 0 - vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m3, m1 - pmaddubsw m3, m0 - pmaddwd m3, m2 - - ; Row 1 - vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - packssdw m3, m4 - pmulhrsw m3, m6 - - ; Row 2 - vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m4, m1 - pmaddubsw m4, m0 - pmaddwd m4, m2 - - ; Row 3 - vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - pshufb m5, m1 - pmaddubsw m5, m0 - pmaddwd m5, m2 - packssdw m4, m5 - pmulhrsw m4, m6 - - packuswb m3, m4 - vextracti128 xm4, m3, 1 - movd [r2], xm3 - pextrw [r2 + 4], xm4, 0 - pextrd [r2 + r3], xm3, 1 - pextrw [r2 + r3 + 4], xm4, 2 - pextrd [r2 + r3 * 2], xm3, 2 - pextrw [r2 + r3 * 2 + 4], xm4, 4 - pextrd [r2 + r5], xm3, 3 - pextrw [r2 + r5 + 4], xm4, 6 - lea r2, [r2 + r3 * 4] - lea r0, [r0 + r1 * 4] -%endrep + add r0, 16 + dec r9d + jnz .loop RET +%endif +%endmacro + + FILTER_VER_LUMA_S_AVX2_32x24 sp + FILTER_VER_LUMA_S_AVX2_32x24 ss From 0e13e211e62c12a67a3c3b2365b555742ff012fc Mon Sep 17 00:00:00 2001 From: Aruna Matheswaran Date: Sat, 27 Jan 2018 18:45:21 +0530 Subject: [PATCH 46/51] rc: fix inconsistency in pass 2 when weightp and cutree are enabled. --- source/encoder/slicetype.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp index fe2e51aab9..544e6c645f 100644 --- a/source/encoder/slicetype.cpp +++ b/source/encoder/slicetype.cpp @@ -154,7 +154,7 @@ void LookaheadTLD::calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param) int blockXY = 0; int blockX = 0, blockY = 0; double strength = 0.f; - if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0) + if ((param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0) || (param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame))) { /* Need to init it anyways for CU tree */ int cuCount = blockCount; @@ -982,11 +982,8 @@ void PreLookaheadGroup::processTasks(int workerThreadID) ProfileLookaheadTime(m_lookahead.m_preLookaheadElapsedTime, m_lookahead.m_countPreLookahead); ProfileScopeEvent(prelookahead); m_lock.release(); - preFrame->m_lowres.init(preFrame->m_fencPic, preFrame->m_poc); - if (m_lookahead.m_param->rc.bStatRead && m_lookahead.m_param->rc.cuTree && IS_REFERENCED(preFrame)) - /* cu-tree offsets were read from stats file */; - else if (m_lookahead.m_bAdaptiveQuant) + if (m_lookahead.m_bAdaptiveQuant) tld.calcAdaptiveQuantFrame(preFrame, m_lookahead.m_param); tld.lowresIntraEstimate(preFrame->m_lowres, m_lookahead.m_param->rc.qgSize); preFrame->m_lowresInit = true; From c56fd20ff467e29d4b3e4e2a97cb854a469054d0 Mon Sep 17 00:00:00 2001 From: Radhakrishnan Date: Wed, 31 Jan 2018 15:13:30 +0530 Subject: [PATCH 47/51] x86: Split ipfilter8 kernals part3 This patch adds infra and moves the vertical 4tap ipfilter kernals from ipfilter8.asm to new source file v4-ipfilter8.asm to reduce compile time of x265. --- source/common/CMakeLists.txt | 2 +- source/common/x86/ipfilter8.asm | 22286 ++++++--------------------- source/common/x86/v4-ipfilter8.asm | 12799 +++++++++++++++ 3 files changed, 17566 insertions(+), 17521 deletions(-) create mode 100644 source/common/x86/v4-ipfilter8.asm diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index ab980a223e..8261fac0c3 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -63,7 +63,7 @@ if(ENABLE_ASSEMBLY AND X86) if(HIGH_BIT_DEPTH) set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm loopfilter.asm) else() - set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm h-ipfilter8.asm ipfilter8.asm loopfilter.asm) + set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm v4-ipfilter8.asm h-ipfilter8.asm ipfilter8.asm loopfilter.asm) endif() if(NOT X64) set(A_SRCS ${A_SRCS} pixel-32.asm) diff --git a/source/common/x86/ipfilter8.asm b/source/common/x86/ipfilter8.asm index 223917b0f3..28ef118ee3 100644 --- a/source/common/x86/ipfilter8.asm +++ b/source/common/x86/ipfilter8.asm @@ -33,9 +33,6 @@ const tab_Tm, db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15 -const interp_vert_shuf, times 2 db 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9 - times 2 db 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13 - const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4 dd 2, 3, 3, 4, 4, 5, 5, 6 @@ -44,103 +41,8 @@ const tab_Lm, db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12 db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14 -const tab_Vm, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 - db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 - -const tab_Cm, db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3 - const pd_526336, times 8 dd 8192*64+2048 -const tab_ChromaCoeff, db 0, 64, 0, 0 - db -2, 58, 10, -2 - db -4, 54, 16, -2 - db -6, 46, 28, -4 - db -4, 36, 36, -4 - db -4, 28, 46, -6 - db -2, 16, 54, -4 - db -2, 10, 58, -2 - -const tabw_ChromaCoeff, dw 0, 64, 0, 0 - dw -2, 58, 10, -2 - dw -4, 54, 16, -2 - dw -6, 46, 28, -4 - dw -4, 36, 36, -4 - dw -4, 28, 46, -6 - dw -2, 16, 54, -4 - dw -2, 10, 58, -2 - -const tab_ChromaCoeff_V, times 8 db 0, 64 - times 8 db 0, 0 - - times 8 db -2, 58 - times 8 db 10, -2 - - times 8 db -4, 54 - times 8 db 16, -2 - - times 8 db -6, 46 - times 8 db 28, -4 - - times 8 db -4, 36 - times 8 db 36, -4 - - times 8 db -4, 28 - times 8 db 46, -6 - - times 8 db -2, 16 - times 8 db 54, -4 - - times 8 db -2, 10 - times 8 db 58, -2 - -const tab_ChromaCoeffV, times 4 dw 0, 64 - times 4 dw 0, 0 - - times 4 dw -2, 58 - times 4 dw 10, -2 - - times 4 dw -4, 54 - times 4 dw 16, -2 - - times 4 dw -6, 46 - times 4 dw 28, -4 - - times 4 dw -4, 36 - times 4 dw 36, -4 - - times 4 dw -4, 28 - times 4 dw 46, -6 - - times 4 dw -2, 16 - times 4 dw 54, -4 - - times 4 dw -2, 10 - times 4 dw 58, -2 - -const pw_ChromaCoeffV, times 8 dw 0, 64 - times 8 dw 0, 0 - - times 8 dw -2, 58 - times 8 dw 10, -2 - - times 8 dw -4, 54 - times 8 dw 16, -2 - - times 8 dw -6, 46 - times 8 dw 28, -4 - - times 8 dw -4, 36 - times 8 dw 36, -4 - - times 8 dw -4, 28 - times 8 dw 46, -6 - - times 8 dw -2, 16 - times 8 dw 54, -4 - - times 8 dw -2, 10 - times 8 dw 58, -2 - const tab_LumaCoeff, db 0, 0, 0, 64, 0, 0, 0, 0 db -1, 4, -10, 58, 17, -5, 1, 0 db -1, 4, -11, 40, 40, -11, 4, -1 @@ -231,30 +133,6 @@ const tab_LumaCoeffVer_32, times 16 db 0, 0 times 16 db 58, -10 times 16 db 4, -1 -const tab_ChromaCoeffVer_32, times 16 db 0, 64 - times 16 db 0, 0 - - times 16 db -2, 58 - times 16 db 10, -2 - - times 16 db -4, 54 - times 16 db 16, -2 - - times 16 db -6, 46 - times 16 db 28, -4 - - times 16 db -4, 36 - times 16 db 36, -4 - - times 16 db -4, 28 - times 16 db 46, -6 - - times 16 db -2, 16 - times 16 db 54, -4 - - times 16 db -2, 10 - times 16 db 58, -2 - const tab_c_64_n64, times 8 db 64, -64 const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 @@ -723,4416 +601,2567 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 8, 11 FILTER_VER_LUMA_sse2 64, 64, ps %endif -%macro WORD_TO_DOUBLE 1 -%if ARCH_X86_64 - punpcklbw %1, m8 -%else - punpcklbw %1, %1 - psrlw %1, 8 -%endif -%endmacro - -;----------------------------------------------------------------------------- -; void interp_4tap_vert_%1_2x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W2_H4_sse2 2 -INIT_XMM sse2 -%if ARCH_X86_64 -cglobal interp_4tap_vert_%1_2x%2, 4, 6, 9 - pxor m8, m8 -%else -cglobal interp_4tap_vert_%1_2x%2, 4, 6, 8 -%endif - mov r4d, r4m - sub r0, r1 - -%ifidn %1,pp - mova m1, [pw_32] -%elifidn %1,ps - mova m1, [pw_2000] - add r3d, r3d -%endif - -%ifdef PIC - lea r5, [tabw_ChromaCoeff] - movh m0, [r5 + r4 * 8] -%else - movh m0, [tabw_ChromaCoeff + r4 * 8] -%endif - - punpcklqdq m0, m0 - lea r5, [3 * r1] - -%assign x 1 -%rep %2/4 - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] - - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklwd m2, m6 - - WORD_TO_DOUBLE m2 - pmaddwd m2, m0 - - lea r0, [r0 + 4 * r1] - movd m6, [r0] - - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklwd m3, m7 - - WORD_TO_DOUBLE m3 - pmaddwd m3, m0 - - packssdw m2, m3 - pshuflw m3, m2, q2301 - pshufhw m3, m3, q2301 - paddw m2, m3 - - movd m7, [r0 + r1] - - punpcklbw m4, m5 - punpcklbw m3, m6, m7 - punpcklwd m4, m3 - - WORD_TO_DOUBLE m4 - pmaddwd m4, m0 - - movd m3, [r0 + 2 * r1] - - punpcklbw m5, m6 - punpcklbw m7, m3 - punpcklwd m5, m7 - - WORD_TO_DOUBLE m5 - pmaddwd m5, m0 - - packssdw m4, m5 - pshuflw m5, m4, q2301 - pshufhw m5, m5, q2301 - paddw m4, m5 - -%ifidn %1,pp - psrld m2, 16 - psrld m4, 16 - packssdw m2, m4 - paddw m2, m1 - psraw m2, 6 - packuswb m2, m2 - -%if ARCH_X86_64 - movq r4, m2 - mov [r2], r4w - shr r4, 16 - mov [r2 + r3], r4w - lea r2, [r2 + 2 * r3] - shr r4, 16 - mov [r2], r4w - shr r4, 16 - mov [r2 + r3], r4w -%else - movd r4, m2 - mov [r2], r4w - shr r4, 16 - mov [r2 + r3], r4w - lea r2, [r2 + 2 * r3] - psrldq m2, 4 - movd r4, m2 - mov [r2], r4w - shr r4, 16 - mov [r2 + r3], r4w -%endif -%elifidn %1,ps - psrldq m2, 2 - psrldq m4, 2 - pshufd m2, m2, q3120 - pshufd m4, m4, q3120 - psubw m4, m1 - psubw m2, m1 - - movd [r2], m2 - psrldq m2, 4 - movd [r2 + r3], m2 - lea r2, [r2 + 2 * r3] - movd [r2], m4 - psrldq m4, 4 - movd [r2 + r3], m4 -%endif - -%if x < %2/4 - lea r2, [r2 + 2 * r3] -%endif -%assign x x+1 -%endrep - RET +%macro FILTER_P2S_2_4_sse2 1 + movd m2, [r0 + %1] + movd m3, [r0 + r1 + %1] + punpcklwd m2, m3 + movd m3, [r0 + r1 * 2 + %1] + movd m4, [r0 + r4 + %1] + punpcklwd m3, m4 + punpckldq m2, m3 + punpcklbw m2, m0 + psllw m2, 6 + psubw m2, m1 + movd [r2 + r3 * 0 + %1 * 2], m2 + psrldq m2, 4 + movd [r2 + r3 * 1 + %1 * 2], m2 + psrldq m2, 4 + movd [r2 + r3 * 2 + %1 * 2], m2 + psrldq m2, 4 + movd [r2 + r5 + %1 * 2], m2 %endmacro - FILTER_V4_W2_H4_sse2 pp, 4 - FILTER_V4_W2_H4_sse2 pp, 8 - FILTER_V4_W2_H4_sse2 pp, 16 - - FILTER_V4_W2_H4_sse2 ps, 4 - FILTER_V4_W2_H4_sse2 ps, 8 - FILTER_V4_W2_H4_sse2 ps, 16 - -;----------------------------------------------------------------------------- -; void interp_4tap_vert_%1_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V2_W4_H4_sse2 1 -INIT_XMM sse2 -cglobal interp_4tap_vert_%1_4x2, 4, 6, 8 - mov r4d, r4m - sub r0, r1 - pxor m7, m7 - -%ifdef PIC - lea r5, [tabw_ChromaCoeff] - movh m0, [r5 + r4 * 8] -%else - movh m0, [tabw_ChromaCoeff + r4 * 8] -%endif +%macro FILTER_P2S_4_4_sse2 1 + movd m2, [r0 + %1] + movd m3, [r0 + r1 + %1] + movd m4, [r0 + r1 * 2 + %1] + movd m5, [r0 + r4 + %1] + punpckldq m2, m3 + punpcklbw m2, m0 + punpckldq m4, m5 + punpcklbw m4, m0 + psllw m2, 6 + psllw m4, 6 + psubw m2, m1 + psubw m4, m1 + movh [r2 + r3 * 0 + %1 * 2], m2 + movh [r2 + r3 * 2 + %1 * 2], m4 + movhps [r2 + r3 * 1 + %1 * 2], m2 + movhps [r2 + r5 + %1 * 2], m4 +%endmacro - lea r5, [r0 + 2 * r1] - punpcklqdq m0, m0 - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r5] - movd m5, [r5 + r1] - - punpcklbw m2, m3 - punpcklbw m1, m4, m5 - punpcklwd m2, m1 - - movhlps m6, m2 - punpcklbw m2, m7 - punpcklbw m6, m7 - pmaddwd m2, m0 - pmaddwd m6, m0 - packssdw m2, m6 - - movd m1, [r0 + 4 * r1] - - punpcklbw m3, m4 - punpcklbw m5, m1 - punpcklwd m3, m5 - - movhlps m6, m3 - punpcklbw m3, m7 - punpcklbw m6, m7 - pmaddwd m3, m0 - pmaddwd m6, m0 - packssdw m3, m6 - - pshuflw m4, m2, q2301 - pshufhw m4, m4, q2301 - paddw m2, m4 - pshuflw m5, m3, q2301 - pshufhw m5, m5, q2301 - paddw m3, m5 - -%ifidn %1, pp - psrld m2, 16 - psrld m3, 16 - packssdw m2, m3 - - paddw m2, [pw_32] - psraw m2, 6 - packuswb m2, m2 - - movd [r2], m2 - psrldq m2, 4 - movd [r2 + r3], m2 -%elifidn %1, ps - psrldq m2, 2 - psrldq m3, 2 - pshufd m2, m2, q3120 - pshufd m3, m3, q3120 - punpcklqdq m2, m3 - - add r3d, r3d - psubw m2, [pw_2000] - movh [r2], m2 - movhps [r2 + r3], m2 -%endif - RET +%macro FILTER_P2S_4_2_sse2 0 + movd m2, [r0] + movd m3, [r0 + r1] + punpckldq m2, m3 + punpcklbw m2, m0 + psllw m2, 6 + psubw m2, [pw_8192] + movh [r2], m2 + movhps [r2 + r3 * 2], m2 +%endmacro +%macro FILTER_P2S_8_4_sse2 1 + movh m2, [r0 + %1] + movh m3, [r0 + r1 + %1] + movh m4, [r0 + r1 * 2 + %1] + movh m5, [r0 + r4 + %1] + punpcklbw m2, m0 + punpcklbw m3, m0 + punpcklbw m5, m0 + punpcklbw m4, m0 + psllw m2, 6 + psllw m3, 6 + psllw m5, 6 + psllw m4, 6 + psubw m2, m1 + psubw m3, m1 + psubw m4, m1 + psubw m5, m1 + movu [r2 + r3 * 0 + %1 * 2], m2 + movu [r2 + r3 * 1 + %1 * 2], m3 + movu [r2 + r3 * 2 + %1 * 2], m4 + movu [r2 + r5 + %1 * 2], m5 %endmacro - FILTER_V2_W4_H4_sse2 pp - FILTER_V2_W4_H4_sse2 ps +%macro FILTER_P2S_8_2_sse2 1 + movh m2, [r0 + %1] + movh m3, [r0 + r1 + %1] + punpcklbw m2, m0 + punpcklbw m3, m0 + psllw m2, 6 + psllw m3, 6 + psubw m2, m1 + psubw m3, m1 + movu [r2 + r3 * 0 + %1 * 2], m2 + movu [r2 + r3 * 1 + %1 * 2], m3 +%endmacro ;----------------------------------------------------------------------------- -; void interp_4tap_vert_%1_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W4_H4_sse2 2 +%macro FILTER_PIX_TO_SHORT_sse2 2 INIT_XMM sse2 -%if ARCH_X86_64 -cglobal interp_4tap_vert_%1_4x%2, 4, 6, 9 - pxor m8, m8 -%else -cglobal interp_4tap_vert_%1_4x%2, 4, 6, 8 +cglobal filterPixelToShort_%1x%2, 4, 6, 6 + pxor m0, m0 +%if %2 == 2 +%if %1 == 4 + FILTER_P2S_4_2_sse2 +%elif %1 == 8 + add r3d, r3d + mova m1, [pw_8192] + FILTER_P2S_8_2_sse2 0 %endif - - mov r4d, r4m - sub r0, r1 - -%ifdef PIC - lea r5, [tabw_ChromaCoeff] - movh m0, [r5 + r4 * 8] %else - movh m0, [tabw_ChromaCoeff + r4 * 8] -%endif - -%ifidn %1,pp - mova m1, [pw_32] -%elifidn %1,ps - add r3d, r3d - mova m1, [pw_2000] -%endif - - lea r5, [3 * r1] - lea r4, [3 * r3] - punpcklqdq m0, m0 - -%assign x 1 + add r3d, r3d + mova m1, [pw_8192] + lea r4, [r1 * 3] + lea r5, [r3 * 3] +%assign y 1 %rep %2/4 - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] - - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklwd m2, m6 - - movhlps m6, m2 - WORD_TO_DOUBLE m2 - WORD_TO_DOUBLE m6 - pmaddwd m2, m0 - pmaddwd m6, m0 - packssdw m2, m6 - - lea r0, [r0 + 4 * r1] - movd m6, [r0] - - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklwd m3, m7 - - movhlps m7, m3 - WORD_TO_DOUBLE m3 - WORD_TO_DOUBLE m7 - pmaddwd m3, m0 - pmaddwd m7, m0 - packssdw m3, m7 - - pshuflw m7, m2, q2301 - pshufhw m7, m7, q2301 - paddw m2, m7 - pshuflw m7, m3, q2301 - pshufhw m7, m7, q2301 - paddw m3, m7 - -%ifidn %1,pp - psrld m2, 16 - psrld m3, 16 - packssdw m2, m3 - paddw m2, m1 - psraw m2, 6 -%elifidn %1,ps - psrldq m2, 2 - psrldq m3, 2 - pshufd m2, m2, q3120 - pshufd m3, m3, q3120 - punpcklqdq m2, m3 - - psubw m2, m1 - movh [r2], m2 - movhps [r2 + r3], m2 -%endif - - movd m7, [r0 + r1] - - punpcklbw m4, m5 - punpcklbw m3, m6, m7 - punpcklwd m4, m3 - - movhlps m3, m4 - WORD_TO_DOUBLE m4 - WORD_TO_DOUBLE m3 - pmaddwd m4, m0 - pmaddwd m3, m0 - packssdw m4, m3 - - movd m3, [r0 + 2 * r1] - - punpcklbw m5, m6 - punpcklbw m7, m3 - punpcklwd m5, m7 - - movhlps m3, m5 - WORD_TO_DOUBLE m5 - WORD_TO_DOUBLE m3 - pmaddwd m5, m0 - pmaddwd m3, m0 - packssdw m5, m3 - - pshuflw m7, m4, q2301 - pshufhw m7, m7, q2301 - paddw m4, m7 - pshuflw m7, m5, q2301 - pshufhw m7, m7, q2301 - paddw m5, m7 - -%ifidn %1,pp - psrld m4, 16 - psrld m5, 16 - packssdw m4, m5 - - paddw m4, m1 - psraw m4, 6 - packuswb m2, m4 - - movd [r2], m2 - psrldq m2, 4 - movd [r2 + r3], m2 - psrldq m2, 4 - movd [r2 + 2 * r3], m2 - psrldq m2, 4 - movd [r2 + r4], m2 -%elifidn %1,ps - psrldq m4, 2 - psrldq m5, 2 - pshufd m4, m4, q3120 - pshufd m5, m5, q3120 - punpcklqdq m4, m5 - psubw m4, m1 - movh [r2 + 2 * r3], m4 - movhps [r2 + r4], m4 -%endif - -%if x < %2/4 - lea r2, [r2 + 4 * r3] +%assign x 0 +%rep %1/8 + FILTER_P2S_8_4_sse2 x +%if %2 == 6 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + FILTER_P2S_8_2_sse2 x %endif - -%assign x x+1 +%assign x x+8 %endrep - RET - -%endmacro - - FILTER_V4_W4_H4_sse2 pp, 4 - FILTER_V4_W4_H4_sse2 pp, 8 - FILTER_V4_W4_H4_sse2 pp, 16 - FILTER_V4_W4_H4_sse2 pp, 32 - - FILTER_V4_W4_H4_sse2 ps, 4 - FILTER_V4_W4_H4_sse2 ps, 8 - FILTER_V4_W4_H4_sse2 ps, 16 - FILTER_V4_W4_H4_sse2 ps, 32 - -;----------------------------------------------------------------------------- -;void interp_4tap_vert_%1_6x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W6_H4_sse2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_%1_6x%2, 4, 7, 10 - mov r4d, r4m - sub r0, r1 - shl r4d, 5 - pxor m9, m9 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - mova m6, [r5 + r4] - mova m5, [r5 + r4 + 16] -%else - mova m6, [tab_ChromaCoeffV + r4] - mova m5, [tab_ChromaCoeffV + r4 + 16] +%rep (%1 % 8)/4 + FILTER_P2S_4_4_sse2 x +%assign x x+4 +%endrep +%rep (%1 % 4)/2 + FILTER_P2S_2_4_sse2 x +%endrep +%if y < %2/4 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] +%assign y y+1 %endif - -%ifidn %1,pp - mova m4, [pw_32] -%elifidn %1,ps - mova m4, [pw_2000] - add r3d, r3d +%endrep %endif - lea r5, [3 * r1] - -%assign x 1 -%rep %2/4 - movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - movq m3, [r0 + r5] - - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 - - movhlps m7, m0 - punpcklbw m0, m9 - punpcklbw m7, m9 - pmaddwd m0, m6 - pmaddwd m7, m6 - packssdw m0, m7 - - movhlps m8, m2 - movq m7, m2 - punpcklbw m8, m9 - punpcklbw m7, m9 - pmaddwd m8, m5 - pmaddwd m7, m5 - packssdw m7, m8 - - paddw m0, m7 +RET +%endmacro -%ifidn %1,pp - paddw m0, m4 - psraw m0, 6 - packuswb m0, m0 - - movd [r2], m0 - pextrw r6d, m0, 2 - mov [r2 + 4], r6w -%elifidn %1,ps - psubw m0, m4 - movh [r2], m0 - pshufd m0, m0, 2 - movd [r2 + 8], m0 -%endif + FILTER_PIX_TO_SHORT_sse2 2, 4 + FILTER_PIX_TO_SHORT_sse2 2, 8 + FILTER_PIX_TO_SHORT_sse2 2, 16 + FILTER_PIX_TO_SHORT_sse2 4, 2 + FILTER_PIX_TO_SHORT_sse2 4, 4 + FILTER_PIX_TO_SHORT_sse2 4, 8 + FILTER_PIX_TO_SHORT_sse2 4, 16 + FILTER_PIX_TO_SHORT_sse2 4, 32 + FILTER_PIX_TO_SHORT_sse2 6, 8 + FILTER_PIX_TO_SHORT_sse2 6, 16 + FILTER_PIX_TO_SHORT_sse2 8, 2 + FILTER_PIX_TO_SHORT_sse2 8, 4 + FILTER_PIX_TO_SHORT_sse2 8, 6 + FILTER_PIX_TO_SHORT_sse2 8, 8 + FILTER_PIX_TO_SHORT_sse2 8, 12 + FILTER_PIX_TO_SHORT_sse2 8, 16 + FILTER_PIX_TO_SHORT_sse2 8, 32 + FILTER_PIX_TO_SHORT_sse2 8, 64 + FILTER_PIX_TO_SHORT_sse2 12, 16 + FILTER_PIX_TO_SHORT_sse2 12, 32 + FILTER_PIX_TO_SHORT_sse2 16, 4 + FILTER_PIX_TO_SHORT_sse2 16, 8 + FILTER_PIX_TO_SHORT_sse2 16, 12 + FILTER_PIX_TO_SHORT_sse2 16, 16 + FILTER_PIX_TO_SHORT_sse2 16, 24 + FILTER_PIX_TO_SHORT_sse2 16, 32 + FILTER_PIX_TO_SHORT_sse2 16, 64 + FILTER_PIX_TO_SHORT_sse2 24, 32 + FILTER_PIX_TO_SHORT_sse2 24, 64 + FILTER_PIX_TO_SHORT_sse2 32, 8 + FILTER_PIX_TO_SHORT_sse2 32, 16 + FILTER_PIX_TO_SHORT_sse2 32, 24 + FILTER_PIX_TO_SHORT_sse2 32, 32 + FILTER_PIX_TO_SHORT_sse2 32, 48 + FILTER_PIX_TO_SHORT_sse2 32, 64 + FILTER_PIX_TO_SHORT_sse2 48, 64 + FILTER_PIX_TO_SHORT_sse2 64, 16 + FILTER_PIX_TO_SHORT_sse2 64, 32 + FILTER_PIX_TO_SHORT_sse2 64, 48 + FILTER_PIX_TO_SHORT_sse2 64, 64 + +%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst + movu %1, %7 + pshufb %2, %1, [tab_Lm + 0] + pmaddubsw %2, %5 + pshufb %3, %1, [tab_Lm + 16] + pmaddubsw %3, %5 + phaddw %2, %3 + pshufb %4, %1, [tab_Lm + 32] + pmaddubsw %4, %5 + pshufb %1, %1, [tab_Lm + 48] + pmaddubsw %1, %5 + phaddw %4, %1 + phaddw %2, %4 + %if %0 == 8 + pmulhrsw %2, %6 + packuswb %2, %2 + movh %8, %2 + %endif +%endmacro + +;----------------------------------------------------------------------------- +; Interpolate HV +;----------------------------------------------------------------------------- +%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2] + mova %5, [r0 + (%6 + 0) * 16] + mova %1, [r0 + (%6 + 1) * 16] + mova %2, [r0 + (%6 + 2) * 16] + punpcklwd %3, %5, %1 + punpckhwd %5, %1 + pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0 + pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1] + punpcklwd %4, %1, %2 + punpckhwd %1, %2 + pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1 + pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2] +%endmacro ; FILTER_HV8_START - lea r0, [r0 + 4 * r1] +%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6] + mova %8, [r0 + (%9 + 0) * 16] + mova %1, [r0 + (%9 + 1) * 16] + punpcklwd %7, %2, %8 + punpckhwd %2, %8 + pmaddwd %7, [r5 + %10 * 16] + pmaddwd %2, [r5 + %10 * 16] + paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0 + paddd %5, %2 ; R0 = H[0+1+2+3] + punpcklwd %7, %8, %1 + punpckhwd %8, %1 + pmaddwd %7, [r5 + %10 * 16] + pmaddwd %8, [r5 + %10 * 16] + paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1 + paddd %6, %8 ; R1 = H[1+2+3+4] +%endmacro ; FILTER_HV8_MID - movq m0, [r0] - punpcklbw m3, m0 +; Round and Saturate +%macro FILTER_HV8_END 4 ; output in [1, 3] + paddd %1, [pd_526336] + paddd %2, [pd_526336] + paddd %3, [pd_526336] + paddd %4, [pd_526336] + psrad %1, 12 + psrad %2, 12 + psrad %3, 12 + psrad %4, 12 + packssdw %1, %2 + packssdw %3, %4 - movhlps m8, m1 - punpcklbw m1, m9 - punpcklbw m8, m9 - pmaddwd m1, m6 - pmaddwd m8, m6 - packssdw m1, m8 + ; TODO: is merge better? I think this way is short dependency link + packuswb %1, %3 +%endmacro ; FILTER_HV8_END - movhlps m8, m3 - movq m7, m3 - punpcklbw m8, m9 - punpcklbw m7, m9 - pmaddwd m8, m5 - pmaddwd m7, m5 - packssdw m7, m8 +;----------------------------------------------------------------------------- +; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 +%define coef m7 +%define stk_buf rsp - paddw m1, m7 + mov r4d, r4m + mov r5d, r5m -%ifidn %1,pp - paddw m1, m4 - psraw m1, 6 - packuswb m1, m1 - - movd [r2 + r3], m1 - pextrw r6d, m1, 2 - mov [r2 + r3 + 4], r6w -%elifidn %1,ps - psubw m1, m4 - movh [r2 + r3], m1 - pshufd m1, m1, 2 - movd [r2 + r3 + 8], m1 +%ifdef PIC + lea r6, [tab_LumaCoeff] + movh coef, [r6 + r4 * 8] +%else + movh coef, [tab_LumaCoeff + r4 * 8] %endif + punpcklqdq coef, coef + + ; move to row -3 + lea r6, [r1 + r1 * 2] + sub r0, r6 - movq m1, [r0 + r1] - punpcklbw m7, m0, m1 + xor r6, r6 + mov r4, rsp - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m6 - pmaddwd m8, m6 - packssdw m2, m8 +.loopH: + FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3] + psubw m1, [pw_2000] + mova [r4], m1 - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m5 - pmaddwd m8, m5 - packssdw m7, m8 + add r0, r1 + add r4, 16 + inc r6 + cmp r6, 8+7 + jnz .loopH - paddw m2, m7 - lea r2, [r2 + 2 * r3] + ; ready to phase V + ; Here all of mN is free -%ifidn %1,pp - paddw m2, m4 - psraw m2, 6 - packuswb m2, m2 - movd [r2], m2 - pextrw r6d, m2, 2 - mov [r2 + 4], r6w -%elifidn %1,ps - psubw m2, m4 - movh [r2], m2 - pshufd m2, m2, 2 - movd [r2 + 8], m2 -%endif + ; load coeff table + shl r5, 6 + lea r6, [tab_LumaCoeffV] + lea r5, [r5 + r6] - movq m2, [r0 + 2 * r1] - punpcklbw m1, m2 + ; load intermedia buffer + mov r0, stk_buf - movhlps m8, m3 - punpcklbw m3, m9 - punpcklbw m8, m9 - pmaddwd m3, m6 - pmaddwd m8, m6 - packssdw m3, m8 + ; register mapping + ; r0 - src + ; r5 - coeff + ; r6 - loop_i - movhlps m8, m1 - punpcklbw m1, m9 - punpcklbw m8, m9 - pmaddwd m1, m5 - pmaddwd m8, m5 - packssdw m1, m8 + ; let's go + xor r6, r6 - paddw m3, m1 + ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache +.loopV: -%ifidn %1,pp - paddw m3, m4 - psraw m3, 6 - packuswb m3, m3 - - movd [r2 + r3], m3 - pextrw r6d, m3, 2 - mov [r2 + r3 + 4], r6w -%elifidn %1,ps - psubw m3, m4 - movh [r2 + r3], m3 - pshufd m3, m3, 2 - movd [r2 + r3 + 8], m3 -%endif + FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 + FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 + FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 + FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 + FILTER_HV8_END m3, m0, m4, m1 -%if x < %2/4 - lea r2, [r2 + 2 * r3] -%endif + movh [r2], m3 + movhps [r2 + r3], m3 -%assign x x+1 -%endrep - RET + lea r0, [r0 + 16 * 2] + lea r2, [r2 + r3 * 2] -%endmacro + inc r6 + cmp r6, 8/2 + jnz .loopV -%if ARCH_X86_64 - FILTER_V4_W6_H4_sse2 pp, 8 - FILTER_V4_W6_H4_sse2 pp, 16 - FILTER_V4_W6_H4_sse2 ps, 8 - FILTER_V4_W6_H4_sse2 ps, 16 -%endif + RET ;----------------------------------------------------------------------------- -; void interp_4tap_vert_%1_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W8_sse2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_%1_8x%2, 4, 7, 12 - mov r4d, r4m - sub r0, r1 - shl r4d, 5 - pxor m9, m9 - -%ifidn %1,pp - mova m4, [pw_32] -%elifidn %1,ps - mova m4, [pw_2000] - add r3d, r3d -%endif +INIT_XMM sse3 +cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 + mov r4d, r4m + mov r5d, r5m + add r4d, r4d + pxor m6, m6 %ifdef PIC - lea r6, [tab_ChromaCoeffV] - mova m6, [r6 + r4] - mova m5, [r6 + r4 + 16] + lea r6, [tabw_LumaCoeff] + mova m3, [r6 + r4 * 8] %else - mova m6, [tab_ChromaCoeffV + r4] - mova m5, [tab_ChromaCoeffV + r4 + 16] + mova m3, [tabw_LumaCoeff + r4 * 8] %endif - movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - lea r5, [r0 + 2 * r1] - movq m3, [r5 + r1] - - punpcklbw m0, m1 - punpcklbw m7, m2, m3 - - movhlps m8, m0 - punpcklbw m0, m9 - punpcklbw m8, m9 - pmaddwd m0, m6 - pmaddwd m8, m6 - packssdw m0, m8 + ; move to row -3 + lea r6, [r1 + r1 * 2] + sub r0, r6 - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m5 - pmaddwd m8, m5 - packssdw m7, m8 + mov r4, rsp - paddw m0, m7 +%assign x 0 ;needed for FILTER_H8_W8_sse2 macro +%assign y 1 +%rep 15 + FILTER_H8_W8_sse2 + psubw m1, [pw_2000] + mova [r4], m1 -%ifidn %1,pp - paddw m0, m4 - psraw m0, 6 -%elifidn %1,ps - psubw m0, m4 - movu [r2], m0 +%if y < 15 + add r0, r1 + add r4, 16 %endif +%assign y y+1 +%endrep + + ; ready to phase V + ; Here all of mN is free - movq m11, [r0 + 4 * r1] + ; load coeff table + shl r5, 6 + lea r6, [tab_LumaCoeffV] + lea r5, [r5 + r6] - punpcklbw m1, m2 - punpcklbw m7, m3, m11 + ; load intermedia buffer + mov r0, rsp - movhlps m8, m1 - punpcklbw m1, m9 - punpcklbw m8, m9 - pmaddwd m1, m6 - pmaddwd m8, m6 - packssdw m1, m8 + ; register mapping + ; r0 - src + ; r5 - coeff - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m5 - pmaddwd m8, m5 - packssdw m7, m8 + ; let's go +%assign y 1 +%rep 4 + FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 + FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 + FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 + FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 + FILTER_HV8_END m3, m0, m4, m1 - paddw m1, m7 + movh [r2], m3 + movhps [r2 + r3], m3 -%ifidn %1,pp - paddw m1, m4 - psraw m1, 6 - packuswb m1, m0 - - movhps [r2], m1 - movh [r2 + r3], m1 -%elifidn %1,ps - psubw m1, m4 - movu [r2 + r3], m1 +%if y < 4 + lea r0, [r0 + 16 * 2] + lea r2, [r2 + r3 * 2] %endif -%if %2 == 2 ;end of 8x2 +%assign y y+1 +%endrep RET -%else - lea r6, [r0 + 4 * r1] - movq m1, [r6 + r1] - - punpcklbw m2, m3 - punpcklbw m7, m11, m1 - - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m6 - pmaddwd m8, m6 - packssdw m2, m8 - - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m5 - pmaddwd m8, m5 - packssdw m7, m8 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +%macro P2S_H_2xN 1 +INIT_XMM sse4 +cglobal filterPixelToShort_2x%1, 3, 4, 3 + mov r3d, r3m + add r3d, r3d - paddw m2, m7 + ; load constant + mova m1, [pb_128] + mova m2, [tab_c_64_n64] -%ifidn %1,pp - paddw m2, m4 - psraw m2, 6 -%elifidn %1,ps - psubw m2, m4 - movu [r2 + 2 * r3], m2 -%endif - - movq m10, [r6 + 2 * r1] - - punpcklbw m3, m11 - punpcklbw m7, m1, m10 - - movhlps m8, m3 - punpcklbw m3, m9 - punpcklbw m8, m9 - pmaddwd m3, m6 - pmaddwd m8, m6 - packssdw m3, m8 - - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m5 - pmaddwd m8, m5 - packssdw m7, m8 +%rep %1/2 + movd m0, [r0] + pinsrd m0, [r0 + r1], 1 + punpcklbw m0, m1 + pmaddubsw m0, m2 - paddw m3, m7 - lea r5, [r2 + 2 * r3] + movd [r2 + r3 * 0], m0 + pextrd [r2 + r3 * 1], m0, 2 -%ifidn %1,pp - paddw m3, m4 - psraw m3, 6 - packuswb m3, m2 - - movhps [r2 + 2 * r3], m3 - movh [r5 + r3], m3 -%elifidn %1,ps - psubw m3, m4 - movu [r5 + r3], m3 -%endif -%if %2 == 4 ;end of 8x4 + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] +%endrep RET +%endmacro + P2S_H_2xN 4 + P2S_H_2xN 8 + P2S_H_2xN 16 -%else - lea r6, [r6 + 2 * r1] - movq m3, [r6 + r1] - - punpcklbw m11, m1 - punpcklbw m7, m10, m3 - - movhlps m8, m11 - punpcklbw m11, m9 - punpcklbw m8, m9 - pmaddwd m11, m6 - pmaddwd m8, m6 - packssdw m11, m8 - - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m5 - pmaddwd m8, m5 - packssdw m7, m8 - - paddw m11, m7 - -%ifidn %1, pp - paddw m11, m4 - psraw m11, 6 -%elifidn %1,ps - psubw m11, m4 - movu [r2 + 4 * r3], m11 -%endif - - movq m7, [r0 + 8 * r1] - - punpcklbw m1, m10 - punpcklbw m3, m7 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +%macro P2S_H_4xN 1 +INIT_XMM sse4 +cglobal filterPixelToShort_4x%1, 3, 6, 4 + mov r3d, r3m + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] - movhlps m8, m1 - punpcklbw m1, m9 - punpcklbw m8, m9 - pmaddwd m1, m6 - pmaddwd m8, m6 - packssdw m1, m8 + ; load constant + mova m2, [pb_128] + mova m3, [tab_c_64_n64] - movhlps m8, m3 - punpcklbw m3, m9 - punpcklbw m8, m9 - pmaddwd m3, m5 - pmaddwd m8, m5 - packssdw m3, m8 +%assign x 0 +%rep %1/4 + movd m0, [r0] + pinsrd m0, [r0 + r1], 1 + punpcklbw m0, m2 + pmaddubsw m0, m3 - paddw m1, m3 - lea r5, [r2 + 4 * r3] + movd m1, [r0 + r1 * 2] + pinsrd m1, [r0 + r5], 1 + punpcklbw m1, m2 + pmaddubsw m1, m3 -%ifidn %1,pp - paddw m1, m4 - psraw m1, 6 - packuswb m1, m11 - - movhps [r2 + 4 * r3], m1 - movh [r5 + r3], m1 -%elifidn %1,ps - psubw m1, m4 - movu [r5 + r3], m1 + movq [r2 + r3 * 0], m0 + movq [r2 + r3 * 2], m1 + movhps [r2 + r3 * 1], m0 + movhps [r2 + r4], m1 +%assign x x+1 +%if (x != %1/4) + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] %endif -%if %2 == 6 +%endrep RET - -%else - %error INVALID macro argument, only 2, 4 or 6! -%endif -%endif -%endif %endmacro - -%if ARCH_X86_64 - FILTER_V4_W8_sse2 pp, 2 - FILTER_V4_W8_sse2 pp, 4 - FILTER_V4_W8_sse2 pp, 6 - FILTER_V4_W8_sse2 ps, 2 - FILTER_V4_W8_sse2 ps, 4 - FILTER_V4_W8_sse2 ps, 6 -%endif + P2S_H_4xN 4 + P2S_H_4xN 8 + P2S_H_4xN 16 + P2S_H_4xN 32 ;----------------------------------------------------------------------------- -; void interp_4tap_vert_%1_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W8_H8_H16_H32_sse2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_%1_8x%2, 4, 6, 11 - mov r4d, r4m - sub r0, r1 - shl r4d, 5 - pxor m9, m9 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - mova m6, [r5 + r4] - mova m5, [r5 + r4 + 16] -%else - mova m6, [tab_ChromaCoeff + r4] - mova m5, [tab_ChromaCoeff + r4 + 16] -%endif - -%ifidn %1,pp - mova m4, [pw_32] -%elifidn %1,ps - mova m4, [pw_2000] - add r3d, r3d -%endif +%macro P2S_H_6xN 1 +INIT_XMM sse4 +cglobal filterPixelToShort_6x%1, 3, 7, 6 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] - lea r5, [r1 * 3] + ; load height + mov r6d, %1/4 -%assign x 1 -%rep %2/4 - movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - movq m3, [r0 + r5] - - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 - - movhlps m7, m0 - punpcklbw m0, m9 - punpcklbw m7, m9 - pmaddwd m0, m6 - pmaddwd m7, m6 - packssdw m0, m7 - - movhlps m8, m2 - movq m7, m2 - punpcklbw m8, m9 - punpcklbw m7, m9 - pmaddwd m8, m5 - pmaddwd m7, m5 - packssdw m7, m8 - - paddw m0, m7 + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] -%ifidn %1,pp - paddw m0, m4 - psraw m0, 6 -%elifidn %1,ps - psubw m0, m4 - movu [r2], m0 -%endif +.loop: + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - lea r0, [r0 + 4 * r1] - movq m10, [r0] - punpcklbw m3, m10 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - movhlps m8, m1 - punpcklbw m1, m9 - punpcklbw m8, m9 - pmaddwd m1, m6 - pmaddwd m8, m6 - packssdw m1, m8 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - movhlps m8, m3 - movq m7, m3 - punpcklbw m8, m9 - punpcklbw m7, m9 - pmaddwd m8, m5 - pmaddwd m7, m5 - packssdw m7, m8 + movh m3, [r0 + r4] + punpcklbw m3, m4 + pmaddubsw m3, m5 - paddw m1, m7 + movh [r2 + r3 * 0], m0 + pextrd [r2 + r3 * 0 + 8], m0, 2 + movh [r2 + r3 * 1], m1 + pextrd [r2 + r3 * 1 + 8], m1, 2 + movh [r2 + r3 * 2], m2 + pextrd [r2 + r3 * 2 + 8], m2, 2 + movh [r2 + r5], m3 + pextrd [r2 + r5 + 8], m3, 2 -%ifidn %1,pp - paddw m1, m4 - psraw m1, 6 - - packuswb m0, m1 - movh [r2], m0 - movhps [r2 + r3], m0 -%elifidn %1,ps - psubw m1, m4 - movu [r2 + r3], m1 -%endif + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - movq m1, [r0 + r1] - punpcklbw m10, m1 + dec r6d + jnz .loop + RET +%endmacro + P2S_H_6xN 8 + P2S_H_6xN 16 - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m6 - pmaddwd m8, m6 - packssdw m2, m8 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +%macro P2S_H_8xN 1 +INIT_XMM ssse3 +cglobal filterPixelToShort_8x%1, 3, 7, 6 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] - movhlps m8, m10 - punpcklbw m10, m9 - punpcklbw m8, m9 - pmaddwd m10, m5 - pmaddwd m8, m5 - packssdw m10, m8 + ; load height + mov r4d, %1/4 - paddw m2, m10 - lea r2, [r2 + 2 * r3] + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] -%ifidn %1,pp - paddw m2, m4 - psraw m2, 6 -%elifidn %1,ps - psubw m2, m4 - movu [r2], m2 -%endif +.loop: + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - movq m7, [r0 + 2 * r1] - punpcklbw m1, m7 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - movhlps m8, m3 - punpcklbw m3, m9 - punpcklbw m8, m9 - pmaddwd m3, m6 - pmaddwd m8, m6 - packssdw m3, m8 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - movhlps m8, m1 - punpcklbw m1, m9 - punpcklbw m8, m9 - pmaddwd m1, m5 - pmaddwd m8, m5 - packssdw m1, m8 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - paddw m3, m1 + movu [r2 + r3 * 0], m0 + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6 ], m3 -%ifidn %1,pp - paddw m3, m4 - psraw m3, 6 - - packuswb m2, m3 - movh [r2], m2 - movhps [r2 + r3], m2 -%elifidn %1,ps - psubw m3, m4 - movu [r2 + r3], m3 -%endif + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] -%if x < %2/4 - lea r2, [r2 + 2 * r3] -%endif -%endrep + dec r4d + jnz .loop RET %endmacro - -%if ARCH_X86_64 - FILTER_V4_W8_H8_H16_H32_sse2 pp, 8 - FILTER_V4_W8_H8_H16_H32_sse2 pp, 16 - FILTER_V4_W8_H8_H16_H32_sse2 pp, 32 - - FILTER_V4_W8_H8_H16_H32_sse2 pp, 12 - FILTER_V4_W8_H8_H16_H32_sse2 pp, 64 - - FILTER_V4_W8_H8_H16_H32_sse2 ps, 8 - FILTER_V4_W8_H8_H16_H32_sse2 ps, 16 - FILTER_V4_W8_H8_H16_H32_sse2 ps, 32 - - FILTER_V4_W8_H8_H16_H32_sse2 ps, 12 - FILTER_V4_W8_H8_H16_H32_sse2 ps, 64 -%endif + P2S_H_8xN 8 + P2S_H_8xN 4 + P2S_H_8xN 16 + P2S_H_8xN 32 + P2S_H_8xN 12 + P2S_H_8xN 64 ;----------------------------------------------------------------------------- -; void interp_4tap_vert_%1_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W12_H2_sse2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_%1_12x%2, 4, 6, 11 - mov r4d, r4m - sub r0, r1 - shl r4d, 5 - pxor m9, m9 +INIT_XMM ssse3 +cglobal filterPixelToShort_8x6, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r1 * 5] + lea r6, [r3 * 3] -%ifidn %1,pp - mova m6, [pw_32] -%elifidn %1,ps - mova m6, [pw_2000] - add r3d, r3d -%endif + ; load constant + mova m3, [pb_128] + mova m4, [tab_c_64_n64] -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - mova m1, [r5 + r4] - mova m0, [r5 + r4 + 16] -%else - mova m1, [tab_ChromaCoeffV + r4] - mova m0, [tab_ChromaCoeffV + r4 + 16] -%endif + movh m0, [r0] + punpcklbw m0, m3 + pmaddubsw m0, m4 -%assign x 1 -%rep %2/2 - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - movhlps m8, m4 - punpcklbw m4, m9 - punpcklbw m8, m9 - pmaddwd m4, m1 - pmaddwd m8, m1 - packssdw m4, m8 - - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m1 - pmaddwd m8, m1 - packssdw m2, m8 - - lea r0, [r0 + 2 * r1] - movu m5, [r0] - movu m7, [r0 + r1] - - punpcklbw m10, m5, m7 - movhlps m8, m10 - punpcklbw m10, m9 - punpcklbw m8, m9 - pmaddwd m10, m0 - pmaddwd m8, m0 - packssdw m10, m8 - - paddw m4, m10 - - punpckhbw m10, m5, m7 - movhlps m8, m10 - punpcklbw m10, m9 - punpcklbw m8, m9 - pmaddwd m10, m0 - pmaddwd m8, m0 - packssdw m10, m8 - - paddw m2, m10 + movh m1, [r0 + r1] + punpcklbw m1, m3 + pmaddubsw m1, m4 -%ifidn %1,pp - paddw m4, m6 - psraw m4, 6 - paddw m2, m6 - psraw m2, 6 - - packuswb m4, m2 - movh [r2], m4 - psrldq m4, 8 - movd [r2 + 8], m4 -%elifidn %1,ps - psubw m4, m6 - psubw m2, m6 - movu [r2], m4 - movh [r2 + 16], m2 -%endif + movh m2, [r0 + r1 * 2] + punpcklbw m2, m3 + pmaddubsw m2, m4 - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - movhlps m8, m4 - punpcklbw m4, m9 - punpcklbw m8, m9 - pmaddwd m4, m1 - pmaddwd m8, m1 - packssdw m4, m8 - - movhlps m8, m4 - punpcklbw m3, m9 - punpcklbw m8, m9 - pmaddwd m3, m1 - pmaddwd m8, m1 - packssdw m3, m8 - - movu m5, [r0 + 2 * r1] - punpcklbw m2, m7, m5 - punpckhbw m7, m5 - - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m0 - pmaddwd m8, m0 - packssdw m2, m8 - - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m0 - pmaddwd m8, m0 - packssdw m7, m8 - - paddw m4, m2 - paddw m3, m7 + movu [r2 + r3 * 0], m0 + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 2], m2 -%ifidn %1,pp - paddw m4, m6 - psraw m4, 6 - paddw m3, m6 - psraw m3, 6 - - packuswb m4, m3 - movh [r2 + r3], m4 - psrldq m4, 8 - movd [r2 + r3 + 8], m4 -%elifidn %1,ps - psubw m4, m6 - psubw m3, m6 - movu [r2 + r3], m4 - movh [r2 + r3 + 16], m3 -%endif + movh m0, [r0 + r4] + punpcklbw m0, m3 + pmaddubsw m0, m4 -%if x < %2/2 - lea r2, [r2 + 2 * r3] -%endif -%assign x x+1 -%endrep - RET + movh m1, [r0 + r1 * 4] + punpcklbw m1, m3 + pmaddubsw m1, m4 -%endmacro + movh m2, [r0 + r5] + punpcklbw m2, m3 + pmaddubsw m2, m4 -%if ARCH_X86_64 - FILTER_V4_W12_H2_sse2 pp, 16 - FILTER_V4_W12_H2_sse2 pp, 32 - FILTER_V4_W12_H2_sse2 ps, 16 - FILTER_V4_W12_H2_sse2 ps, 32 -%endif + movu [r2 + r6 ], m0 + movu [r2 + r3 * 4], m1 + lea r2, [r2 + r3 * 4] + movu [r2 + r3], m2 + + RET ;----------------------------------------------------------------------------- -; void interp_4tap_vert_%1_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W16_H2_sse2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_%1_16x%2, 4, 6, 11 - mov r4d, r4m - sub r0, r1 - shl r4d, 5 - pxor m9, m9 - -%ifidn %1,pp - mova m6, [pw_32] -%elifidn %1,ps - mova m6, [pw_2000] - add r3d, r3d -%endif +%macro P2S_H_16xN 1 +INIT_XMM ssse3 +cglobal filterPixelToShort_16x%1, 3, 7, 6 + mov r3d, r3m + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - mova m1, [r5 + r4] - mova m0, [r5 + r4 + 16] -%else - mova m1, [tab_ChromaCoeffV + r4] - mova m0, [tab_ChromaCoeffV + r4 + 16] -%endif + ; load height + mov r6d, %1/4 -%assign x 1 -%rep %2/2 - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - movhlps m8, m4 - punpcklbw m4, m9 - punpcklbw m8, m9 - pmaddwd m4, m1 - pmaddwd m8, m1 - packssdw m4, m8 - - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m1 - pmaddwd m8, m1 - packssdw m2, m8 - - lea r0, [r0 + 2 * r1] - movu m5, [r0] - movu m10, [r0 + r1] - - punpckhbw m7, m5, m10 - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m0 - pmaddwd m8, m0 - packssdw m7, m8 - paddw m2, m7 - - punpcklbw m7, m5, m10 - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m0 - pmaddwd m8, m0 - packssdw m7, m8 - paddw m4, m7 + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] -%ifidn %1,pp - paddw m4, m6 - psraw m4, 6 - paddw m2, m6 - psraw m2, 6 - - packuswb m4, m2 - movu [r2], m4 -%elifidn %1,ps - psubw m4, m6 - psubw m2, m6 - movu [r2], m4 - movu [r2 + 16], m2 -%endif - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - movhlps m8, m4 - punpcklbw m4, m9 - punpcklbw m8, m9 - pmaddwd m4, m1 - pmaddwd m8, m1 - packssdw m4, m8 - - movhlps m8, m3 - punpcklbw m3, m9 - punpcklbw m8, m9 - pmaddwd m3, m1 - pmaddwd m8, m1 - packssdw m3, m8 - - movu m5, [r0 + 2 * r1] - - punpcklbw m2, m10, m5 - punpckhbw m10, m5 - - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m0 - pmaddwd m8, m0 - packssdw m2, m8 - - movhlps m8, m10 - punpcklbw m10, m9 - punpcklbw m8, m9 - pmaddwd m10, m0 - pmaddwd m8, m0 - packssdw m10, m8 - - paddw m4, m2 - paddw m3, m10 - -%ifidn %1,pp - paddw m4, m6 - psraw m4, 6 - paddw m3, m6 - psraw m3, 6 - - packuswb m4, m3 - movu [r2 + r3], m4 -%elifidn %1,ps - psubw m4, m6 - psubw m3, m6 - movu [r2 + r3], m4 - movu [r2 + r3 + 16], m3 -%endif - -%if x < %2/2 - lea r2, [r2 + 2 * r3] -%endif -%assign x x+1 -%endrep - RET - -%endmacro - -%if ARCH_X86_64 - FILTER_V4_W16_H2_sse2 pp, 4 - FILTER_V4_W16_H2_sse2 pp, 8 - FILTER_V4_W16_H2_sse2 pp, 12 - FILTER_V4_W16_H2_sse2 pp, 16 - FILTER_V4_W16_H2_sse2 pp, 32 - - FILTER_V4_W16_H2_sse2 pp, 24 - FILTER_V4_W16_H2_sse2 pp, 64 - - FILTER_V4_W16_H2_sse2 ps, 4 - FILTER_V4_W16_H2_sse2 ps, 8 - FILTER_V4_W16_H2_sse2 ps, 12 - FILTER_V4_W16_H2_sse2 ps, 16 - FILTER_V4_W16_H2_sse2 ps, 32 - - FILTER_V4_W16_H2_sse2 ps, 24 - FILTER_V4_W16_H2_sse2 ps, 64 -%endif - -;----------------------------------------------------------------------------- -;void interp_4tap_vert_%1_24%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W24_sse2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_%1_24x%2, 4, 6, 11 - mov r4d, r4m - sub r0, r1 - shl r4d, 5 - pxor m9, m9 +.loop: + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 -%ifidn %1,pp - mova m6, [pw_32] -%elifidn %1,ps - mova m6, [pw_2000] - add r3d, r3d -%endif + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - mova m1, [r5 + r4] - mova m0, [r5 + r4 + 16] -%else - mova m1, [tab_ChromaCoeffV + r4] - mova m0, [tab_ChromaCoeffV + r4 + 16] -%endif + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 -%assign x 1 -%rep %2/2 - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - movhlps m8, m4 - punpcklbw m4, m9 - punpcklbw m8, m9 - pmaddwd m4, m1 - pmaddwd m8, m1 - packssdw m4, m8 - - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m1 - pmaddwd m8, m1 - packssdw m2, m8 - - lea r5, [r0 + 2 * r1] - movu m5, [r5] - movu m10, [r5 + r1] - punpcklbw m7, m5, m10 - - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m0 - pmaddwd m8, m0 - packssdw m7, m8 - paddw m4, m7 - - punpckhbw m7, m5, m10 - - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m0 - pmaddwd m8, m0 - packssdw m7, m8 - - paddw m2, m7 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 -%ifidn %1,pp - paddw m4, m6 - psraw m4, 6 - paddw m2, m6 - psraw m2, 6 - - packuswb m4, m2 - movu [r2], m4 -%elifidn %1,ps - psubw m4, m6 - psubw m2, m6 - movu [r2], m4 - movu [r2 + 16], m2 -%endif + movu [r2 + r3 * 0], m0 + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r4], m3 - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - movhlps m8, m4 - punpcklbw m4, m9 - punpcklbw m8, m9 - pmaddwd m4, m1 - pmaddwd m8, m1 - packssdw m4, m8 - - movhlps m8, m3 - punpcklbw m3, m9 - punpcklbw m8, m9 - pmaddwd m3, m1 - pmaddwd m8, m1 - packssdw m3, m8 - - movu m2, [r5 + 2 * r1] - - punpcklbw m5, m10, m2 - punpckhbw m10, m2 - - movhlps m8, m5 - punpcklbw m5, m9 - punpcklbw m8, m9 - pmaddwd m5, m0 - pmaddwd m8, m0 - packssdw m5, m8 - - movhlps m8, m10 - punpcklbw m10, m9 - punpcklbw m8, m9 - pmaddwd m10, m0 - pmaddwd m8, m0 - packssdw m10, m8 - - paddw m4, m5 - paddw m3, m10 + lea r0, [r0 + 8] -%ifidn %1,pp - paddw m4, m6 - psraw m4, 6 - paddw m3, m6 - psraw m3, 6 - - packuswb m4, m3 - movu [r2 + r3], m4 -%elifidn %1,ps - psubw m4, m6 - psubw m3, m6 - movu [r2 + r3], m4 - movu [r2 + r3 + 16], m3 -%endif + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - movq m2, [r0 + 16] - movq m3, [r0 + r1 + 16] - movq m4, [r5 + 16] - movq m5, [r5 + r1 + 16] + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - punpcklbw m2, m3 - punpcklbw m4, m5 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - movhlps m8, m4 - punpcklbw m4, m9 - punpcklbw m8, m9 - pmaddwd m4, m0 - pmaddwd m8, m0 - packssdw m4, m8 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m1 - pmaddwd m8, m1 - packssdw m2, m8 + movu [r2 + r3 * 0 + 16], m0 + movu [r2 + r3 * 1 + 16], m1 + movu [r2 + r3 * 2 + 16], m2 + movu [r2 + r4 + 16], m3 - paddw m2, m4 + lea r0, [r0 + r1 * 4 - 8] + lea r2, [r2 + r3 * 4] -%ifidn %1,pp - paddw m2, m6 - psraw m2, 6 -%elifidn %1,ps - psubw m2, m6 - movu [r2 + 32], m2 -%endif + dec r6d + jnz .loop + RET +%endmacro + P2S_H_16xN 16 + P2S_H_16xN 4 + P2S_H_16xN 8 + P2S_H_16xN 12 + P2S_H_16xN 32 + P2S_H_16xN 64 + P2S_H_16xN 24 - movq m3, [r0 + r1 + 16] - movq m4, [r5 + 16] - movq m5, [r5 + r1 + 16] - movq m7, [r5 + 2 * r1 + 16] +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal filterPixelToShort_16x4, 3, 4, 2 + mov r3d, r3m + add r3d, r3d - punpcklbw m3, m4 - punpcklbw m5, m7 + ; load constant + vbroadcasti128 m1, [pw_2000] - movhlps m8, m5 - punpcklbw m5, m9 - punpcklbw m8, m9 - pmaddwd m5, m0 - pmaddwd m8, m0 - packssdw m5, m8 + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - movhlps m8, m3 - punpcklbw m3, m9 - punpcklbw m8, m9 - pmaddwd m3, m1 - pmaddwd m8, m1 - packssdw m3, m8 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - paddw m3, m5 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 -%ifidn %1,pp - paddw m3, m6 - psraw m3, 6 - - packuswb m2, m3 - movh [r2 + 16], m2 - movhps [r2 + r3 + 16], m2 -%elifidn %1,ps - psubw m3, m6 - movu [r2 + r3 + 32], m3 -%endif + lea r1, [r1 * 3] + lea r3, [r3 * 3] -%if x < %2/2 - mov r0, r5 - lea r2, [r2 + 2 * r3] -%endif -%assign x x+1 -%endrep + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 RET -%endmacro - -%if ARCH_X86_64 - FILTER_V4_W24_sse2 pp, 32 - FILTER_V4_W24_sse2 pp, 64 - FILTER_V4_W24_sse2 ps, 32 - FILTER_V4_W24_sse2 ps, 64 -%endif - ;----------------------------------------------------------------------------- -; void interp_4tap_vert_%1_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W32_sse2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_%1_32x%2, 4, 6, 10 - mov r4d, r4m - sub r0, r1 - shl r4d, 5 - pxor m9, m9 - -%ifidn %1,pp - mova m6, [pw_32] -%elifidn %1,ps - mova m6, [pw_2000] - add r3d, r3d -%endif - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - mova m1, [r5 + r4] - mova m0, [r5 + r4 + 16] -%else - mova m1, [tab_ChromaCoeffV + r4] - mova m0, [tab_ChromaCoeffV + r4 + 16] -%endif - - mov r4d, %2 +INIT_YMM avx2 +cglobal filterPixelToShort_16x8, 3, 6, 2 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - movhlps m8, m4 - punpcklbw m4, m9 - punpcklbw m8, m9 - pmaddwd m4, m1 - pmaddwd m8, m1 - packssdw m4, m8 - - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m1 - pmaddwd m8, m1 - packssdw m2, m8 - - lea r5, [r0 + 2 * r1] - movu m3, [r5] - movu m5, [r5 + r1] - - punpcklbw m7, m3, m5 - punpckhbw m3, m5 - - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m0 - pmaddwd m8, m0 - packssdw m7, m8 - - movhlps m8, m3 - punpcklbw m3, m9 - punpcklbw m8, m9 - pmaddwd m3, m0 - pmaddwd m8, m0 - packssdw m3, m8 - - paddw m4, m7 - paddw m2, m3 + ; load constant + vbroadcasti128 m1, [pw_2000] -%ifidn %1,pp - paddw m4, m6 - psraw m4, 6 - paddw m2, m6 - psraw m2, 6 - - packuswb m4, m2 - movu [r2], m4 -%elifidn %1,ps - psubw m4, m6 - psubw m2, m6 - movu [r2], m4 - movu [r2 + 16], m2 -%endif + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - movu m2, [r0 + 16] - movu m3, [r0 + r1 + 16] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - movhlps m8, m4 - punpcklbw m4, m9 - punpcklbw m8, m9 - pmaddwd m4, m1 - pmaddwd m8, m1 - packssdw m4, m8 - - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m1 - pmaddwd m8, m1 - packssdw m2, m8 - - movu m3, [r5 + 16] - movu m5, [r5 + r1 + 16] - - punpcklbw m7, m3, m5 - punpckhbw m3, m5 - - movhlps m8, m7 - punpcklbw m7, m9 - punpcklbw m8, m9 - pmaddwd m7, m0 - pmaddwd m8, m0 - packssdw m7, m8 - - movhlps m8, m3 - punpcklbw m3, m9 - punpcklbw m8, m9 - pmaddwd m3, m0 - pmaddwd m8, m0 - packssdw m3, m8 - - paddw m4, m7 - paddw m2, m3 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 -%ifidn %1,pp - paddw m4, m6 - psraw m4, 6 - paddw m2, m6 - psraw m2, 6 - - packuswb m4, m2 - movu [r2 + 16], m4 -%elifidn %1,ps - psubw m4, m6 - psubw m2, m6 - movu [r2 + 32], m4 - movu [r2 + 48], m2 -%endif + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - lea r0, [r0 + r1] - lea r2, [r2 + r3] - dec r4 - jnz .loop - RET + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 -%endmacro + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] -%if ARCH_X86_64 - FILTER_V4_W32_sse2 pp, 8 - FILTER_V4_W32_sse2 pp, 16 - FILTER_V4_W32_sse2 pp, 24 - FILTER_V4_W32_sse2 pp, 32 + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - FILTER_V4_W32_sse2 pp, 48 - FILTER_V4_W32_sse2 pp, 64 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - FILTER_V4_W32_sse2 ps, 8 - FILTER_V4_W32_sse2 ps, 16 - FILTER_V4_W32_sse2 ps, 24 - FILTER_V4_W32_sse2 ps, 32 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - FILTER_V4_W32_sse2 ps, 48 - FILTER_V4_W32_sse2 ps, 64 -%endif + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 + RET ;----------------------------------------------------------------------------- -; void interp_4tap_vert_%1_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W16n_H2_sse2 3 -INIT_XMM sse2 -cglobal interp_4tap_vert_%1_%2x%3, 4, 7, 11 - mov r4d, r4m - sub r0, r1 - shl r4d, 5 - pxor m9, m9 - -%ifidn %1,pp - mova m7, [pw_32] -%elifidn %1,ps - mova m7, [pw_2000] - add r3d, r3d -%endif +INIT_YMM avx2 +cglobal filterPixelToShort_16x12, 3, 6, 2 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - mova m1, [r5 + r4] - mova m0, [r5 + r4 + 16] -%else - mova m1, [tab_ChromaCoeffV + r4] - mova m0, [tab_ChromaCoeffV + r4 + 16] -%endif + ; load constant + vbroadcasti128 m1, [pw_2000] - mov r4d, %3/2 + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 -.loop: + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - mov r6d, %2/16 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 -.loopW: + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - movhlps m8, m4 - punpcklbw m4, m9 - punpcklbw m8, m9 - pmaddwd m4, m1 - pmaddwd m8, m1 - packssdw m4, m8 - - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m1 - pmaddwd m8, m1 - packssdw m2, m8 - - lea r5, [r0 + 2 * r1] - movu m5, [r5] - movu m6, [r5 + r1] - - punpckhbw m10, m5, m6 - movhlps m8, m10 - punpcklbw m10, m9 - punpcklbw m8, m9 - pmaddwd m10, m0 - pmaddwd m8, m0 - packssdw m10, m8 - paddw m2, m10 - - punpcklbw m10, m5, m6 - movhlps m8, m10 - punpcklbw m10, m9 - punpcklbw m8, m9 - pmaddwd m10, m0 - pmaddwd m8, m0 - packssdw m10, m8 - paddw m4, m10 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] -%ifidn %1,pp - paddw m4, m7 - psraw m4, 6 - paddw m2, m7 - psraw m2, 6 - - packuswb m4, m2 - movu [r2], m4 -%elifidn %1,ps - psubw m4, m7 - psubw m2, m7 - movu [r2], m4 - movu [r2 + 16], m2 -%endif + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - movhlps m8, m4 - punpcklbw m4, m9 - punpcklbw m8, m9 - pmaddwd m4, m1 - pmaddwd m8, m1 - packssdw m4, m8 - - movhlps m8, m3 - punpcklbw m3, m9 - punpcklbw m8, m9 - pmaddwd m3, m1 - pmaddwd m8, m1 - packssdw m3, m8 - - movu m5, [r5 + 2 * r1] - - punpcklbw m2, m6, m5 - punpckhbw m6, m5 - - movhlps m8, m2 - punpcklbw m2, m9 - punpcklbw m8, m9 - pmaddwd m2, m0 - pmaddwd m8, m0 - packssdw m2, m8 - - movhlps m8, m6 - punpcklbw m6, m9 - punpcklbw m8, m9 - pmaddwd m6, m0 - pmaddwd m8, m0 - packssdw m6, m8 - - paddw m4, m2 - paddw m3, m6 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 -%ifidn %1,pp - paddw m4, m7 - psraw m4, 6 - paddw m3, m7 - psraw m3, 6 - - packuswb m4, m3 - movu [r2 + r3], m4 - add r2, 16 -%elifidn %1,ps - psubw m4, m7 - psubw m3, m7 - movu [r2 + r3], m4 - movu [r2 + r3 + 16], m3 - add r2, 32 -%endif + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - add r0, 16 - dec r6d - jnz .loopW + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - lea r0, [r0 + r1 * 2 - %2] + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] -%ifidn %1,pp - lea r2, [r2 + r3 * 2 - %2] -%elifidn %1,ps - lea r2, [r2 + r3 * 2 - (%2 * 2)] -%endif + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - dec r4d - jnz .loop - RET + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 -%endmacro + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 -%if ARCH_X86_64 - FILTER_V4_W16n_H2_sse2 pp, 64, 64 - FILTER_V4_W16n_H2_sse2 pp, 64, 32 - FILTER_V4_W16n_H2_sse2 pp, 64, 48 - FILTER_V4_W16n_H2_sse2 pp, 48, 64 - FILTER_V4_W16n_H2_sse2 pp, 64, 16 - FILTER_V4_W16n_H2_sse2 ps, 64, 64 - FILTER_V4_W16n_H2_sse2 ps, 64, 32 - FILTER_V4_W16n_H2_sse2 ps, 64, 48 - FILTER_V4_W16n_H2_sse2 ps, 48, 64 - FILTER_V4_W16n_H2_sse2 ps, 64, 16 -%endif + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 + RET -%macro FILTER_P2S_2_4_sse2 1 - movd m2, [r0 + %1] - movd m3, [r0 + r1 + %1] - punpcklwd m2, m3 - movd m3, [r0 + r1 * 2 + %1] - movd m4, [r0 + r4 + %1] - punpcklwd m3, m4 - punpckldq m2, m3 - punpcklbw m2, m0 - psllw m2, 6 - psubw m2, m1 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal filterPixelToShort_16x16, 3, 6, 2 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] - movd [r2 + r3 * 0 + %1 * 2], m2 - psrldq m2, 4 - movd [r2 + r3 * 1 + %1 * 2], m2 - psrldq m2, 4 - movd [r2 + r3 * 2 + %1 * 2], m2 - psrldq m2, 4 - movd [r2 + r5 + %1 * 2], m2 -%endmacro + ; load constant + vbroadcasti128 m1, [pw_2000] -%macro FILTER_P2S_4_4_sse2 1 - movd m2, [r0 + %1] - movd m3, [r0 + r1 + %1] - movd m4, [r0 + r1 * 2 + %1] - movd m5, [r0 + r4 + %1] - punpckldq m2, m3 - punpcklbw m2, m0 - punpckldq m4, m5 - punpcklbw m4, m0 - psllw m2, 6 - psllw m4, 6 - psubw m2, m1 - psubw m4, m1 - movh [r2 + r3 * 0 + %1 * 2], m2 - movh [r2 + r3 * 2 + %1 * 2], m4 - movhps [r2 + r3 * 1 + %1 * 2], m2 - movhps [r2 + r5 + %1 * 2], m4 -%endmacro + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 -%macro FILTER_P2S_4_2_sse2 0 - movd m2, [r0] - movd m3, [r0 + r1] - punpckldq m2, m3 - punpcklbw m2, m0 - psllw m2, 6 - psubw m2, [pw_8192] - movh [r2], m2 - movhps [r2 + r3 * 2], m2 -%endmacro + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 -%macro FILTER_P2S_8_4_sse2 1 - movh m2, [r0 + %1] - movh m3, [r0 + r1 + %1] - movh m4, [r0 + r1 * 2 + %1] - movh m5, [r0 + r4 + %1] - punpcklbw m2, m0 - punpcklbw m3, m0 - punpcklbw m5, m0 - punpcklbw m4, m0 - psllw m2, 6 - psllw m3, 6 - psllw m5, 6 - psllw m4, 6 - psubw m2, m1 - psubw m3, m1 - psubw m4, m1 - psubw m5, m1 - movu [r2 + r3 * 0 + %1 * 2], m2 - movu [r2 + r3 * 1 + %1 * 2], m3 - movu [r2 + r3 * 2 + %1 * 2], m4 - movu [r2 + r5 + %1 * 2], m5 -%endmacro + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 -%macro FILTER_P2S_8_2_sse2 1 - movh m2, [r0 + %1] - movh m3, [r0 + r1 + %1] - punpcklbw m2, m0 - punpcklbw m3, m0 - psllw m2, 6 - psllw m3, 6 - psubw m2, m1 - psubw m3, m1 - movu [r2 + r3 * 0 + %1 * 2], m2 - movu [r2 + r3 * 1 + %1 * 2], m3 -%endmacro + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride) -;----------------------------------------------------------------------------- -%macro FILTER_PIX_TO_SHORT_sse2 2 -INIT_XMM sse2 -cglobal filterPixelToShort_%1x%2, 4, 6, 6 - pxor m0, m0 -%if %2 == 2 -%if %1 == 4 - FILTER_P2S_4_2_sse2 -%elif %1 == 8 - add r3d, r3d - mova m1, [pw_8192] - FILTER_P2S_8_2_sse2 0 -%endif -%else - add r3d, r3d - mova m1, [pw_8192] - lea r4, [r1 * 3] - lea r5, [r3 * 3] -%assign y 1 -%rep %2/4 -%assign x 0 -%rep %1/8 - FILTER_P2S_8_4_sse2 x -%if %2 == 6 - lea r0, [r0 + 4 * r1] - lea r2, [r2 + 4 * r3] - FILTER_P2S_8_2_sse2 x -%endif -%assign x x+8 -%endrep -%rep (%1 % 8)/4 - FILTER_P2S_4_4_sse2 x -%assign x x+4 -%endrep -%rep (%1 % 4)/2 - FILTER_P2S_2_4_sse2 x -%endrep -%if y < %2/4 - lea r0, [r0 + 4 * r1] - lea r2, [r2 + 4 * r3] -%assign y y+1 -%endif -%endrep -%endif -RET -%endmacro + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - FILTER_PIX_TO_SHORT_sse2 2, 4 - FILTER_PIX_TO_SHORT_sse2 2, 8 - FILTER_PIX_TO_SHORT_sse2 2, 16 - FILTER_PIX_TO_SHORT_sse2 4, 2 - FILTER_PIX_TO_SHORT_sse2 4, 4 - FILTER_PIX_TO_SHORT_sse2 4, 8 - FILTER_PIX_TO_SHORT_sse2 4, 16 - FILTER_PIX_TO_SHORT_sse2 4, 32 - FILTER_PIX_TO_SHORT_sse2 6, 8 - FILTER_PIX_TO_SHORT_sse2 6, 16 - FILTER_PIX_TO_SHORT_sse2 8, 2 - FILTER_PIX_TO_SHORT_sse2 8, 4 - FILTER_PIX_TO_SHORT_sse2 8, 6 - FILTER_PIX_TO_SHORT_sse2 8, 8 - FILTER_PIX_TO_SHORT_sse2 8, 12 - FILTER_PIX_TO_SHORT_sse2 8, 16 - FILTER_PIX_TO_SHORT_sse2 8, 32 - FILTER_PIX_TO_SHORT_sse2 8, 64 - FILTER_PIX_TO_SHORT_sse2 12, 16 - FILTER_PIX_TO_SHORT_sse2 12, 32 - FILTER_PIX_TO_SHORT_sse2 16, 4 - FILTER_PIX_TO_SHORT_sse2 16, 8 - FILTER_PIX_TO_SHORT_sse2 16, 12 - FILTER_PIX_TO_SHORT_sse2 16, 16 - FILTER_PIX_TO_SHORT_sse2 16, 24 - FILTER_PIX_TO_SHORT_sse2 16, 32 - FILTER_PIX_TO_SHORT_sse2 16, 64 - FILTER_PIX_TO_SHORT_sse2 24, 32 - FILTER_PIX_TO_SHORT_sse2 24, 64 - FILTER_PIX_TO_SHORT_sse2 32, 8 - FILTER_PIX_TO_SHORT_sse2 32, 16 - FILTER_PIX_TO_SHORT_sse2 32, 24 - FILTER_PIX_TO_SHORT_sse2 32, 32 - FILTER_PIX_TO_SHORT_sse2 32, 48 - FILTER_PIX_TO_SHORT_sse2 32, 64 - FILTER_PIX_TO_SHORT_sse2 48, 64 - FILTER_PIX_TO_SHORT_sse2 64, 16 - FILTER_PIX_TO_SHORT_sse2 64, 32 - FILTER_PIX_TO_SHORT_sse2 64, 48 - FILTER_PIX_TO_SHORT_sse2 64, 64 + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 -%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst - movu %1, %7 - pshufb %2, %1, [tab_Lm + 0] - pmaddubsw %2, %5 - pshufb %3, %1, [tab_Lm + 16] - pmaddubsw %3, %5 - phaddw %2, %3 - pshufb %4, %1, [tab_Lm + 32] - pmaddubsw %4, %5 - pshufb %1, %1, [tab_Lm + 48] - pmaddubsw %1, %5 - phaddw %4, %1 - phaddw %2, %4 - %if %0 == 8 - pmulhrsw %2, %6 - packuswb %2, %2 - movh %8, %2 - %endif -%endmacro + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 -;----------------------------------------------------------------------------- -; Interpolate HV -;----------------------------------------------------------------------------- -%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2] - mova %5, [r0 + (%6 + 0) * 16] - mova %1, [r0 + (%6 + 1) * 16] - mova %2, [r0 + (%6 + 2) * 16] - punpcklwd %3, %5, %1 - punpckhwd %5, %1 - pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0 - pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1] - punpcklwd %4, %1, %2 - punpckhwd %1, %2 - pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1 - pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2] -%endmacro ; FILTER_HV8_START + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 -%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6] - mova %8, [r0 + (%9 + 0) * 16] - mova %1, [r0 + (%9 + 1) * 16] - punpcklwd %7, %2, %8 - punpckhwd %2, %8 - pmaddwd %7, [r5 + %10 * 16] - pmaddwd %2, [r5 + %10 * 16] - paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0 - paddd %5, %2 ; R0 = H[0+1+2+3] - punpcklwd %7, %8, %1 - punpckhwd %8, %1 - pmaddwd %7, [r5 + %10 * 16] - pmaddwd %8, [r5 + %10 * 16] - paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1 - paddd %6, %8 ; R1 = H[1+2+3+4] -%endmacro ; FILTER_HV8_MID + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 -; Round and Saturate -%macro FILTER_HV8_END 4 ; output in [1, 3] - paddd %1, [pd_526336] - paddd %2, [pd_526336] - paddd %3, [pd_526336] - paddd %4, [pd_526336] - psrad %1, 12 - psrad %2, 12 - psrad %3, 12 - psrad %4, 12 - packssdw %1, %2 - packssdw %3, %4 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - ; TODO: is merge better? I think this way is short dependency link - packuswb %1, %3 -%endmacro ; FILTER_HV8_END + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 -;----------------------------------------------------------------------------- -; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) -;----------------------------------------------------------------------------- -INIT_XMM ssse3 -cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 -%define coef m7 -%define stk_buf rsp + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - mov r4d, r4m - mov r5d, r5m + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 -%ifdef PIC - lea r6, [tab_LumaCoeff] - movh coef, [r6 + r4 * 8] -%else - movh coef, [tab_LumaCoeff + r4 * 8] -%endif - punpcklqdq coef, coef + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - ; move to row -3 - lea r6, [r1 + r1 * 2] - sub r0, r6 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - xor r6, r6 - mov r4, rsp + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 -.loopH: - FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3] - psubw m1, [pw_2000] - mova [r4], m1 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - add r0, r1 - add r4, 16 - inc r6 - cmp r6, 8+7 - jnz .loopH + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - ; ready to phase V - ; Here all of mN is free + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 + RET - ; load coeff table - shl r5, 6 - lea r6, [tab_LumaCoeffV] - lea r5, [r5 + r6] +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal filterPixelToShort_16x24, 3, 7, 2 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] + mov r6d, 3 - ; load intermedia buffer - mov r0, stk_buf + ; load constant + vbroadcasti128 m1, [pw_2000] +.loop: + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - ; register mapping - ; r0 - src - ; r5 - coeff - ; r6 - loop_i + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - ; let's go - xor r6, r6 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache -.loopV: + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 - FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 - FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 - FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 - FILTER_HV8_END m3, m0, m4, m1 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - movh [r2], m3 - movhps [r2 + r3], m3 + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - lea r0, [r0 + 16 * 2] - lea r2, [r2 + r3 * 2] + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - inc r6 - cmp r6, 8/2 - jnz .loopV + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 + + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + dec r6d + jnz .loop RET ;----------------------------------------------------------------------------- -; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -INIT_XMM sse3 -cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 - mov r4d, r4m - mov r5d, r5m - add r4d, r4d - pxor m6, m6 - -%ifdef PIC - lea r6, [tabw_LumaCoeff] - mova m3, [r6 + r4 * 8] -%else - mova m3, [tabw_LumaCoeff + r4 * 8] -%endif +%macro P2S_H_16xN_avx2 1 +INIT_YMM avx2 +cglobal filterPixelToShort_16x%1, 3, 7, 2 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] + mov r6d, %1/16 - ; move to row -3 - lea r6, [r1 + r1 * 2] - sub r0, r6 + ; load constant + vbroadcasti128 m1, [pw_2000] +.loop: + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - mov r4, rsp + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 -%assign x 0 ;needed for FILTER_H8_W8_sse2 macro -%assign y 1 -%rep 15 - FILTER_H8_W8_sse2 - psubw m1, [pw_2000] - mova [r4], m1 + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 -%if y < 15 - add r0, r1 - add r4, 16 -%endif -%assign y y+1 -%endrep + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - ; ready to phase V - ; Here all of mN is free + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - ; load coeff table - shl r5, 6 - lea r6, [tab_LumaCoeffV] - lea r5, [r5 + r6] + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - ; load intermedia buffer - mov r0, rsp + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - ; register mapping - ; r0 - src - ; r5 - coeff - - ; let's go -%assign y 1 -%rep 4 - FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 - FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 - FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 - FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 - FILTER_HV8_END m3, m0, m4, m1 - - movh [r2], m3 - movhps [r2 + r3], m3 - -%if y < 4 - lea r0, [r0 + 16 * 2] - lea r2, [r2 + r3 * 2] -%endif -%assign y y+1 -%endrep - RET - -;----------------------------------------------------------------------------- -;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - lea r4, [r1 * 3] - lea r5, [r0 + 4 * r1] - pshufb m0, [tab_Cm] - mova m1, [pw_512] - - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r4] - - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 - - pmaddubsw m2, m0 - - movd m6, [r5] - - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 - - pmaddubsw m3, m0 - - phaddw m2, m3 - - pmulhrsw m2, m1 - - movd m7, [r5 + r1] - - punpcklbw m4, m5 - punpcklbw m3, m6, m7 - punpcklbw m4, m3 - - pmaddubsw m4, m0 - - movd m3, [r5 + 2 * r1] - - punpcklbw m5, m6 - punpcklbw m7, m3 - punpcklbw m5, m7 - - pmaddubsw m5, m0 - - phaddw m4, m5 - - pmulhrsw m4, m1 - packuswb m2, m4 - - pextrw [r2], m2, 0 - pextrw [r2 + r3], m2, 2 - lea r2, [r2 + 2 * r3] - pextrw [r2], m2, 4 - pextrw [r2 + r3], m2, 6 - - RET - -%macro FILTER_VER_CHROMA_AVX2_2x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_2x4, 4, 6, 2 - mov r4d, r4m - shl r4d, 5 - sub r0, r1 - -%ifdef PIC - lea r5, [tab_ChromaCoeff_V] - add r5, r4 -%else - lea r5, [tab_ChromaCoeff_V + r4] -%endif + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - lea r4, [r1 * 3] + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - pinsrw xm1, [r0], 0 - pinsrw xm1, [r0 + r1], 1 - pinsrw xm1, [r0 + r1 * 2], 2 - pinsrw xm1, [r0 + r4], 3 lea r0, [r0 + r1 * 4] - pinsrw xm1, [r0], 4 - pinsrw xm1, [r0 + r1], 5 - pinsrw xm1, [r0 + r1 * 2], 6 - - pshufb xm0, xm1, [interp_vert_shuf] - pshufb xm1, [interp_vert_shuf + 32] - vinserti128 m0, m0, xm1, 1 - pmaddubsw m0, [r5] - vextracti128 xm1, m0, 1 - paddw xm0, xm1 -%ifidn %1,pp - pmulhrsw xm0, [pw_512] - packuswb xm0, xm0 - lea r4, [r3 * 3] - pextrw [r2], xm0, 0 - pextrw [r2 + r3], xm0, 1 - pextrw [r2 + r3 * 2], xm0, 2 - pextrw [r2 + r4], xm0, 3 -%else - add r3d, r3d - lea r4, [r3 * 3] - psubw xm0, [pw_2000] - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - pextrd [r2 + r3 * 2], xm0, 2 - pextrd [r2 + r4], xm0, 3 -%endif - RET -%endmacro + lea r2, [r2 + r3 * 4] - FILTER_VER_CHROMA_AVX2_2x4 pp - FILTER_VER_CHROMA_AVX2_2x4 ps + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 -%macro FILTER_VER_CHROMA_AVX2_2x8 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_2x8, 4, 6, 2 - mov r4d, r4m - shl r4d, 6 - sub r0, r1 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - lea r4, [r1 * 3] + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - pinsrw xm1, [r0], 0 - pinsrw xm1, [r0 + r1], 1 - pinsrw xm1, [r0 + r1 * 2], 2 - pinsrw xm1, [r0 + r4], 3 - lea r0, [r0 + r1 * 4] - pinsrw xm1, [r0], 4 - pinsrw xm1, [r0 + r1], 5 - pinsrw xm1, [r0 + r1 * 2], 6 - pinsrw xm1, [r0 + r4], 7 - movhlps xm0, xm1 lea r0, [r0 + r1 * 4] - pinsrw xm0, [r0], 4 - pinsrw xm0, [r0 + r1], 5 - pinsrw xm0, [r0 + r1 * 2], 6 - vinserti128 m1, m1, xm0, 1 - - pshufb m0, m1, [interp_vert_shuf] - pshufb m1, [interp_vert_shuf + 32] - pmaddubsw m0, [r5] - pmaddubsw m1, [r5 + 1 * mmsize] - paddw m0, m1 -%ifidn %1,pp - pmulhrsw m0, [pw_512] - vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - lea r4, [r3 * 3] - pextrw [r2], xm0, 0 - pextrw [r2 + r3], xm0, 1 - pextrw [r2 + r3 * 2], xm0, 2 - pextrw [r2 + r4], xm0, 3 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 4 - pextrw [r2 + r3], xm0, 5 - pextrw [r2 + r3 * 2], xm0, 6 - pextrw [r2 + r4], xm0, 7 -%else - add r3d, r3d - lea r4, [r3 * 3] - psubw m0, [pw_2000] - vextracti128 xm1, m0, 1 - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - pextrd [r2 + r3 * 2], xm0, 2 - pextrd [r2 + r4], xm0, 3 lea r2, [r2 + r3 * 4] - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r4], xm1, 3 -%endif - RET -%endmacro - FILTER_VER_CHROMA_AVX2_2x8 pp - FILTER_VER_CHROMA_AVX2_2x8 ps + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 -%macro FILTER_VER_CHROMA_AVX2_2x16 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_2x16, 4, 6, 3 - mov r4d, r4m - shl r4d, 6 - sub r0, r1 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - lea r4, [r1 * 3] + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - movd xm1, [r0] - pinsrw xm1, [r0 + r1], 1 - pinsrw xm1, [r0 + r1 * 2], 2 - pinsrw xm1, [r0 + r4], 3 - lea r0, [r0 + r1 * 4] - pinsrw xm1, [r0], 4 - pinsrw xm1, [r0 + r1], 5 - pinsrw xm1, [r0 + r1 * 2], 6 - pinsrw xm1, [r0 + r4], 7 - lea r0, [r0 + r1 * 4] - pinsrw xm0, [r0], 4 - pinsrw xm0, [r0 + r1], 5 - pinsrw xm0, [r0 + r1 * 2], 6 - pinsrw xm0, [r0 + r4], 7 - punpckhqdq xm0, xm1, xm0 - vinserti128 m1, m1, xm0, 1 - - pshufb m2, m1, [interp_vert_shuf] - pshufb m1, [interp_vert_shuf + 32] - pmaddubsw m2, [r5] - pmaddubsw m1, [r5 + 1 * mmsize] - paddw m2, m1 - - lea r0, [r0 + r1 * 4] - pinsrw xm1, [r0], 4 - pinsrw xm1, [r0 + r1], 5 - pinsrw xm1, [r0 + r1 * 2], 6 - pinsrw xm1, [r0 + r4], 7 - punpckhqdq xm1, xm0, xm1 - lea r0, [r0 + r1 * 4] - pinsrw xm0, [r0], 4 - pinsrw xm0, [r0 + r1], 5 - pinsrw xm0, [r0 + r1 * 2], 6 - punpckhqdq xm0, xm1, xm0 - vinserti128 m1, m1, xm0, 1 - - pshufb m0, m1, [interp_vert_shuf] - pshufb m1, [interp_vert_shuf + 32] - pmaddubsw m0, [r5] - pmaddubsw m1, [r5 + 1 * mmsize] - paddw m0, m1 -%ifidn %1,pp - mova m1, [pw_512] - pmulhrsw m2, m1 - pmulhrsw m0, m1 - packuswb m2, m0 - lea r4, [r3 * 3] - pextrw [r2], xm2, 0 - pextrw [r2 + r3], xm2, 1 - pextrw [r2 + r3 * 2], xm2, 2 - pextrw [r2 + r4], xm2, 3 - vextracti128 xm0, m2, 1 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 0 - pextrw [r2 + r3], xm0, 1 - pextrw [r2 + r3 * 2], xm0, 2 - pextrw [r2 + r4], xm0, 3 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm2, 4 - pextrw [r2 + r3], xm2, 5 - pextrw [r2 + r3 * 2], xm2, 6 - pextrw [r2 + r4], xm2, 7 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 4 - pextrw [r2 + r3], xm0, 5 - pextrw [r2 + r3 * 2], xm0, 6 - pextrw [r2 + r4], xm0, 7 -%else - add r3d, r3d - lea r4, [r3 * 3] - vbroadcasti128 m1, [pw_2000] - psubw m2, m1 - psubw m0, m1 - vextracti128 xm1, m2, 1 - movd [r2], xm2 - pextrd [r2 + r3], xm2, 1 - pextrd [r2 + r3 * 2], xm2, 2 - pextrd [r2 + r4], xm2, 3 + lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r4], xm1, 3 - vextracti128 xm1, m0, 1 - lea r2, [r2 + r3 * 4] - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - pextrd [r2 + r3 * 2], xm0, 2 - pextrd [r2 + r4], xm0, 3 - lea r2, [r2 + r3 * 4] - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r4], xm1, 3 -%endif + + dec r6d + jnz .loop RET %endmacro - - FILTER_VER_CHROMA_AVX2_2x16 pp - FILTER_VER_CHROMA_AVX2_2x16 ps +P2S_H_16xN_avx2 32 +P2S_H_16xN_avx2 64 ;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W2_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m0, [tab_Cm] +%macro P2S_H_32xN 1 +INIT_XMM ssse3 +cglobal filterPixelToShort_32x%1, 3, 7, 6 + mov r3d, r3m + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] - mova m1, [pw_512] + ; load height + mov r6d, %1/4 - mov r4d, %2 - lea r5, [3 * r1] + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] .loop: - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] - - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - pmaddubsw m2, m0 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - lea r0, [r0 + 4 * r1] - movd m6, [r0] + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - pmaddubsw m3, m0 + movu [r2 + r3 * 0], m0 + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r4], m3 - phaddw m2, m3 + lea r0, [r0 + 8] - pmulhrsw m2, m1 + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - movd m7, [r0 + r1] + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - punpcklbw m4, m5 - punpcklbw m3, m6, m7 - punpcklbw m4, m3 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - pmaddubsw m4, m0 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - movd m3, [r0 + 2 * r1] + movu [r2 + r3 * 0 + 16], m0 + movu [r2 + r3 * 1 + 16], m1 + movu [r2 + r3 * 2 + 16], m2 + movu [r2 + r4 + 16], m3 - punpcklbw m5, m6 - punpcklbw m7, m3 - punpcklbw m5, m7 + lea r0, [r0 + 8] - pmaddubsw m5, m0 + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - phaddw m4, m5 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - pmulhrsw m4, m1 - packuswb m2, m4 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - pextrw [r2], m2, 0 - pextrw [r2 + r3], m2, 2 - lea r2, [r2 + 2 * r3] - pextrw [r2], m2, 4 - pextrw [r2 + r3], m2, 6 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - lea r2, [r2 + 2 * r3] + movu [r2 + r3 * 0 + 32], m0 + movu [r2 + r3 * 1 + 32], m1 + movu [r2 + r3 * 2 + 32], m2 + movu [r2 + r4 + 32], m3 - sub r4, 4 - jnz .loop - RET -%endmacro + lea r0, [r0 + 8] - FILTER_V4_W2_H4 2, 8 + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - FILTER_V4_W2_H4 2, 16 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_4x2, 4, 6, 6 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - mov r4d, r4m - sub r0, r1 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif + movu [r2 + r3 * 0 + 48], m0 + movu [r2 + r3 * 1 + 48], m1 + movu [r2 + r3 * 2 + 48], m2 + movu [r2 + r4 + 48], m3 + + lea r0, [r0 + r1 * 4 - 24] + lea r2, [r2 + r3 * 4] - pshufb m0, [tab_Cm] - lea r5, [r0 + 2 * r1] + dec r6d + jnz .loop + RET +%endmacro + P2S_H_32xN 32 + P2S_H_32xN 8 + P2S_H_32xN 16 + P2S_H_32xN 24 + P2S_H_32xN 64 + P2S_H_32xN 48 - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r5] - movd m5, [r5 + r1] +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +%macro P2S_H_32xN_avx2 1 +INIT_YMM avx2 +cglobal filterPixelToShort_32x%1, 3, 7, 3 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] - punpcklbw m2, m3 - punpcklbw m1, m4, m5 - punpcklbw m2, m1 + ; load height + mov r4d, %1/4 - pmaddubsw m2, m0 + ; load constant + vpbroadcastd m2, [pw_2000] - movd m1, [r0 + 4 * r1] +.loop: + pmovzxbw m0, [r0 + 0 * mmsize/2] + pmovzxbw m1, [r0 + 1 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psubw m0, m2 + psubw m1, m2 + movu [r2 + 0 * mmsize], m0 + movu [r2 + 1 * mmsize], m1 - punpcklbw m3, m4 - punpcklbw m5, m1 - punpcklbw m3, m5 + pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psubw m0, m2 + psubw m1, m2 + movu [r2 + r3 + 0 * mmsize], m0 + movu [r2 + r3 + 1 * mmsize], m1 - pmaddubsw m3, m0 + pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psubw m0, m2 + psubw m1, m2 + movu [r2 + r3 * 2 + 0 * mmsize], m0 + movu [r2 + r3 * 2 + 1 * mmsize], m1 - phaddw m2, m3 + pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psubw m0, m2 + psubw m1, m2 + movu [r2 + r6 + 0 * mmsize], m0 + movu [r2 + r6 + 1 * mmsize], m1 - pmulhrsw m2, [pw_512] - packuswb m2, m2 - movd [r2], m2 - pextrd [r2 + r3], m2, 1 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + dec r4d + jnz .loop RET +%endmacro + P2S_H_32xN_avx2 32 + P2S_H_32xN_avx2 8 + P2S_H_32xN_avx2 16 + P2S_H_32xN_avx2 24 + P2S_H_32xN_avx2 64 + P2S_H_32xN_avx2 48 -%macro FILTER_VER_CHROMA_AVX2_4x2 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x2, 4, 6, 4 - mov r4d, r4m - shl r4d, 5 - sub r0, r1 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +%macro P2S_H_64xN 1 +INIT_XMM ssse3 +cglobal filterPixelToShort_64x%1, 3, 7, 6 + mov r3d, r3m + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] -%ifdef PIC - lea r5, [tab_ChromaCoeff_V] - add r5, r4 -%else - lea r5, [tab_ChromaCoeff_V + r4] -%endif + ; load height + mov r6d, %1/4 - lea r4, [r1 * 3] + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] - movd xm1, [r0] - movd xm2, [r0 + r1] - punpcklbw xm1, xm2 - movd xm3, [r0 + r1 * 2] - punpcklbw xm2, xm3 - movlhps xm1, xm2 - movd xm0, [r0 + r4] - punpcklbw xm3, xm0 - movd xm2, [r0 + r1 * 4] - punpcklbw xm0, xm2 - movlhps xm3, xm0 - vinserti128 m1, m1, xm3, 1 ; m1 = row[x x x 4 3 2 1 0] +.loop: + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - pmaddubsw m1, [r5] - vextracti128 xm3, m1, 1 - paddw xm1, xm3 -%ifidn %1,pp - pmulhrsw xm1, [pw_512] - packuswb xm1, xm1 - movd [r2], xm1 - pextrd [r2 + r3], xm1, 1 -%else - add r3d, r3d - psubw xm1, [pw_2000] - movq [r2], xm1 - movhps [r2 + r3], xm1 -%endif - RET -%endmacro + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - FILTER_VER_CHROMA_AVX2_4x2 pp - FILTER_VER_CHROMA_AVX2_4x2 ps + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_4x4, 4, 6, 8 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - mov r4d, r4m - sub r0, r1 + movu [r2 + r3 * 0], m0 + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r4], m3 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif + lea r0, [r0 + 8] - pshufb m0, [tab_Cm] - mova m1, [pw_512] - lea r5, [r0 + 4 * r1] - lea r4, [r1 * 3] + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r4] + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - pmaddubsw m2, m0 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - movd m6, [r5] + movu [r2 + r3 * 0 + 16], m0 + movu [r2 + r3 * 1 + 16], m1 + movu [r2 + r3 * 2 + 16], m2 + movu [r2 + r4 + 16], m3 - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 + lea r0, [r0 + 8] - pmaddubsw m3, m0 + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - phaddw m2, m3 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - pmulhrsw m2, m1 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - movd m7, [r5 + r1] + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - punpcklbw m4, m5 - punpcklbw m3, m6, m7 - punpcklbw m4, m3 + movu [r2 + r3 * 0 + 32], m0 + movu [r2 + r3 * 1 + 32], m1 + movu [r2 + r3 * 2 + 32], m2 + movu [r2 + r4 + 32], m3 - pmaddubsw m4, m0 + lea r0, [r0 + 8] - movd m3, [r5 + 2 * r1] + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - punpcklbw m5, m6 - punpcklbw m7, m3 - punpcklbw m5, m7 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - pmaddubsw m5, m0 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - phaddw m4, m5 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - pmulhrsw m4, m1 + movu [r2 + r3 * 0 + 48], m0 + movu [r2 + r3 * 1 + 48], m1 + movu [r2 + r3 * 2 + 48], m2 + movu [r2 + r4 + 48], m3 - packuswb m2, m4 - movd [r2], m2 - pextrd [r2 + r3], m2, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m2, 2 - pextrd [r2 + r3], m2, 3 - RET -%macro FILTER_VER_CHROMA_AVX2_4x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x4, 4, 6, 3 - mov r4d, r4m - shl r4d, 6 - sub r0, r1 + lea r0, [r0 + 8] -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - lea r4, [r1 * 3] + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - movd xm1, [r0] - pinsrd xm1, [r0 + r1], 1 - pinsrd xm1, [r0 + r1 * 2], 2 - pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm2, [r0] - pinsrd xm2, [r0 + r1], 1 - pinsrd xm2, [r0 + r1 * 2], 2 ; m2 = row[x 6 5 4] - vinserti128 m1, m1, xm2, 1 ; m1 = row[x 6 5 4 3 2 1 0] - mova m2, [interp4_vpp_shuf1] - vpermd m0, m2, m1 ; m0 = row[4 3 3 2 2 1 1 0] - mova m2, [interp4_vpp_shuf1 + mmsize] - vpermd m1, m2, m1 ; m1 = row[6 5 5 4 4 3 3 2] - - mova m2, [interp4_vpp_shuf] - pshufb m0, m0, m2 - pshufb m1, m1, m2 - pmaddubsw m0, [r5] - pmaddubsw m1, [r5 + mmsize] - paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] -%ifidn %1,pp - pmulhrsw m0, [pw_512] - vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - lea r5, [r3 * 3] - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - pextrd [r2 + r3 * 2], xm0, 2 - pextrd [r2 + r5], xm0, 3 -%else - add r3d, r3d - psubw m0, [pw_2000] - vextracti128 xm1, m0, 1 - lea r5, [r3 * 3] - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm1 - movhps [r2 + r5], xm1 -%endif - RET -%endmacro - FILTER_VER_CHROMA_AVX2_4x4 pp - FILTER_VER_CHROMA_AVX2_4x4 ps + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 -%macro FILTER_VER_CHROMA_AVX2_4x8 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x8, 4, 6, 5 - mov r4d, r4m - shl r4d, 6 - sub r0, r1 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif + movu [r2 + r3 * 0 + 64], m0 + movu [r2 + r3 * 1 + 64], m1 + movu [r2 + r3 * 2 + 64], m2 + movu [r2 + r4 + 64], m3 - lea r4, [r1 * 3] + lea r0, [r0 + 8] - movd xm1, [r0] - pinsrd xm1, [r0 + r1], 1 - pinsrd xm1, [r0 + r1 * 2], 2 - pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm2, [r0] - pinsrd xm2, [r0 + r1], 1 - pinsrd xm2, [r0 + r1 * 2], 2 - pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] - vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm3, [r0] - pinsrd xm3, [r0 + r1], 1 - pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] - vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] - mova m3, [interp4_vpp_shuf1] - vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] - vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] - mova m3, [interp4_vpp_shuf1 + mmsize] - vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] - vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] - - mova m3, [interp4_vpp_shuf] - pshufb m0, m0, m3 - pshufb m1, m1, m3 - pshufb m2, m2, m3 - pshufb m4, m4, m3 - pmaddubsw m0, [r5] - pmaddubsw m4, [r5] - pmaddubsw m1, [r5 + mmsize] - pmaddubsw m2, [r5 + mmsize] - paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] - paddw m4, m2 ; m4 = WORD ROW[7 6 5 4] -%ifidn %1,pp - pmulhrsw m0, [pw_512] - pmulhrsw m4, [pw_512] - packuswb m0, m4 - vextracti128 xm1, m0, 1 - lea r5, [r3 * 3] - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - movd [r2 + r3 * 2], xm1 - pextrd [r2 + r5], xm1, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm0, 3 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r5], xm1, 3 -%else - add r3d, r3d - psubw m0, [pw_2000] - psubw m4, [pw_2000] - vextracti128 xm1, m0, 1 - vextracti128 xm2, m4, 1 - lea r5, [r3 * 3] - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm1 - movhps [r2 + r5], xm1 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movhps [r2 + r3], xm4 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r5], xm2 -%endif - RET -%endmacro - - FILTER_VER_CHROMA_AVX2_4x8 pp - FILTER_VER_CHROMA_AVX2_4x8 ps - -%macro FILTER_VER_CHROMA_AVX2_4xN 2 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x%2, 4, 6, 12 - mov r4d, r4m - shl r4d, 6 - sub r0, r1 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - mova m10, [r5] - mova m11, [r5 + mmsize] -%ifidn %1,pp - mova m9, [pw_512] -%else - add r3d, r3d - mova m9, [pw_2000] -%endif - lea r5, [r3 * 3] -%rep %2 / 16 - movd xm1, [r0] - pinsrd xm1, [r0 + r1], 1 - pinsrd xm1, [r0 + r1 * 2], 2 - pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm2, [r0] - pinsrd xm2, [r0 + r1], 1 - pinsrd xm2, [r0 + r1 * 2], 2 - pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] - vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm3, [r0] - pinsrd xm3, [r0 + r1], 1 - pinsrd xm3, [r0 + r1 * 2], 2 - pinsrd xm3, [r0 + r4], 3 ; m3 = row[11 10 9 8] - vinserti128 m2, m2, xm3, 1 ; m2 = row[11 10 9 8 7 6 5 4] - lea r0, [r0 + r1 * 4] - movd xm4, [r0] - pinsrd xm4, [r0 + r1], 1 - pinsrd xm4, [r0 + r1 * 2], 2 - pinsrd xm4, [r0 + r4], 3 ; m4 = row[15 14 13 12] - vinserti128 m3, m3, xm4, 1 ; m3 = row[15 14 13 12 11 10 9 8] - lea r0, [r0 + r1 * 4] - movd xm5, [r0] - pinsrd xm5, [r0 + r1], 1 - pinsrd xm5, [r0 + r1 * 2], 2 ; m5 = row[x 18 17 16] - vinserti128 m4, m4, xm5, 1 ; m4 = row[x 18 17 16 15 14 13 12] - mova m5, [interp4_vpp_shuf1] - vpermd m0, m5, m1 ; m0 = row[4 3 3 2 2 1 1 0] - vpermd m6, m5, m2 ; m6 = row[8 7 7 6 6 5 5 4] - vpermd m7, m5, m3 ; m7 = row[12 11 11 10 10 9 9 8] - vpermd m8, m5, m4 ; m8 = row[16 15 15 14 14 13 13 12] - mova m5, [interp4_vpp_shuf1 + mmsize] - vpermd m1, m5, m1 ; m1 = row[6 5 5 4 4 3 3 2] - vpermd m2, m5, m2 ; m2 = row[10 9 9 8 8 7 7 6] - vpermd m3, m5, m3 ; m3 = row[14 13 13 12 12 11 11 10] - vpermd m4, m5, m4 ; m4 = row[18 17 17 16 16 15 15 14] - - mova m5, [interp4_vpp_shuf] - pshufb m0, m0, m5 - pshufb m1, m1, m5 - pshufb m2, m2, m5 - pshufb m4, m4, m5 - pshufb m3, m3, m5 - pshufb m6, m6, m5 - pshufb m7, m7, m5 - pshufb m8, m8, m5 - pmaddubsw m0, m10 - pmaddubsw m6, m10 - pmaddubsw m7, m10 - pmaddubsw m8, m10 - pmaddubsw m1, m11 - pmaddubsw m2, m11 - pmaddubsw m3, m11 - pmaddubsw m4, m11 - paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] - paddw m6, m2 ; m6 = WORD ROW[7 6 5 4] - paddw m7, m3 ; m7 = WORD ROW[11 10 9 8] - paddw m8, m4 ; m8 = WORD ROW[15 14 13 12] -%ifidn %1,pp - pmulhrsw m0, m9 - pmulhrsw m6, m9 - pmulhrsw m7, m9 - pmulhrsw m8, m9 - packuswb m0, m6 - packuswb m7, m8 - vextracti128 xm1, m0, 1 - vextracti128 xm2, m7, 1 - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - movd [r2 + r3 * 2], xm1 - pextrd [r2 + r5], xm1, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm0, 3 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r5], xm1, 3 - lea r2, [r2 + r3 * 4] - movd [r2], xm7 - pextrd [r2 + r3], xm7, 1 - movd [r2 + r3 * 2], xm2 - pextrd [r2 + r5], xm2, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm7, 2 - pextrd [r2 + r3], xm7, 3 - pextrd [r2 + r3 * 2], xm2, 2 - pextrd [r2 + r5], xm2, 3 -%else - psubw m0, m9 - psubw m6, m9 - psubw m7, m9 - psubw m8, m9 - vextracti128 xm1, m0, 1 - vextracti128 xm2, m6, 1 - vextracti128 xm3, m7, 1 - vextracti128 xm4, m8, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm1 - movhps [r2 + r5], xm1 - lea r2, [r2 + r3 * 4] - movq [r2], xm6 - movhps [r2 + r3], xm6 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r5], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm7 - movhps [r2 + r3], xm7 - movq [r2 + r3 * 2], xm3 - movhps [r2 + r5], xm3 - lea r2, [r2 + r3 * 4] - movq [r2], xm8 - movhps [r2 + r3], xm8 - movq [r2 + r3 * 2], xm4 - movhps [r2 + r5], xm4 -%endif - lea r2, [r2 + r3 * 4] -%endrep - RET -%endif -%endmacro - - FILTER_VER_CHROMA_AVX2_4xN pp, 16 - FILTER_VER_CHROMA_AVX2_4xN ps, 16 - FILTER_VER_CHROMA_AVX2_4xN pp, 32 - FILTER_VER_CHROMA_AVX2_4xN ps, 32 - -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W4_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m0, [tab_Cm] - - mova m1, [pw_512] - - mov r4d, %2 - - lea r5, [3 * r1] + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 -.loop: - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - pmaddubsw m2, m0 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - lea r0, [r0 + 4 * r1] - movd m6, [r0] + movu [r2 + r3 * 0 + 80], m0 + movu [r2 + r3 * 1 + 80], m1 + movu [r2 + r3 * 2 + 80], m2 + movu [r2 + r4 + 80], m3 - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 + lea r0, [r0 + 8] - pmaddubsw m3, m0 + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - phaddw m2, m3 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - pmulhrsw m2, m1 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - movd m7, [r0 + r1] + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - punpcklbw m4, m5 - punpcklbw m3, m6, m7 - punpcklbw m4, m3 + movu [r2 + r3 * 0 + 96], m0 + movu [r2 + r3 * 1 + 96], m1 + movu [r2 + r3 * 2 + 96], m2 + movu [r2 + r4 + 96], m3 - pmaddubsw m4, m0 + lea r0, [r0 + 8] - movd m3, [r0 + 2 * r1] + movh m0, [r0] + punpcklbw m0, m4 + pmaddubsw m0, m5 - punpcklbw m5, m6 - punpcklbw m7, m3 - punpcklbw m5, m7 + movh m1, [r0 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - pmaddubsw m5, m0 + movh m2, [r0 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - phaddw m4, m5 + movh m3, [r0 + r5] + punpcklbw m3, m4 + pmaddubsw m3, m5 - pmulhrsw m4, m1 - packuswb m2, m4 - movd [r2], m2 - pextrd [r2 + r3], m2, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m2, 2 - pextrd [r2 + r3], m2, 3 + movu [r2 + r3 * 0 + 112], m0 + movu [r2 + r3 * 1 + 112], m1 + movu [r2 + r3 * 2 + 112], m2 + movu [r2 + r4 + 112], m3 - lea r2, [r2 + 2 * r3] + lea r0, [r0 + r1 * 4 - 56] + lea r2, [r2 + r3 * 4] - sub r4, 4 - jnz .loop + dec r6d + jnz .loop RET %endmacro + P2S_H_64xN 64 + P2S_H_64xN 16 + P2S_H_64xN 32 + P2S_H_64xN 48 - FILTER_V4_W4_H4 4, 8 - FILTER_V4_W4_H4 4, 16 - - FILTER_V4_W4_H4 4, 32 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +%macro P2S_H_64xN_avx2 1 +INIT_YMM avx2 +cglobal filterPixelToShort_64x%1, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] -%macro FILTER_V4_W8_H2 0 - punpcklbw m1, m2 - punpcklbw m7, m3, m0 + ; load height + mov r4d, %1/4 - pmaddubsw m1, m6 - pmaddubsw m7, m5 + ; load constant + vpbroadcastd m4, [pw_2000] - paddw m1, m7 +.loop: + pmovzxbw m0, [r0 + 0 * mmsize/2] + pmovzxbw m1, [r0 + 1 * mmsize/2] + pmovzxbw m2, [r0 + 2 * mmsize/2] + pmovzxbw m3, [r0 + 3 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 - pmulhrsw m1, m4 - packuswb m1, m1 -%endmacro + movu [r2 + 0 * mmsize], m0 + movu [r2 + 1 * mmsize], m1 + movu [r2 + 2 * mmsize], m2 + movu [r2 + 3 * mmsize], m3 -%macro FILTER_V4_W8_H3 0 - punpcklbw m2, m3 - punpcklbw m7, m0, m1 + pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] + pmovzxbw m2, [r0 + r1 + 2 * mmsize/2] + pmovzxbw m3, [r0 + r1 + 3 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 - pmaddubsw m2, m6 - pmaddubsw m7, m5 + movu [r2 + r3 + 0 * mmsize], m0 + movu [r2 + r3 + 1 * mmsize], m1 + movu [r2 + r3 + 2 * mmsize], m2 + movu [r2 + r3 + 3 * mmsize], m3 - paddw m2, m7 + pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] + pmovzxbw m2, [r0 + r1 * 2 + 2 * mmsize/2] + pmovzxbw m3, [r0 + r1 * 2 + 3 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 - pmulhrsw m2, m4 - packuswb m2, m2 -%endmacro - -%macro FILTER_V4_W8_H4 0 - punpcklbw m3, m0 - punpcklbw m7, m1, m2 - - pmaddubsw m3, m6 - pmaddubsw m7, m5 - - paddw m3, m7 - - pmulhrsw m3, m4 - packuswb m3, m3 -%endmacro - -%macro FILTER_V4_W8_H5 0 - punpcklbw m0, m1 - punpcklbw m7, m2, m3 - - pmaddubsw m0, m6 - pmaddubsw m7, m5 - - paddw m0, m7 + movu [r2 + r3 * 2 + 0 * mmsize], m0 + movu [r2 + r3 * 2 + 1 * mmsize], m1 + movu [r2 + r3 * 2 + 2 * mmsize], m2 + movu [r2 + r3 * 2 + 3 * mmsize], m3 - pmulhrsw m0, m4 - packuswb m0, m0 -%endmacro + pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] + pmovzxbw m2, [r0 + r5 + 2 * mmsize/2] + pmovzxbw m3, [r0 + r5 + 3 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psllw m3, 6 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 -%macro FILTER_V4_W8_8x2 2 - FILTER_V4_W8 %1, %2 - movq m0, [r0 + 4 * r1] + movu [r2 + r6 + 0 * mmsize], m0 + movu [r2 + r6 + 1 * mmsize], m1 + movu [r2 + r6 + 2 * mmsize], m2 + movu [r2 + r6 + 3 * mmsize], m3 - FILTER_V4_W8_H2 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - movh [r2 + r3], m1 + dec r4d + jnz .loop + RET %endmacro + P2S_H_64xN_avx2 64 + P2S_H_64xN_avx2 16 + P2S_H_64xN_avx2 32 + P2S_H_64xN_avx2 48 -%macro FILTER_V4_W8_8x4 2 - FILTER_V4_W8_8x2 %1, %2 -;8x3 - lea r6, [r0 + 4 * r1] - movq m1, [r6 + r1] +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride) +;----------------------------------------------------------------------------- +%macro P2S_H_12xN 1 +INIT_XMM ssse3 +cglobal filterPixelToShort_12x%1, 3, 7, 6 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r6, [r3 * 3] + mov r5d, %1/4 - FILTER_V4_W8_H3 + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] - movh [r2 + 2 * r3], m2 +.loop: + movu m0, [r0] + punpcklbw m1, m0, m4 + punpckhbw m0, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 -;8x4 - movq m2, [r6 + 2 * r1] + movu m2, [r0 + r1] + punpcklbw m3, m2, m4 + punpckhbw m2, m4 + pmaddubsw m2, m5 + pmaddubsw m3, m5 - FILTER_V4_W8_H4 + movu [r2 + r3 * 0], m1 + movu [r2 + r3 * 1], m3 - lea r5, [r2 + 2 * r3] - movh [r5 + r3], m3 -%endmacro + movh [r2 + r3 * 0 + 16], m0 + movh [r2 + r3 * 1 + 16], m2 -%macro FILTER_V4_W8_8x6 2 - FILTER_V4_W8_8x4 %1, %2 -;8x5 - lea r6, [r6 + 2 * r1] - movq m3, [r6 + r1] + movu m0, [r0 + r1 * 2] + punpcklbw m1, m0, m4 + punpckhbw m0, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 - FILTER_V4_W8_H5 + movu m2, [r0 + r4] + punpcklbw m3, m2, m4 + punpckhbw m2, m4 + pmaddubsw m2, m5 + pmaddubsw m3, m5 - movh [r2 + 4 * r3], m0 + movu [r2 + r3 * 2], m1 + movu [r2 + r6], m3 -;8x6 - movq m0, [r0 + 8 * r1] + movh [r2 + r3 * 2 + 16], m0 + movh [r2 + r6 + 16], m2 - FILTER_V4_W8_H2 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - lea r5, [r2 + 4 * r3] - movh [r5 + r3], m1 + dec r5d + jnz .loop + RET %endmacro + P2S_H_12xN 16 + P2S_H_12xN 32 ;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W8 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 +%macro P2S_H_24xN 1 +INIT_XMM ssse3 +cglobal filterPixelToShort_24x%1, 3, 7, 5 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] + mov r6d, %1/4 - mov r4d, r4m + ; load constant + mova m3, [pb_128] + mova m4, [tab_c_64_n64] - sub r0, r1 - movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - lea r5, [r0 + 2 * r1] - movq m3, [r5 + r1] +.loop: + movu m0, [r0] + punpcklbw m1, m0, m3 + punpckhbw m0, m3 + pmaddubsw m0, m4 + pmaddubsw m1, m4 - punpcklbw m0, m1 - punpcklbw m4, m2, m3 + movu m2, [r0 + 16] + punpcklbw m2, m3 + pmaddubsw m2, m4 -%ifdef PIC - lea r6, [tab_ChromaCoeff] - movd m5, [r6 + r4 * 4] -%else - movd m5, [tab_ChromaCoeff + r4 * 4] -%endif + movu [r2 + r3 * 0], m1 + movu [r2 + r3 * 0 + 16], m0 + movu [r2 + r3 * 0 + 32], m2 + + movu m0, [r0 + r1] + punpcklbw m1, m0, m3 + punpckhbw m0, m3 + pmaddubsw m0, m4 + pmaddubsw m1, m4 - pshufb m6, m5, [tab_Vm] - pmaddubsw m0, m6 + movu m2, [r0 + r1 + 16] + punpcklbw m2, m3 + pmaddubsw m2, m4 - pshufb m5, [tab_Vm + 16] - pmaddubsw m4, m5 + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 1 + 16], m0 + movu [r2 + r3 * 1 + 32], m2 - paddw m0, m4 + movu m0, [r0 + r1 * 2] + punpcklbw m1, m0, m3 + punpckhbw m0, m3 + pmaddubsw m0, m4 + pmaddubsw m1, m4 - mova m4, [pw_512] + movu m2, [r0 + r1 * 2 + 16] + punpcklbw m2, m3 + pmaddubsw m2, m4 - pmulhrsw m0, m4 - packuswb m0, m0 - movh [r2], m0 -%endmacro + movu [r2 + r3 * 2], m1 + movu [r2 + r3 * 2 + 16], m0 + movu [r2 + r3 * 2 + 32], m2 -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- - FILTER_V4_W8_8x2 8, 2 + movu m0, [r0 + r4] + punpcklbw m1, m0, m3 + punpckhbw m0, m3 + pmaddubsw m0, m4 + pmaddubsw m1, m4 - RET + movu m2, [r0 + r4 + 16] + punpcklbw m2, m3 + pmaddubsw m2, m4 + movu [r2 + r5], m1 + movu [r2 + r5 + 16], m0 + movu [r2 + r5 + 32], m2 -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- - FILTER_V4_W8_8x4 8, 4 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + dec r6d + jnz .loop RET +%endmacro + P2S_H_24xN 32 + P2S_H_24xN 64 ;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- - FILTER_V4_W8_8x6 8, 6 - - RET - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_4x2, 4, 6, 6 - - mov r4d, r4m - sub r0, r1 +%macro P2S_H_24xN_avx2 1 +INIT_YMM avx2 +cglobal filterPixelToShort_24x%1, 3, 7, 4 + mov r3d, r3m add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] + mov r6d, %1/4 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif + ; load constant + vpbroadcastd m1, [pw_2000] + vpbroadcastd m2, [pb_128] + vpbroadcastd m3, [tab_c_64_n64] - pshufb m0, [tab_Cm] +.loop: + pmovzxbw m0, [r0] + psllw m0, 6 + psubw m0, m1 + movu [r2], m0 - movd m2, [r0] - movd m3, [r0 + r1] - lea r5, [r0 + 2 * r1] - movd m4, [r5] - movd m5, [r5 + r1] + movu m0, [r0 + mmsize/2] + punpcklbw m0, m2 + pmaddubsw m0, m3 + movu [r2 + r3 * 0 + mmsize], xm0 - punpcklbw m2, m3 - punpcklbw m1, m4, m5 - punpcklbw m2, m1 + pmovzxbw m0, [r0 + r1] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3], m0 - pmaddubsw m2, m0 + movu m0, [r0 + r1 + mmsize/2] + punpcklbw m0, m2 + pmaddubsw m0, m3 + movu [r2 + r3 * 1 + mmsize], xm0 - movd m1, [r0 + 4 * r1] + pmovzxbw m0, [r0 + r1 * 2] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r3 * 2], m0 - punpcklbw m3, m4 - punpcklbw m5, m1 - punpcklbw m3, m5 + movu m0, [r0 + r1 * 2 + mmsize/2] + punpcklbw m0, m2 + pmaddubsw m0, m3 + movu [r2 + r3 * 2 + mmsize], xm0 - pmaddubsw m3, m0 + pmovzxbw m0, [r0 + r4] + psllw m0, 6 + psubw m0, m1 + movu [r2 + r5], m0 - phaddw m2, m3 + movu m0, [r0 + r4 + mmsize/2] + punpcklbw m0, m2 + pmaddubsw m0, m3 + movu [r2 + r5 + mmsize], xm0 - psubw m2, [pw_2000] - movh [r2], m2 - movhps [r2 + r3], m2 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + dec r6d + jnz .loop RET +%endmacro + P2S_H_24xN_avx2 32 + P2S_H_24xN_avx2 64 -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_4x4, 4, 6, 7 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal filterPixelToShort_48x64, 3, 7, 4 + mov r3d, r3m + add r3d, r3d + lea r4, [r1 * 3] + lea r5, [r3 * 3] + mov r6d, 16 - mov r4d, r4m - sub r0, r1 - add r3d, r3d + ; load constant + mova m2, [pb_128] + mova m3, [tab_c_64_n64] -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif +.loop: + movu m0, [r0] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 - pshufb m0, [tab_Cm] + movu [r2 + r3 * 0], m1 + movu [r2 + r3 * 0 + 16], m0 - lea r4, [r1 * 3] - lea r5, [r0 + 4 * r1] - - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r4] - - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 - - pmaddubsw m2, m0 - - movd m6, [r5] - - punpcklbw m3, m4 - punpcklbw m1, m5, m6 - punpcklbw m3, m1 - - pmaddubsw m3, m0 - - phaddw m2, m3 - - mova m1, [pw_2000] - - psubw m2, m1 - movh [r2], m2 - movhps [r2 + r3], m2 - - movd m2, [r5 + r1] + movu m0, [r0 + 16] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 - punpcklbw m4, m5 - punpcklbw m3, m6, m2 - punpcklbw m4, m3 + movu [r2 + r3 * 0 + 32], m1 + movu [r2 + r3 * 0 + 48], m0 - pmaddubsw m4, m0 + movu m0, [r0 + 32] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 - movd m3, [r5 + 2 * r1] + movu [r2 + r3 * 0 + 64], m1 + movu [r2 + r3 * 0 + 80], m0 - punpcklbw m5, m6 - punpcklbw m2, m3 - punpcklbw m5, m2 + movu m0, [r0 + r1] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 - pmaddubsw m5, m0 + movu [r2 + r3 * 1], m1 + movu [r2 + r3 * 1 + 16], m0 - phaddw m4, m5 + movu m0, [r0 + r1 + 16] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 - psubw m4, m1 - lea r2, [r2 + 2 * r3] - movh [r2], m4 - movhps [r2 + r3], m4 + movu [r2 + r3 * 1 + 32], m1 + movu [r2 + r3 * 1 + 48], m0 - RET + movu m0, [r0 + r1 + 32] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W4_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 + movu [r2 + r3 * 1 + 64], m1 + movu [r2 + r3 * 1 + 80], m0 - mov r4d, r4m - sub r0, r1 - add r3d, r3d + movu m0, [r0 + r1 * 2] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif + movu [r2 + r3 * 2], m1 + movu [r2 + r3 * 2 + 16], m0 - pshufb m0, [tab_Cm] + movu m0, [r0 + r1 * 2 + 16] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 - mova m1, [pw_2000] + movu [r2 + r3 * 2 + 32], m1 + movu [r2 + r3 * 2 + 48], m0 - mov r4d, %2/4 - lea r5, [3 * r1] + movu m0, [r0 + r1 * 2 + 32] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 -.loop: - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] + movu [r2 + r3 * 2 + 64], m1 + movu [r2 + r3 * 2 + 80], m0 - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 + movu m0, [r0 + r4] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 - pmaddubsw m2, m0 + movu [r2 + r5], m1 + movu [r2 + r5 + 16], m0 - lea r0, [r0 + 4 * r1] - movd m6, [r0] + movu m0, [r0 + r4 + 16] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 + movu [r2 + r5 + 32], m1 + movu [r2 + r5 + 48], m0 - pmaddubsw m3, m0 + movu m0, [r0 + r4 + 32] + punpcklbw m1, m0, m2 + punpckhbw m0, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 - phaddw m2, m3 + movu [r2 + r5 + 64], m1 + movu [r2 + r5 + 80], m0 - psubw m2, m1 - movh [r2], m2 - movhps [r2 + r3], m2 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - movd m2, [r0 + r1] + dec r6d + jnz .loop + RET - punpcklbw m4, m5 - punpcklbw m3, m6, m2 - punpcklbw m4, m3 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal filterPixelToShort_48x64, 3,7,4 + mov r3d, r3m + add r3d, r3d + lea r5, [r1 * 3] + lea r6, [r3 * 3] - pmaddubsw m4, m0 + ; load height + mov r4d, 64/4 - movd m3, [r0 + 2 * r1] + ; load constant + vpbroadcastd m3, [pw_2000] - punpcklbw m5, m6 - punpcklbw m2, m3 - punpcklbw m5, m2 + ; just unroll(1) because it is best choice for 48x64 +.loop: + pmovzxbw m0, [r0 + 0 * mmsize/2] + pmovzxbw m1, [r0 + 1 * mmsize/2] + pmovzxbw m2, [r0 + 2 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 + movu [r2 + 0 * mmsize], m0 + movu [r2 + 1 * mmsize], m1 + movu [r2 + 2 * mmsize], m2 - pmaddubsw m5, m0 + pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] + pmovzxbw m2, [r0 + r1 + 2 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 + movu [r2 + r3 + 0 * mmsize], m0 + movu [r2 + r3 + 1 * mmsize], m1 + movu [r2 + r3 + 2 * mmsize], m2 - phaddw m4, m5 + pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] + pmovzxbw m2, [r0 + r1 * 2 + 2 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 + movu [r2 + r3 * 2 + 0 * mmsize], m0 + movu [r2 + r3 * 2 + 1 * mmsize], m1 + movu [r2 + r3 * 2 + 2 * mmsize], m2 - psubw m4, m1 - lea r2, [r2 + 2 * r3] - movh [r2], m4 - movhps [r2 + r3], m4 + pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] + pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] + pmovzxbw m2, [r0 + r5 + 2 * mmsize/2] + psllw m0, 6 + psllw m1, 6 + psllw m2, 6 + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 + movu [r2 + r6 + 0 * mmsize], m0 + movu [r2 + r6 + 1 * mmsize], m1 + movu [r2 + r6 + 2 * mmsize], m2 - lea r2, [r2 + 2 * r3] + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] - dec r4d + dec r4d jnz .loop RET -%endmacro - FILTER_V_PS_W4_H4 4, 8 - FILTER_V_PS_W4_H4 4, 16 - FILTER_V_PS_W4_H4 4, 32 +%macro PROCESS_LUMA_W4_4R 0 + movd m0, [r0] + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[0 1] -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W8_H8_H16_H2 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7 + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[1 2] + punpcklqdq m2, m1 ; m2=[0 1 1 2] + pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2] - mov r4d, r4m - sub r0, r1 - add r3d, r3d + movd m1, [r0 + r1] + punpcklbw m5, m0, m1 ; m2=[2 3] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[3 4] + punpcklqdq m5, m1 ; m5=[2 3 3 4] + pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4] + paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2 + pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4 -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m5, [r5 + r4 * 4] -%else - movd m5, [tab_ChromaCoeff + r4 * 4] -%endif + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[4 5] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[5 6] + punpcklqdq m2, m1 ; m2=[4 5 5 6] + pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6] + paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2 + pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6] + paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4 - pshufb m6, m5, [tab_Vm] - pshufb m5, [tab_Vm + 16] - mova m4, [pw_2000] + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[6 7] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[7 8] + punpcklqdq m2, m1 ; m2=[6 7 7 8] + pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8] + paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end + pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8] + paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4 - mov r4d, %2/2 - lea r5, [3 * r1] + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[8 9] + movd m0, [r0 + 2 * r1] + punpcklbw m1, m0 ; m1=[9 10] + punpcklqdq m2, m1 ; m2=[8 9 9 10] + pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10] + paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end +%endmacro -.loopH: +%macro PROCESS_LUMA_W8_4R 0 movq m0, [r0] movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - movq m3, [r0 + r5] - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 - - pmaddubsw m0, m6 - pmaddubsw m2, m5 - - paddw m0, m2 - - psubw m0, m4 - movu [r2], m0 - - movq m0, [r0 + 4 * r1] - - punpcklbw m3, m0 - - pmaddubsw m1, m6 - pmaddubsw m3, m5 + pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1 - paddw m1, m3 - psubw m1, m4 + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2 - movu [r2 + r3], m1 + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3 + pmaddubsw m0, [r6 + 1 * 16] + paddw m7, m0 ;m7=[0+1+2+3] Row1 lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4 + pmaddubsw m1, [r6 + 1 * 16] + paddw m6, m1 ;m6 = [1+2+3+4] Row2 - dec r4d - jnz .loopH + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m2, m0, [r6 + 1 * 16] + pmaddubsw m0, [r6 + 2 * 16] + paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1 + paddw m5, m2 ;m5=[2+3+4+5] Row3 - RET -%endmacro - - FILTER_V_PS_W8_H8_H16_H2 8, 2 - FILTER_V_PS_W8_H8_H16_H2 8, 4 - FILTER_V_PS_W8_H8_H16_H2 8, 6 - - FILTER_V_PS_W8_H8_H16_H2 8, 12 - FILTER_V_PS_W8_H8_H16_H2 8, 64 - -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W8_H8_H16_H32 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m5, [r5 + r4 * 4] -%else - movd m5, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m6, m5, [tab_Vm] - pshufb m5, [tab_Vm + 16] - mova m4, [pw_2000] - - mov r4d, %2/4 - lea r5, [3 * r1] - -.loop: + lea r0, [r0 + 2 * r1] movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - movq m3, [r0 + r5] + punpcklbw m1, m0 + pmaddubsw m2, m1, [r6 + 1 * 16] + pmaddubsw m1, [r6 + 2 * 16] + paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2 + paddw m4, m2 ;m4=[3+4+5+6] Row4 + movq m1, [r0 + r1] punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 - - pmaddubsw m0, m6 - pmaddubsw m7, m2, m5 - - paddw m0, m7 - - psubw m0, m4 - movu [r2], m0 + pmaddubsw m2, m0, [r6 + 2 * 16] + pmaddubsw m0, [r6 + 3 * 16] + paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end + paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3 - lea r0, [r0 + 4 * r1] + lea r0, [r0 + 2 * r1] movq m0, [r0] - - punpcklbw m3, m0 - - pmaddubsw m1, m6 - pmaddubsw m7, m3, m5 - - paddw m1, m7 - - psubw m1, m4 - movu [r2 + r3], m1 + punpcklbw m1, m0 + pmaddubsw m2, m1, [r6 + 2 * 16] + pmaddubsw m1, [r6 + 3 * 16] + paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end + paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4 movq m1, [r0 + r1] - punpcklbw m0, m1 + pmaddubsw m0, [r6 + 3 * 16] + paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end - pmaddubsw m2, m6 - pmaddubsw m0, m5 - - paddw m2, m0 - - psubw m2, m4 - lea r2, [r2 + 2 * r3] - movu [r2], m2 - - movq m2, [r0 + 2 * r1] - - punpcklbw m1, m2 - - pmaddubsw m3, m6 - pmaddubsw m1, m5 - - paddw m3, m1 - psubw m3, m4 - - movu [r2 + r3], m3 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loop - RET + movq m0, [r0 + 2 * r1] + punpcklbw m1, m0 + pmaddubsw m1, [r6 + 3 * 16] + paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end %endmacro - FILTER_V_PS_W8_H8_H16_H32 8, 8 - FILTER_V_PS_W8_H8_H16_H32 8, 16 - FILTER_V_PS_W8_H8_H16_H32 8, 32 - -;------------------------------------------------------------------------------------------------------------ -;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------ -%macro FILTER_V_PS_W6 2 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_4xN 3 INIT_XMM sse4 -cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - add r3d, r3d +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6 + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 +%ifidn %3,ps + add r3d, r3d +%endif %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m5, [r5 + r4 * 4] + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] %else - movd m5, [tab_ChromaCoeff + r4 * 4] + lea r6, [tab_LumaCoeffVer + r4] %endif - pshufb m6, m5, [tab_Vm] - pshufb m5, [tab_Vm + 16] - mova m4, [pw_2000] - lea r5, [3 * r1] - mov r4d, %2/4 - -.loop: - movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - movq m3, [r0 + r5] - - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 - - pmaddubsw m0, m6 - pmaddubsw m7, m2, m5 - - paddw m0, m7 - psubw m0, m4 - - movh [r2], m0 - pshufd m0, m0, 2 - movd [r2 + 8], m0 - - lea r0, [r0 + 4 * r1] - movq m0, [r0] - punpcklbw m3, m0 - - pmaddubsw m1, m6 - pmaddubsw m7, m3, m5 +%ifidn %3,pp + mova m3, [pw_512] +%else + mova m3, [pw_2000] +%endif - paddw m1, m7 - psubw m1, m4 + mov r4d, %2/4 + lea r5, [4 * r1] - movh [r2 + r3], m1 - pshufd m1, m1, 2 - movd [r2 + r3 + 8], m1 +.loopH: + PROCESS_LUMA_W4_4R - movq m1, [r0 + r1] - punpcklbw m0, m1 +%ifidn %3,pp + pmulhrsw m4, m3 + pmulhrsw m5, m3 - pmaddubsw m2, m6 - pmaddubsw m0, m5 + packuswb m4, m5 - paddw m2, m0 - psubw m2, m4 + movd [r2], m4 + pextrd [r2 + r3], m4, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m4, 2 + pextrd [r2 + r3], m4, 3 +%else + psubw m4, m3 + psubw m5, m3 - lea r2,[r2 + 2 * r3] - movh [r2], m2 - pshufd m2, m2, 2 - movd [r2 + 8], m2 + movlps [r2], m4 + movhps [r2 + r3], m4 + lea r2, [r2 + 2 * r3] + movlps [r2], m5 + movhps [r2 + r3], m5 +%endif - movq m2,[r0 + 2 * r1] - punpcklbw m1, m2 + sub r0, r5 + lea r2, [r2 + 2 * r3] - pmaddubsw m3, m6 - pmaddubsw m1, m5 + dec r4d + jnz .loopH - paddw m3, m1 - psubw m3, m4 + RET +%endmacro - movh [r2 + r3], m3 - pshufd m3, m3, 2 - movd [r2 + r3 + 8], m3 - lea r2, [r2 + 2 * r3] +INIT_YMM avx2 +cglobal interp_8tap_vert_pp_4x4, 4,6,8 + mov r4d, r4m + lea r5, [r1 * 3] + sub r0, r5 - dec r4d - jnz .loop - RET -%endmacro + ; TODO: VPGATHERDD + movd xm1, [r0] ; m1 = row0 + movd xm2, [r0 + r1] ; m2 = row1 + punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00] - FILTER_V_PS_W6 6, 8 - FILTER_V_PS_W6 6, 16 + movd xm3, [r0 + r1 * 2] ; m3 = row2 + punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10] + movd xm4, [r0 + r5] + punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20] + punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W12 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8 + lea r0, [r0 + r1 * 4] + movd xm5, [r0] ; m5 = row4 + punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30] + punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] + vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] + movd xm2, [r0 + r1] ; m2 = row5 + punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40] + punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] + movd xm6, [r0 + r1 * 2] ; m6 = row6 + punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50] + punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] + vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] + movd xm4, [r0 + r5] ; m4 = row7 + punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60] + punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] - mov r4d, r4m - sub r0, r1 - add r3d, r3d + lea r0, [r0 + r1 * 4] + movd xm7, [r0] ; m7 = row8 + punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70] + punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] + vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] + movd xm2, [r0 + r1] ; m2 = row9 + punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80] + punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] + movd xm7, [r0 + r1 * 2] ; m7 = rowA + punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90] + punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] + vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] + ; load filter coeff %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8 + 0] + vpbroadcastd m2, [r5 + r4 * 8 + 4] %else - movd m0, [tab_ChromaCoeff + r4 * 4] + vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0] + vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4] %endif - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mov r4d, %2/2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r0, [r0 + 2 * r1] - movu m5, [r0] - movu m7, [r0 + r1] - - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 - - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 - - mova m6, [pw_2000] - - psubw m4, m6 - psubw m2, m6 - - movu [r2], m4 - movh [r2 + 16], m2 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m2, [r0 + 2 * r1] - - punpcklbw m5, m7, m2 - punpckhbw m7, m2 - - pmaddubsw m5, m0 - pmaddubsw m7, m0 - - paddw m4, m5 - paddw m3, m7 - - psubw m4, m6 - psubw m3, m6 - - movu [r2 + r3], m4 - movh [r2 + r3 + 16], m3 - - lea r2, [r2 + 2 * r3] + pmaddubsw m1, m0 + pmaddubsw m3, m0 + pmaddubsw m5, m2 + pmaddubsw m6, m2 + vbroadcasti128 m0, [pw_1] + pmaddwd m1, m0 + pmaddwd m3, m0 + pmaddwd m5, m0 + pmaddwd m6, m0 + paddd m1, m5 ; m1 = DQWORD ROW[1 0] + paddd m3, m6 ; m3 = DQWORD ROW[3 2] + packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0] - dec r4d - jnz .loop + ; TODO: does it overflow? + pmulhrsw m1, [pw_512] + vextracti128 xm2, m1, 1 + packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0] + movd [r2], xm1 + pextrd [r2 + r3], xm1, 2 + pextrd [r2 + r3 * 2], xm1, 1 + lea r4, [r3 * 3] + pextrd [r2 + r4], xm1, 3 RET -%endmacro - - FILTER_V_PS_W12 12, 16 - FILTER_V_PS_W12 12, 32 - -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W16 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 - mov r4d, r4m - sub r0, r1 - add r3d, r3d +INIT_YMM avx2 +cglobal interp_8tap_vert_ps_4x4, 4, 6, 5 + mov r4d, r4m + shl r4d, 7 %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 %else - movd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - mov r4d, %2/2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r0, [r0 + 2 * r1] - movu m5, [r0] - movu m7, [r0 + r1] - - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 - - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 - - mova m6, [pw_2000] - - psubw m4, m6 - psubw m2, m6 - - movu [r2], m4 - movu [r2 + 16], m2 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m5, [r0 + 2 * r1] - - punpcklbw m2, m7, m5 - punpckhbw m7, m5 - - pmaddubsw m2, m0 - pmaddubsw m7, m0 + lea r4, [r1 * 3] + sub r0, r4 - paddw m4, m2 - paddw m3, m7 + add r3d, r3d - psubw m4, m6 - psubw m3, m6 - - movu [r2 + r3], m4 - movu [r2 + r3 + 16], m3 + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] + mova m3, [interp4_vpp_shuf1] + vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] + vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] + mova m3, [interp4_vpp_shuf1 + mmsize] + vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] - lea r2, [r2 + 2 * r3] + mova m3, [interp4_vpp_shuf] + pshufb m0, m0, m3 + pshufb m1, m1, m3 + pshufb m4, m4, m3 + pshufb m2, m2, m3 + pmaddubsw m0, [r5] + pmaddubsw m1, [r5 + mmsize] + pmaddubsw m4, [r5 + 2 * mmsize] + pmaddubsw m2, [r5 + 3 * mmsize] + paddw m0, m1 + paddw m0, m4 + paddw m0, m2 ; m0 = WORD ROW[3 2 1 0] - dec r4d - jnz .loop + psubw m0, [pw_2000] + vextracti128 xm2, m0, 1 + lea r5, [r3 * 3] + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r5], xm2 RET -%endmacro - - FILTER_V_PS_W16 16, 4 - FILTER_V_PS_W16 16, 8 - FILTER_V_PS_W16 16, 12 - FILTER_V_PS_W16 16, 16 - FILTER_V_PS_W16 16, 32 - - FILTER_V_PS_W16 16, 24 - FILTER_V_PS_W16 16, 64 - -;-------------------------------------------------------------------------------------------------------------- -;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_V4_PS_W24 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8 - mov r4d, r4m - sub r0, r1 - add r3d, r3d +%macro FILTER_VER_LUMA_AVX2_4xN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 10 + mov r4d, r4m + shl r4d, 7 %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 %else - movd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mov r4d, %2/2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r5, [r0 + 2 * r1] - - movu m5, [r5] - movu m7, [r5 + r1] - - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 - - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 - - mova m6, [pw_2000] - - psubw m4, m6 - psubw m2, m6 - - movu [r2], m4 - movu [r2 + 16], m2 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m2, [r5 + 2 * r1] - - punpcklbw m5, m7, m2 - punpckhbw m7, m2 - - pmaddubsw m5, m0 - pmaddubsw m7, m0 - - paddw m4, m5 - paddw m3, m7 - - psubw m4, m6 - psubw m3, m6 - - movu [r2 + r3], m4 - movu [r2 + r3 + 16], m3 - - movq m2, [r0 + 16] - movq m3, [r0 + r1 + 16] - movq m4, [r5 + 16] - movq m5, [r5 + r1 + 16] - - punpcklbw m2, m3 - punpcklbw m7, m4, m5 - - pmaddubsw m2, m1 - pmaddubsw m7, m0 - - paddw m2, m7 - psubw m2, m6 - - movu [r2 + 32], m2 - - movq m2, [r5 + 2 * r1 + 16] - - punpcklbw m3, m4 - punpcklbw m5, m2 - - pmaddubsw m3, m1 - pmaddubsw m5, m0 - - paddw m3, m5 - psubw m3, m6 - - movu [r2 + r3 + 32], m3 - - mov r0, r5 - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loop - RET -%endmacro - - FILTER_V4_PS_W24 24, 32 - - FILTER_V4_PS_W24 24, 64 - -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W32 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r1 * 4] +%ifidn %3,pp + mova m6, [pw_512] %else - movd m0, [tab_ChromaCoeff + r4 * 4] + add r3d, r3d + vbroadcasti128 m6, [pw_2000] %endif - - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mova m7, [pw_2000] - - mov r4d, %2 - + lea r8, [r3 * 3] + mova m5, [interp4_vpp_shuf] + mova m0, [interp4_vpp_shuf1] + mova m7, [interp4_vpp_shuf1 + mmsize] + mov r7d, %2 / 8 .loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r5, [r0 + 2 * r1] - movu m3, [r5] - movu m5, [r5 + r1] - - punpcklbw m6, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m6, m0 - pmaddubsw m3, m0 - - paddw m4, m6 - paddw m2, m3 - - psubw m4, m7 - psubw m2, m7 - - movu [r2], m4 - movu [r2 + 16], m2 - - movu m2, [r0 + 16] - movu m3, [r0 + r1 + 16] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - movu m3, [r5 + 16] - movu m5, [r5 + r1 + 16] - - punpcklbw m6, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m6, m0 - pmaddubsw m3, m0 - - paddw m4, m6 - paddw m2, m3 - - psubw m4, m7 - psubw m2, m7 - - movu [r2 + 32], m4 - movu [r2 + 48], m2 + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 + pinsrd xm3, [r0 + r4], 3 ; m3 = row[11 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[11 10 9 8 7 6 5 4] + lea r0, [r0 + r1 * 4] + movd xm4, [r0] + pinsrd xm4, [r0 + r1], 1 + pinsrd xm4, [r0 + r1 * 2], 2 ; m4 = row[x 14 13 12] + vinserti128 m3, m3, xm4, 1 ; m3 = row[x 14 13 12 11 10 9 8] + vpermd m8, m0, m1 ; m8 = row[4 3 3 2 2 1 1 0] + vpermd m4, m0, m2 ; m4 = row[8 7 7 6 6 5 5 4] + vpermd m1, m7, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m7, m2 ; m2 = row[10 9 9 8 8 7 7 6] + vpermd m9, m0, m3 ; m9 = row[12 11 11 10 10 9 9 8] + vpermd m3, m7, m3 ; m3 = row[14 13 13 12 12 11 11 10] - lea r0, [r0 + r1] - lea r2, [r2 + r3] + pshufb m8, m8, m5 + pshufb m1, m1, m5 + pshufb m4, m4, m5 + pshufb m9, m9, m5 + pshufb m2, m2, m5 + pshufb m3, m3, m5 + pmaddubsw m8, [r5] + pmaddubsw m1, [r5 + mmsize] + pmaddubsw m9, [r5 + 2 * mmsize] + pmaddubsw m3, [r5 + 3 * mmsize] + paddw m8, m1 + paddw m9, m3 + pmaddubsw m1, m4, [r5 + 2 * mmsize] + pmaddubsw m3, m2, [r5 + 3 * mmsize] + pmaddubsw m4, [r5] + pmaddubsw m2, [r5 + mmsize] + paddw m3, m1 + paddw m2, m4 + paddw m8, m3 ; m8 = WORD ROW[3 2 1 0] + paddw m9, m2 ; m9 = WORD ROW[7 6 5 4] - dec r4d - jnz .loop +%ifidn %3,pp + pmulhrsw m8, m6 + pmulhrsw m9, m6 + packuswb m8, m9 + vextracti128 xm1, m8, 1 + movd [r2], xm8 + pextrd [r2 + r3], xm8, 1 + movd [r2 + r3 * 2], xm1 + pextrd [r2 + r8], xm1, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm8, 2 + pextrd [r2 + r3], xm8, 3 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r8], xm1, 3 +%else + psubw m8, m6 + psubw m9, m6 + vextracti128 xm1, m8, 1 + vextracti128 xm2, m9, 1 + movq [r2], xm8 + movhps [r2 + r3], xm8 + movq [r2 + r3 * 2], xm1 + movhps [r2 + r8], xm1 + lea r2, [r2 + r3 * 4] + movq [r2], xm9 + movhps [r2 + r3], xm9 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r8], xm2 +%endif + lea r2, [r2 + r3 * 4] + sub r0, r6 + dec r7d + jnz .loop RET +%endif %endmacro - FILTER_V_PS_W32 32, 8 - FILTER_V_PS_W32 32, 16 - FILTER_V_PS_W32 32, 24 - FILTER_V_PS_W32 32, 32 - - FILTER_V_PS_W32 32, 48 - FILTER_V_PS_W32 32, 64 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_4xN 4, 4, pp -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W8_H8_H16_H32 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_4xN 4, 8, pp + FILTER_VER_LUMA_AVX2_4xN 4, 8, pp - mov r4d, r4m - sub r0, r1 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_4xN 4, 16, pp + FILTER_VER_LUMA_AVX2_4xN 4, 16, pp -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m5, [r5 + r4 * 4] -%else - movd m5, [tab_ChromaCoeff + r4 * 4] -%endif +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_4xN 4, 4, ps - pshufb m6, m5, [tab_Vm] - pshufb m5, [tab_Vm + 16] - mova m4, [pw_512] - lea r5, [r1 * 3] +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_4xN 4, 8, ps + FILTER_VER_LUMA_AVX2_4xN 4, 8, ps - mov r4d, %2 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_4xN 4, 16, ps + FILTER_VER_LUMA_AVX2_4xN 4, 16, ps -.loop: - movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - movq m3, [r0 + r5] - - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 - - pmaddubsw m0, m6 - pmaddubsw m7, m2, m5 - - paddw m0, m7 - - pmulhrsw m0, m4 - packuswb m0, m0 - movh [r2], m0 - - lea r0, [r0 + 4 * r1] - movq m0, [r0] - - punpcklbw m3, m0 - - pmaddubsw m1, m6 - pmaddubsw m7, m3, m5 - - paddw m1, m7 - - pmulhrsw m1, m4 - packuswb m1, m1 - movh [r2 + r3], m1 - - movq m1, [r0 + r1] - - punpcklbw m0, m1 - - pmaddubsw m2, m6 - pmaddubsw m0, m5 - - paddw m2, m0 - - pmulhrsw m2, m4 - - movq m7, [r0 + 2 * r1] - punpcklbw m1, m7 - - pmaddubsw m3, m6 - pmaddubsw m1, m5 - - paddw m3, m1 - - pmulhrsw m3, m4 - packuswb m2, m3 - - lea r2, [r2 + 2 * r3] - movh [r2], m2 - movhps [r2 + r3], m2 - - lea r2, [r2 + 2 * r3] - - sub r4, 4 - jnz .loop - RET -%endmacro - - FILTER_V4_W8_H8_H16_H32 8, 8 - FILTER_V4_W8_H8_H16_H32 8, 16 - FILTER_V4_W8_H8_H16_H32 8, 32 - - FILTER_V4_W8_H8_H16_H32 8, 12 - FILTER_V4_W8_H8_H16_H32 8, 64 - -%macro PROCESS_CHROMA_AVX2_W8_8R 0 +%macro PROCESS_LUMA_AVX2_W8_8R 0 movq xm1, [r0] ; m1 = row 0 movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] @@ -5154,6 +3183,8 @@ cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 movq xm4, [r0 + r1 * 2] ; m4 = row 6 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m5, m3 pmaddubsw m0, m1, [r5 + 1 * mmsize] paddw m2, m0 pmaddubsw m1, [r5] @@ -5163,6 +3194,10 @@ cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 movq xm0, [r0] ; m0 = row 8 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m3, m4, [r5 + 3 * mmsize] + paddw m5, m3 + pmaddubsw m3, m4, [r5 + 2 * mmsize] + paddw m2, m3 pmaddubsw m3, m4, [r5 + 1 * mmsize] paddw m1, m3 pmaddubsw m4, [r5] @@ -5171,90 +3206,33 @@ cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 movq xm6, [r0 + r1 * 2] ; m6 = row 10 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + pmaddubsw m3, m0, [r5 + 3 * mmsize] + paddw m2, m3 + pmaddubsw m3, m0, [r5 + 2 * mmsize] + paddw m1, m3 pmaddubsw m0, [r5 + 1 * mmsize] paddw m4, m0 -%endmacro - -%macro FILTER_VER_CHROMA_AVX2_8x8 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x8, 4, 6, 7 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r1 - PROCESS_CHROMA_AVX2_W8_8R -%ifidn %1,pp - lea r4, [r3 * 3] - mova m3, [pw_512] - pmulhrsw m5, m3 ; m5 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - pmulhrsw m1, m3 ; m1 = word: row 4, row 5 - pmulhrsw m4, m3 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r4], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm1 - movq [r2 + r3], xm4 - movhps [r2 + r3 * 2], xm1 - movhps [r2 + r4], xm4 -%else - add r3d, r3d - vbroadcasti128 m3, [pw_2000] - lea r4, [r3 * 3] - psubw m5, m3 ; m5 = word: row 0, row 1 - psubw m2, m3 ; m2 = word: row 2, row 3 - psubw m1, m3 ; m1 = word: row 4, row 5 - psubw m4, m3 ; m4 = word: row 6, row 7 - vextracti128 xm6, m5, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm0, m1, 1 - movu [r2], xm5 - movu [r2 + r3], xm6 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm4 - vextracti128 xm4, m4, 1 - movu [r2 + r4], xm4 -%endif - RET + movq xm3, [r0 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 12 + punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] + vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] + pmaddubsw m3, m6, [r5 + 3 * mmsize] + paddw m1, m3 + pmaddubsw m6, [r5 + 2 * mmsize] + paddw m4, m6 + movq xm3, [r0 + r1] ; m3 = row 13 + punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] + movq xm6, [r0 + r1 * 2] ; m6 = row 14 + punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] + vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] + pmaddubsw m0, [r5 + 3 * mmsize] + paddw m4, m0 %endmacro - FILTER_VER_CHROMA_AVX2_8x8 pp - FILTER_VER_CHROMA_AVX2_8x8 ps - -%macro FILTER_VER_CHROMA_AVX2_8x6 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x6, 4, 6, 6 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 - +%macro PROCESS_LUMA_AVX2_W8_4R 0 movq xm1, [r0] ; m1 = row 0 movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] @@ -5276,105 +3254,199 @@ cglobal interp_4tap_vert_%1_8x6, 4, 6, 6 movq xm4, [r0 + r1 * 2] ; m4 = row 6 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m5, m3 pmaddubsw m0, m1, [r5 + 1 * mmsize] paddw m2, m0 - pmaddubsw m1, [r5] movq xm3, [r0 + r4] ; m3 = row 7 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] lea r0, [r0 + r1 * 4] movq xm0, [r0] ; m0 = row 8 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - pmaddubsw m4, [r5 + 1 * mmsize] - paddw m1, m4 -%ifidn %1,pp - lea r4, [r3 * 3] - mova m3, [pw_512] - pmulhrsw m5, m3 ; m5 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + pmaddubsw m3, m4, [r5 + 3 * mmsize] + paddw m5, m3 + pmaddubsw m3, m4, [r5 + 2 * mmsize] + paddw m2, m3 + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] + vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + pmaddubsw m3, m0, [r5 + 3 * mmsize] + paddw m2, m3 +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_8xN 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 + +%ifidn %3,ps + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif + + %ifidn %3,pp + mova m3, [pw_512] +%else + mova m3, [pw_2000] +%endif + + mov r4d, %2/4 + lea r5, [4 * r1] + +.loopH: + PROCESS_LUMA_W8_4R + +%ifidn %3,pp + pmulhrsw m7, m3 + pmulhrsw m6, m3 + pmulhrsw m5, m3 + pmulhrsw m4, m3 + + packuswb m7, m6 + packuswb m5, m4 + + movlps [r2], m7 + movhps [r2 + r3], m7 + lea r2, [r2 + 2 * r3] + movlps [r2], m5 + movhps [r2 + r3], m5 +%else + psubw m7, m3 + psubw m6, m3 + psubw m5, m3 + psubw m4, m3 + + movu [r2], m7 + movu [r2 + r3], m6 + lea r2, [r2 + 2 * r3] + movu [r2], m5 + movu [r2 + r3], m4 +%endif + + sub r0, r5 + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +%macro FILTER_VER_LUMA_AVX2_8xN 3 +INIT_YMM avx2 +cglobal interp_8tap_vert_%3_%1x%2, 4, 7, 8, 0-gprsize + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r1 * 4] +%ifidn %3,pp + mova m7, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m7, [pw_2000] +%endif + mov word [rsp], %2 / 8 + +.loop: + PROCESS_LUMA_AVX2_W8_8R +%ifidn %3,pp + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 packuswb m5, m2 - packuswb m1, m1 + packuswb m1, m4 vextracti128 xm2, m5, 1 vextracti128 xm4, m1, 1 movq [r2], xm5 movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r4], xm2 - lea r2, [r2 + r3 * 4] + lea r2, [r2 + r3 * 2] + movhps [r2], xm5 + movhps [r2 + r3], xm2 + lea r2, [r2 + r3 * 2] movq [r2], xm1 movq [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm1 + movhps [r2 + r3], xm4 %else - add r3d, r3d - mova m3, [pw_2000] - lea r4, [r3 * 3] - psubw m5, m3 ; m5 = word: row 0, row 1 - psubw m2, m3 ; m2 = word: row 2, row 3 - psubw m1, m3 ; m1 = word: row 4, row 5 - vextracti128 xm4, m5, 1 + psubw m5, m7 ; m5 = word: row 0, row 1 + psubw m2, m7 ; m2 = word: row 2, row 3 + psubw m1, m7 ; m1 = word: row 4, row 5 + psubw m4, m7 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 vextracti128 xm3, m2, 1 vextracti128 xm0, m1, 1 movu [r2], xm5 - movu [r2 + r3], xm4 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 - lea r2, [r2 + r3 * 4] + movu [r2 + r3], xm6 + lea r2, [r2 + r3 * 2] + movu [r2], xm2 + movu [r2 + r3], xm3 + lea r2, [r2 + r3 * 2] movu [r2], xm1 movu [r2 + r3], xm0 + lea r2, [r2 + r3 * 2] + movu [r2], xm4 + vextracti128 xm4, m4, 1 + movu [r2 + r3], xm4 %endif + lea r2, [r2 + r3 * 2] + sub r0, r6 + dec word [rsp] + jnz .loop RET %endmacro - FILTER_VER_CHROMA_AVX2_8x6 pp - FILTER_VER_CHROMA_AVX2_8x6 ps +%macro FILTER_VER_LUMA_AVX2_8x8 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_8x8, 4, 6, 7 + mov r4d, r4m + shl r4d, 7 -%macro PROCESS_CHROMA_AVX2_W8_16R 1 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 - vinserti128 m5, m1, xm2, 1 - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 - vinserti128 m2, m3, xm4, 1 - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - pmaddubsw m1, [r5] - movq xm3, [r0 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 - lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 8 - punpcklbw xm3, xm0 - vinserti128 m4, m4, xm3, 1 - pmaddubsw m3, m4, [r5 + 1 * mmsize] - paddw m1, m3 - pmaddubsw m4, [r5] - movq xm3, [r0 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 - movq xm6, [r0 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 - vinserti128 m0, m0, xm3, 1 - pmaddubsw m3, m0, [r5 + 1 * mmsize] - paddw m4, m3 - pmaddubsw m0, [r5] +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + PROCESS_LUMA_AVX2_W8_8R %ifidn %1,pp - pmulhrsw m5, m7 ; m5 = word: row 0, row 1 - pmulhrsw m2, m7 ; m2 = word: row 2, row 3 - pmulhrsw m1, m7 ; m1 = word: row 4, row 5 - pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + mova m3, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] +%endif + lea r4, [r3 * 3] +%ifidn %1,pp + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + pmulhrsw m4, m3 ; m4 = word: row 6, row 7 packuswb m5, m2 packuswb m1, m4 vextracti128 xm2, m5, 1 @@ -5382,886 +3454,377 @@ cglobal interp_4tap_vert_%1_8x6, 4, 6, 6 movq [r2], xm5 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm2 + movhps [r2 + r4], xm2 lea r2, [r2 + r3 * 4] movq [r2], xm1 movq [r2 + r3], xm4 movhps [r2 + r3 * 2], xm1 - movhps [r2 + r6], xm4 + movhps [r2 + r4], xm4 %else - psubw m5, m7 ; m5 = word: row 0, row 1 - psubw m2, m7 ; m2 = word: row 2, row 3 - psubw m1, m7 ; m1 = word: row 4, row 5 - psubw m4, m7 ; m4 = word: row 6, row 7 - vextracti128 xm3, m5, 1 - movu [r2], xm5 - movu [r2 + r3], xm3 + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + psubw m4, m3 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movu [r2], xm5 + movu [r2 + r3], xm6 movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 + movu [r2 + r4], xm3 lea r2, [r2 + r3 * 4] - vextracti128 xm5, m1, 1 - vextracti128 xm3, m4, 1 movu [r2], xm1 - movu [r2 + r3], xm5 + movu [r2 + r3], xm0 movu [r2 + r3 * 2], xm4 - movu [r2 + r6], xm3 -%endif - movq xm3, [r0 + r4] ; m3 = row 11 - punpcklbw xm6, xm3 - lea r0, [r0 + r1 * 4] - movq xm5, [r0] ; m5 = row 12 - punpcklbw xm3, xm5 - vinserti128 m6, m6, xm3, 1 - pmaddubsw m3, m6, [r5 + 1 * mmsize] - paddw m0, m3 - pmaddubsw m6, [r5] - movq xm3, [r0 + r1] ; m3 = row 13 - punpcklbw xm5, xm3 - movq xm2, [r0 + r1 * 2] ; m2 = row 14 - punpcklbw xm3, xm2 - vinserti128 m5, m5, xm3, 1 - pmaddubsw m3, m5, [r5 + 1 * mmsize] - paddw m6, m3 - pmaddubsw m5, [r5] - movq xm3, [r0 + r4] ; m3 = row 15 - punpcklbw xm2, xm3 - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 16 - punpcklbw xm3, xm1 - vinserti128 m2, m2, xm3, 1 - pmaddubsw m3, m2, [r5 + 1 * mmsize] - paddw m5, m3 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 17 - punpcklbw xm1, xm3 - movq xm4, [r0 + r1 * 2] ; m4 = row 18 - punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5 + 1 * mmsize] - paddw m2, m1 - lea r2, [r2 + r3 * 4] -%ifidn %1,pp - pmulhrsw m0, m7 ; m0 = word: row 8, row 9 - pmulhrsw m6, m7 ; m6 = word: row 10, row 11 - pmulhrsw m5, m7 ; m5 = word: row 12, row 13 - pmulhrsw m2, m7 ; m2 = word: row 14, row 15 - packuswb m0, m6 - packuswb m5, m2 - vextracti128 xm6, m0, 1 - vextracti128 xm2, m5, 1 - movq [r2], xm0 - movq [r2 + r3], xm6 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm6 - lea r2, [r2 + r3 * 4] - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm2 -%else - psubw m0, m7 ; m0 = word: row 8, row 9 - psubw m6, m7 ; m6 = word: row 10, row 11 - psubw m5, m7 ; m5 = word: row 12, row 13 - psubw m2, m7 ; m2 = word: row 14, row 15 - vextracti128 xm1, m0, 1 - vextracti128 xm3, m6, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - vextracti128 xm1, m5, 1 - vextracti128 xm3, m2, 1 - movu [r2], xm5 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 -%endif -%endmacro - -%macro FILTER_VER_CHROMA_AVX2_8x16 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x16, 4, 7, 8 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m7, [pw_512] -%else - add r3d, r3d - mova m7, [pw_2000] + vextracti128 xm4, m4, 1 + movu [r2 + r4], xm4 %endif - lea r6, [r3 * 3] - PROCESS_CHROMA_AVX2_W8_16R %1 RET %endmacro - FILTER_VER_CHROMA_AVX2_8x16 pp - FILTER_VER_CHROMA_AVX2_8x16 ps - -%macro FILTER_VER_CHROMA_AVX2_8x12 1 +%macro FILTER_VER_LUMA_AVX2_8x4 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x12, 4, 7, 8 +cglobal interp_8tap_vert_%1_8x4, 4, 6, 7 mov r4d, r4m - shl r4d, 6 + shl r4d, 7 %ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1, pp - mova m7, [pw_512] + sub r0, r4 + PROCESS_LUMA_AVX2_W8_4R +%ifidn %1,pp + mova m3, [pw_512] %else add r3d, r3d - mova m7, [pw_2000] + vbroadcasti128 m3, [pw_2000] %endif - lea r6, [r3 * 3] - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 - vinserti128 m5, m1, xm2, 1 - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 - vinserti128 m2, m3, xm4, 1 - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - pmaddubsw m1, [r5] - movq xm3, [r0 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 - lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 8 - punpcklbw xm3, xm0 - vinserti128 m4, m4, xm3, 1 - pmaddubsw m3, m4, [r5 + 1 * mmsize] - paddw m1, m3 - pmaddubsw m4, [r5] - movq xm3, [r0 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 - movq xm6, [r0 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 - vinserti128 m0, m0, xm3, 1 - pmaddubsw m3, m0, [r5 + 1 * mmsize] - paddw m4, m3 - pmaddubsw m0, [r5] -%ifidn %1, pp - pmulhrsw m5, m7 ; m5 = word: row 0, row 1 - pmulhrsw m2, m7 ; m2 = word: row 2, row 3 - pmulhrsw m1, m7 ; m1 = word: row 4, row 5 - pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + lea r4, [r3 * 3] +%ifidn %1,pp + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 packuswb m5, m2 - packuswb m1, m4 vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 movq [r2], xm5 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm1 - movq [r2 + r3], xm4 - movhps [r2 + r3 * 2], xm1 - movhps [r2 + r6], xm4 + movhps [r2 + r4], xm2 %else - psubw m5, m7 ; m5 = word: row 0, row 1 - psubw m2, m7 ; m2 = word: row 2, row 3 - psubw m1, m7 ; m1 = word: row 4, row 5 - psubw m4, m7 ; m4 = word: row 6, row 7 - vextracti128 xm3, m5, 1 + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 movu [r2], xm5 - movu [r2 + r3], xm3 - vextracti128 xm3, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - vextracti128 xm5, m1, 1 - vextracti128 xm3, m4, 1 - movu [r2], xm1 + vextracti128 xm5, m5, 1 movu [r2 + r3], xm5 - movu [r2 + r3 * 2], xm4 - movu [r2 + r6], xm3 -%endif - movq xm3, [r0 + r4] ; m3 = row 11 - punpcklbw xm6, xm3 - lea r0, [r0 + r1 * 4] - movq xm5, [r0] ; m5 = row 12 - punpcklbw xm3, xm5 - vinserti128 m6, m6, xm3, 1 - pmaddubsw m3, m6, [r5 + 1 * mmsize] - paddw m0, m3 - pmaddubsw m6, [r5] - movq xm3, [r0 + r1] ; m3 = row 13 - punpcklbw xm5, xm3 - movq xm2, [r0 + r1 * 2] ; m2 = row 14 - punpcklbw xm3, xm2 - vinserti128 m5, m5, xm3, 1 - pmaddubsw m3, m5, [r5 + 1 * mmsize] - paddw m6, m3 - lea r2, [r2 + r3 * 4] -%ifidn %1, pp - pmulhrsw m0, m7 ; m0 = word: row 8, row 9 - pmulhrsw m6, m7 ; m6 = word: row 10, row 11 - packuswb m0, m6 - vextracti128 xm6, m0, 1 - movq [r2], xm0 - movq [r2 + r3], xm6 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm6 -%else - psubw m0, m7 ; m0 = word: row 8, row 9 - psubw m6, m7 ; m6 = word: row 10, row 11 - vextracti128 xm1, m0, 1 - vextracti128 xm3, m6, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm3 + movu [r2 + r3 * 2], xm2 + vextracti128 xm2, m2, 1 + movu [r2 + r4], xm2 %endif RET %endmacro - FILTER_VER_CHROMA_AVX2_8x12 pp - FILTER_VER_CHROMA_AVX2_8x12 ps +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 4, pp + FILTER_VER_LUMA_AVX2_8x4 pp -%macro FILTER_VER_CHROMA_AVX2_8xN 2 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x%2, 4, 7, 8 - mov r4d, r4m - shl r4d, 6 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 8, pp + FILTER_VER_LUMA_AVX2_8x8 pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 16, pp + FILTER_VER_LUMA_AVX2_8xN 8, 16, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 32, pp + FILTER_VER_LUMA_AVX2_8xN 8, 32, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 4, ps + FILTER_VER_LUMA_AVX2_8x4 ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 8, ps + FILTER_VER_LUMA_AVX2_8x8 ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 16, ps + FILTER_VER_LUMA_AVX2_8xN 8, 16, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_8xN 8, 32, ps + FILTER_VER_LUMA_AVX2_8xN 8, 32, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_12xN 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 +%ifidn %3,ps + add r3d, r3d +%endif %ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + lea r6, [tab_LumaCoeffVer + r4] %endif - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m7, [pw_512] + %ifidn %3,pp + mova m3, [pw_512] %else - add r3d, r3d - mova m7, [pw_2000] + mova m3, [pw_2000] %endif - lea r6, [r3 * 3] -%rep %2 / 16 - PROCESS_CHROMA_AVX2_W8_16R %1 - lea r2, [r2 + r3 * 4] -%endrep - RET -%endmacro - FILTER_VER_CHROMA_AVX2_8xN pp, 32 - FILTER_VER_CHROMA_AVX2_8xN ps, 32 - FILTER_VER_CHROMA_AVX2_8xN pp, 64 - FILTER_VER_CHROMA_AVX2_8xN ps, 64 + mov r4d, %2/4 -%macro PROCESS_CHROMA_AVX2_W8_4R 0 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - vinserti128 m0, m1, xm2, 1 ; m0 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - pmaddubsw m0, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - pmaddubsw m1, [r5 + 1 * mmsize] - paddw m2, m1 -%endmacro +.loopH: + PROCESS_LUMA_W8_4R -%macro FILTER_VER_CHROMA_AVX2_8x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x4, 4, 6, 5 - mov r4d, r4m - shl r4d, 6 +%ifidn %3,pp + pmulhrsw m7, m3 + pmulhrsw m6, m3 + pmulhrsw m5, m3 + pmulhrsw m4, m3 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 + packuswb m7, m6 + packuswb m5, m4 + + movlps [r2], m7 + movhps [r2 + r3], m7 + lea r5, [r2 + 2 * r3] + movlps [r5], m5 + movhps [r5 + r3], m5 %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + psubw m7, m3 + psubw m6, m3 + psubw m5, m3 + psubw m4, m3 + + movu [r2], m7 + movu [r2 + r3], m6 + lea r5, [r2 + 2 * r3] + movu [r5], m5 + movu [r5 + r3], m4 %endif - lea r4, [r1 * 3] - sub r0, r1 - PROCESS_CHROMA_AVX2_W8_4R -%ifidn %1,pp - lea r4, [r3 * 3] - mova m3, [pw_512] - pmulhrsw m0, m3 ; m0 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - packuswb m0, m2 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r4], xm2 + lea r5, [8 * r1 - 8] + sub r0, r5 +%ifidn %3,pp + add r2, 8 %else - add r3d, r3d - vbroadcasti128 m3, [pw_2000] - lea r4, [r3 * 3] - psubw m0, m3 ; m0 = word: row 0, row 1 - psubw m2, m3 ; m2 = word: row 2, row 3 - vextracti128 xm1, m0, 1 - vextracti128 xm4, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm4 + add r2, 16 %endif - RET -%endmacro - FILTER_VER_CHROMA_AVX2_8x4 pp - FILTER_VER_CHROMA_AVX2_8x4 ps + PROCESS_LUMA_W4_4R -%macro FILTER_VER_CHROMA_AVX2_8x2 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x2, 4, 6, 4 - mov r4d, r4m - shl r4d, 6 +%ifidn %3,pp + pmulhrsw m4, m3 + pmulhrsw m5, m3 -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 + packuswb m4, m5 + + movd [r2], m4 + pextrd [r2 + r3], m4, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m4, 2 + pextrd [r5 + r3], m4, 3 %else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif + psubw m4, m3 + psubw m5, m3 - lea r4, [r1 * 3] - sub r0, r1 + movlps [r2], m4 + movhps [r2 + r3], m4 + lea r5, [r2 + 2 * r3] + movlps [r5], m5 + movhps [r5 + r3], m5 +%endif - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - vinserti128 m1, m1, xm2, 1 ; m1 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - pmaddubsw m1, [r5] - movq xm2, [r0 + r4] ; m2 = row 3 - punpcklbw xm3, xm2 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - movq xm0, [r0 + r1 * 4] ; m0 = row 4 - punpcklbw xm2, xm0 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - vinserti128 m3, m3, xm2, 1 ; m3 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 -%ifidn %1,pp - pmulhrsw m1, [pw_512] ; m1 = word: row 0, row 1 - packuswb m1, m1 - vextracti128 xm0, m1, 1 - movq [r2], xm1 - movq [r2 + r3], xm0 + lea r5, [4 * r1 + 8] + sub r0, r5 +%ifidn %3,pp + lea r2, [r2 + 4 * r3 - 8] %else - add r3d, r3d - psubw m1, [pw_2000] ; m1 = word: row 0, row 1 - vextracti128 xm0, m1, 1 - movu [r2], xm1 - movu [r2 + r3], xm0 + lea r2, [r2 + 4 * r3 - 16] %endif + + dec r4d + jnz .loopH + RET %endmacro - FILTER_VER_CHROMA_AVX2_8x2 pp - FILTER_VER_CHROMA_AVX2_8x2 ps +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_12xN 12, 16, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_12xN 12, 16, ps -%macro FILTER_VER_CHROMA_AVX2_6x8 1 +%macro FILTER_VER_LUMA_AVX2_12x16 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_6x8, 4, 6, 7 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_12x16, 4, 7, 15 mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 - PROCESS_CHROMA_AVX2_W8_8R -%ifidn %1,pp - lea r4, [r3 * 3] - mova m3, [pw_512] - pmulhrsw m5, m3 ; m5 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - pmulhrsw m1, m3 ; m1 = word: row 4, row 5 - pmulhrsw m4, m3 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movd [r2], xm5 - pextrw [r2 + 4], xm5, 2 - movd [r2 + r3], xm2 - pextrw [r2 + r3 + 4], xm2, 2 - pextrd [r2 + r3 * 2], xm5, 2 - pextrw [r2 + r3 * 2 + 4], xm5, 6 - pextrd [r2 + r4], xm2, 2 - pextrw [r2 + r4 + 4], xm2, 6 - lea r2, [r2 + r3 * 4] - movd [r2], xm1 - pextrw [r2 + 4], xm1, 2 - movd [r2 + r3], xm4 - pextrw [r2 + r3 + 4], xm4, 2 - pextrd [r2 + r3 * 2], xm1, 2 - pextrw [r2 + r3 * 2 + 4], xm1, 6 - pextrd [r2 + r4], xm4, 2 - pextrw [r2 + r4 + 4], xm4, 6 -%else - add r3d, r3d - vbroadcasti128 m3, [pw_2000] - lea r4, [r3 * 3] - psubw m5, m3 ; m5 = word: row 0, row 1 - psubw m2, m3 ; m2 = word: row 2, row 3 - psubw m1, m3 ; m1 = word: row 4, row 5 - psubw m4, m3 ; m4 = word: row 6, row 7 - vextracti128 xm6, m5, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm0, m1, 1 - movq [r2], xm5 - pextrd [r2 + 8], xm5, 2 - movq [r2 + r3], xm6 - pextrd [r2 + r3 + 8], xm6, 2 - movq [r2 + r3 * 2], xm2 - pextrd [r2 + r3 * 2 + 8], xm2, 2 - movq [r2 + r4], xm3 - pextrd [r2 + r4 + 8], xm3, 2 - lea r2, [r2 + r3 * 4] - movq [r2], xm1 - pextrd [r2 + 8], xm1, 2 - movq [r2 + r3], xm0 - pextrd [r2 + r3 + 8], xm0, 2 - movq [r2 + r3 * 2], xm4 - pextrd [r2 + r3 * 2 + 8], xm4, 2 - vextracti128 xm4, m4, 1 - movq [r2 + r4], xm4 - pextrd [r2 + r4 + 8], xm4, 2 -%endif - RET -%endmacro - - FILTER_VER_CHROMA_AVX2_6x8 pp - FILTER_VER_CHROMA_AVX2_6x8 ps - -;----------------------------------------------------------------------------- -;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W6_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m5, [r5 + r4 * 4] -%else - movd m5, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m6, m5, [tab_Vm] - pshufb m5, [tab_Vm + 16] - mova m4, [pw_512] - - mov r4d, %2 - lea r5, [3 * r1] - -.loop: - movq m0, [r0] - movq m1, [r0 + r1] - movq m2, [r0 + 2 * r1] - movq m3, [r0 + r5] - - punpcklbw m0, m1 - punpcklbw m1, m2 - punpcklbw m2, m3 - - pmaddubsw m0, m6 - pmaddubsw m7, m2, m5 - - paddw m0, m7 - - pmulhrsw m0, m4 - packuswb m0, m0 - movd [r2], m0 - pextrw [r2 + 4], m0, 2 - - lea r0, [r0 + 4 * r1] - - movq m0, [r0] - punpcklbw m3, m0 - - pmaddubsw m1, m6 - pmaddubsw m7, m3, m5 - - paddw m1, m7 - - pmulhrsw m1, m4 - packuswb m1, m1 - movd [r2 + r3], m1 - pextrw [r2 + r3 + 4], m1, 2 - - movq m1, [r0 + r1] - punpcklbw m7, m0, m1 - - pmaddubsw m2, m6 - pmaddubsw m7, m5 - - paddw m2, m7 - - pmulhrsw m2, m4 - packuswb m2, m2 - lea r2, [r2 + 2 * r3] - movd [r2], m2 - pextrw [r2 + 4], m2, 2 - - movq m2, [r0 + 2 * r1] - punpcklbw m1, m2 - - pmaddubsw m3, m6 - pmaddubsw m1, m5 - - paddw m3, m1 - - pmulhrsw m3, m4 - packuswb m3, m3 - - movd [r2 + r3], m3 - pextrw [r2 + r3 + 4], m3, 2 - - lea r2, [r2 + 2 * r3] - - sub r4, 4 - jnz .loop - RET -%endmacro - - FILTER_V4_W6_H4 6, 8 - - FILTER_V4_W6_H4 6, 16 - -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W12_H2 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mov r4d, %2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r0, [r0 + 2 * r1] - movu m5, [r0] - movu m7, [r0 + r1] - - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 - - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 - - mova m6, [pw_512] - - pmulhrsw m4, m6 - pmulhrsw m2, m6 - - packuswb m4, m2 - - movh [r2], m4 - pextrd [r2 + 8], m4, 2 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m5, [r0 + 2 * r1] - - punpcklbw m2, m7, m5 - punpckhbw m7, m5 - - pmaddubsw m2, m0 - pmaddubsw m7, m0 - - paddw m4, m2 - paddw m3, m7 - - pmulhrsw m4, m6 - pmulhrsw m3, m6 - - packuswb m4, m3 - - movh [r2 + r3], m4 - pextrd [r2 + r3 + 8], m4, 2 - - lea r2, [r2 + 2 * r3] - - sub r4, 2 - jnz .loop - RET -%endmacro - - FILTER_V4_W12_H2 12, 16 - - FILTER_V4_W12_H2 12, 32 - -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W16_H2 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mov r4d, %2/2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r0, [r0 + 2 * r1] - movu m5, [r0] - movu m6, [r0 + r1] - - punpckhbw m7, m5, m6 - pmaddubsw m7, m0 - paddw m2, m7 - - punpcklbw m7, m5, m6 - pmaddubsw m7, m0 - paddw m4, m7 - - mova m7, [pw_512] - - pmulhrsw m4, m7 - pmulhrsw m2, m7 - - packuswb m4, m2 - - movu [r2], m4 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m5, [r0 + 2 * r1] - - punpcklbw m2, m6, m5 - punpckhbw m6, m5 - - pmaddubsw m2, m0 - pmaddubsw m6, m0 - - paddw m4, m2 - paddw m3, m6 - - pmulhrsw m4, m7 - pmulhrsw m3, m7 - - packuswb m4, m3 - - movu [r2 + r3], m4 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loop - RET -%endmacro - - FILTER_V4_W16_H2 16, 4 - FILTER_V4_W16_H2 16, 8 - FILTER_V4_W16_H2 16, 12 - FILTER_V4_W16_H2 16, 16 - FILTER_V4_W16_H2 16, 32 - - FILTER_V4_W16_H2 16, 24 - FILTER_V4_W16_H2 16, 64 - -%macro FILTER_VER_CHROMA_AVX2_16x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 - mov r4d, r4m - shl r4d, 6 + shl r4d, 7 %ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - mova m12, [r5] - mova m13, [r5 + mmsize] lea r4, [r1 * 3] - sub r0, r1 + sub r0, r4 %ifidn %1,pp mova m14, [pw_512] %else add r3d, r3d vbroadcasti128 m14, [pw_2000] %endif - lea r5, [r3 * 3] + lea r6, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, m12 + pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, m12 + pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, m13 + pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 - pmaddubsw m2, m12 + pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, m13 + pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 - pmaddubsw m3, m12 + pmaddubsw m3, [r5] movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, m13 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 - pmaddubsw m4, m12 + pmaddubsw m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, m13 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 - pmaddubsw m5, m12 + pmaddubsw m5, [r5] movu xm7, [r0 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, m13 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] paddw m4, m8 - pmaddubsw m6, m12 + pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, m13 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] paddw m5, m9 - pmaddubsw m7, m12 + pmaddubsw m7, [r5] movu xm9, [r0 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, m13 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] paddw m6, m10 - pmaddubsw m8, m12 + pmaddubsw m8, [r5] movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, m13 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] paddw m7, m11 - pmaddubsw m9, m12 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] %ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 @@ -6270,29 +3833,28 @@ cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 pmulhrsw m3, m14 ; m3 = word: row 3 pmulhrsw m4, m14 ; m4 = word: row 4 pmulhrsw m5, m14 ; m5 = word: row 5 - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 - packuswb m6, m7 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 - vextracti128 xm7, m6, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r5], xm3 + movq [r2], xm0 + pextrd [r2 + 8], xm0, 2 + movq [r2 + r3], xm1 + pextrd [r2 + r3 + 8], xm1, 2 + movq [r2 + r3 * 2], xm2 + pextrd [r2 + r3 * 2 + 8], xm2, 2 + movq [r2 + r6], xm3 + pextrd [r2 + r6 + 8], xm3, 2 lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 - movu [r2 + r3 * 2], xm6 - movu [r2 + r5], xm7 + movq [r2], xm4 + pextrd [r2 + 8], xm4, 2 + movq [r2 + r3], xm5 + pextrd [r2 + r3 + 8], xm5, 2 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 @@ -6300,153 +3862,246 @@ cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 psubw m3, m14 ; m3 = word: row 3 psubw m4, m14 ; m4 = word: row 4 psubw m5, m14 ; m5 = word: row 5 - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 - movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r5], m3 + movu [r2], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + 16], xm0 + movu [r2 + r3], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r3 + 16], xm1 + movu [r2 + r3 * 2], xm2 + vextracti128 xm2, m2, 1 + movq [r2 + r3 * 2 + 16], xm2 + movu [r2 + r6], xm3 + vextracti128 xm3, m3, 1 + movq [r2 + r6 + 16], xm3 lea r2, [r2 + r3 * 4] - movu [r2], m4 - movu [r2 + r3], m5 - movu [r2 + r3 * 2], m6 - movu [r2 + r5], m7 + movu [r2], xm4 + vextracti128 xm4, m4, 1 + movq [r2 + 16], xm4 + movu [r2 + r3], xm5 + vextracti128 xm5, m5, 1 + movq [r2 + r3 + 16], xm5 %endif - lea r2, [r2 + r3 * 4] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm6, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm6, 1 - pmaddubsw m6, m10, m13 - paddw m8, m6 - pmaddubsw m10, m12 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 12 - punpckhbw xm7, xm11, xm6 - punpcklbw xm11, xm6 - vinserti128 m11, m11, xm7, 1 - pmaddubsw m7, m11, m13 - paddw m9, m7 - pmaddubsw m11, m12 - - movu xm7, [r0 + r1] ; m7 = row 13 - punpckhbw xm0, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm0, 1 - pmaddubsw m0, m6, m13 + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] paddw m10, m0 - pmaddubsw m6, m12 + pmaddubsw m12, [r5] movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm7, xm0 - punpcklbw xm7, xm0 - vinserti128 m7, m7, xm1, 1 - pmaddubsw m1, m7, m13 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] paddw m11, m1 - pmaddubsw m7, m12 + pmaddubsw m13, [r5] + +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movq [r2 + r3 * 2], xm6 + pextrd [r2 + r3 * 2 + 8], xm6, 2 + movq [r2 + r6], xm7 + pextrd [r2 + r6 + 8], xm7, 2 +%else + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], xm6 + vextracti128 xm6, m6, 1 + movq [r2 + r3 * 2 + 16], xm6 + movu [r2 + r6], xm7 + vextracti128 xm7, m7, 1 + movq [r2 + r6 + 16], xm7 +%endif + lea r2, [r2 + r3 * 4] + movu xm1, [r0 + r4] ; m1 = row 15 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, m13 - paddw m6, m2 - pmaddubsw m0, m12 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] lea r0, [r0 + r1 * 4] movu xm2, [r0] ; m2 = row 16 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, m13 - paddw m7, m3 - pmaddubsw m1, m12 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] movu xm3, [r0 + r1] ; m3 = row 17 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddubsw m2, m13 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] paddw m0, m2 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 - pmaddubsw m3, m13 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] paddw m1, m3 + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 %ifidn %1,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m6, m14 ; m6 = word: row 12 - pmulhrsw m7, m14 ; m7 = word: row 13 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 pmulhrsw m0, m14 ; m0 = word: row 14 pmulhrsw m1, m14 ; m1 = word: row 15 packuswb m8, m9 packuswb m10, m11 - packuswb m6, m7 + packuswb m12, m13 packuswb m0, m1 vpermq m8, m8, 11011000b vpermq m10, m10, 11011000b - vpermq m6, m6, 11011000b + vpermq m12, m12, 11011000b vpermq m0, m0, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 - vextracti128 xm7, m6, 1 + vextracti128 xm13, m12, 1 vextracti128 xm1, m0, 1 - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r5], xm11 + movq [r2], xm8 + pextrd [r2 + 8], xm8, 2 + movq [r2 + r3], xm9 + pextrd [r2 + r3 + 8], xm9, 2 + movq [r2 + r3 * 2], xm10 + pextrd [r2 + r3 * 2 + 8], xm10, 2 + movq [r2 + r6], xm11 + pextrd [r2 + r6 + 8], xm11, 2 lea r2, [r2 + r3 * 4] - movu [r2], xm6 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm0 - movu [r2 + r5], xm1 + movq [r2], xm12 + pextrd [r2 + 8], xm12, 2 + movq [r2 + r3], xm13 + pextrd [r2 + r3 + 8], xm13, 2 + movq [r2 + r3 * 2], xm0 + pextrd [r2 + r3 * 2 + 8], xm0, 2 + movq [r2 + r6], xm1 + pextrd [r2 + r6 + 8], xm1, 2 %else psubw m8, m14 ; m8 = word: row 8 psubw m9, m14 ; m9 = word: row 9 psubw m10, m14 ; m10 = word: row 10 psubw m11, m14 ; m11 = word: row 11 - psubw m6, m14 ; m6 = word: row 12 - psubw m7, m14 ; m7 = word: row 13 + psubw m12, m14 ; m12 = word: row 12 + psubw m13, m14 ; m13 = word: row 13 psubw m0, m14 ; m0 = word: row 14 psubw m1, m14 ; m1 = word: row 15 - movu [r2], m8 - movu [r2 + r3], m9 - movu [r2 + r3 * 2], m10 - movu [r2 + r5], m11 + movu [r2], xm8 + vextracti128 xm8, m8, 1 + movq [r2 + 16], xm8 + movu [r2 + r3], xm9 + vextracti128 xm9, m9, 1 + movq [r2 + r3 + 16], xm9 + movu [r2 + r3 * 2], xm10 + vextracti128 xm10, m10, 1 + movq [r2 + r3 * 2 + 16], xm10 + movu [r2 + r6], xm11 + vextracti128 xm11, m11, 1 + movq [r2 + r6 + 16], xm11 lea r2, [r2 + r3 * 4] - movu [r2], m6 - movu [r2 + r3], m7 - movu [r2 + r3 * 2], m0 - movu [r2 + r5], m1 + movu [r2], xm12 + vextracti128 xm12, m12, 1 + movq [r2 + 16], xm12 + movu [r2 + r3], xm13 + vextracti128 xm13, m13, 1 + movq [r2 + r3 + 16], xm13 + movu [r2 + r3 * 2], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + r3 * 2 + 16], xm0 + movu [r2 + r6], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r6 + 16], xm1 %endif RET %endif %endmacro - FILTER_VER_CHROMA_AVX2_16x16 pp - FILTER_VER_CHROMA_AVX2_16x16 ps -%macro FILTER_VER_CHROMA_AVX2_16x8 1 + FILTER_VER_LUMA_AVX2_12x16 pp + FILTER_VER_LUMA_AVX2_12x16 ps + +%macro FILTER_VER_LUMA_AVX2_16x16 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_16x8, 4, 7, 7 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 mov r4d, r4m - shl r4d, 6 + shl r4d, 7 %ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] - sub r0, r1 + sub r0, r4 %ifidn %1,pp - mova m6, [pw_512] + mova m14, [pw_512] %else add r3d, r3d - mova m6, [pw_2000] + vbroadcasti128 m14, [pw_2000] %endif lea r6, [r3 * 3] @@ -6465,7 +4120,7 @@ cglobal interp_4tap_vert_%1_16x8, 4, 7, 7 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + mmsize] + pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] @@ -6473,735 +4128,440 @@ cglobal interp_4tap_vert_%1_16x8, 4, 7, 7 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + mmsize] + pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] -%ifidn %1,pp - pmulhrsw m0, m6 ; m0 = word: row 0 - pmulhrsw m1, m6 ; m1 = word: row 1 - packuswb m0, m1 - vpermq m0, m0, 11011000b - vextracti128 xm1, m0, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 -%else - psubw m0, m6 ; m0 = word: row 0 - psubw m1, m6 ; m1 = word: row 1 - movu [r2], m0 - movu [r2 + r3], m1 -%endif - - movu xm0, [r0 + r1] ; m0 = row 5 - punpckhbw xm1, xm4, xm0 - punpcklbw xm4, xm0 - vinserti128 m4, m4, xm1, 1 - pmaddubsw m1, m4, [r5 + mmsize] - paddw m2, m1 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 pmaddubsw m4, [r5] - movu xm1, [r0 + r1 * 2] ; m1 = row 6 - punpckhbw xm5, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm5, 1 - pmaddubsw m5, m0, [r5 + mmsize] - paddw m3, m5 - pmaddubsw m0, [r5] -%ifidn %1,pp - pmulhrsw m2, m6 ; m2 = word: row 2 - pmulhrsw m3, m6 ; m3 = word: row 3 - packuswb m2, m3 - vpermq m2, m2, 11011000b - vextracti128 xm3, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 -%else - psubw m2, m6 ; m2 = word: row 2 - psubw m3, m6 ; m3 = word: row 3 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m3 -%endif - - movu xm2, [r0 + r4] ; m2 = row 7 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + mmsize] - paddw m4, m3 - pmaddubsw m1, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] - movu xm3, [r0] ; m3 = row 8 - punpckhbw xm5, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm5, 1 - pmaddubsw m5, m2, [r5 + mmsize] - paddw m0, m5 - pmaddubsw m2, [r5] - lea r2, [r2 + r3 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + %ifidn %1,pp - pmulhrsw m4, m6 ; m4 = word: row 4 - pmulhrsw m0, m6 ; m0 = word: row 5 - packuswb m4, m0 + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b - vextracti128 xm0, m4, 1 + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] movu [r2], xm4 - movu [r2 + r3], xm0 + movu [r2 + r3], xm5 %else - psubw m4, m6 ; m4 = word: row 4 - psubw m0, m6 ; m0 = word: row 5 + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + lea r2, [r2 + r3 * 4] movu [r2], m4 - movu [r2 + r3], m0 + movu [r2 + r3], m5 +%endif + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] + +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 +%else + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], m6 + movu [r2 + r6], m7 %endif + lea r2, [r2 + r3 * 4] - movu xm5, [r0 + r1] ; m5 = row 9 - punpckhbw xm4, xm3, xm5 - punpcklbw xm3, xm5 - vinserti128 m3, m3, xm4, 1 - pmaddubsw m3, [r5 + mmsize] + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] paddw m1, m3 - movu xm4, [r0 + r1 * 2] ; m4 = row 10 - punpckhbw xm0, xm5, xm4 - punpcklbw xm5, xm4 - vinserti128 m5, m5, xm0, 1 - pmaddubsw m5, [r5 + mmsize] - paddw m2, m5 + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 + %ifidn %1,pp - pmulhrsw m1, m6 ; m1 = word: row 6 - pmulhrsw m2, m6 ; m2 = word: row 7 - packuswb m1, m2 - vpermq m1, m1, 11011000b - vextracti128 xm2, m1, 1 - movu [r2 + r3 * 2], xm1 - movu [r2 + r6], xm2 + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm1 %else - psubw m1, m6 ; m1 = word: row 6 - psubw m2, m6 ; m2 = word: row 7 - movu [r2 + r3 * 2], m1 - movu [r2 + r6], m2 + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m12, m14 ; m12 = word: row 12 + psubw m13, m14 ; m13 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r2], m8 + movu [r2 + r3], m9 + movu [r2 + r3 * 2], m10 + movu [r2 + r6], m11 + lea r2, [r2 + r3 * 4] + movu [r2], m12 + movu [r2 + r3], m13 + movu [r2 + r3 * 2], m0 + movu [r2 + r6], m1 %endif RET +%endif %endmacro - FILTER_VER_CHROMA_AVX2_16x8 pp - FILTER_VER_CHROMA_AVX2_16x8 ps + FILTER_VER_LUMA_AVX2_16x16 pp + FILTER_VER_LUMA_AVX2_16x16 ps -%macro FILTER_VER_CHROMA_AVX2_16x12 1 +%macro FILTER_VER_LUMA_AVX2_16x12 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 +cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - mova m8, [r5] - mova m9, [r5 + mmsize] - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m7, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m7, [pw_2000] -%endif - lea r5, [r3 * 3] - - movu xm0, [r0] - vinserti128 m0, m0, [r0 + r1 * 2], 1 - movu xm1, [r0 + r1] - vinserti128 m1, m1, [r0 + r4], 1 - - punpcklbw m2, m0, m1 - punpckhbw m3, m0, m1 - vperm2i128 m4, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - pmaddubsw m4, m8 - pmaddubsw m3, m2, m9 - paddw m4, m3 - pmaddubsw m2, m8 - - vextracti128 xm0, m0, 1 - lea r0, [r0 + r1 * 4] - vinserti128 m0, m0, [r0], 1 - - punpcklbw m5, m1, m0 - punpckhbw m3, m1, m0 - vperm2i128 m6, m5, m3, 0x20 - vperm2i128 m5, m5, m3, 0x31 - pmaddubsw m6, m8 - pmaddubsw m3, m5, m9 - paddw m6, m3 - pmaddubsw m5, m8 -%ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 0 - pmulhrsw m6, m7 ; m6 = word: row 1 - packuswb m4, m6 - vpermq m4, m4, 11011000b - vextracti128 xm6, m4, 1 - movu [r2], xm4 - movu [r2 + r3], xm6 -%else - psubw m4, m7 ; m4 = word: row 0 - psubw m6, m7 ; m6 = word: row 1 - movu [r2], m4 - movu [r2 + r3], m6 -%endif - - movu xm4, [r0 + r1 * 2] - vinserti128 m4, m4, [r0 + r1], 1 - vextracti128 xm1, m4, 1 - vinserti128 m0, m0, xm1, 0 - - punpcklbw m6, m0, m4 - punpckhbw m1, m0, m4 - vperm2i128 m0, m6, m1, 0x20 - vperm2i128 m6, m6, m1, 0x31 - pmaddubsw m1, m0, m9 - paddw m5, m1 - pmaddubsw m0, m8 - pmaddubsw m1, m6, m9 - paddw m2, m1 - pmaddubsw m6, m8 - -%ifidn %1,pp - pmulhrsw m2, m7 ; m2 = word: row 2 - pmulhrsw m5, m7 ; m5 = word: row 3 - packuswb m2, m5 - vpermq m2, m2, 11011000b - vextracti128 xm5, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r5], xm5 -%else - psubw m2, m7 ; m2 = word: row 2 - psubw m5, m7 ; m5 = word: row 3 - movu [r2 + r3 * 2], m2 - movu [r2 + r5], m5 -%endif - lea r2, [r2 + r3 * 4] - - movu xm1, [r0 + r4] - lea r0, [r0 + r1 * 4] - vinserti128 m1, m1, [r0], 1 - vinserti128 m4, m4, xm1, 1 - - punpcklbw m2, m4, m1 - punpckhbw m5, m4, m1 - vperm2i128 m3, m2, m5, 0x20 - vperm2i128 m2, m2, m5, 0x31 - pmaddubsw m5, m3, m9 - paddw m6, m5 - pmaddubsw m3, m8 - pmaddubsw m5, m2, m9 - paddw m0, m5 - pmaddubsw m2, m8 - -%ifidn %1,pp - pmulhrsw m6, m7 ; m6 = word: row 4 - pmulhrsw m0, m7 ; m0 = word: row 5 - packuswb m6, m0 - vpermq m6, m6, 11011000b - vextracti128 xm0, m6, 1 - movu [r2], xm6 - movu [r2 + r3], xm0 -%else - psubw m6, m7 ; m6 = word: row 4 - psubw m0, m7 ; m0 = word: row 5 - movu [r2], m6 - movu [r2 + r3], m0 -%endif - - movu xm6, [r0 + r1 * 2] - vinserti128 m6, m6, [r0 + r1], 1 - vextracti128 xm0, m6, 1 - vinserti128 m1, m1, xm0, 0 - - punpcklbw m4, m1, m6 - punpckhbw m5, m1, m6 - vperm2i128 m0, m4, m5, 0x20 - vperm2i128 m5, m4, m5, 0x31 - pmaddubsw m4, m0, m9 - paddw m2, m4 - pmaddubsw m0, m8 - pmaddubsw m4, m5, m9 - paddw m3, m4 - pmaddubsw m5, m8 - -%ifidn %1,pp - pmulhrsw m3, m7 ; m3 = word: row 6 - pmulhrsw m2, m7 ; m2 = word: row 7 - packuswb m3, m2 - vpermq m3, m3, 11011000b - vextracti128 xm2, m3, 1 - movu [r2 + r3 * 2], xm3 - movu [r2 + r5], xm2 -%else - psubw m3, m7 ; m3 = word: row 6 - psubw m2, m7 ; m2 = word: row 7 - movu [r2 + r3 * 2], m3 - movu [r2 + r5], m2 -%endif - lea r2, [r2 + r3 * 4] - - movu xm3, [r0 + r4] - lea r0, [r0 + r1 * 4] - vinserti128 m3, m3, [r0], 1 - vinserti128 m6, m6, xm3, 1 - - punpcklbw m2, m6, m3 - punpckhbw m1, m6, m3 - vperm2i128 m4, m2, m1, 0x20 - vperm2i128 m2, m2, m1, 0x31 - pmaddubsw m1, m4, m9 - paddw m5, m1 - pmaddubsw m4, m8 - pmaddubsw m1, m2, m9 - paddw m0, m1 - pmaddubsw m2, m8 - -%ifidn %1,pp - pmulhrsw m5, m7 ; m5 = word: row 8 - pmulhrsw m0, m7 ; m0 = word: row 9 - packuswb m5, m0 - vpermq m5, m5, 11011000b - vextracti128 xm0, m5, 1 - movu [r2], xm5 - movu [r2 + r3], xm0 -%else - psubw m5, m7 ; m5 = word: row 8 - psubw m0, m7 ; m0 = word: row 9 - movu [r2], m5 - movu [r2 + r3], m0 -%endif - - movu xm5, [r0 + r1 * 2] - vinserti128 m5, m5, [r0 + r1], 1 - vextracti128 xm0, m5, 1 - vinserti128 m3, m3, xm0, 0 - - punpcklbw m1, m3, m5 - punpckhbw m0, m3, m5 - vperm2i128 m6, m1, m0, 0x20 - vperm2i128 m0, m1, m0, 0x31 - pmaddubsw m1, m6, m9 - paddw m2, m1 - pmaddubsw m1, m0, m9 - paddw m4, m1 - -%ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 10 - pmulhrsw m2, m7 ; m2 = word: row 11 - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm2, m4, 1 - movu [r2 + r3 * 2], xm4 - movu [r2 + r5], xm2 -%else - psubw m4, m7 ; m4 = word: row 10 - psubw m2, m7 ; m2 = word: row 11 - movu [r2 + r3 * 2], m4 - movu [r2 + r5], m2 -%endif - RET -%endif -%endmacro - - FILTER_VER_CHROMA_AVX2_16x12 pp - FILTER_VER_CHROMA_AVX2_16x12 ps - -%macro FILTER_VER_CHROMA_AVX2_16xN 2 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_16x%2, 4, 8, 8 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m7, [pw_512] -%else - add r3d, r3d - mova m7, [pw_2000] -%endif - lea r6, [r3 * 3] - mov r7d, %2 / 16 -.loopH: - movu xm0, [r0] - vinserti128 m0, m0, [r0 + r1 * 2], 1 - movu xm1, [r0 + r1] - vinserti128 m1, m1, [r0 + r4], 1 - - punpcklbw m2, m0, m1 - punpckhbw m3, m0, m1 - vperm2i128 m4, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - pmaddubsw m4, [r5] - pmaddubsw m3, m2, [r5 + mmsize] - paddw m4, m3 - pmaddubsw m2, [r5] - - vextracti128 xm0, m0, 1 - lea r0, [r0 + r1 * 4] - vinserti128 m0, m0, [r0], 1 - - punpcklbw m5, m1, m0 - punpckhbw m3, m1, m0 - vperm2i128 m6, m5, m3, 0x20 - vperm2i128 m5, m5, m3, 0x31 - pmaddubsw m6, [r5] - pmaddubsw m3, m5, [r5 + mmsize] - paddw m6, m3 - pmaddubsw m5, [r5] -%ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 0 - pmulhrsw m6, m7 ; m6 = word: row 1 - packuswb m4, m6 - vpermq m4, m4, 11011000b - vextracti128 xm6, m4, 1 - movu [r2], xm4 - movu [r2 + r3], xm6 -%else - psubw m4, m7 ; m4 = word: row 0 - psubw m6, m7 ; m6 = word: row 1 - movu [r2], m4 - movu [r2 + r3], m6 -%endif - - movu xm4, [r0 + r1 * 2] - vinserti128 m4, m4, [r0 + r1], 1 - vextracti128 xm1, m4, 1 - vinserti128 m0, m0, xm1, 0 - - punpcklbw m6, m0, m4 - punpckhbw m1, m0, m4 - vperm2i128 m0, m6, m1, 0x20 - vperm2i128 m6, m6, m1, 0x31 - pmaddubsw m1, m0, [r5 + mmsize] - paddw m5, m1 - pmaddubsw m0, [r5] - pmaddubsw m1, m6, [r5 + mmsize] - paddw m2, m1 - pmaddubsw m6, [r5] - -%ifidn %1,pp - pmulhrsw m2, m7 ; m2 = word: row 2 - pmulhrsw m5, m7 ; m5 = word: row 3 - packuswb m2, m5 - vpermq m2, m2, 11011000b - vextracti128 xm5, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm5 -%else - psubw m2, m7 ; m2 = word: row 2 - psubw m5, m7 ; m5 = word: row 3 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m5 -%endif - lea r2, [r2 + r3 * 4] - - movu xm1, [r0 + r4] - lea r0, [r0 + r1 * 4] - vinserti128 m1, m1, [r0], 1 - vinserti128 m4, m4, xm1, 1 - - punpcklbw m2, m4, m1 - punpckhbw m5, m4, m1 - vperm2i128 m3, m2, m5, 0x20 - vperm2i128 m2, m2, m5, 0x31 - pmaddubsw m5, m3, [r5 + mmsize] - paddw m6, m5 - pmaddubsw m3, [r5] - pmaddubsw m5, m2, [r5 + mmsize] - paddw m0, m5 - pmaddubsw m2, [r5] - -%ifidn %1,pp - pmulhrsw m6, m7 ; m6 = word: row 4 - pmulhrsw m0, m7 ; m0 = word: row 5 - packuswb m6, m0 - vpermq m6, m6, 11011000b - vextracti128 xm0, m6, 1 - movu [r2], xm6 - movu [r2 + r3], xm0 -%else - psubw m6, m7 ; m6 = word: row 4 - psubw m0, m7 ; m0 = word: row 5 - movu [r2], m6 - movu [r2 + r3], m0 -%endif - - movu xm6, [r0 + r1 * 2] - vinserti128 m6, m6, [r0 + r1], 1 - vextracti128 xm0, m6, 1 - vinserti128 m1, m1, xm0, 0 - - punpcklbw m4, m1, m6 - punpckhbw m5, m1, m6 - vperm2i128 m0, m4, m5, 0x20 - vperm2i128 m5, m4, m5, 0x31 - pmaddubsw m4, m0, [r5 + mmsize] - paddw m2, m4 - pmaddubsw m0, [r5] - pmaddubsw m4, m5, [r5 + mmsize] - paddw m3, m4 - pmaddubsw m5, [r5] - -%ifidn %1,pp - pmulhrsw m3, m7 ; m3 = word: row 6 - pmulhrsw m2, m7 ; m2 = word: row 7 - packuswb m3, m2 - vpermq m3, m3, 11011000b - vextracti128 xm2, m3, 1 - movu [r2 + r3 * 2], xm3 - movu [r2 + r6], xm2 -%else - psubw m3, m7 ; m3 = word: row 6 - psubw m2, m7 ; m2 = word: row 7 - movu [r2 + r3 * 2], m3 - movu [r2 + r6], m2 -%endif - lea r2, [r2 + r3 * 4] - - movu xm3, [r0 + r4] - lea r0, [r0 + r1 * 4] - vinserti128 m3, m3, [r0], 1 - vinserti128 m6, m6, xm3, 1 - - punpcklbw m2, m6, m3 - punpckhbw m1, m6, m3 - vperm2i128 m4, m2, m1, 0x20 - vperm2i128 m2, m2, m1, 0x31 - pmaddubsw m1, m4, [r5 + mmsize] - paddw m5, m1 - pmaddubsw m4, [r5] - pmaddubsw m1, m2, [r5 + mmsize] - paddw m0, m1 - pmaddubsw m2, [r5] - -%ifidn %1,pp - pmulhrsw m5, m7 ; m5 = word: row 8 - pmulhrsw m0, m7 ; m0 = word: row 9 - packuswb m5, m0 - vpermq m5, m5, 11011000b - vextracti128 xm0, m5, 1 - movu [r2], xm5 - movu [r2 + r3], xm0 -%else - psubw m5, m7 ; m5 = word: row 8 - psubw m0, m7 ; m0 = word: row 9 - movu [r2], m5 - movu [r2 + r3], m0 -%endif - - movu xm5, [r0 + r1 * 2] - vinserti128 m5, m5, [r0 + r1], 1 - vextracti128 xm0, m5, 1 - vinserti128 m3, m3, xm0, 0 - - punpcklbw m1, m3, m5 - punpckhbw m0, m3, m5 - vperm2i128 m6, m1, m0, 0x20 - vperm2i128 m0, m1, m0, 0x31 - pmaddubsw m1, m6, [r5 + mmsize] - paddw m2, m1 - pmaddubsw m6, [r5] - pmaddubsw m1, m0, [r5 + mmsize] - paddw m4, m1 - pmaddubsw m0, [r5] - -%ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 10 - pmulhrsw m2, m7 ; m2 = word: row 11 - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm2, m4, 1 - movu [r2 + r3 * 2], xm4 - movu [r2 + r6], xm2 -%else - psubw m4, m7 ; m4 = word: row 10 - psubw m2, m7 ; m2 = word: row 11 - movu [r2 + r3 * 2], m4 - movu [r2 + r6], m2 -%endif - lea r2, [r2 + r3 * 4] - - movu xm3, [r0 + r4] - lea r0, [r0 + r1 * 4] - vinserti128 m3, m3, [r0], 1 - vinserti128 m5, m5, xm3, 1 - - punpcklbw m2, m5, m3 - punpckhbw m1, m5, m3 - vperm2i128 m4, m2, m1, 0x20 - vperm2i128 m2, m2, m1, 0x31 - pmaddubsw m1, m4, [r5 + mmsize] - paddw m0, m1 - pmaddubsw m4, [r5] - pmaddubsw m1, m2, [r5 + mmsize] - paddw m6, m1 - pmaddubsw m2, [r5] - -%ifidn %1,pp - pmulhrsw m0, m7 ; m0 = word: row 12 - pmulhrsw m6, m7 ; m6 = word: row 13 - packuswb m0, m6 - vpermq m0, m0, 11011000b - vextracti128 xm6, m0, 1 - movu [r2], xm0 - movu [r2 + r3], xm6 -%else - psubw m0, m7 ; m0 = word: row 12 - psubw m6, m7 ; m6 = word: row 13 - movu [r2], m0 - movu [r2 + r3], m6 -%endif - - movu xm5, [r0 + r1 * 2] - vinserti128 m5, m5, [r0 + r1], 1 - vextracti128 xm0, m5, 1 - vinserti128 m3, m3, xm0, 0 - - punpcklbw m1, m3, m5 - punpckhbw m0, m3, m5 - vperm2i128 m6, m1, m0, 0x20 - vperm2i128 m0, m1, m0, 0x31 - pmaddubsw m6, [r5 + mmsize] - paddw m2, m6 - pmaddubsw m0, [r5 + mmsize] - paddw m4, m0 - -%ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 14 - pmulhrsw m2, m7 ; m2 = word: row 15 - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm2, m4, 1 - movu [r2 + r3 * 2], xm4 - movu [r2 + r6], xm2 -%else - psubw m4, m7 ; m4 = word: row 14 - psubw m2, m7 ; m2 = word: row 15 - movu [r2 + r3 * 2], m4 - movu [r2 + r6], m2 -%endif - lea r2, [r2 + r3 * 4] - dec r7d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_CHROMA_AVX2_16xN pp, 32 - FILTER_VER_CHROMA_AVX2_16xN ps, 32 - FILTER_VER_CHROMA_AVX2_16xN pp, 64 - FILTER_VER_CHROMA_AVX2_16xN ps, 64 - -%macro FILTER_VER_CHROMA_AVX2_16x24 1 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_16x24, 4, 6, 15 - mov r4d, r4m - shl r4d, 6 + shl r4d, 7 %ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - mova m12, [r5] - mova m13, [r5 + mmsize] lea r4, [r1 * 3] - sub r0, r1 + sub r0, r4 %ifidn %1,pp mova m14, [pw_512] %else add r3d, r3d vbroadcasti128 m14, [pw_2000] %endif - lea r5, [r3 * 3] + lea r6, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, m12 + pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, m12 + pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, m13 + pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 - pmaddubsw m2, m12 + pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, m13 + pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 - pmaddubsw m3, m12 + pmaddubsw m3, [r5] movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, m13 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 - pmaddubsw m4, m12 + pmaddubsw m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, m13 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 - pmaddubsw m5, m12 + pmaddubsw m5, [r5] movu xm7, [r0 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, m13 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] paddw m4, m8 - pmaddubsw m6, m12 + pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, m13 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] paddw m5, m9 - pmaddubsw m7, m12 + pmaddubsw m7, [r5] movu xm9, [r0 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, m13 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] paddw m6, m10 - pmaddubsw m8, m12 + pmaddubsw m8, [r5] movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, m13 - paddw m7, m11 - pmaddubsw m9, m12 - + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + %ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 @@ -7209,29 +4569,22 @@ cglobal interp_4tap_vert_%1_16x24, 4, 6, 15 pmulhrsw m3, m14 ; m3 = word: row 3 pmulhrsw m4, m14 ; m4 = word: row 4 pmulhrsw m5, m14 ; m5 = word: row 5 - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 - packuswb m6, m7 - vpermq m0, m0, q3120 - vpermq m2, m2, q3120 - vpermq m4, m4, q3120 - vpermq m6, m6, q3120 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 - vextracti128 xm7, m6, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 - movu [r2 + r5], xm3 + movu [r2 + r6], xm3 lea r2, [r2 + r3 * 4] movu [r2], xm4 movu [r2 + r3], xm5 - movu [r2 + r3 * 2], xm6 - movu [r2 + r5], xm7 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 @@ -7239,959 +4592,461 @@ cglobal interp_4tap_vert_%1_16x24, 4, 6, 15 psubw m3, m14 ; m3 = word: row 3 psubw m4, m14 ; m4 = word: row 4 psubw m5, m14 ; m5 = word: row 5 - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 - movu [r2 + r5], m3 + movu [r2 + r6], m3 lea r2, [r2 + r3 * 4] movu [r2], m4 movu [r2 + r3], m5 - movu [r2 + r3 * 2], m6 - movu [r2 + r5], m7 %endif - lea r2, [r2 + r3 * 4] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm6, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm6, 1 - pmaddubsw m6, m10, m13 - paddw m8, m6 - pmaddubsw m10, m12 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 12 - punpckhbw xm7, xm11, xm6 - punpcklbw xm11, xm6 - vinserti128 m11, m11, xm7, 1 - pmaddubsw m7, m11, m13 - paddw m9, m7 - pmaddubsw m11, m12 - - movu xm7, [r0 + r1] ; m7 = row 13 - punpckhbw xm0, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm0, 1 - pmaddubsw m0, m6, m13 + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] paddw m10, m0 - pmaddubsw m6, m12 movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm7, xm0 - punpcklbw xm7, xm0 - vinserti128 m7, m7, xm1, 1 - pmaddubsw m1, m7, m13 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] paddw m11, m1 - pmaddubsw m7, m12 + +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 +%else + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], m6 + movu [r2 + r6], m7 +%endif + lea r2, [r2 + r3 * 4] + movu xm1, [r0 + r4] ; m1 = row 15 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, m13 - paddw m6, m2 - pmaddubsw m0, m12 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 lea r0, [r0 + r1 * 4] movu xm2, [r0] ; m2 = row 16 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, m13 - paddw m7, m3 - pmaddubsw m1, m12 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 movu xm3, [r0 + r1] ; m3 = row 17 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, m13 - paddw m0, m4 - pmaddubsw m2, m12 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, m13 - paddw m1, m5 - pmaddubsw m3, m12 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 %ifidn %1,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m6, m14 ; m6 = word: row 12 - pmulhrsw m7, m14 ; m7 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 packuswb m8, m9 packuswb m10, m11 - packuswb m6, m7 - packuswb m0, m1 - vpermq m8, m8, q3120 - vpermq m10, m10, q3120 - vpermq m6, m6, q3120 - vpermq m0, m0, q3120 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 - vextracti128 xm7, m6, 1 - vextracti128 xm1, m0, 1 movu [r2], xm8 movu [r2 + r3], xm9 movu [r2 + r3 * 2], xm10 - movu [r2 + r5], xm11 - lea r2, [r2 + r3 * 4] - movu [r2], xm6 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm0 - movu [r2 + r5], xm1 + movu [r2 + r6], xm11 %else psubw m8, m14 ; m8 = word: row 8 psubw m9, m14 ; m9 = word: row 9 psubw m10, m14 ; m10 = word: row 10 psubw m11, m14 ; m11 = word: row 11 - psubw m6, m14 ; m6 = word: row 12 - psubw m7, m14 ; m7 = word: row 13 - psubw m0, m14 ; m0 = word: row 14 - psubw m1, m14 ; m1 = word: row 15 movu [r2], m8 movu [r2 + r3], m9 movu [r2 + r3 * 2], m10 - movu [r2 + r5], m11 - lea r2, [r2 + r3 * 4] - movu [r2], m6 - movu [r2 + r3], m7 - movu [r2 + r3 * 2], m0 - movu [r2 + r5], m1 + movu [r2 + r6], m11 %endif - lea r2, [r2 + r3 * 4] + RET +%endif +%endmacro - movu xm5, [r0 + r4] ; m5 = row 19 + FILTER_VER_LUMA_AVX2_16x12 pp + FILTER_VER_LUMA_AVX2_16x12 ps + +%macro FILTER_VER_LUMA_AVX2_16x8 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 + mov r4d, r4m + shl r4d, 7 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + mova m14, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, m13 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 - pmaddubsw m4, m12 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 20 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, m13 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 - pmaddubsw m5, m12 - movu xm7, [r0 + r1] ; m7 = row 21 - punpckhbw xm0, xm6, xm7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 - vinserti128 m6, m6, xm0, 1 - pmaddubsw m0, m6, m13 - paddw m4, m0 - pmaddubsw m6, m12 - movu xm0, [r0 + r1 * 2] ; m0 = row 22 - punpckhbw xm1, xm7, xm0 - punpcklbw xm7, xm0 - vinserti128 m7, m7, xm1, 1 - pmaddubsw m1, m7, m13 - paddw m5, m1 - pmaddubsw m7, m12 - movu xm1, [r0 + r4] ; m1 = row 23 - punpckhbw xm8, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm8, 1 - pmaddubsw m8, m0, m13 - paddw m6, m8 - pmaddubsw m0, m12 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 24 - punpckhbw xm9, xm1, xm8 - punpcklbw xm1, xm8 - vinserti128 m1, m1, xm9, 1 - pmaddubsw m9, m1, m13 - paddw m7, m9 - pmaddubsw m1, m12 - movu xm9, [r0 + r1] ; m9 = row 25 + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 - pmaddubsw m8, m13 - paddw m0, m8 - movu xm10, [r0 + r1 * 2] ; m10 = row 26 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 - pmaddubsw m9, m13 - paddw m1, m9 - + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + lea r4, [r3 * 3] %ifidn %1,pp - pmulhrsw m2, m14 ; m2 = word: row 16 - pmulhrsw m3, m14 ; m3 = word: row 17 - pmulhrsw m4, m14 ; m4 = word: row 18 - pmulhrsw m5, m14 ; m5 = word: row 19 - pmulhrsw m6, m14 ; m6 = word: row 20 - pmulhrsw m7, m14 ; m7 = word: row 21 - pmulhrsw m0, m14 ; m0 = word: row 22 - pmulhrsw m1, m14 ; m1 = word: row 23 + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 - packuswb m6, m7 - packuswb m0, m1 - vpermq m2, m2, q3120 - vpermq m4, m4, q3120 - vpermq m6, m6, q3120 - vpermq m0, m0, q3120 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 - vextracti128 xm7, m6, 1 - vextracti128 xm1, m0, 1 - movu [r2], xm2 - movu [r2 + r3], xm3 - movu [r2 + r3 * 2], xm4 - movu [r2 + r5], xm5 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 lea r2, [r2 + r3 * 4] - movu [r2], xm6 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm0 - movu [r2 + r5], xm1 + movu [r2], xm4 + movu [r2 + r3], xm5 %else - psubw m2, m14 ; m2 = word: row 16 - psubw m3, m14 ; m3 = word: row 17 - psubw m4, m14 ; m4 = word: row 18 - psubw m5, m14 ; m5 = word: row 19 - psubw m6, m14 ; m6 = word: row 20 - psubw m7, m14 ; m7 = word: row 21 - psubw m0, m14 ; m0 = word: row 22 - psubw m1, m14 ; m1 = word: row 23 - movu [r2], m2 - movu [r2 + r3], m3 - movu [r2 + r3 * 2], m4 - movu [r2 + r5], m5 + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r4], m3 lea r2, [r2 + r3 * 4] - movu [r2], m6 - movu [r2 + r3], m7 - movu [r2 + r3 * 2], m0 - movu [r2 + r5], m1 + movu [r2], m4 + movu [r2 + r3], m5 +%endif + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r4], xm7 +%else + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], m6 + movu [r2 + r4], m7 %endif RET %endif %endmacro - FILTER_VER_CHROMA_AVX2_16x24 pp - FILTER_VER_CHROMA_AVX2_16x24 ps + FILTER_VER_LUMA_AVX2_16x8 pp + FILTER_VER_LUMA_AVX2_16x8 ps -%macro FILTER_VER_CHROMA_AVX2_24x32 1 +%macro FILTER_VER_LUMA_AVX2_16x4 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_24x32, 4, 9, 10 +cglobal interp_8tap_vert_%1_16x4, 4, 6, 13 mov r4d, r4m - shl r4d, 6 - + shl r4d, 7 %ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [tab_ChromaCoeffVer_32 + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - - mova m8, [r5] - mova m9, [r5 + mmsize] lea r4, [r1 * 3] - sub r0, r1 + sub r0, r4 %ifidn %1,pp - mova m7, [pw_512] + mova m12, [pw_512] %else add r3d, r3d - vbroadcasti128 m7, [pw_2000] -%endif - lea r6, [r3 * 3] - mov r5d, 2 -.loopH: - movu xm0, [r0] - vinserti128 m0, m0, [r0 + r1 * 2], 1 - movu xm1, [r0 + r1] - vinserti128 m1, m1, [r0 + r4], 1 - - punpcklbw m2, m0, m1 - punpckhbw m3, m0, m1 - vperm2i128 m4, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - pmaddubsw m4, m8 - pmaddubsw m3, m2, m9 - paddw m4, m3 - pmaddubsw m2, m8 - - vextracti128 xm0, m0, 1 - lea r7, [r0 + r1 * 4] - vinserti128 m0, m0, [r7], 1 - - punpcklbw m5, m1, m0 - punpckhbw m3, m1, m0 - vperm2i128 m6, m5, m3, 0x20 - vperm2i128 m5, m5, m3, 0x31 - pmaddubsw m6, m8 - pmaddubsw m3, m5, m9 - paddw m6, m3 - pmaddubsw m5, m8 -%ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 0 - pmulhrsw m6, m7 ; m6 = word: row 1 - packuswb m4, m6 - vpermq m4, m4, 11011000b - vextracti128 xm6, m4, 1 - movu [r2], xm4 - movu [r2 + r3], xm6 -%else - psubw m4, m7 ; m4 = word: row 0 - psubw m6, m7 ; m6 = word: row 1 - movu [r2], m4 - movu [r2 + r3], m6 + vbroadcasti128 m12, [pw_2000] %endif - - movu xm4, [r7 + r1 * 2] - vinserti128 m4, m4, [r7 + r1], 1 - vextracti128 xm1, m4, 1 - vinserti128 m0, m0, xm1, 0 - - punpcklbw m6, m0, m4 - punpckhbw m1, m0, m4 - vperm2i128 m0, m6, m1, 0x20 - vperm2i128 m6, m6, m1, 0x31 - pmaddubsw m1, m0, m9 - paddw m5, m1 - pmaddubsw m0, m8 - pmaddubsw m1, m6, m9 - paddw m2, m1 - pmaddubsw m6, m8 - + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 %ifidn %1,pp - pmulhrsw m2, m7 ; m2 = word: row 2 - pmulhrsw m5, m7 ; m5 = word: row 3 - packuswb m2, m5 + pmulhrsw m0, m12 ; m0 = word: row 0 + pmulhrsw m1, m12 ; m1 = word: row 1 + pmulhrsw m2, m12 ; m2 = word: row 2 + pmulhrsw m3, m12 ; m3 = word: row 3 + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b - vextracti128 xm5, m2, 1 + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm5 + lea r4, [r3 * 3] + movu [r2 + r4], xm3 %else - psubw m2, m7 ; m2 = word: row 2 - psubw m5, m7 ; m5 = word: row 3 + psubw m0, m12 ; m0 = word: row 0 + psubw m1, m12 ; m1 = word: row 1 + psubw m2, m12 ; m2 = word: row 2 + psubw m3, m12 ; m3 = word: row 3 + movu [r2], m0 + movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 - movu [r2 + r6], m5 -%endif - lea r8, [r2 + r3 * 4] - - movu xm1, [r7 + r4] - lea r7, [r7 + r1 * 4] - vinserti128 m1, m1, [r7], 1 - vinserti128 m4, m4, xm1, 1 - - punpcklbw m2, m4, m1 - punpckhbw m5, m4, m1 - vperm2i128 m3, m2, m5, 0x20 - vperm2i128 m2, m2, m5, 0x31 - pmaddubsw m5, m3, m9 - paddw m6, m5 - pmaddubsw m3, m8 - pmaddubsw m5, m2, m9 - paddw m0, m5 - pmaddubsw m2, m8 - -%ifidn %1,pp - pmulhrsw m6, m7 ; m6 = word: row 4 - pmulhrsw m0, m7 ; m0 = word: row 5 - packuswb m6, m0 - vpermq m6, m6, 11011000b - vextracti128 xm0, m6, 1 - movu [r8], xm6 - movu [r8 + r3], xm0 -%else - psubw m6, m7 ; m6 = word: row 4 - psubw m0, m7 ; m0 = word: row 5 - movu [r8], m6 - movu [r8 + r3], m0 -%endif - - movu xm6, [r7 + r1 * 2] - vinserti128 m6, m6, [r7 + r1], 1 - vextracti128 xm0, m6, 1 - vinserti128 m1, m1, xm0, 0 - - punpcklbw m4, m1, m6 - punpckhbw m5, m1, m6 - vperm2i128 m0, m4, m5, 0x20 - vperm2i128 m5, m4, m5, 0x31 - pmaddubsw m4, m0, m9 - paddw m2, m4 - pmaddubsw m0, m8 - pmaddubsw m4, m5, m9 - paddw m3, m4 - pmaddubsw m5, m8 - -%ifidn %1,pp - pmulhrsw m3, m7 ; m3 = word: row 6 - pmulhrsw m2, m7 ; m2 = word: row 7 - packuswb m3, m2 - vpermq m3, m3, 11011000b - vextracti128 xm2, m3, 1 - movu [r8 + r3 * 2], xm3 - movu [r8 + r6], xm2 -%else - psubw m3, m7 ; m3 = word: row 6 - psubw m2, m7 ; m2 = word: row 7 - movu [r8 + r3 * 2], m3 - movu [r8 + r6], m2 + lea r4, [r3 * 3] + movu [r2 + r4], m3 %endif - lea r8, [r8 + r3 * 4] - - movu xm3, [r7 + r4] - lea r7, [r7 + r1 * 4] - vinserti128 m3, m3, [r7], 1 - vinserti128 m6, m6, xm3, 1 - - punpcklbw m2, m6, m3 - punpckhbw m1, m6, m3 - vperm2i128 m4, m2, m1, 0x20 - vperm2i128 m2, m2, m1, 0x31 - pmaddubsw m1, m4, m9 - paddw m5, m1 - pmaddubsw m4, m8 - pmaddubsw m1, m2, m9 - paddw m0, m1 - pmaddubsw m2, m8 - -%ifidn %1,pp - pmulhrsw m5, m7 ; m5 = word: row 8 - pmulhrsw m0, m7 ; m0 = word: row 9 - packuswb m5, m0 - vpermq m5, m5, 11011000b - vextracti128 xm0, m5, 1 - movu [r8], xm5 - movu [r8 + r3], xm0 -%else - psubw m5, m7 ; m5 = word: row 8 - psubw m0, m7 ; m0 = word: row 9 - movu [r8], m5 - movu [r8 + r3], m0 + RET %endif +%endmacro - movu xm5, [r7 + r1 * 2] - vinserti128 m5, m5, [r7 + r1], 1 - vextracti128 xm0, m5, 1 - vinserti128 m3, m3, xm0, 0 - - punpcklbw m1, m3, m5 - punpckhbw m0, m3, m5 - vperm2i128 m6, m1, m0, 0x20 - vperm2i128 m0, m1, m0, 0x31 - pmaddubsw m1, m6, m9 - paddw m2, m1 - pmaddubsw m6, m8 - pmaddubsw m1, m0, m9 - paddw m4, m1 - pmaddubsw m0, m8 + FILTER_VER_LUMA_AVX2_16x4 pp + FILTER_VER_LUMA_AVX2_16x4 ps +%macro FILTER_VER_LUMA_AVX2_16xN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 15 + mov r4d, r4m + shl r4d, 7 -%ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 10 - pmulhrsw m2, m7 ; m2 = word: row 11 - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm2, m4, 1 - movu [r8 + r3 * 2], xm4 - movu [r8 + r6], xm2 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 %else - psubw m4, m7 ; m4 = word: row 10 - psubw m2, m7 ; m2 = word: row 11 - movu [r8 + r3 * 2], m4 - movu [r8 + r6], m2 + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - lea r8, [r8 + r3 * 4] - - movu xm3, [r7 + r4] - lea r7, [r7 + r1 * 4] - vinserti128 m3, m3, [r7], 1 - vinserti128 m5, m5, xm3, 1 - - punpcklbw m2, m5, m3 - punpckhbw m1, m5, m3 - vperm2i128 m4, m2, m1, 0x20 - vperm2i128 m2, m2, m1, 0x31 - pmaddubsw m1, m4, m9 - paddw m0, m1 - pmaddubsw m4, m8 - pmaddubsw m1, m2, m9 - paddw m6, m1 - pmaddubsw m2, m8 -%ifidn %1,pp - pmulhrsw m0, m7 ; m0 = word: row 12 - pmulhrsw m6, m7 ; m6 = word: row 13 - packuswb m0, m6 - vpermq m0, m0, 11011000b - vextracti128 xm6, m0, 1 - movu [r8], xm0 - movu [r8 + r3], xm6 + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %3,ps + add r3d, r3d + vbroadcasti128 m14, [pw_2000] %else - psubw m0, m7 ; m0 = word: row 12 - psubw m6, m7 ; m6 = word: row 13 - movu [r8], m0 - movu [r8 + r3], m6 + mova m14, [pw_512] %endif + lea r6, [r3 * 3] + lea r7, [r1 * 4] + mov r8d, %2 / 16 - movu xm5, [r7 + r1 * 2] - vinserti128 m5, m5, [r7 + r1], 1 - vextracti128 xm0, m5, 1 - vinserti128 m3, m3, xm0, 0 - - punpcklbw m1, m3, m5 - punpckhbw m0, m3, m5 - vperm2i128 m6, m1, m0, 0x20 - vperm2i128 m0, m1, m0, 0x31 - pmaddubsw m6, m9 - paddw m2, m6 - pmaddubsw m0, m9 - paddw m4, m0 - -%ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 14 - pmulhrsw m2, m7 ; m2 = word: row 15 - packuswb m4, m2 - vpermq m4, m4, 11011000b - vextracti128 xm2, m4, 1 - movu [r8 + r3 * 2], xm4 - movu [r8 + r6], xm2 - add r2, 16 -%else - psubw m4, m7 ; m4 = word: row 14 - psubw m2, m7 ; m2 = word: row 15 - movu [r8 + r3 * 2], m4 - movu [r8 + r6], m2 - add r2, 32 -%endif - add r0, 16 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 - vinserti128 m5, m1, xm2, 1 - pmaddubsw m5, m8 - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 - lea r7, [r0 + r1 * 4] - movq xm1, [r7] ; m1 = row 4 - punpcklbw xm4, xm1 - vinserti128 m2, m3, xm4, 1 - pmaddubsw m0, m2, m9 - paddw m5, m0 - pmaddubsw m2, m8 - movq xm3, [r7 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 - movq xm4, [r7 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m0, m1, m9 - paddw m2, m0 - pmaddubsw m1, m8 - movq xm3, [r7 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 - lea r7, [r7 + r1 * 4] - movq xm0, [r7] ; m0 = row 8 - punpcklbw xm3, xm0 - vinserti128 m4, m4, xm3, 1 - pmaddubsw m3, m4, m9 - paddw m1, m3 - pmaddubsw m4, m8 - movq xm3, [r7 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 - movq xm6, [r7 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 - vinserti128 m0, m0, xm3, 1 - pmaddubsw m3, m0, m9 - paddw m4, m3 - pmaddubsw m0, m8 - -%ifidn %1,pp - pmulhrsw m5, m7 ; m5 = word: row 0, row 1 - pmulhrsw m2, m7 ; m2 = word: row 2, row 3 - pmulhrsw m1, m7 ; m1 = word: row 4, row 5 - pmulhrsw m4, m7 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm2 - lea r8, [r2 + r3 * 4] - movq [r8], xm1 - movq [r8 + r3], xm4 - movhps [r8 + r3 * 2], xm1 - movhps [r8 + r6], xm4 -%else - psubw m5, m7 ; m5 = word: row 0, row 1 - psubw m2, m7 ; m2 = word: row 2, row 3 - psubw m1, m7 ; m1 = word: row 4, row 5 - psubw m4, m7 ; m4 = word: row 6, row 7 - vextracti128 xm3, m5, 1 - movu [r2], xm5 - movu [r2 + r3], xm3 - vextracti128 xm3, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - vextracti128 xm3, m1, 1 - lea r8, [r2 + r3 * 4] - movu [r8], xm1 - movu [r8 + r3], xm3 - vextracti128 xm3, m4, 1 - movu [r8 + r3 * 2], xm4 - movu [r8 + r6], xm3 -%endif - lea r8, [r8 + r3 * 4] - - movq xm3, [r7 + r4] ; m3 = row 11 - punpcklbw xm6, xm3 - lea r7, [r7 + r1 * 4] - movq xm5, [r7] ; m5 = row 12 - punpcklbw xm3, xm5 - vinserti128 m6, m6, xm3, 1 - pmaddubsw m3, m6, m9 - paddw m0, m3 - pmaddubsw m6, m8 - movq xm3, [r7 + r1] ; m3 = row 13 - punpcklbw xm5, xm3 - movq xm2, [r7 + r1 * 2] ; m2 = row 14 - punpcklbw xm3, xm2 - vinserti128 m5, m5, xm3, 1 - pmaddubsw m3, m5, m9 - paddw m6, m3 - pmaddubsw m5, m8 - movq xm3, [r7 + r4] ; m3 = row 15 - punpcklbw xm2, xm3 - lea r7, [r7 + r1 * 4] - movq xm1, [r7] ; m1 = row 16 - punpcklbw xm3, xm1 - vinserti128 m2, m2, xm3, 1 - pmaddubsw m3, m2, m9 - paddw m5, m3 - pmaddubsw m2, m8 - movq xm3, [r7 + r1] ; m3 = row 17 - punpcklbw xm1, xm3 - movq xm4, [r7 + r1 * 2] ; m4 = row 18 - punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, m9 - paddw m2, m3 -%ifidn %1,pp - pmulhrsw m0, m7 ; m0 = word: row 8, row 9 - pmulhrsw m6, m7 ; m6 = word: row 10, row 11 - pmulhrsw m5, m7 ; m5 = word: row 12, row 13 - pmulhrsw m2, m7 ; m2 = word: row 14, row 15 - packuswb m0, m6 - packuswb m5, m2 - vextracti128 xm6, m0, 1 - vextracti128 xm2, m5, 1 - movq [r8], xm0 - movq [r8 + r3], xm6 - movhps [r8 + r3 * 2], xm0 - movhps [r8 + r6], xm6 - lea r8, [r8 + r3 * 4] - movq [r8], xm5 - movq [r8 + r3], xm2 - movhps [r8 + r3 * 2], xm5 - movhps [r8 + r6], xm2 - lea r2, [r8 + r3 * 4 - 16] -%else - psubw m0, m7 ; m0 = word: row 8, row 9 - psubw m6, m7 ; m6 = word: row 10, row 11 - psubw m5, m7 ; m5 = word: row 12, row 13 - psubw m2, m7 ; m2 = word: row 14, row 15 - vextracti128 xm3, m0, 1 - movu [r8], xm0 - movu [r8 + r3], xm3 - vextracti128 xm3, m6, 1 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm3 - vextracti128 xm3, m5, 1 - lea r8, [r8 + r3 * 4] - movu [r8], xm5 - movu [r8 + r3], xm3 - vextracti128 xm3, m2, 1 - movu [r8 + r3 * 2], xm2 - movu [r8 + r6], xm3 - lea r2, [r8 + r3 * 4 - 32] -%endif - lea r0, [r7 - 16] - dec r5d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_CHROMA_AVX2_24x32 pp - FILTER_VER_CHROMA_AVX2_24x32 ps - -%macro FILTER_VER_CHROMA_AVX2_24x64 1 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_24x64, 4, 7, 13 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - mova m10, [r5] - mova m11, [r5 + mmsize] - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m12, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m12, [pw_2000] -%endif - lea r5, [r3 * 3] - mov r6d, 16 -.loopH: - movu m0, [r0] ; m0 = row 0 - movu m1, [r0 + r1] ; m1 = row 1 - punpcklbw m2, m0, m1 - punpckhbw m3, m0, m1 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - movu m0, [r0 + r1 * 2] ; m0 = row 2 - punpcklbw m4, m1, m0 - punpckhbw m5, m1, m0 - pmaddubsw m4, m10 - pmaddubsw m5, m10 - movu m1, [r0 + r4] ; m1 = row 3 - punpcklbw m6, m0, m1 - punpckhbw m7, m0, m1 - pmaddubsw m8, m6, m11 - pmaddubsw m9, m7, m11 - pmaddubsw m6, m10 - pmaddubsw m7, m10 - paddw m2, m8 - paddw m3, m9 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2], xm2 - vextracti128 xm2, m2, 1 - movq [r2 + 16], xm2 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - movu [r2], m0 - movu [r2 + mmsize], xm2 -%endif - lea r0, [r0 + r1 * 4] - movu m0, [r0] ; m0 = row 4 - punpcklbw m2, m1, m0 - punpckhbw m3, m1, m0 - pmaddubsw m8, m2, m11 - pmaddubsw m9, m3, m11 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - paddw m4, m8 - paddw m5, m9 -%ifidn %1,pp - pmulhrsw m4, m12 - pmulhrsw m5, m12 - packuswb m4, m5 - movu [r2 + r3], xm4 - vextracti128 xm4, m4, 1 - movq [r2 + r3 + 16], xm4 -%else - psubw m4, m12 - psubw m5, m12 - vperm2i128 m1, m4, m5, 0x20 - vperm2i128 m4, m4, m5, 0x31 - movu [r2 + r3], m1 - movu [r2 + r3 + mmsize], xm4 -%endif - - movu m1, [r0 + r1] ; m1 = row 5 - punpcklbw m4, m0, m1 - punpckhbw m5, m0, m1 - pmaddubsw m4, m11 - pmaddubsw m5, m11 - paddw m6, m4 - paddw m7, m5 -%ifidn %1,pp - pmulhrsw m6, m12 - pmulhrsw m7, m12 - packuswb m6, m7 - movu [r2 + r3 * 2], xm6 - vextracti128 xm6, m6, 1 - movq [r2 + r3 * 2 + 16], xm6 -%else - psubw m6, m12 - psubw m7, m12 - vperm2i128 m0, m6, m7, 0x20 - vperm2i128 m6, m6, m7, 0x31 - movu [r2 + r3 * 2], m0 - movu [r2 + r3 * 2 + mmsize], xm6 -%endif - - movu m0, [r0 + r1 * 2] ; m0 = row 6 - punpcklbw m6, m1, m0 - punpckhbw m7, m1, m0 - pmaddubsw m6, m11 - pmaddubsw m7, m11 - paddw m2, m6 - paddw m3, m7 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2 + r5], xm2 - vextracti128 xm2, m2, 1 - movq [r2 + r5 + 16], xm2 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - movu [r2 + r5], m0 - movu [r2 + r5 + mmsize], xm2 -%endif - lea r2, [r2 + r3 * 4] - dec r6d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_CHROMA_AVX2_24x64 pp - FILTER_VER_CHROMA_AVX2_24x64 ps - -%macro FILTER_VER_CHROMA_AVX2_16x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_16x4, 4, 6, 8 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m7, [pw_512] -%else - add r3d, r3d - mova m7, [pw_2000] -%endif - - movu xm0, [r0] - vinserti128 m0, m0, [r0 + r1 * 2], 1 - movu xm1, [r0 + r1] - vinserti128 m1, m1, [r0 + r4], 1 - - punpcklbw m2, m0, m1 - punpckhbw m3, m0, m1 - vperm2i128 m4, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - pmaddubsw m4, [r5] - pmaddubsw m3, m2, [r5 + mmsize] - paddw m4, m3 - pmaddubsw m2, [r5] - - vextracti128 xm0, m0, 1 - lea r0, [r0 + r1 * 4] - vinserti128 m0, m0, [r0], 1 - - punpcklbw m5, m1, m0 - punpckhbw m3, m1, m0 - vperm2i128 m6, m5, m3, 0x20 - vperm2i128 m5, m5, m3, 0x31 - pmaddubsw m6, [r5] - pmaddubsw m3, m5, [r5 + mmsize] - paddw m6, m3 - pmaddubsw m5, [r5] -%ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 0 - pmulhrsw m6, m7 ; m6 = word: row 1 - packuswb m4, m6 - vpermq m4, m4, 11011000b - vextracti128 xm6, m4, 1 - movu [r2], xm4 - movu [r2 + r3], xm6 -%else - psubw m4, m7 ; m4 = word: row 0 - psubw m6, m7 ; m6 = word: row 1 - movu [r2], m4 - movu [r2 + r3], m6 -%endif - lea r2, [r2 + r3 * 2] - - movu xm4, [r0 + r1 * 2] - vinserti128 m4, m4, [r0 + r1], 1 - vextracti128 xm1, m4, 1 - vinserti128 m0, m0, xm1, 0 - - punpcklbw m6, m0, m4 - punpckhbw m1, m0, m4 - vperm2i128 m0, m6, m1, 0x20 - vperm2i128 m6, m6, m1, 0x31 - pmaddubsw m0, [r5 + mmsize] - paddw m5, m0 - pmaddubsw m6, [r5 + mmsize] - paddw m2, m6 - -%ifidn %1,pp - pmulhrsw m2, m7 ; m2 = word: row 2 - pmulhrsw m5, m7 ; m5 = word: row 3 - packuswb m2, m5 - vpermq m2, m2, 11011000b - vextracti128 xm5, m2, 1 - movu [r2], xm2 - movu [r2 + r3], xm5 -%else - psubw m2, m7 ; m2 = word: row 2 - psubw m5, m7 ; m5 = word: row 3 - movu [r2], m2 - movu [r2 + r3], m5 -%endif - RET -%endmacro - - FILTER_VER_CHROMA_AVX2_16x4 pp - FILTER_VER_CHROMA_AVX2_16x4 ps - -%macro FILTER_VER_CHROMA_AVX2_12xN 2 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_12x%2, 4, 7, 8 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m7, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m7, [pw_2000] -%endif - lea r6, [r3 * 3] -%rep %2 / 16 +.loop: movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 @@ -8218,11332 +5073,1723 @@ cglobal interp_4tap_vert_%1_12x%2, 4, 7, 8 pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] -%ifidn %1,pp - pmulhrsw m0, m7 ; m0 = word: row 0 - pmulhrsw m1, m7 ; m1 = word: row 1 - packuswb m0, m1 - vextracti128 xm1, m0, 1 - movq [r2], xm0 - movd [r2 + 8], xm1 - movhps [r2 + r3], xm0 - pextrd [r2 + r3 + 8], xm1, 2 -%else - psubw m0, m7 ; m0 = word: row 0 - psubw m1, m7 ; m1 = word: row 1 - movu [r2], xm0 - vextracti128 xm0, m0, 1 - movq [r2 + 16], xm0 - movu [r2 + r3], xm1 - vextracti128 xm1, m1, 1 - movq [r2 + r3 + 16], xm1 -%endif - movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 pmaddubsw m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm0, xm5, xm6 + punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 - vinserti128 m5, m5, xm0, 1 - pmaddubsw m0, m5, [r5 + 1 * mmsize] - paddw m3, m0 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 pmaddubsw m5, [r5] -%ifidn %1,pp - pmulhrsw m2, m7 ; m2 = word: row 2 - pmulhrsw m3, m7 ; m3 = word: row 3 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + +%ifidn %3,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 - movq [r2 + r3 * 2], xm2 - movd [r2 + r3 * 2 + 8], xm3 - movhps [r2 + r6], xm2 - pextrd [r2 + r6 + 8], xm3, 2 -%else - psubw m2, m7 ; m2 = word: row 2 - psubw m3, m7 ; m3 = word: row 3 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 - vextracti128 xm2, m2, 1 - movq [r2 + r3 * 2 + 16], xm2 movu [r2 + r6], xm3 - vextracti128 xm3, m3, 1 - movq [r2 + r6 + 16], xm3 -%endif lea r2, [r2 + r3 * 4] - - movu xm0, [r0 + r4] ; m0 = row 7 - punpckhbw xm3, xm6, xm0 - punpcklbw xm6, xm0 - vinserti128 m6, m6, xm3, 1 - pmaddubsw m3, m6, [r5 + 1 * mmsize] - paddw m4, m3 - pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm3, [r0] ; m3 = row 8 - punpckhbw xm1, xm0, xm3 - punpcklbw xm0, xm3 - vinserti128 m0, m0, xm1, 1 - pmaddubsw m1, m0, [r5 + 1 * mmsize] - paddw m5, m1 - pmaddubsw m0, [r5] -%ifidn %1,pp - pmulhrsw m4, m7 ; m4 = word: row 4 - pmulhrsw m5, m7 ; m5 = word: row 5 - packuswb m4, m5 - vextracti128 xm5, m4, 1 - movq [r2], xm4 - movd [r2 + 8], xm5 - movhps [r2 + r3], xm4 - pextrd [r2 + r3 + 8], xm5, 2 -%else - psubw m4, m7 ; m4 = word: row 4 - psubw m5, m7 ; m5 = word: row 5 movu [r2], xm4 - vextracti128 xm4, m4, 1 - movq [r2 + 16], xm4 movu [r2 + r3], xm5 - vextracti128 xm5, m5, 1 - movq [r2 + r3 + 16], xm5 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + lea r2, [r2 + r3 * 4] + movu [r2], m4 + movu [r2 + r3], m5 %endif - movu xm1, [r0 + r1] ; m1 = row 9 - punpckhbw xm2, xm3, xm1 - punpcklbw xm3, xm1 - vinserti128 m3, m3, xm2, 1 - pmaddubsw m2, m3, [r5 + 1 * mmsize] - paddw m6, m2 - pmaddubsw m3, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 10 - punpckhbw xm4, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm4, 1 - pmaddubsw m4, m1, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m1, [r5] + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] -%ifidn %1,pp - pmulhrsw m6, m7 ; m6 = word: row 6 - pmulhrsw m0, m7 ; m0 = word: row 7 - packuswb m6, m0 - vextracti128 xm0, m6, 1 - movq [r2 + r3 * 2], xm6 - movd [r2 + r3 * 2 + 8], xm0 - movhps [r2 + r6], xm6 - pextrd [r2 + r6 + 8], xm0, 2 -%else - psubw m6, m7 ; m6 = word: row 6 - psubw m0, m7 ; m0 = word: row 7 +%ifidn %3,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 movu [r2 + r3 * 2], xm6 - vextracti128 xm6, m6, 1 - movq [r2 + r3 * 2 + 16], xm6 - movu [r2 + r6], xm0 - vextracti128 xm0, m0, 1 - movq [r2 + r6 + 16], xm0 -%endif - lea r2, [r2 + r3 * 4] - - movu xm4, [r0 + r4] ; m4 = row 11 - punpckhbw xm6, xm2, xm4 - punpcklbw xm2, xm4 - vinserti128 m2, m2, xm6, 1 - pmaddubsw m6, m2, [r5 + 1 * mmsize] - paddw m3, m6 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 12 - punpckhbw xm0, xm4, xm6 - punpcklbw xm4, xm6 - vinserti128 m4, m4, xm0, 1 - pmaddubsw m0, m4, [r5 + 1 * mmsize] - paddw m1, m0 - pmaddubsw m4, [r5] -%ifidn %1,pp - pmulhrsw m3, m7 ; m3 = word: row 8 - pmulhrsw m1, m7 ; m1 = word: row 9 - packuswb m3, m1 - vextracti128 xm1, m3, 1 - movq [r2], xm3 - movd [r2 + 8], xm1 - movhps [r2 + r3], xm3 - pextrd [r2 + r3 + 8], xm1, 2 + movu [r2 + r6], xm7 %else - psubw m3, m7 ; m3 = word: row 8 - psubw m1, m7 ; m1 = word: row 9 - movu [r2], xm3 - vextracti128 xm3, m3, 1 - movq [r2 + 16], xm3 - movu [r2 + r3], xm1 - vextracti128 xm1, m1, 1 - movq [r2 + r3 + 16], xm1 + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], m6 + movu [r2 + r6], m7 %endif - movu xm0, [r0 + r1] ; m0 = row 13 - punpckhbw xm1, xm6, xm0 - punpcklbw xm6, xm0 - vinserti128 m6, m6, xm1, 1 - pmaddubsw m1, m6, [r5 + 1 * mmsize] - paddw m2, m1 - pmaddubsw m6, [r5] - movu xm1, [r0 + r1 * 2] ; m1 = row 14 - punpckhbw xm5, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm5, 1 - pmaddubsw m5, m0, [r5 + 1 * mmsize] - paddw m4, m5 - pmaddubsw m0, [r5] -%ifidn %1,pp - pmulhrsw m2, m7 ; m2 = word: row 10 - pmulhrsw m4, m7 ; m4 = word: row 11 - packuswb m2, m4 - vextracti128 xm4, m2, 1 - movq [r2 + r3 * 2], xm2 - movd [r2 + r3 * 2 + 8], xm4 - movhps [r2 + r6], xm2 - pextrd [r2 + r6 + 8], xm4, 2 -%else - psubw m2, m7 ; m2 = word: row 10 - psubw m4, m7 ; m4 = word: row 11 - movu [r2 + r3 * 2], xm2 - vextracti128 xm2, m2, 1 - movq [r2 + r3 * 2 + 16], xm2 - movu [r2 + r6], xm4 - vextracti128 xm4, m4, 1 - movq [r2 + r6 + 16], xm4 -%endif lea r2, [r2 + r3 * 4] - movu xm5, [r0 + r4] ; m5 = row 15 - punpckhbw xm2, xm1, xm5 - punpcklbw xm1, xm5 - vinserti128 m1, m1, xm2, 1 - pmaddubsw m2, m1, [r5 + 1 * mmsize] - paddw m6, m2 - pmaddubsw m1, [r5] + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] lea r0, [r0 + r1 * 4] movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm5, xm2 - punpcklbw xm5, xm2 - vinserti128 m5, m5, xm3, 1 - pmaddubsw m3, m5, [r5 + 1 * mmsize] - paddw m0, m3 - pmaddubsw m5, [r5] + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] movu xm3, [r0 + r1] ; m3 = row 17 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 pmaddubsw m2, [r5 + 1 * mmsize] - paddw m1, m2 + paddw m0, m2 movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm2, xm3, xm4 + punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 - vinserti128 m3, m3, xm2, 1 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 pmaddubsw m3, [r5 + 1 * mmsize] - paddw m5, m3 - -%ifidn %1,pp - pmulhrsw m6, m7 ; m6 = word: row 12 - pmulhrsw m0, m7 ; m0 = word: row 13 - pmulhrsw m1, m7 ; m1 = word: row 14 - pmulhrsw m5, m7 ; m5 = word: row 15 - packuswb m6, m0 - packuswb m1, m5 - vextracti128 xm0, m6, 1 - vextracti128 xm5, m1, 1 - movq [r2], xm6 - movd [r2 + 8], xm0 - movhps [r2 + r3], xm6 - pextrd [r2 + r3 + 8], xm0, 2 - movq [r2 + r3 * 2], xm1 - movd [r2 + r3 * 2 + 8], xm5 - movhps [r2 + r6], xm1 - pextrd [r2 + r6 + 8], xm5, 2 -%else - psubw m6, m7 ; m6 = word: row 12 - psubw m0, m7 ; m0 = word: row 13 - psubw m1, m7 ; m1 = word: row 14 - psubw m5, m7 ; m5 = word: row 15 - movu [r2], xm6 - vextracti128 xm6, m6, 1 - movq [r2 + 16], xm6 - movu [r2 + r3], xm0 - vextracti128 xm0, m0, 1 - movq [r2 + r3 + 16], xm0 - movu [r2 + r3 * 2], xm1 - vextracti128 xm1, m1, 1 - movq [r2 + r3 * 2 + 16], xm1 - movu [r2 + r6], xm5 - vextracti128 xm5, m5, 1 - movq [r2 + r6 + 16], xm5 -%endif - lea r2, [r2 + r3 * 4] -%endrep - RET -%endmacro - - FILTER_VER_CHROMA_AVX2_12xN pp, 16 - FILTER_VER_CHROMA_AVX2_12xN ps, 16 - FILTER_VER_CHROMA_AVX2_12xN pp, 32 - FILTER_VER_CHROMA_AVX2_12xN ps, 32 - -;----------------------------------------------------------------------------- -;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W24 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mov r4d, %2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r5, [r0 + 2 * r1] - movu m5, [r5] - movu m7, [r5 + r1] - - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 - - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 - - mova m6, [pw_512] - - pmulhrsw m4, m6 - pmulhrsw m2, m6 - - packuswb m4, m2 - - movu [r2], m4 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m2, [r5 + 2 * r1] - - punpcklbw m5, m7, m2 - punpckhbw m7, m2 - - pmaddubsw m5, m0 - pmaddubsw m7, m0 - - paddw m4, m5 - paddw m3, m7 - - pmulhrsw m4, m6 - pmulhrsw m3, m6 - - packuswb m4, m3 - - movu [r2 + r3], m4 - - movq m2, [r0 + 16] - movq m3, [r0 + r1 + 16] - movq m4, [r5 + 16] - movq m5, [r5 + r1 + 16] - - punpcklbw m2, m3 - punpcklbw m4, m5 - - pmaddubsw m2, m1 - pmaddubsw m4, m0 - - paddw m2, m4 - - pmulhrsw m2, m6 - - movq m3, [r0 + r1 + 16] - movq m4, [r5 + 16] - movq m5, [r5 + r1 + 16] - movq m7, [r5 + 2 * r1 + 16] - - punpcklbw m3, m4 - punpcklbw m5, m7 - - pmaddubsw m3, m1 - pmaddubsw m5, m0 - - paddw m3, m5 - - pmulhrsw m3, m6 - packuswb m2, m3 - - movh [r2 + 16], m2 - movhps [r2 + r3 + 16], m2 - - mov r0, r5 - lea r2, [r2 + 2 * r3] - - sub r4, 2 - jnz .loop - RET -%endmacro - - FILTER_V4_W24 24, 32 - - FILTER_V4_W24 24, 64 - -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W32 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mova m7, [pw_512] - - mov r4d, %2 - -.loop: - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r5, [r0 + 2 * r1] - movu m3, [r5] - movu m5, [r5 + r1] - - punpcklbw m6, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m6, m0 - pmaddubsw m3, m0 - - paddw m4, m6 - paddw m2, m3 - - pmulhrsw m4, m7 - pmulhrsw m2, m7 - - packuswb m4, m2 - - movu [r2], m4 - - movu m2, [r0 + 16] - movu m3, [r0 + r1 + 16] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - movu m3, [r5 + 16] - movu m5, [r5 + r1 + 16] - - punpcklbw m6, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m6, m0 - pmaddubsw m3, m0 - - paddw m4, m6 - paddw m2, m3 - - pmulhrsw m4, m7 - pmulhrsw m2, m7 - - packuswb m4, m2 - - movu [r2 + 16], m4 - - lea r0, [r0 + r1] - lea r2, [r2 + r3] - - dec r4 - jnz .loop - RET -%endmacro - - FILTER_V4_W32 32, 8 - FILTER_V4_W32 32, 16 - FILTER_V4_W32 32, 24 - FILTER_V4_W32 32, 32 - - FILTER_V4_W32 32, 48 - FILTER_V4_W32 32, 64 - -%macro FILTER_VER_CHROMA_AVX2_32xN 2 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_32x%2, 4, 7, 13 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif - - mova m10, [r5] - mova m11, [r5 + mmsize] - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m12, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m12, [pw_2000] -%endif - lea r5, [r3 * 3] - mov r6d, %2 / 4 -.loopW: - movu m0, [r0] ; m0 = row 0 - movu m1, [r0 + r1] ; m1 = row 1 - punpcklbw m2, m0, m1 - punpckhbw m3, m0, m1 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - movu m0, [r0 + r1 * 2] ; m0 = row 2 - punpcklbw m4, m1, m0 - punpckhbw m5, m1, m0 - pmaddubsw m4, m10 - pmaddubsw m5, m10 - movu m1, [r0 + r4] ; m1 = row 3 - punpcklbw m6, m0, m1 - punpckhbw m7, m0, m1 - pmaddubsw m8, m6, m11 - pmaddubsw m9, m7, m11 - pmaddubsw m6, m10 - pmaddubsw m7, m10 - paddw m2, m8 - paddw m3, m9 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2], m2 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - movu [r2], m0 - movu [r2 + mmsize], m2 -%endif + paddw m1, m3 + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 lea r0, [r0 + r1 * 4] - movu m0, [r0] ; m0 = row 4 - punpcklbw m2, m1, m0 - punpckhbw m3, m1, m0 - pmaddubsw m8, m2, m11 - pmaddubsw m9, m3, m11 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - paddw m4, m8 - paddw m5, m9 -%ifidn %1,pp - pmulhrsw m4, m12 - pmulhrsw m5, m12 - packuswb m4, m5 - movu [r2 + r3], m4 -%else - psubw m4, m12 - psubw m5, m12 - vperm2i128 m1, m4, m5, 0x20 - vperm2i128 m4, m4, m5, 0x31 - movu [r2 + r3], m1 - movu [r2 + r3 + mmsize], m4 -%endif + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 - movu m1, [r0 + r1] ; m1 = row 5 - punpcklbw m4, m0, m1 - punpckhbw m5, m0, m1 - pmaddubsw m4, m11 - pmaddubsw m5, m11 - paddw m6, m4 - paddw m7, m5 -%ifidn %1,pp - pmulhrsw m6, m12 - pmulhrsw m7, m12 - packuswb m6, m7 - movu [r2 + r3 * 2], m6 +%ifidn %3,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm1 %else - psubw m6, m12 - psubw m7, m12 - vperm2i128 m0, m6, m7, 0x20 - vperm2i128 m6, m6, m7, 0x31 + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m12, m14 ; m12 = word: row 12 + psubw m13, m14 ; m13 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r2], m8 + movu [r2 + r3], m9 + movu [r2 + r3 * 2], m10 + movu [r2 + r6], m11 + lea r2, [r2 + r3 * 4] + movu [r2], m12 + movu [r2 + r3], m13 movu [r2 + r3 * 2], m0 - movu [r2 + r3 * 2 + mmsize], m6 + movu [r2 + r6], m1 %endif - movu m0, [r0 + r1 * 2] ; m0 = row 6 - punpcklbw m6, m1, m0 - punpckhbw m7, m1, m0 - pmaddubsw m6, m11 - pmaddubsw m7, m11 - paddw m2, m6 - paddw m3, m7 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2 + r5], m2 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - movu [r2 + r5], m0 - movu [r2 + r5 + mmsize], m2 -%endif lea r2, [r2 + r3 * 4] - dec r6d - jnz .loopW + sub r0, r7 + dec r8d + jnz .loop RET %endif %endmacro - FILTER_VER_CHROMA_AVX2_32xN pp, 64 - FILTER_VER_CHROMA_AVX2_32xN pp, 48 - FILTER_VER_CHROMA_AVX2_32xN pp, 32 - FILTER_VER_CHROMA_AVX2_32xN pp, 24 - FILTER_VER_CHROMA_AVX2_32xN pp, 16 - FILTER_VER_CHROMA_AVX2_32xN pp, 8 - FILTER_VER_CHROMA_AVX2_32xN ps, 64 - FILTER_VER_CHROMA_AVX2_32xN ps, 48 - FILTER_VER_CHROMA_AVX2_32xN ps, 32 - FILTER_VER_CHROMA_AVX2_32xN ps, 24 - FILTER_VER_CHROMA_AVX2_32xN ps, 16 - FILTER_VER_CHROMA_AVX2_32xN ps, 8 - -%macro FILTER_VER_CHROMA_AVX2_48x64 1 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_48x64, 4, 8, 13 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] -%endif + FILTER_VER_LUMA_AVX2_16xN 16, 32, pp + FILTER_VER_LUMA_AVX2_16xN 16, 64, pp + FILTER_VER_LUMA_AVX2_16xN 16, 32, ps + FILTER_VER_LUMA_AVX2_16xN 16, 64, ps - mova m10, [r5] - mova m11, [r5 + mmsize] - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m12, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m12, [pw_2000] -%endif - lea r5, [r3 * 3] - lea r7, [r1 * 4] - mov r6d, 16 -.loopH: - movu m0, [r0] ; m0 = row 0 - movu m1, [r0 + r1] ; m1 = row 1 - punpcklbw m2, m0, m1 - punpckhbw m3, m0, m1 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - movu m0, [r0 + r1 * 2] ; m0 = row 2 - punpcklbw m4, m1, m0 - punpckhbw m5, m1, m0 - pmaddubsw m4, m10 - pmaddubsw m5, m10 - movu m1, [r0 + r4] ; m1 = row 3 - punpcklbw m6, m0, m1 - punpckhbw m7, m0, m1 - pmaddubsw m8, m6, m11 - pmaddubsw m9, m7, m11 - pmaddubsw m6, m10 - pmaddubsw m7, m10 - paddw m2, m8 - paddw m3, m9 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2], m2 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - movu [r2], m0 - movu [r2 + mmsize], m2 -%endif - lea r0, [r0 + r1 * 4] - movu m0, [r0] ; m0 = row 4 - punpcklbw m2, m1, m0 - punpckhbw m3, m1, m0 - pmaddubsw m8, m2, m11 - pmaddubsw m9, m3, m11 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - paddw m4, m8 - paddw m5, m9 -%ifidn %1,pp - pmulhrsw m4, m12 - pmulhrsw m5, m12 - packuswb m4, m5 - movu [r2 + r3], m4 -%else - psubw m4, m12 - psubw m5, m12 - vperm2i128 m1, m4, m5, 0x20 - vperm2i128 m4, m4, m5, 0x31 - movu [r2 + r3], m1 - movu [r2 + r3 + mmsize], m4 -%endif - - movu m1, [r0 + r1] ; m1 = row 5 - punpcklbw m4, m0, m1 - punpckhbw m5, m0, m1 - pmaddubsw m4, m11 - pmaddubsw m5, m11 - paddw m6, m4 - paddw m7, m5 -%ifidn %1,pp - pmulhrsw m6, m12 - pmulhrsw m7, m12 - packuswb m6, m7 - movu [r2 + r3 * 2], m6 -%else - psubw m6, m12 - psubw m7, m12 - vperm2i128 m0, m6, m7, 0x20 - vperm2i128 m6, m6, m7, 0x31 - movu [r2 + r3 * 2], m0 - movu [r2 + r3 * 2 + mmsize], m6 -%endif - - movu m0, [r0 + r1 * 2] ; m0 = row 6 - punpcklbw m6, m1, m0 - punpckhbw m7, m1, m0 - pmaddubsw m6, m11 - pmaddubsw m7, m11 - paddw m2, m6 - paddw m3, m7 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2 + r5], m2 - add r2, 32 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - movu [r2 + r5], m0 - movu [r2 + r5 + mmsize], m2 - add r2, 64 -%endif - sub r0, r7 - - movu xm0, [r0 + 32] ; m0 = row 0 - movu xm1, [r0 + r1 + 32] ; m1 = row 1 +%macro PROCESS_LUMA_AVX2_W16_16R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, m10 - movu xm2, [r0 + r1 * 2 + 32] ; m2 = row 2 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, m10 - movu xm3, [r0 + r4 + 32] ; m3 = row 3 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, m11 + pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 - pmaddubsw m2, m10 - lea r0, [r0 + r1 * 4] - movu xm4, [r0 + 32] ; m4 = row 4 + pmaddubsw m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, m11 + pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 - pmaddubsw m3, m10 - movu xm5, [r0 + r1 + 32] ; m5 = row 5 + pmaddubsw m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 - pmaddubsw m4, m11 - paddw m2, m4 - movu xm6, [r0 + r1 * 2 + 32] ; m6 = row 6 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 - pmaddubsw m5, m11 - paddw m3, m5 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + %ifidn %1,pp - pmulhrsw m0, m12 ; m0 = word: row 0 - pmulhrsw m1, m12 ; m1 = word: row 1 - pmulhrsw m2, m12 ; m2 = word: row 2 - pmulhrsw m3, m12 ; m3 = word: row 3 + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 packuswb m0, m1 packuswb m2, m3 + packuswb m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 - movu [r2 + r5], xm3 - lea r2, [r2 + r3 * 4 - 32] + movu [r2 + r6], xm3 + lea r8, [r2 + r3 * 4] + movu [r8], xm4 + movu [r8 + r3], xm5 %else - psubw m0, m12 ; m0 = word: row 0 - psubw m1, m12 ; m1 = word: row 1 - psubw m2, m12 ; m2 = word: row 2 - psubw m3, m12 ; m3 = word: row 3 + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 - movu [r2 + r5], m3 - lea r2, [r2 + r3 * 4 - 64] -%endif - dec r6d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_CHROMA_AVX2_48x64 pp - FILTER_VER_CHROMA_AVX2_48x64 ps - -%macro FILTER_VER_CHROMA_AVX2_64xN 2 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_64x%2, 4, 8, 13 - mov r4d, r4m - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer_32 + r4] + movu [r2 + r6], m3 + lea r8, [r2 + r3 * 4] + movu [r8], m4 + movu [r8 + r3], m5 %endif - mova m10, [r5] - mova m11, [r5 + mmsize] - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - mova m12, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m12, [pw_2000] -%endif - lea r5, [r3 * 3] - lea r7, [r1 * 4] - mov r6d, %2 / 4 -.loopH: -%assign x 0 -%rep 2 - movu m0, [r0 + x] ; m0 = row 0 - movu m1, [r0 + r1 + x] ; m1 = row 1 - punpcklbw m2, m0, m1 - punpckhbw m3, m0, m1 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - movu m0, [r0 + r1 * 2 + x] ; m0 = row 2 - punpcklbw m4, m1, m0 - punpckhbw m5, m1, m0 - pmaddubsw m4, m10 - pmaddubsw m5, m10 - movu m1, [r0 + r4 + x] ; m1 = row 3 - punpcklbw m6, m0, m1 - punpckhbw m7, m0, m1 - pmaddubsw m8, m6, m11 - pmaddubsw m9, m7, m11 - pmaddubsw m6, m10 - pmaddubsw m7, m10 - paddw m2, m8 - paddw m3, m9 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2], m2 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - movu [r2], m0 - movu [r2 + mmsize], m2 -%endif - lea r0, [r0 + r1 * 4] - movu m0, [r0 + x] ; m0 = row 4 - punpcklbw m2, m1, m0 - punpckhbw m3, m1, m0 - pmaddubsw m8, m2, m11 - pmaddubsw m9, m3, m11 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - paddw m4, m8 - paddw m5, m9 -%ifidn %1,pp - pmulhrsw m4, m12 - pmulhrsw m5, m12 - packuswb m4, m5 - movu [r2 + r3], m4 -%else - psubw m4, m12 - psubw m5, m12 - vperm2i128 m1, m4, m5, 0x20 - vperm2i128 m4, m4, m5, 0x31 - movu [r2 + r3], m1 - movu [r2 + r3 + mmsize], m4 -%endif + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] - movu m1, [r0 + r1 + x] ; m1 = row 5 - punpcklbw m4, m0, m1 - punpckhbw m5, m0, m1 - pmaddubsw m4, m11 - pmaddubsw m5, m11 - paddw m6, m4 - paddw m7, m5 %ifidn %1,pp - pmulhrsw m6, m12 - pmulhrsw m7, m12 + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m6, m7 - movu [r2 + r3 * 2], m6 -%else - psubw m6, m12 - psubw m7, m12 - vperm2i128 m0, m6, m7, 0x20 - vperm2i128 m6, m6, m7, 0x31 - movu [r2 + r3 * 2], m0 - movu [r2 + r3 * 2 + mmsize], m6 -%endif - - movu m0, [r0 + r1 * 2 + x] ; m0 = row 6 - punpcklbw m6, m1, m0 - punpckhbw m7, m1, m0 - pmaddubsw m6, m11 - pmaddubsw m7, m11 - paddw m2, m6 - paddw m3, m7 -%ifidn %1,pp - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2 + r5], m2 - add r2, 32 -%else - psubw m2, m12 - psubw m3, m12 - vperm2i128 m0, m2, m3, 0x20 - vperm2i128 m2, m2, m3, 0x31 - movu [r2 + r5], m0 - movu [r2 + r5 + mmsize], m2 - add r2, 64 -%endif - sub r0, r7 -%assign x x+32 -%endrep -%ifidn %1,pp - lea r2, [r2 + r3 * 4 - 64] -%else - lea r2, [r2 + r3 * 4 - 128] -%endif - add r0, r7 - dec r6d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_CHROMA_AVX2_64xN pp, 64 - FILTER_VER_CHROMA_AVX2_64xN pp, 48 - FILTER_VER_CHROMA_AVX2_64xN pp, 32 - FILTER_VER_CHROMA_AVX2_64xN pp, 16 - FILTER_VER_CHROMA_AVX2_64xN ps, 64 - FILTER_VER_CHROMA_AVX2_64xN ps, 48 - FILTER_VER_CHROMA_AVX2_64xN ps, 32 - FILTER_VER_CHROMA_AVX2_64xN ps, 16 - -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W16n_H2 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 - - mov r4d, r4m - sub r0, r1 - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 %else - movd m0, [tab_ChromaCoeff + r4 * 4] + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r8 + r3 * 2], m6 + movu [r8 + r6], m7 %endif - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - - mov r4d, %2/2 + lea r8, [r8 + r3 * 4] -.loop: - - mov r6d, %1/16 - -.loopW: - - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r5, [r0 + 2 * r1] - movu m5, [r5] - movu m6, [r5 + r1] - - punpckhbw m7, m5, m6 - pmaddubsw m7, m0 - paddw m2, m7 - - punpcklbw m7, m5, m6 - pmaddubsw m7, m0 - paddw m4, m7 - - mova m7, [pw_512] - - pmulhrsw m4, m7 - pmulhrsw m2, m7 - - packuswb m4, m2 - - movu [r2], m4 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m5, [r5 + 2 * r1] - - punpcklbw m2, m6, m5 - punpckhbw m6, m5 - - pmaddubsw m2, m0 - pmaddubsw m6, m0 - - paddw m4, m2 - paddw m3, m6 - - pmulhrsw m4, m7 - pmulhrsw m3, m7 - - packuswb m4, m3 - - movu [r2 + r3], m4 - - add r0, 16 - add r2, 16 - dec r6d - jnz .loopW - - lea r0, [r0 + r1 * 2 - %1] - lea r2, [r2 + r3 * 2 - %1] - - dec r4d - jnz .loop - RET -%endmacro - - FILTER_V4_W16n_H2 64, 64 - FILTER_V4_W16n_H2 64, 32 - FILTER_V4_W16n_H2 64, 48 - FILTER_V4_W16n_H2 48, 64 - FILTER_V4_W16n_H2 64, 16 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_2xN 1 -INIT_XMM sse4 -cglobal filterPixelToShort_2x%1, 3, 4, 3 - mov r3d, r3m - add r3d, r3d - - ; load constant - mova m1, [pb_128] - mova m2, [tab_c_64_n64] - -%rep %1/2 - movd m0, [r0] - pinsrd m0, [r0 + r1], 1 - punpcklbw m0, m1 - pmaddubsw m0, m2 - - movd [r2 + r3 * 0], m0 - pextrd [r2 + r3 * 1], m0, 2 - - lea r0, [r0 + r1 * 2] - lea r2, [r2 + r3 * 2] -%endrep - RET -%endmacro - P2S_H_2xN 4 - P2S_H_2xN 8 - P2S_H_2xN 16 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_4xN 1 -INIT_XMM sse4 -cglobal filterPixelToShort_4x%1, 3, 6, 4 - mov r3d, r3m - add r3d, r3d - lea r4, [r3 * 3] - lea r5, [r1 * 3] - - ; load constant - mova m2, [pb_128] - mova m3, [tab_c_64_n64] - -%assign x 0 -%rep %1/4 - movd m0, [r0] - pinsrd m0, [r0 + r1], 1 - punpcklbw m0, m2 - pmaddubsw m0, m3 - - movd m1, [r0 + r1 * 2] - pinsrd m1, [r0 + r5], 1 - punpcklbw m1, m2 - pmaddubsw m1, m3 - - movq [r2 + r3 * 0], m0 - movq [r2 + r3 * 2], m1 - movhps [r2 + r3 * 1], m0 - movhps [r2 + r4], m1 -%assign x x+1 -%if (x != %1/4) - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] -%endif -%endrep - RET -%endmacro - P2S_H_4xN 4 - P2S_H_4xN 8 - P2S_H_4xN 16 - P2S_H_4xN 32 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_6xN 1 -INIT_XMM sse4 -cglobal filterPixelToShort_6x%1, 3, 7, 6 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - - ; load height - mov r6d, %1/4 - - ; load constant - mova m4, [pb_128] - mova m5, [tab_c_64_n64] - -.loop: - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r4] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movh [r2 + r3 * 0], m0 - pextrd [r2 + r3 * 0 + 8], m0, 2 - movh [r2 + r3 * 1], m1 - pextrd [r2 + r3 * 1 + 8], m1, 2 - movh [r2 + r3 * 2], m2 - pextrd [r2 + r3 * 2 + 8], m2, 2 - movh [r2 + r5], m3 - pextrd [r2 + r5 + 8], m3, 2 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r6d - jnz .loop - RET -%endmacro - P2S_H_6xN 8 - P2S_H_6xN 16 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_8xN 1 -INIT_XMM ssse3 -cglobal filterPixelToShort_8x%1, 3, 7, 6 - mov r3d, r3m - add r3d, r3d - lea r5, [r1 * 3] - lea r6, [r3 * 3] - - ; load height - mov r4d, %1/4 - - ; load constant - mova m4, [pb_128] - mova m5, [tab_c_64_n64] - -.loop: - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0], m0 - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r6 ], m3 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r4d - jnz .loop - RET -%endmacro - P2S_H_8xN 8 - P2S_H_8xN 4 - P2S_H_8xN 16 - P2S_H_8xN 32 - P2S_H_8xN 12 - P2S_H_8xN 64 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_XMM ssse3 -cglobal filterPixelToShort_8x6, 3, 7, 5 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r1 * 5] - lea r6, [r3 * 3] - - ; load constant - mova m3, [pb_128] - mova m4, [tab_c_64_n64] - - movh m0, [r0] - punpcklbw m0, m3 - pmaddubsw m0, m4 - - movh m1, [r0 + r1] - punpcklbw m1, m3 - pmaddubsw m1, m4 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m3 - pmaddubsw m2, m4 - - movu [r2 + r3 * 0], m0 - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 2], m2 - - movh m0, [r0 + r4] - punpcklbw m0, m3 - pmaddubsw m0, m4 - - movh m1, [r0 + r1 * 4] - punpcklbw m1, m3 - pmaddubsw m1, m4 - - movh m2, [r0 + r5] - punpcklbw m2, m3 - pmaddubsw m2, m4 - - movu [r2 + r6 ], m0 - movu [r2 + r3 * 4], m1 - lea r2, [r2 + r3 * 4] - movu [r2 + r3], m2 - - RET - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_16xN 1 -INIT_XMM ssse3 -cglobal filterPixelToShort_16x%1, 3, 7, 6 - mov r3d, r3m - add r3d, r3d - lea r4, [r3 * 3] - lea r5, [r1 * 3] - - ; load height - mov r6d, %1/4 - - ; load constant - mova m4, [pb_128] - mova m5, [tab_c_64_n64] - -.loop: - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0], m0 - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r4], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0 + 16], m0 - movu [r2 + r3 * 1 + 16], m1 - movu [r2 + r3 * 2 + 16], m2 - movu [r2 + r4 + 16], m3 - - lea r0, [r0 + r1 * 4 - 8] - lea r2, [r2 + r3 * 4] - - dec r6d - jnz .loop - RET -%endmacro - P2S_H_16xN 16 - P2S_H_16xN 4 - P2S_H_16xN 8 - P2S_H_16xN 12 - P2S_H_16xN 32 - P2S_H_16xN 64 - P2S_H_16xN 24 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal filterPixelToShort_16x4, 3, 4, 2 - mov r3d, r3m - add r3d, r3d - - ; load constant - vbroadcasti128 m1, [pw_2000] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - lea r1, [r1 * 3] - lea r3, [r3 * 3] - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - RET - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal filterPixelToShort_16x8, 3, 6, 2 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - - ; load constant - vbroadcasti128 m1, [pw_2000] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - RET - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal filterPixelToShort_16x12, 3, 6, 2 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - - ; load constant - vbroadcasti128 m1, [pw_2000] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - RET - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal filterPixelToShort_16x16, 3, 6, 2 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - - ; load constant - vbroadcasti128 m1, [pw_2000] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - RET - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal filterPixelToShort_16x24, 3, 7, 2 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - mov r6d, 3 - - ; load constant - vbroadcasti128 m1, [pw_2000] -.loop: - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r6d - jnz .loop - RET - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_16xN_avx2 1 -INIT_YMM avx2 -cglobal filterPixelToShort_16x%1, 3, 7, 2 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - mov r6d, %1/16 - - ; load constant - vbroadcasti128 m1, [pw_2000] -.loop: - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r6d - jnz .loop - RET -%endmacro -P2S_H_16xN_avx2 32 -P2S_H_16xN_avx2 64 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_32xN 1 -INIT_XMM ssse3 -cglobal filterPixelToShort_32x%1, 3, 7, 6 - mov r3d, r3m - add r3d, r3d - lea r4, [r3 * 3] - lea r5, [r1 * 3] - - ; load height - mov r6d, %1/4 - - ; load constant - mova m4, [pb_128] - mova m5, [tab_c_64_n64] - -.loop: - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0], m0 - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r4], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0 + 16], m0 - movu [r2 + r3 * 1 + 16], m1 - movu [r2 + r3 * 2 + 16], m2 - movu [r2 + r4 + 16], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0 + 32], m0 - movu [r2 + r3 * 1 + 32], m1 - movu [r2 + r3 * 2 + 32], m2 - movu [r2 + r4 + 32], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0 + 48], m0 - movu [r2 + r3 * 1 + 48], m1 - movu [r2 + r3 * 2 + 48], m2 - movu [r2 + r4 + 48], m3 - - lea r0, [r0 + r1 * 4 - 24] - lea r2, [r2 + r3 * 4] - - dec r6d - jnz .loop - RET -%endmacro - P2S_H_32xN 32 - P2S_H_32xN 8 - P2S_H_32xN 16 - P2S_H_32xN 24 - P2S_H_32xN 64 - P2S_H_32xN 48 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_32xN_avx2 1 -INIT_YMM avx2 -cglobal filterPixelToShort_32x%1, 3, 7, 3 - mov r3d, r3m - add r3d, r3d - lea r5, [r1 * 3] - lea r6, [r3 * 3] - - ; load height - mov r4d, %1/4 - - ; load constant - vpbroadcastd m2, [pw_2000] - -.loop: - pmovzxbw m0, [r0 + 0 * mmsize/2] - pmovzxbw m1, [r0 + 1 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psubw m0, m2 - psubw m1, m2 - movu [r2 + 0 * mmsize], m0 - movu [r2 + 1 * mmsize], m1 - - pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psubw m0, m2 - psubw m1, m2 - movu [r2 + r3 + 0 * mmsize], m0 - movu [r2 + r3 + 1 * mmsize], m1 - - pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psubw m0, m2 - psubw m1, m2 - movu [r2 + r3 * 2 + 0 * mmsize], m0 - movu [r2 + r3 * 2 + 1 * mmsize], m1 - - pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psubw m0, m2 - psubw m1, m2 - movu [r2 + r6 + 0 * mmsize], m0 - movu [r2 + r6 + 1 * mmsize], m1 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r4d - jnz .loop - RET -%endmacro - P2S_H_32xN_avx2 32 - P2S_H_32xN_avx2 8 - P2S_H_32xN_avx2 16 - P2S_H_32xN_avx2 24 - P2S_H_32xN_avx2 64 - P2S_H_32xN_avx2 48 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_64xN 1 -INIT_XMM ssse3 -cglobal filterPixelToShort_64x%1, 3, 7, 6 - mov r3d, r3m - add r3d, r3d - lea r4, [r3 * 3] - lea r5, [r1 * 3] - - ; load height - mov r6d, %1/4 - - ; load constant - mova m4, [pb_128] - mova m5, [tab_c_64_n64] - -.loop: - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0], m0 - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r4], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0 + 16], m0 - movu [r2 + r3 * 1 + 16], m1 - movu [r2 + r3 * 2 + 16], m2 - movu [r2 + r4 + 16], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0 + 32], m0 - movu [r2 + r3 * 1 + 32], m1 - movu [r2 + r3 * 2 + 32], m2 - movu [r2 + r4 + 32], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0 + 48], m0 - movu [r2 + r3 * 1 + 48], m1 - movu [r2 + r3 * 2 + 48], m2 - movu [r2 + r4 + 48], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0 + 64], m0 - movu [r2 + r3 * 1 + 64], m1 - movu [r2 + r3 * 2 + 64], m2 - movu [r2 + r4 + 64], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0 + 80], m0 - movu [r2 + r3 * 1 + 80], m1 - movu [r2 + r3 * 2 + 80], m2 - movu [r2 + r4 + 80], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0 + 96], m0 - movu [r2 + r3 * 1 + 96], m1 - movu [r2 + r3 * 2 + 96], m2 - movu [r2 + r4 + 96], m3 - - lea r0, [r0 + 8] - - movh m0, [r0] - punpcklbw m0, m4 - pmaddubsw m0, m5 - - movh m1, [r0 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 - - movh m2, [r0 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 - - movh m3, [r0 + r5] - punpcklbw m3, m4 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0 + 112], m0 - movu [r2 + r3 * 1 + 112], m1 - movu [r2 + r3 * 2 + 112], m2 - movu [r2 + r4 + 112], m3 - - lea r0, [r0 + r1 * 4 - 56] - lea r2, [r2 + r3 * 4] - - dec r6d - jnz .loop - RET -%endmacro - P2S_H_64xN 64 - P2S_H_64xN 16 - P2S_H_64xN 32 - P2S_H_64xN 48 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_64xN_avx2 1 -INIT_YMM avx2 -cglobal filterPixelToShort_64x%1, 3, 7, 5 - mov r3d, r3m - add r3d, r3d - lea r5, [r1 * 3] - lea r6, [r3 * 3] - - ; load height - mov r4d, %1/4 - - ; load constant - vpbroadcastd m4, [pw_2000] - -.loop: - pmovzxbw m0, [r0 + 0 * mmsize/2] - pmovzxbw m1, [r0 + 1 * mmsize/2] - pmovzxbw m2, [r0 + 2 * mmsize/2] - pmovzxbw m3, [r0 + 3 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psllw m3, 6 - psubw m0, m4 - psubw m1, m4 - psubw m2, m4 - psubw m3, m4 - - movu [r2 + 0 * mmsize], m0 - movu [r2 + 1 * mmsize], m1 - movu [r2 + 2 * mmsize], m2 - movu [r2 + 3 * mmsize], m3 - - pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] - pmovzxbw m2, [r0 + r1 + 2 * mmsize/2] - pmovzxbw m3, [r0 + r1 + 3 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psllw m3, 6 - psubw m0, m4 - psubw m1, m4 - psubw m2, m4 - psubw m3, m4 - - movu [r2 + r3 + 0 * mmsize], m0 - movu [r2 + r3 + 1 * mmsize], m1 - movu [r2 + r3 + 2 * mmsize], m2 - movu [r2 + r3 + 3 * mmsize], m3 - - pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] - pmovzxbw m2, [r0 + r1 * 2 + 2 * mmsize/2] - pmovzxbw m3, [r0 + r1 * 2 + 3 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psllw m3, 6 - psubw m0, m4 - psubw m1, m4 - psubw m2, m4 - psubw m3, m4 - - movu [r2 + r3 * 2 + 0 * mmsize], m0 - movu [r2 + r3 * 2 + 1 * mmsize], m1 - movu [r2 + r3 * 2 + 2 * mmsize], m2 - movu [r2 + r3 * 2 + 3 * mmsize], m3 - - pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] - pmovzxbw m2, [r0 + r5 + 2 * mmsize/2] - pmovzxbw m3, [r0 + r5 + 3 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psllw m3, 6 - psubw m0, m4 - psubw m1, m4 - psubw m2, m4 - psubw m3, m4 - - movu [r2 + r6 + 0 * mmsize], m0 - movu [r2 + r6 + 1 * mmsize], m1 - movu [r2 + r6 + 2 * mmsize], m2 - movu [r2 + r6 + 3 * mmsize], m3 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r4d - jnz .loop - RET -%endmacro - P2S_H_64xN_avx2 64 - P2S_H_64xN_avx2 16 - P2S_H_64xN_avx2 32 - P2S_H_64xN_avx2 48 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_12xN 1 -INIT_XMM ssse3 -cglobal filterPixelToShort_12x%1, 3, 7, 6 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r6, [r3 * 3] - mov r5d, %1/4 - - ; load constant - mova m4, [pb_128] - mova m5, [tab_c_64_n64] - -.loop: - movu m0, [r0] - punpcklbw m1, m0, m4 - punpckhbw m0, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - - movu m2, [r0 + r1] - punpcklbw m3, m2, m4 - punpckhbw m2, m4 - pmaddubsw m2, m5 - pmaddubsw m3, m5 - - movu [r2 + r3 * 0], m1 - movu [r2 + r3 * 1], m3 - - movh [r2 + r3 * 0 + 16], m0 - movh [r2 + r3 * 1 + 16], m2 - - movu m0, [r0 + r1 * 2] - punpcklbw m1, m0, m4 - punpckhbw m0, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - - movu m2, [r0 + r4] - punpcklbw m3, m2, m4 - punpckhbw m2, m4 - pmaddubsw m2, m5 - pmaddubsw m3, m5 - - movu [r2 + r3 * 2], m1 - movu [r2 + r6], m3 - - movh [r2 + r3 * 2 + 16], m0 - movh [r2 + r6 + 16], m2 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r5d - jnz .loop - RET -%endmacro - P2S_H_12xN 16 - P2S_H_12xN 32 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_24xN 1 -INIT_XMM ssse3 -cglobal filterPixelToShort_24x%1, 3, 7, 5 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - mov r6d, %1/4 - - ; load constant - mova m3, [pb_128] - mova m4, [tab_c_64_n64] - -.loop: - movu m0, [r0] - punpcklbw m1, m0, m3 - punpckhbw m0, m3 - pmaddubsw m0, m4 - pmaddubsw m1, m4 - - movu m2, [r0 + 16] - punpcklbw m2, m3 - pmaddubsw m2, m4 - - movu [r2 + r3 * 0], m1 - movu [r2 + r3 * 0 + 16], m0 - movu [r2 + r3 * 0 + 32], m2 - - movu m0, [r0 + r1] - punpcklbw m1, m0, m3 - punpckhbw m0, m3 - pmaddubsw m0, m4 - pmaddubsw m1, m4 - - movu m2, [r0 + r1 + 16] - punpcklbw m2, m3 - pmaddubsw m2, m4 - - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 1 + 16], m0 - movu [r2 + r3 * 1 + 32], m2 - - movu m0, [r0 + r1 * 2] - punpcklbw m1, m0, m3 - punpckhbw m0, m3 - pmaddubsw m0, m4 - pmaddubsw m1, m4 - - movu m2, [r0 + r1 * 2 + 16] - punpcklbw m2, m3 - pmaddubsw m2, m4 - - movu [r2 + r3 * 2], m1 - movu [r2 + r3 * 2 + 16], m0 - movu [r2 + r3 * 2 + 32], m2 - - movu m0, [r0 + r4] - punpcklbw m1, m0, m3 - punpckhbw m0, m3 - pmaddubsw m0, m4 - pmaddubsw m1, m4 - - movu m2, [r0 + r4 + 16] - punpcklbw m2, m3 - pmaddubsw m2, m4 - movu [r2 + r5], m1 - movu [r2 + r5 + 16], m0 - movu [r2 + r5 + 32], m2 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r6d - jnz .loop - RET -%endmacro - P2S_H_24xN 32 - P2S_H_24xN 64 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -%macro P2S_H_24xN_avx2 1 -INIT_YMM avx2 -cglobal filterPixelToShort_24x%1, 3, 7, 4 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - mov r6d, %1/4 - - ; load constant - vpbroadcastd m1, [pw_2000] - vpbroadcastd m2, [pb_128] - vpbroadcastd m3, [tab_c_64_n64] - -.loop: - pmovzxbw m0, [r0] - psllw m0, 6 - psubw m0, m1 - movu [r2], m0 - - movu m0, [r0 + mmsize/2] - punpcklbw m0, m2 - pmaddubsw m0, m3 - movu [r2 + r3 * 0 + mmsize], xm0 - - pmovzxbw m0, [r0 + r1] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3], m0 - - movu m0, [r0 + r1 + mmsize/2] - punpcklbw m0, m2 - pmaddubsw m0, m3 - movu [r2 + r3 * 1 + mmsize], xm0 - - pmovzxbw m0, [r0 + r1 * 2] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r3 * 2], m0 - - movu m0, [r0 + r1 * 2 + mmsize/2] - punpcklbw m0, m2 - pmaddubsw m0, m3 - movu [r2 + r3 * 2 + mmsize], xm0 - - pmovzxbw m0, [r0 + r4] - psllw m0, 6 - psubw m0, m1 - movu [r2 + r5], m0 - - movu m0, [r0 + r4 + mmsize/2] - punpcklbw m0, m2 - pmaddubsw m0, m3 - movu [r2 + r5 + mmsize], xm0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r6d - jnz .loop - RET -%endmacro - P2S_H_24xN_avx2 32 - P2S_H_24xN_avx2 64 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_XMM ssse3 -cglobal filterPixelToShort_48x64, 3, 7, 4 - mov r3d, r3m - add r3d, r3d - lea r4, [r1 * 3] - lea r5, [r3 * 3] - mov r6d, 16 - - ; load constant - mova m2, [pb_128] - mova m3, [tab_c_64_n64] - -.loop: - movu m0, [r0] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r3 * 0], m1 - movu [r2 + r3 * 0 + 16], m0 - - movu m0, [r0 + 16] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r3 * 0 + 32], m1 - movu [r2 + r3 * 0 + 48], m0 - - movu m0, [r0 + 32] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r3 * 0 + 64], m1 - movu [r2 + r3 * 0 + 80], m0 - - movu m0, [r0 + r1] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r3 * 1], m1 - movu [r2 + r3 * 1 + 16], m0 - - movu m0, [r0 + r1 + 16] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r3 * 1 + 32], m1 - movu [r2 + r3 * 1 + 48], m0 - - movu m0, [r0 + r1 + 32] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r3 * 1 + 64], m1 - movu [r2 + r3 * 1 + 80], m0 - - movu m0, [r0 + r1 * 2] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r3 * 2], m1 - movu [r2 + r3 * 2 + 16], m0 - - movu m0, [r0 + r1 * 2 + 16] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r3 * 2 + 32], m1 - movu [r2 + r3 * 2 + 48], m0 - - movu m0, [r0 + r1 * 2 + 32] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r3 * 2 + 64], m1 - movu [r2 + r3 * 2 + 80], m0 - - movu m0, [r0 + r4] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r5], m1 - movu [r2 + r5 + 16], m0 - - movu m0, [r0 + r4 + 16] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r5 + 32], m1 - movu [r2 + r5 + 48], m0 - - movu m0, [r0 + r4 + 32] - punpcklbw m1, m0, m2 - punpckhbw m0, m2 - pmaddubsw m0, m3 - pmaddubsw m1, m3 - - movu [r2 + r5 + 64], m1 - movu [r2 + r5 + 80], m0 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r6d - jnz .loop - RET - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal filterPixelToShort_48x64, 3,7,4 - mov r3d, r3m - add r3d, r3d - lea r5, [r1 * 3] - lea r6, [r3 * 3] - - ; load height - mov r4d, 64/4 - - ; load constant - vpbroadcastd m3, [pw_2000] - - ; just unroll(1) because it is best choice for 48x64 -.loop: - pmovzxbw m0, [r0 + 0 * mmsize/2] - pmovzxbw m1, [r0 + 1 * mmsize/2] - pmovzxbw m2, [r0 + 2 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psubw m0, m3 - psubw m1, m3 - psubw m2, m3 - movu [r2 + 0 * mmsize], m0 - movu [r2 + 1 * mmsize], m1 - movu [r2 + 2 * mmsize], m2 - - pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] - pmovzxbw m2, [r0 + r1 + 2 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psubw m0, m3 - psubw m1, m3 - psubw m2, m3 - movu [r2 + r3 + 0 * mmsize], m0 - movu [r2 + r3 + 1 * mmsize], m1 - movu [r2 + r3 + 2 * mmsize], m2 - - pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] - pmovzxbw m2, [r0 + r1 * 2 + 2 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psubw m0, m3 - psubw m1, m3 - psubw m2, m3 - movu [r2 + r3 * 2 + 0 * mmsize], m0 - movu [r2 + r3 * 2 + 1 * mmsize], m1 - movu [r2 + r3 * 2 + 2 * mmsize], m2 - - pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] - pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] - pmovzxbw m2, [r0 + r5 + 2 * mmsize/2] - psllw m0, 6 - psllw m1, 6 - psllw m2, 6 - psubw m0, m3 - psubw m1, m3 - psubw m2, m3 - movu [r2 + r6 + 0 * mmsize], m0 - movu [r2 + r6 + 1 * mmsize], m1 - movu [r2 + r6 + 2 * mmsize], m2 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r4d - jnz .loop - RET - - -%macro PROCESS_LUMA_W4_4R 0 - movd m0, [r0] - movd m1, [r0 + r1] - punpcklbw m2, m0, m1 ; m2=[0 1] - - lea r0, [r0 + 2 * r1] - movd m0, [r0] - punpcklbw m1, m0 ; m1=[1 2] - punpcklqdq m2, m1 ; m2=[0 1 1 2] - pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2] - - movd m1, [r0 + r1] - punpcklbw m5, m0, m1 ; m2=[2 3] - lea r0, [r0 + 2 * r1] - movd m0, [r0] - punpcklbw m1, m0 ; m1=[3 4] - punpcklqdq m5, m1 ; m5=[2 3 3 4] - pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4] - paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2 - pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4 - - movd m1, [r0 + r1] - punpcklbw m2, m0, m1 ; m2=[4 5] - lea r0, [r0 + 2 * r1] - movd m0, [r0] - punpcklbw m1, m0 ; m1=[5 6] - punpcklqdq m2, m1 ; m2=[4 5 5 6] - pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6] - paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2 - pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6] - paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4 - - movd m1, [r0 + r1] - punpcklbw m2, m0, m1 ; m2=[6 7] - lea r0, [r0 + 2 * r1] - movd m0, [r0] - punpcklbw m1, m0 ; m1=[7 8] - punpcklqdq m2, m1 ; m2=[6 7 7 8] - pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8] - paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end - pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8] - paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4 - - movd m1, [r0 + r1] - punpcklbw m2, m0, m1 ; m2=[8 9] - movd m0, [r0 + 2 * r1] - punpcklbw m1, m0 ; m1=[9 10] - punpcklqdq m2, m1 ; m2=[8 9 9 10] - pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10] - paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end -%endmacro - -%macro PROCESS_LUMA_W8_4R 0 - movq m0, [r0] - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movq m0, [r0] - punpcklbw m1, m0 - pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2 - - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3 - pmaddubsw m0, [r6 + 1 * 16] - paddw m7, m0 ;m7=[0+1+2+3] Row1 - - lea r0, [r0 + 2 * r1] - movq m0, [r0] - punpcklbw m1, m0 - pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4 - pmaddubsw m1, [r6 + 1 * 16] - paddw m6, m1 ;m6 = [1+2+3+4] Row2 - - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m2, m0, [r6 + 1 * 16] - pmaddubsw m0, [r6 + 2 * 16] - paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1 - paddw m5, m2 ;m5=[2+3+4+5] Row3 - - lea r0, [r0 + 2 * r1] - movq m0, [r0] - punpcklbw m1, m0 - pmaddubsw m2, m1, [r6 + 1 * 16] - pmaddubsw m1, [r6 + 2 * 16] - paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2 - paddw m4, m2 ;m4=[3+4+5+6] Row4 - - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m2, m0, [r6 + 2 * 16] - pmaddubsw m0, [r6 + 3 * 16] - paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end - paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3 - - lea r0, [r0 + 2 * r1] - movq m0, [r0] - punpcklbw m1, m0 - pmaddubsw m2, m1, [r6 + 2 * 16] - pmaddubsw m1, [r6 + 3 * 16] - paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end - paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4 - - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m0, [r6 + 3 * 16] - paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end - - movq m0, [r0 + 2 * r1] - punpcklbw m1, m0 - pmaddubsw m1, [r6 + 3 * 16] - paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_4xN 3 -INIT_XMM sse4 -cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6 - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 -%ifidn %3,ps - add r3d, r3d -%endif - -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffVer + r4] -%endif - -%ifidn %3,pp - mova m3, [pw_512] -%else - mova m3, [pw_2000] -%endif - - mov r4d, %2/4 - lea r5, [4 * r1] - -.loopH: - PROCESS_LUMA_W4_4R - -%ifidn %3,pp - pmulhrsw m4, m3 - pmulhrsw m5, m3 - - packuswb m4, m5 - - movd [r2], m4 - pextrd [r2 + r3], m4, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m4, 2 - pextrd [r2 + r3], m4, 3 -%else - psubw m4, m3 - psubw m5, m3 - - movlps [r2], m4 - movhps [r2 + r3], m4 - lea r2, [r2 + 2 * r3] - movlps [r2], m5 - movhps [r2 + r3], m5 -%endif - - sub r0, r5 - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH - - RET -%endmacro - - -INIT_YMM avx2 -cglobal interp_8tap_vert_pp_4x4, 4,6,8 - mov r4d, r4m - lea r5, [r1 * 3] - sub r0, r5 - - ; TODO: VPGATHERDD - movd xm1, [r0] ; m1 = row0 - movd xm2, [r0 + r1] ; m2 = row1 - punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00] - - movd xm3, [r0 + r1 * 2] ; m3 = row2 - punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10] - movd xm4, [r0 + r5] - punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20] - punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] - - lea r0, [r0 + r1 * 4] - movd xm5, [r0] ; m5 = row4 - punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30] - punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] - movd xm2, [r0 + r1] ; m2 = row5 - punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40] - punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] - movd xm6, [r0 + r1 * 2] ; m6 = row6 - punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50] - punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] - movd xm4, [r0 + r5] ; m4 = row7 - punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60] - punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] - - lea r0, [r0 + r1 * 4] - movd xm7, [r0] ; m7 = row8 - punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70] - punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] - movd xm2, [r0 + r1] ; m2 = row9 - punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80] - punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] - movd xm7, [r0 + r1 * 2] ; m7 = rowA - punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90] - punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] - - ; load filter coeff -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastd m0, [r5 + r4 * 8 + 0] - vpbroadcastd m2, [r5 + r4 * 8 + 4] -%else - vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0] - vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4] -%endif - - pmaddubsw m1, m0 - pmaddubsw m3, m0 - pmaddubsw m5, m2 - pmaddubsw m6, m2 - vbroadcasti128 m0, [pw_1] - pmaddwd m1, m0 - pmaddwd m3, m0 - pmaddwd m5, m0 - pmaddwd m6, m0 - paddd m1, m5 ; m1 = DQWORD ROW[1 0] - paddd m3, m6 ; m3 = DQWORD ROW[3 2] - packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0] - - ; TODO: does it overflow? - pmulhrsw m1, [pw_512] - vextracti128 xm2, m1, 1 - packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0] - movd [r2], xm1 - pextrd [r2 + r3], xm1, 2 - pextrd [r2 + r3 * 2], xm1, 1 - lea r4, [r3 * 3] - pextrd [r2 + r4], xm1, 3 - RET - -INIT_YMM avx2 -cglobal interp_8tap_vert_ps_4x4, 4, 6, 5 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 - - add r3d, r3d - - movd xm1, [r0] - pinsrd xm1, [r0 + r1], 1 - pinsrd xm1, [r0 + r1 * 2], 2 - pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm2, [r0] - pinsrd xm2, [r0 + r1], 1 - pinsrd xm2, [r0 + r1 * 2], 2 - pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] - vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm3, [r0] - pinsrd xm3, [r0 + r1], 1 - pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] - vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] - mova m3, [interp4_vpp_shuf1] - vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] - vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] - mova m3, [interp4_vpp_shuf1 + mmsize] - vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] - vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] - - mova m3, [interp4_vpp_shuf] - pshufb m0, m0, m3 - pshufb m1, m1, m3 - pshufb m4, m4, m3 - pshufb m2, m2, m3 - pmaddubsw m0, [r5] - pmaddubsw m1, [r5 + mmsize] - pmaddubsw m4, [r5 + 2 * mmsize] - pmaddubsw m2, [r5 + 3 * mmsize] - paddw m0, m1 - paddw m0, m4 - paddw m0, m2 ; m0 = WORD ROW[3 2 1 0] - - psubw m0, [pw_2000] - vextracti128 xm2, m0, 1 - lea r5, [r3 * 3] - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r5], xm2 - RET - -%macro FILTER_VER_LUMA_AVX2_4xN 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 10 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r1 * 4] -%ifidn %3,pp - mova m6, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m6, [pw_2000] -%endif - lea r8, [r3 * 3] - mova m5, [interp4_vpp_shuf] - mova m0, [interp4_vpp_shuf1] - mova m7, [interp4_vpp_shuf1 + mmsize] - mov r7d, %2 / 8 -.loop: - movd xm1, [r0] - pinsrd xm1, [r0 + r1], 1 - pinsrd xm1, [r0 + r1 * 2], 2 - pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm2, [r0] - pinsrd xm2, [r0 + r1], 1 - pinsrd xm2, [r0 + r1 * 2], 2 - pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] - vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm3, [r0] - pinsrd xm3, [r0 + r1], 1 - pinsrd xm3, [r0 + r1 * 2], 2 - pinsrd xm3, [r0 + r4], 3 ; m3 = row[11 10 9 8] - vinserti128 m2, m2, xm3, 1 ; m2 = row[11 10 9 8 7 6 5 4] - lea r0, [r0 + r1 * 4] - movd xm4, [r0] - pinsrd xm4, [r0 + r1], 1 - pinsrd xm4, [r0 + r1 * 2], 2 ; m4 = row[x 14 13 12] - vinserti128 m3, m3, xm4, 1 ; m3 = row[x 14 13 12 11 10 9 8] - vpermd m8, m0, m1 ; m8 = row[4 3 3 2 2 1 1 0] - vpermd m4, m0, m2 ; m4 = row[8 7 7 6 6 5 5 4] - vpermd m1, m7, m1 ; m1 = row[6 5 5 4 4 3 3 2] - vpermd m2, m7, m2 ; m2 = row[10 9 9 8 8 7 7 6] - vpermd m9, m0, m3 ; m9 = row[12 11 11 10 10 9 9 8] - vpermd m3, m7, m3 ; m3 = row[14 13 13 12 12 11 11 10] - - pshufb m8, m8, m5 - pshufb m1, m1, m5 - pshufb m4, m4, m5 - pshufb m9, m9, m5 - pshufb m2, m2, m5 - pshufb m3, m3, m5 - pmaddubsw m8, [r5] - pmaddubsw m1, [r5 + mmsize] - pmaddubsw m9, [r5 + 2 * mmsize] - pmaddubsw m3, [r5 + 3 * mmsize] - paddw m8, m1 - paddw m9, m3 - pmaddubsw m1, m4, [r5 + 2 * mmsize] - pmaddubsw m3, m2, [r5 + 3 * mmsize] - pmaddubsw m4, [r5] - pmaddubsw m2, [r5 + mmsize] - paddw m3, m1 - paddw m2, m4 - paddw m8, m3 ; m8 = WORD ROW[3 2 1 0] - paddw m9, m2 ; m9 = WORD ROW[7 6 5 4] - -%ifidn %3,pp - pmulhrsw m8, m6 - pmulhrsw m9, m6 - packuswb m8, m9 - vextracti128 xm1, m8, 1 - movd [r2], xm8 - pextrd [r2 + r3], xm8, 1 - movd [r2 + r3 * 2], xm1 - pextrd [r2 + r8], xm1, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm8, 2 - pextrd [r2 + r3], xm8, 3 - pextrd [r2 + r3 * 2], xm1, 2 - pextrd [r2 + r8], xm1, 3 -%else - psubw m8, m6 - psubw m9, m6 - vextracti128 xm1, m8, 1 - vextracti128 xm2, m9, 1 - movq [r2], xm8 - movhps [r2 + r3], xm8 - movq [r2 + r3 * 2], xm1 - movhps [r2 + r8], xm1 - lea r2, [r2 + r3 * 4] - movq [r2], xm9 - movhps [r2 + r3], xm9 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r8], xm2 -%endif - lea r2, [r2 + r3 * 4] - sub r0, r6 - dec r7d - jnz .loop - RET -%endif -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_4xN 4, 4, pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_4xN 4, 8, pp - FILTER_VER_LUMA_AVX2_4xN 4, 8, pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_4xN 4, 16, pp - FILTER_VER_LUMA_AVX2_4xN 4, 16, pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_4xN 4, 4, ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_4xN 4, 8, ps - FILTER_VER_LUMA_AVX2_4xN 4, 8, ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_4xN 4, 16, ps - FILTER_VER_LUMA_AVX2_4xN 4, 16, ps - -%macro PROCESS_LUMA_AVX2_W8_8R 0 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m5, m3 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - pmaddubsw m1, [r5] - movq xm3, [r0 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 8 - punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - pmaddubsw m3, m4, [r5 + 3 * mmsize] - paddw m5, m3 - pmaddubsw m3, m4, [r5 + 2 * mmsize] - paddw m2, m3 - pmaddubsw m3, m4, [r5 + 1 * mmsize] - paddw m1, m3 - pmaddubsw m4, [r5] - movq xm3, [r0 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - movq xm6, [r0 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - pmaddubsw m3, m0, [r5 + 3 * mmsize] - paddw m2, m3 - pmaddubsw m3, m0, [r5 + 2 * mmsize] - paddw m1, m3 - pmaddubsw m0, [r5 + 1 * mmsize] - paddw m4, m0 - - movq xm3, [r0 + r4] ; m3 = row 11 - punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] - lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 12 - punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] - pmaddubsw m3, m6, [r5 + 3 * mmsize] - paddw m1, m3 - pmaddubsw m6, [r5 + 2 * mmsize] - paddw m4, m6 - movq xm3, [r0 + r1] ; m3 = row 13 - punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] - movq xm6, [r0 + r1 * 2] ; m6 = row 14 - punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] - pmaddubsw m0, [r5 + 3 * mmsize] - paddw m4, m0 -%endmacro - -%macro PROCESS_LUMA_AVX2_W8_4R 0 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m5, m3 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - movq xm3, [r0 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 8 - punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - pmaddubsw m3, m4, [r5 + 3 * mmsize] - paddw m5, m3 - pmaddubsw m3, m4, [r5 + 2 * mmsize] - paddw m2, m3 - movq xm3, [r0 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - movq xm6, [r0 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - pmaddubsw m3, m0, [r5 + 3 * mmsize] - paddw m2, m3 -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_8xN 3 -INIT_XMM sse4 -cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 - -%ifidn %3,ps - add r3d, r3d -%endif - -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffVer + r4] -%endif - - %ifidn %3,pp - mova m3, [pw_512] -%else - mova m3, [pw_2000] -%endif - - mov r4d, %2/4 - lea r5, [4 * r1] - -.loopH: - PROCESS_LUMA_W8_4R - -%ifidn %3,pp - pmulhrsw m7, m3 - pmulhrsw m6, m3 - pmulhrsw m5, m3 - pmulhrsw m4, m3 - - packuswb m7, m6 - packuswb m5, m4 - - movlps [r2], m7 - movhps [r2 + r3], m7 - lea r2, [r2 + 2 * r3] - movlps [r2], m5 - movhps [r2 + r3], m5 -%else - psubw m7, m3 - psubw m6, m3 - psubw m5, m3 - psubw m4, m3 - - movu [r2], m7 - movu [r2 + r3], m6 - lea r2, [r2 + 2 * r3] - movu [r2], m5 - movu [r2 + r3], m4 -%endif - - sub r0, r5 - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH - - RET -%endmacro - -%macro FILTER_VER_LUMA_AVX2_8xN 3 -INIT_YMM avx2 -cglobal interp_8tap_vert_%3_%1x%2, 4, 7, 8, 0-gprsize - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r1 * 4] -%ifidn %3,pp - mova m7, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m7, [pw_2000] -%endif - mov word [rsp], %2 / 8 - -.loop: - PROCESS_LUMA_AVX2_W8_8R -%ifidn %3,pp - pmulhrsw m5, m7 ; m5 = word: row 0, row 1 - pmulhrsw m2, m7 ; m2 = word: row 2, row 3 - pmulhrsw m1, m7 ; m1 = word: row 4, row 5 - pmulhrsw m4, m7 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - lea r2, [r2 + r3 * 2] - movhps [r2], xm5 - movhps [r2 + r3], xm2 - lea r2, [r2 + r3 * 2] - movq [r2], xm1 - movq [r2 + r3], xm4 - lea r2, [r2 + r3 * 2] - movhps [r2], xm1 - movhps [r2 + r3], xm4 -%else - psubw m5, m7 ; m5 = word: row 0, row 1 - psubw m2, m7 ; m2 = word: row 2, row 3 - psubw m1, m7 ; m1 = word: row 4, row 5 - psubw m4, m7 ; m4 = word: row 6, row 7 - vextracti128 xm6, m5, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm0, m1, 1 - movu [r2], xm5 - movu [r2 + r3], xm6 - lea r2, [r2 + r3 * 2] - movu [r2], xm2 - movu [r2 + r3], xm3 - lea r2, [r2 + r3 * 2] - movu [r2], xm1 - movu [r2 + r3], xm0 - lea r2, [r2 + r3 * 2] - movu [r2], xm4 - vextracti128 xm4, m4, 1 - movu [r2 + r3], xm4 -%endif - lea r2, [r2 + r3 * 2] - sub r0, r6 - dec word [rsp] - jnz .loop - RET -%endmacro - -%macro FILTER_VER_LUMA_AVX2_8x8 1 -INIT_YMM avx2 -cglobal interp_8tap_vert_%1_8x8, 4, 6, 7 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 - PROCESS_LUMA_AVX2_W8_8R -%ifidn %1,pp - mova m3, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m3, [pw_2000] -%endif - lea r4, [r3 * 3] -%ifidn %1,pp - pmulhrsw m5, m3 ; m5 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - pmulhrsw m1, m3 ; m1 = word: row 4, row 5 - pmulhrsw m4, m3 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r4], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm1 - movq [r2 + r3], xm4 - movhps [r2 + r3 * 2], xm1 - movhps [r2 + r4], xm4 -%else - psubw m5, m3 ; m5 = word: row 0, row 1 - psubw m2, m3 ; m2 = word: row 2, row 3 - psubw m1, m3 ; m1 = word: row 4, row 5 - psubw m4, m3 ; m4 = word: row 6, row 7 - vextracti128 xm6, m5, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm0, m1, 1 - movu [r2], xm5 - movu [r2 + r3], xm6 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm4 - vextracti128 xm4, m4, 1 - movu [r2 + r4], xm4 -%endif - RET -%endmacro - -%macro FILTER_VER_LUMA_AVX2_8x4 1 -INIT_YMM avx2 -cglobal interp_8tap_vert_%1_8x4, 4, 6, 7 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 - PROCESS_LUMA_AVX2_W8_4R -%ifidn %1,pp - mova m3, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m3, [pw_2000] -%endif - lea r4, [r3 * 3] -%ifidn %1,pp - pmulhrsw m5, m3 ; m5 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - packuswb m5, m2 - vextracti128 xm2, m5, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r4], xm2 -%else - psubw m5, m3 ; m5 = word: row 0, row 1 - psubw m2, m3 ; m2 = word: row 2, row 3 - movu [r2], xm5 - vextracti128 xm5, m5, 1 - movu [r2 + r3], xm5 - movu [r2 + r3 * 2], xm2 - vextracti128 xm2, m2, 1 - movu [r2 + r4], xm2 -%endif - RET -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 4, pp - FILTER_VER_LUMA_AVX2_8x4 pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 8, pp - FILTER_VER_LUMA_AVX2_8x8 pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 16, pp - FILTER_VER_LUMA_AVX2_8xN 8, 16, pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 32, pp - FILTER_VER_LUMA_AVX2_8xN 8, 32, pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 4, ps - FILTER_VER_LUMA_AVX2_8x4 ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 8, ps - FILTER_VER_LUMA_AVX2_8x8 ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 16, ps - FILTER_VER_LUMA_AVX2_8xN 8, 16, ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_8xN 8, 32, ps - FILTER_VER_LUMA_AVX2_8xN 8, 32, ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_12xN 3 -INIT_XMM sse4 -cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 -%ifidn %3,ps - add r3d, r3d -%endif - -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffVer + r4] -%endif - - %ifidn %3,pp - mova m3, [pw_512] -%else - mova m3, [pw_2000] -%endif - - mov r4d, %2/4 - -.loopH: - PROCESS_LUMA_W8_4R - -%ifidn %3,pp - pmulhrsw m7, m3 - pmulhrsw m6, m3 - pmulhrsw m5, m3 - pmulhrsw m4, m3 - - packuswb m7, m6 - packuswb m5, m4 - - movlps [r2], m7 - movhps [r2 + r3], m7 - lea r5, [r2 + 2 * r3] - movlps [r5], m5 - movhps [r5 + r3], m5 -%else - psubw m7, m3 - psubw m6, m3 - psubw m5, m3 - psubw m4, m3 - - movu [r2], m7 - movu [r2 + r3], m6 - lea r5, [r2 + 2 * r3] - movu [r5], m5 - movu [r5 + r3], m4 -%endif - - lea r5, [8 * r1 - 8] - sub r0, r5 -%ifidn %3,pp - add r2, 8 -%else - add r2, 16 -%endif - - PROCESS_LUMA_W4_4R - -%ifidn %3,pp - pmulhrsw m4, m3 - pmulhrsw m5, m3 - - packuswb m4, m5 - - movd [r2], m4 - pextrd [r2 + r3], m4, 1 - lea r5, [r2 + 2 * r3] - pextrd [r5], m4, 2 - pextrd [r5 + r3], m4, 3 -%else - psubw m4, m3 - psubw m5, m3 - - movlps [r2], m4 - movhps [r2 + r3], m4 - lea r5, [r2 + 2 * r3] - movlps [r5], m5 - movhps [r5 + r3], m5 -%endif - - lea r5, [4 * r1 + 8] - sub r0, r5 -%ifidn %3,pp - lea r2, [r2 + 4 * r3 - 8] -%else - lea r2, [r2 + 4 * r3 - 16] -%endif - - dec r4d - jnz .loopH - - RET -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_12xN 12, 16, pp - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_12xN 12, 16, ps - -%macro FILTER_VER_LUMA_AVX2_12x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_12x16, 4, 7, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,pp - mova m14, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%endif - lea r6, [r3 * 3] - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - pmaddubsw m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] - -%ifidn %1,pp - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movq [r2], xm0 - pextrd [r2 + 8], xm0, 2 - movq [r2 + r3], xm1 - pextrd [r2 + r3 + 8], xm1, 2 - movq [r2 + r3 * 2], xm2 - pextrd [r2 + r3 * 2 + 8], xm2, 2 - movq [r2 + r6], xm3 - pextrd [r2 + r6 + 8], xm3, 2 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - pextrd [r2 + 8], xm4, 2 - movq [r2 + r3], xm5 - pextrd [r2 + r3 + 8], xm5, 2 -%else - psubw m0, m14 ; m0 = word: row 0 - psubw m1, m14 ; m1 = word: row 1 - psubw m2, m14 ; m2 = word: row 2 - psubw m3, m14 ; m3 = word: row 3 - psubw m4, m14 ; m4 = word: row 4 - psubw m5, m14 ; m5 = word: row 5 - movu [r2], xm0 - vextracti128 xm0, m0, 1 - movq [r2 + 16], xm0 - movu [r2 + r3], xm1 - vextracti128 xm1, m1, 1 - movq [r2 + r3 + 16], xm1 - movu [r2 + r3 * 2], xm2 - vextracti128 xm2, m2, 1 - movq [r2 + r3 * 2 + 16], xm2 - movu [r2 + r6], xm3 - vextracti128 xm3, m3, 1 - movq [r2 + r6 + 16], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - vextracti128 xm4, m4, 1 - movq [r2 + 16], xm4 - movu [r2 + r3], xm5 - vextracti128 xm5, m5, 1 - movq [r2 + r3 + 16], xm5 -%endif - - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - pmaddubsw m12, [r5] - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 - pmaddubsw m13, [r5] - -%ifidn %1,pp - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movq [r2 + r3 * 2], xm6 - pextrd [r2 + r3 * 2 + 8], xm6, 2 - movq [r2 + r6], xm7 - pextrd [r2 + r6 + 8], xm7, 2 -%else - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 - movu [r2 + r3 * 2], xm6 - vextracti128 xm6, m6, 1 - movq [r2 + r3 * 2 + 16], xm6 - movu [r2 + r6], xm7 - vextracti128 xm7, m7, 1 - movq [r2 + r6 + 16], xm7 -%endif - lea r2, [r2 + r3 * 4] - - movu xm1, [r0 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - pmaddubsw m2, m0, [r5 + 1 * mmsize] - paddw m12, m2 - pmaddubsw m0, [r5] - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - pmaddubsw m3, m1, [r5 + 1 * mmsize] - paddw m13, m3 - pmaddubsw m1, [r5] - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - pmaddubsw m4, m2, [r5 + 2 * mmsize] - paddw m12, m4 - pmaddubsw m2, [r5 + 1 * mmsize] - paddw m0, m2 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 - pmaddubsw m5, m3, [r5 + 2 * mmsize] - paddw m13, m5 - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 - movu xm5, [r0 + r4] ; m5 = row 19 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 3 * mmsize] - paddw m12, m6 - pmaddubsw m4, [r5 + 2 * mmsize] - paddw m0, m4 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 20 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 3 * mmsize] - paddw m13, m7 - pmaddubsw m5, [r5 + 2 * mmsize] - paddw m1, m5 - movu xm7, [r0 + r1] ; m7 = row 21 - punpckhbw xm2, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddubsw m6, [r5 + 3 * mmsize] - paddw m0, m6 - movu xm2, [r0 + r1 * 2] ; m2 = row 22 - punpckhbw xm3, xm7, xm2 - punpcklbw xm7, xm2 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m7, [r5 + 3 * mmsize] - paddw m1, m7 - -%ifidn %1,pp - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m12, m14 ; m12 = word: row 12 - pmulhrsw m13, m14 ; m13 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 - packuswb m8, m9 - packuswb m10, m11 - packuswb m12, m13 - packuswb m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - movq [r2], xm8 - pextrd [r2 + 8], xm8, 2 - movq [r2 + r3], xm9 - pextrd [r2 + r3 + 8], xm9, 2 - movq [r2 + r3 * 2], xm10 - pextrd [r2 + r3 * 2 + 8], xm10, 2 - movq [r2 + r6], xm11 - pextrd [r2 + r6 + 8], xm11, 2 - lea r2, [r2 + r3 * 4] - movq [r2], xm12 - pextrd [r2 + 8], xm12, 2 - movq [r2 + r3], xm13 - pextrd [r2 + r3 + 8], xm13, 2 - movq [r2 + r3 * 2], xm0 - pextrd [r2 + r3 * 2 + 8], xm0, 2 - movq [r2 + r6], xm1 - pextrd [r2 + r6 + 8], xm1, 2 -%else - psubw m8, m14 ; m8 = word: row 8 - psubw m9, m14 ; m9 = word: row 9 - psubw m10, m14 ; m10 = word: row 10 - psubw m11, m14 ; m11 = word: row 11 - psubw m12, m14 ; m12 = word: row 12 - psubw m13, m14 ; m13 = word: row 13 - psubw m0, m14 ; m0 = word: row 14 - psubw m1, m14 ; m1 = word: row 15 - movu [r2], xm8 - vextracti128 xm8, m8, 1 - movq [r2 + 16], xm8 - movu [r2 + r3], xm9 - vextracti128 xm9, m9, 1 - movq [r2 + r3 + 16], xm9 - movu [r2 + r3 * 2], xm10 - vextracti128 xm10, m10, 1 - movq [r2 + r3 * 2 + 16], xm10 - movu [r2 + r6], xm11 - vextracti128 xm11, m11, 1 - movq [r2 + r6 + 16], xm11 - lea r2, [r2 + r3 * 4] - movu [r2], xm12 - vextracti128 xm12, m12, 1 - movq [r2 + 16], xm12 - movu [r2 + r3], xm13 - vextracti128 xm13, m13, 1 - movq [r2 + r3 + 16], xm13 - movu [r2 + r3 * 2], xm0 - vextracti128 xm0, m0, 1 - movq [r2 + r3 * 2 + 16], xm0 - movu [r2 + r6], xm1 - vextracti128 xm1, m1, 1 - movq [r2 + r6 + 16], xm1 -%endif - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_12x16 pp - FILTER_VER_LUMA_AVX2_12x16 ps - -%macro FILTER_VER_LUMA_AVX2_16x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,pp - mova m14, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%endif - lea r6, [r3 * 3] - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - pmaddubsw m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] - -%ifidn %1,pp - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 -%else - psubw m0, m14 ; m0 = word: row 0 - psubw m1, m14 ; m1 = word: row 1 - psubw m2, m14 ; m2 = word: row 2 - psubw m3, m14 ; m3 = word: row 3 - psubw m4, m14 ; m4 = word: row 4 - psubw m5, m14 ; m5 = word: row 5 - movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m3 - lea r2, [r2 + r3 * 4] - movu [r2], m4 - movu [r2 + r3], m5 -%endif - - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - pmaddubsw m12, [r5] - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 - pmaddubsw m13, [r5] - -%ifidn %1,pp - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 -%else - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 - movu [r2 + r3 * 2], m6 - movu [r2 + r6], m7 -%endif - lea r2, [r2 + r3 * 4] - - movu xm1, [r0 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - pmaddubsw m2, m0, [r5 + 1 * mmsize] - paddw m12, m2 - pmaddubsw m0, [r5] - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - pmaddubsw m3, m1, [r5 + 1 * mmsize] - paddw m13, m3 - pmaddubsw m1, [r5] - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - pmaddubsw m4, m2, [r5 + 2 * mmsize] - paddw m12, m4 - pmaddubsw m2, [r5 + 1 * mmsize] - paddw m0, m2 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 - pmaddubsw m5, m3, [r5 + 2 * mmsize] - paddw m13, m5 - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 - movu xm5, [r0 + r4] ; m5 = row 19 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 3 * mmsize] - paddw m12, m6 - pmaddubsw m4, [r5 + 2 * mmsize] - paddw m0, m4 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 20 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 3 * mmsize] - paddw m13, m7 - pmaddubsw m5, [r5 + 2 * mmsize] - paddw m1, m5 - movu xm7, [r0 + r1] ; m7 = row 21 - punpckhbw xm2, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddubsw m6, [r5 + 3 * mmsize] - paddw m0, m6 - movu xm2, [r0 + r1 * 2] ; m2 = row 22 - punpckhbw xm3, xm7, xm2 - punpcklbw xm7, xm2 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m7, [r5 + 3 * mmsize] - paddw m1, m7 - -%ifidn %1,pp - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m12, m14 ; m12 = word: row 12 - pmulhrsw m13, m14 ; m13 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 - packuswb m8, m9 - packuswb m10, m11 - packuswb m12, m13 - packuswb m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r6], xm11 - lea r2, [r2 + r3 * 4] - movu [r2], xm12 - movu [r2 + r3], xm13 - movu [r2 + r3 * 2], xm0 - movu [r2 + r6], xm1 -%else - psubw m8, m14 ; m8 = word: row 8 - psubw m9, m14 ; m9 = word: row 9 - psubw m10, m14 ; m10 = word: row 10 - psubw m11, m14 ; m11 = word: row 11 - psubw m12, m14 ; m12 = word: row 12 - psubw m13, m14 ; m13 = word: row 13 - psubw m0, m14 ; m0 = word: row 14 - psubw m1, m14 ; m1 = word: row 15 - movu [r2], m8 - movu [r2 + r3], m9 - movu [r2 + r3 * 2], m10 - movu [r2 + r6], m11 - lea r2, [r2 + r3 * 4] - movu [r2], m12 - movu [r2 + r3], m13 - movu [r2 + r3 * 2], m0 - movu [r2 + r6], m1 -%endif - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_16x16 pp - FILTER_VER_LUMA_AVX2_16x16 ps - -%macro FILTER_VER_LUMA_AVX2_16x12 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,pp - mova m14, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%endif - lea r6, [r3 * 3] - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - pmaddubsw m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] - -%ifidn %1,pp - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 -%else - psubw m0, m14 ; m0 = word: row 0 - psubw m1, m14 ; m1 = word: row 1 - psubw m2, m14 ; m2 = word: row 2 - psubw m3, m14 ; m3 = word: row 3 - psubw m4, m14 ; m4 = word: row 4 - psubw m5, m14 ; m5 = word: row 5 - movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m3 - lea r2, [r2 + r3 * 4] - movu [r2], m4 - movu [r2 + r3], m5 -%endif - - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 - -%ifidn %1,pp - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 -%else - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 - movu [r2 + r3 * 2], m6 - movu [r2 + r6], m7 -%endif - lea r2, [r2 + r3 * 4] - - movu xm1, [r0 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 - -%ifidn %1,pp - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - packuswb m8, m9 - packuswb m10, m11 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r6], xm11 -%else - psubw m8, m14 ; m8 = word: row 8 - psubw m9, m14 ; m9 = word: row 9 - psubw m10, m14 ; m10 = word: row 10 - psubw m11, m14 ; m11 = word: row 11 - movu [r2], m8 - movu [r2 + r3], m9 - movu [r2 + r3 * 2], m10 - movu [r2 + r6], m11 -%endif - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_16x12 pp - FILTER_VER_LUMA_AVX2_16x12 ps - -%macro FILTER_VER_LUMA_AVX2_16x8 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 - mov r4d, r4m - shl r4d, 7 -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,pp - mova m14, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%endif - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - lea r4, [r3 * 3] -%ifidn %1,pp - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 -%else - psubw m0, m14 ; m0 = word: row 0 - psubw m1, m14 ; m1 = word: row 1 - psubw m2, m14 ; m2 = word: row 2 - psubw m3, m14 ; m3 = word: row 3 - psubw m4, m14 ; m4 = word: row 4 - psubw m5, m14 ; m5 = word: row 5 - movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r4], m3 - lea r2, [r2 + r3 * 4] - movu [r2], m4 - movu [r2 + r3], m5 -%endif - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 -%ifidn %1,pp - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r4], xm7 -%else - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 - movu [r2 + r3 * 2], m6 - movu [r2 + r4], m7 -%endif - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_16x8 pp - FILTER_VER_LUMA_AVX2_16x8 ps - -%macro FILTER_VER_LUMA_AVX2_16x4 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_16x4, 4, 6, 13 - mov r4d, r4m - shl r4d, 7 -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,pp - mova m12, [pw_512] -%else - add r3d, r3d - vbroadcasti128 m12, [pw_2000] -%endif - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 -%ifidn %1,pp - pmulhrsw m0, m12 ; m0 = word: row 0 - pmulhrsw m1, m12 ; m1 = word: row 1 - pmulhrsw m2, m12 ; m2 = word: row 2 - pmulhrsw m3, m12 ; m3 = word: row 3 - packuswb m0, m1 - packuswb m2, m3 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - lea r4, [r3 * 3] - movu [r2 + r4], xm3 -%else - psubw m0, m12 ; m0 = word: row 0 - psubw m1, m12 ; m1 = word: row 1 - psubw m2, m12 ; m2 = word: row 2 - psubw m3, m12 ; m3 = word: row 3 - movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - lea r4, [r3 * 3] - movu [r2 + r4], m3 -%endif - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_16x4 pp - FILTER_VER_LUMA_AVX2_16x4 ps -%macro FILTER_VER_LUMA_AVX2_16xN 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %3,ps - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%else - mova m14, [pw_512] -%endif - lea r6, [r3 * 3] - lea r7, [r1 * 4] - mov r8d, %2 / 16 - -.loop: - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - pmaddubsw m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] - -%ifidn %3,pp - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 -%else - psubw m0, m14 ; m0 = word: row 0 - psubw m1, m14 ; m1 = word: row 1 - psubw m2, m14 ; m2 = word: row 2 - psubw m3, m14 ; m3 = word: row 3 - psubw m4, m14 ; m4 = word: row 4 - psubw m5, m14 ; m5 = word: row 5 - movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m3 - lea r2, [r2 + r3 * 4] - movu [r2], m4 - movu [r2 + r3], m5 -%endif - - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - pmaddubsw m12, [r5] - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 - pmaddubsw m13, [r5] - -%ifidn %3,pp - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 -%else - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 - movu [r2 + r3 * 2], m6 - movu [r2 + r6], m7 -%endif - - lea r2, [r2 + r3 * 4] - - movu xm1, [r0 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - pmaddubsw m2, m0, [r5 + 1 * mmsize] - paddw m12, m2 - pmaddubsw m0, [r5] - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - pmaddubsw m3, m1, [r5 + 1 * mmsize] - paddw m13, m3 - pmaddubsw m1, [r5] - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - pmaddubsw m4, m2, [r5 + 2 * mmsize] - paddw m12, m4 - pmaddubsw m2, [r5 + 1 * mmsize] - paddw m0, m2 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 - pmaddubsw m5, m3, [r5 + 2 * mmsize] - paddw m13, m5 - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 - movu xm5, [r0 + r4] ; m5 = row 19 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 3 * mmsize] - paddw m12, m6 - pmaddubsw m4, [r5 + 2 * mmsize] - paddw m0, m4 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 20 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 3 * mmsize] - paddw m13, m7 - pmaddubsw m5, [r5 + 2 * mmsize] - paddw m1, m5 - movu xm7, [r0 + r1] ; m7 = row 21 - punpckhbw xm2, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddubsw m6, [r5 + 3 * mmsize] - paddw m0, m6 - movu xm2, [r0 + r1 * 2] ; m2 = row 22 - punpckhbw xm3, xm7, xm2 - punpcklbw xm7, xm2 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m7, [r5 + 3 * mmsize] - paddw m1, m7 - -%ifidn %3,pp - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m12, m14 ; m12 = word: row 12 - pmulhrsw m13, m14 ; m13 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 - packuswb m8, m9 - packuswb m10, m11 - packuswb m12, m13 - packuswb m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r6], xm11 - lea r2, [r2 + r3 * 4] - movu [r2], xm12 - movu [r2 + r3], xm13 - movu [r2 + r3 * 2], xm0 - movu [r2 + r6], xm1 -%else - psubw m8, m14 ; m8 = word: row 8 - psubw m9, m14 ; m9 = word: row 9 - psubw m10, m14 ; m10 = word: row 10 - psubw m11, m14 ; m11 = word: row 11 - psubw m12, m14 ; m12 = word: row 12 - psubw m13, m14 ; m13 = word: row 13 - psubw m0, m14 ; m0 = word: row 14 - psubw m1, m14 ; m1 = word: row 15 - movu [r2], m8 - movu [r2 + r3], m9 - movu [r2 + r3 * 2], m10 - movu [r2 + r6], m11 - lea r2, [r2 + r3 * 4] - movu [r2], m12 - movu [r2 + r3], m13 - movu [r2 + r3 * 2], m0 - movu [r2 + r6], m1 -%endif - - lea r2, [r2 + r3 * 4] - sub r0, r7 - dec r8d - jnz .loop - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_16xN 16, 32, pp - FILTER_VER_LUMA_AVX2_16xN 16, 64, pp - FILTER_VER_LUMA_AVX2_16xN 16, 32, ps - FILTER_VER_LUMA_AVX2_16xN 16, 64, ps - -%macro PROCESS_LUMA_AVX2_W16_16R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r7 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm8, [r7] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r7 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - pmaddubsw m8, [r5] - movu xm10, [r7 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r7 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r7, [r7 + r1 * 4] - movu xm12, [r7] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] - -%ifidn %1,pp - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r8, [r2 + r3 * 4] - movu [r8], xm4 - movu [r8 + r3], xm5 -%else - psubw m0, m14 ; m0 = word: row 0 - psubw m1, m14 ; m1 = word: row 1 - psubw m2, m14 ; m2 = word: row 2 - psubw m3, m14 ; m3 = word: row 3 - psubw m4, m14 ; m4 = word: row 4 - psubw m5, m14 ; m5 = word: row 5 - movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m3 - lea r8, [r2 + r3 * 4] - movu [r8], m4 - movu [r8 + r3], m5 -%endif - - movu xm13, [r7 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - pmaddubsw m12, [r5] - movu xm0, [r7 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 - pmaddubsw m13, [r5] - -%ifidn %1,pp - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 -%else - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 - movu [r8 + r3 * 2], m6 - movu [r8 + r6], m7 -%endif - - lea r8, [r8 + r3 * 4] - - movu xm1, [r7 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - pmaddubsw m2, m0, [r5 + 1 * mmsize] - paddw m12, m2 - pmaddubsw m0, [r5] - lea r7, [r7 + r1 * 4] - movu xm2, [r7] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - pmaddubsw m3, m1, [r5 + 1 * mmsize] - paddw m13, m3 - pmaddubsw m1, [r5] - movu xm3, [r7 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - pmaddubsw m4, m2, [r5 + 2 * mmsize] - paddw m12, m4 - pmaddubsw m2, [r5 + 1 * mmsize] - paddw m0, m2 - movu xm4, [r7 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 - pmaddubsw m5, m3, [r5 + 2 * mmsize] - paddw m13, m5 - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 - movu xm5, [r7 + r4] ; m5 = row 19 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 3 * mmsize] - paddw m12, m6 - pmaddubsw m4, [r5 + 2 * mmsize] - paddw m0, m4 - lea r7, [r7 + r1 * 4] - movu xm6, [r7] ; m6 = row 20 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 3 * mmsize] - paddw m13, m7 - pmaddubsw m5, [r5 + 2 * mmsize] - paddw m1, m5 - movu xm7, [r7 + r1] ; m7 = row 21 - punpckhbw xm2, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddubsw m6, [r5 + 3 * mmsize] - paddw m0, m6 - movu xm2, [r7 + r1 * 2] ; m2 = row 22 - punpckhbw xm3, xm7, xm2 - punpcklbw xm7, xm2 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m7, [r5 + 3 * mmsize] - paddw m1, m7 - -%ifidn %1,pp - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m12, m14 ; m12 = word: row 12 - pmulhrsw m13, m14 ; m13 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 - packuswb m8, m9 - packuswb m10, m11 - packuswb m12, m13 - packuswb m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - movu [r8], xm8 - movu [r8 + r3], xm9 - movu [r8 + r3 * 2], xm10 - movu [r8 + r6], xm11 - lea r8, [r8 + r3 * 4] - movu [r8], xm12 - movu [r8 + r3], xm13 - movu [r8 + r3 * 2], xm0 - movu [r8 + r6], xm1 -%else - psubw m8, m14 ; m8 = word: row 8 - psubw m9, m14 ; m9 = word: row 9 - psubw m10, m14 ; m10 = word: row 10 - psubw m11, m14 ; m11 = word: row 11 - psubw m12, m14 ; m12 = word: row 12 - psubw m13, m14 ; m13 = word: row 13 - psubw m0, m14 ; m0 = word: row 14 - psubw m1, m14 ; m1 = word: row 15 - movu [r8], m8 - movu [r8 + r3], m9 - movu [r8 + r3 * 2], m10 - movu [r8 + r6], m11 - lea r8, [r8 + r3 * 4] - movu [r8], m12 - movu [r8 + r3], m13 - movu [r8 + r3 * 2], m0 - movu [r8 + r6], m1 -%endif -%endmacro - -%macro PROCESS_LUMA_AVX2_W16_8R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r7 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm8, [r7] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r7 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - movu xm10, [r7 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - movu xm11, [r7 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - lea r7, [r7 + r1 * 4] - movu xm12, [r7] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - -%ifidn %1,pp - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r8, [r2 + r3 * 4] - movu [r8], xm4 - movu [r8 + r3], xm5 -%else - psubw m0, m14 ; m0 = word: row 0 - psubw m1, m14 ; m1 = word: row 1 - psubw m2, m14 ; m2 = word: row 2 - psubw m3, m14 ; m3 = word: row 3 - psubw m4, m14 ; m4 = word: row 4 - psubw m5, m14 ; m5 = word: row 5 - movu [r2], m0 - movu [r2 + r3], m1 - movu [r2 + r3 * 2], m2 - movu [r2 + r6], m3 - lea r8, [r2 + r3 * 4] - movu [r8], m4 - movu [r8 + r3], m5 -%endif - - movu xm13, [r7 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - movu xm0, [r7 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - -%ifidn %1,pp - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 -%else - psubw m6, m14 ; m6 = word: row 6 - psubw m7, m14 ; m7 = word: row 7 - movu [r8 + r3 * 2], m6 - movu [r8 + r6], m7 -%endif -%endmacro - -%macro FILTER_VER_LUMA_AVX2_24x32 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_24x32, 4, 11, 15 - mov r4d, r4m - shl r4d, 7 -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,ps - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%else - mova m14, [pw_512] -%endif - lea r6, [r3 * 3] - lea r10, [r1 * 4] - mov r9d, 2 -.loopH: - PROCESS_LUMA_AVX2_W16_16R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 - vinserti128 m5, m1, xm2, 1 - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 - lea r7, [r0 + r1 * 4] - movq xm1, [r7] ; m1 = row 4 - punpcklbw xm4, xm1 - vinserti128 m2, m3, xm4, 1 - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r7 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 - movq xm4, [r7 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m5, m3 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - pmaddubsw m1, [r5] - movq xm3, [r7 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 - lea r7, [r7 + r1 * 4] - movq xm0, [r7] ; m0 = row 8 - punpcklbw xm3, xm0 - vinserti128 m4, m4, xm3, 1 - pmaddubsw m3, m4, [r5 + 3 * mmsize] - paddw m5, m3 - pmaddubsw m3, m4, [r5 + 2 * mmsize] - paddw m2, m3 - pmaddubsw m3, m4, [r5 + 1 * mmsize] - paddw m1, m3 - pmaddubsw m4, [r5] - movq xm3, [r7 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 - movq xm6, [r7 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 - vinserti128 m0, m0, xm3, 1 - pmaddubsw m3, m0, [r5 + 3 * mmsize] - paddw m2, m3 - pmaddubsw m3, m0, [r5 + 2 * mmsize] - paddw m1, m3 - pmaddubsw m3, m0, [r5 + 1 * mmsize] - paddw m4, m3 - pmaddubsw m0, [r5] - - movq xm3, [r7 + r4] ; m3 = row 11 - punpcklbw xm6, xm3 - lea r7, [r7 + r1 * 4] - movq xm7, [r7] ; m7 = row 12 - punpcklbw xm3, xm7 - vinserti128 m6, m6, xm3, 1 - pmaddubsw m3, m6, [r5 + 3 * mmsize] - paddw m1, m3 - pmaddubsw m3, m6, [r5 + 2 * mmsize] - paddw m4, m3 - pmaddubsw m3, m6, [r5 + 1 * mmsize] - paddw m0, m3 - pmaddubsw m6, [r5] - movq xm3, [r7 + r1] ; m3 = row 13 - punpcklbw xm7, xm3 - movq xm8, [r7 + r1 * 2] ; m8 = row 14 - punpcklbw xm3, xm8 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m3, m7, [r5 + 3 * mmsize] - paddw m4, m3 - pmaddubsw m3, m7, [r5 + 2 * mmsize] - paddw m0, m3 - pmaddubsw m3, m7, [r5 + 1 * mmsize] - paddw m6, m3 - pmaddubsw m7, [r5] - movq xm3, [r7 + r4] ; m3 = row 15 - punpcklbw xm8, xm3 - lea r7, [r7 + r1 * 4] - movq xm9, [r7] ; m9 = row 16 - punpcklbw xm3, xm9 - vinserti128 m8, m8, xm3, 1 - pmaddubsw m3, m8, [r5 + 3 * mmsize] - paddw m0, m3 - pmaddubsw m3, m8, [r5 + 2 * mmsize] - paddw m6, m3 - pmaddubsw m3, m8, [r5 + 1 * mmsize] - paddw m7, m3 - pmaddubsw m8, [r5] - movq xm3, [r7 + r1] ; m3 = row 17 - punpcklbw xm9, xm3 - movq xm10, [r7 + r1 * 2] ; m10 = row 18 - punpcklbw xm3, xm10 - vinserti128 m9, m9, xm3, 1 - pmaddubsw m3, m9, [r5 + 3 * mmsize] - paddw m6, m3 - pmaddubsw m3, m9, [r5 + 2 * mmsize] - paddw m7, m3 - pmaddubsw m3, m9, [r5 + 1 * mmsize] - paddw m8, m3 - movq xm3, [r7 + r4] ; m3 = row 19 - punpcklbw xm10, xm3 - lea r7, [r7 + r1 * 4] - movq xm9, [r7] ; m9 = row 20 - punpcklbw xm3, xm9 - vinserti128 m10, m10, xm3, 1 - pmaddubsw m3, m10, [r5 + 3 * mmsize] - paddw m7, m3 - pmaddubsw m3, m10, [r5 + 2 * mmsize] - paddw m8, m3 - movq xm3, [r7 + r1] ; m3 = row 21 - punpcklbw xm9, xm3 - movq xm10, [r7 + r1 * 2] ; m10 = row 22 - punpcklbw xm3, xm10 - vinserti128 m9, m9, xm3, 1 - pmaddubsw m3, m9, [r5 + 3 * mmsize] - paddw m8, m3 -%ifidn %1,pp - pmulhrsw m5, m14 ; m5 = word: row 0, row 1 - pmulhrsw m2, m14 ; m2 = word: row 2, row 3 - pmulhrsw m1, m14 ; m1 = word: row 4, row 5 - pmulhrsw m4, m14 ; m4 = word: row 6, row 7 - pmulhrsw m0, m14 ; m0 = word: row 8, row 9 - pmulhrsw m6, m14 ; m6 = word: row 10, row 11 - pmulhrsw m7, m14 ; m7 = word: row 12, row 13 - pmulhrsw m8, m14 ; m8 = word: row 14, row 15 - packuswb m5, m2 - packuswb m1, m4 - packuswb m0, m6 - packuswb m7, m8 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - vextracti128 xm6, m0, 1 - vextracti128 xm8, m7, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm2 - lea r8, [r2 + r3 * 4] - movq [r8], xm1 - movq [r8 + r3], xm4 - movhps [r8 + r3 * 2], xm1 - movhps [r8 + r6], xm4 - lea r8, [r8 + r3 * 4] - movq [r8], xm0 - movq [r8 + r3], xm6 - movhps [r8 + r3 * 2], xm0 - movhps [r8 + r6], xm6 - lea r8, [r8 + r3 * 4] - movq [r8], xm7 - movq [r8 + r3], xm8 - movhps [r8 + r3 * 2], xm7 - movhps [r8 + r6], xm8 -%else - psubw m5, m14 ; m5 = word: row 0, row 1 - psubw m2, m14 ; m2 = word: row 2, row 3 - psubw m1, m14 ; m1 = word: row 4, row 5 - psubw m4, m14 ; m4 = word: row 6, row 7 - psubw m0, m14 ; m0 = word: row 8, row 9 - psubw m6, m14 ; m6 = word: row 10, row 11 - psubw m7, m14 ; m7 = word: row 12, row 13 - psubw m8, m14 ; m8 = word: row 14, row 15 - vextracti128 xm3, m5, 1 - movu [r2], xm5 - movu [r2 + r3], xm3 - vextracti128 xm3, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - vextracti128 xm3, m1, 1 - lea r8, [r2 + r3 * 4] - movu [r8], xm1 - movu [r8 + r3], xm3 - vextracti128 xm3, m4, 1 - movu [r8 + r3 * 2], xm4 - movu [r8 + r6], xm3 - vextracti128 xm3, m0, 1 - lea r8, [r8 + r3 * 4] - movu [r8], xm0 - movu [r8 + r3], xm3 - vextracti128 xm3, m6, 1 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm3 - vextracti128 xm3, m7, 1 - lea r8, [r8 + r3 * 4] - movu [r8], xm7 - movu [r8 + r3], xm3 - vextracti128 xm3, m8, 1 - movu [r8 + r3 * 2], xm8 - movu [r8 + r6], xm3 -%endif - sub r7, r10 - lea r0, [r7 - 16] -%ifidn %1,pp - lea r2, [r8 + r3 * 4 - 16] -%else - lea r2, [r8 + r3 * 4 - 32] -%endif - dec r9d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_24x32 pp - FILTER_VER_LUMA_AVX2_24x32 ps - -%macro FILTER_VER_LUMA_AVX2_32xN 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 - mov r4d, r4m - shl r4d, 7 -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %3,ps - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%else - mova m14, [pw_512] -%endif - lea r6, [r3 * 3] - lea r11, [r1 * 4] - mov r9d, %2 / 16 -.loopH: - mov r10d, %1 / 16 -.loopW: - PROCESS_LUMA_AVX2_W16_16R %3 -%ifidn %3,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r10d - jnz .loopW - sub r7, r11 - lea r0, [r7 - 16] -%ifidn %3,pp - lea r2, [r8 + r3 * 4 - 16] -%else - lea r2, [r8 + r3 * 4 - 32] -%endif - dec r9d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_32xN 32, 32, pp - FILTER_VER_LUMA_AVX2_32xN 32, 64, pp - FILTER_VER_LUMA_AVX2_32xN 32, 32, ps - FILTER_VER_LUMA_AVX2_32xN 32, 64, ps - -%macro FILTER_VER_LUMA_AVX2_32x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_32x16, 4, 10, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,ps - add r3d, r3d - vbroadcasti128 m14, [pw_2000] -%else - mova m14, [pw_512] -%endif - lea r6, [r3 * 3] - mov r9d, 2 -.loopW: - PROCESS_LUMA_AVX2_W16_16R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r9d - jnz .loopW - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_32x16 pp - FILTER_VER_LUMA_AVX2_32x16 ps - -%macro FILTER_VER_LUMA_AVX2_32x24 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 - mov r4d, r4m - shl r4d, 7 -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,ps - add r3d, r3d -%endif - lea r6, [r3 * 3] -%ifidn %1,pp - mova m14, [pw_512] -%else - vbroadcasti128 m14, [pw_2000] -%endif - mov r9d, 2 -.loopW: - PROCESS_LUMA_AVX2_W16_16R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r9d - jnz .loopW - lea r9, [r1 * 4] - sub r7, r9 - lea r0, [r7 - 16] -%ifidn %1,pp - lea r2, [r8 + r3 * 4 - 16] -%else - lea r2, [r8 + r3 * 4 - 32] -%endif - mov r9d, 2 -.loop: - PROCESS_LUMA_AVX2_W16_8R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r9d - jnz .loop - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_32x24 pp - FILTER_VER_LUMA_AVX2_32x24 ps - -%macro FILTER_VER_LUMA_AVX2_32x8 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_32x8, 4, 10, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,ps - add r3d, r3d -%endif - lea r6, [r3 * 3] -%ifidn %1,pp - mova m14, [pw_512] -%else - vbroadcasti128 m14, [pw_2000] -%endif - mov r9d, 2 -.loopW: - PROCESS_LUMA_AVX2_W16_8R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r9d - jnz .loopW - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_32x8 pp - FILTER_VER_LUMA_AVX2_32x8 ps - -%macro FILTER_VER_LUMA_AVX2_48x64 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_48x64, 4, 12, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 - -%ifidn %1,ps - add r3d, r3d -%endif - - lea r6, [r3 * 3] - lea r11, [r1 * 4] - -%ifidn %1,pp - mova m14, [pw_512] -%else - vbroadcasti128 m14, [pw_2000] -%endif - - mov r9d, 4 -.loopH: - mov r10d, 3 -.loopW: - PROCESS_LUMA_AVX2_W16_16R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r10d - jnz .loopW - sub r7, r11 - lea r0, [r7 - 32] -%ifidn %1,pp - lea r2, [r8 + r3 * 4 - 32] -%else - lea r2, [r8 + r3 * 4 - 64] -%endif - dec r9d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_48x64 pp - FILTER_VER_LUMA_AVX2_48x64 ps - -%macro FILTER_VER_LUMA_AVX2_64xN 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 - -%ifidn %3,ps - add r3d, r3d -%endif - - lea r6, [r3 * 3] - lea r11, [r1 * 4] - -%ifidn %3,pp - mova m14, [pw_512] -%else - vbroadcasti128 m14, [pw_2000] -%endif - - mov r9d, %2 / 16 -.loopH: - mov r10d, %1 / 16 -.loopW: - PROCESS_LUMA_AVX2_W16_16R %3 -%ifidn %3,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r10d - jnz .loopW - sub r7, r11 - lea r0, [r7 - 48] -%ifidn %3,pp - lea r2, [r8 + r3 * 4 - 48] -%else - lea r2, [r8 + r3 * 4 - 96] -%endif - dec r9d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_64xN 64, 32, pp - FILTER_VER_LUMA_AVX2_64xN 64, 48, pp - FILTER_VER_LUMA_AVX2_64xN 64, 64, pp - FILTER_VER_LUMA_AVX2_64xN 64, 32, ps - FILTER_VER_LUMA_AVX2_64xN 64, 48, ps - FILTER_VER_LUMA_AVX2_64xN 64, 64, ps - -%macro FILTER_VER_LUMA_AVX2_64x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_64x16, 4, 10, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 - -%ifidn %1,ps - add r3d, r3d -%endif - - lea r6, [r3 * 3] - -%ifidn %1,pp - mova m14, [pw_512] -%else - vbroadcasti128 m14, [pw_2000] -%endif - - mov r9d, 4 -.loopW: - PROCESS_LUMA_AVX2_W16_16R %1 -%ifidn %1,pp - add r2, 16 -%else - add r2, 32 -%endif - add r0, 16 - dec r9d - jnz .loopW - RET -%endif -%endmacro - - FILTER_VER_LUMA_AVX2_64x16 pp - FILTER_VER_LUMA_AVX2_64x16 ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA 3 -INIT_XMM sse4 -cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 -%ifidn %3,ps - add r3d, r3d -%endif - -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffVer + r4] -%endif - -%ifidn %3,pp - mova m3, [pw_512] -%else - mova m3, [pw_2000] -%endif - mov dword [rsp], %2/4 - -.loopH: - mov r4d, (%1/8) -.loopW: - PROCESS_LUMA_W8_4R -%ifidn %3,pp - pmulhrsw m7, m3 - pmulhrsw m6, m3 - pmulhrsw m5, m3 - pmulhrsw m4, m3 - - packuswb m7, m6 - packuswb m5, m4 - - movlps [r2], m7 - movhps [r2 + r3], m7 - lea r5, [r2 + 2 * r3] - movlps [r5], m5 - movhps [r5 + r3], m5 -%else - psubw m7, m3 - psubw m6, m3 - psubw m5, m3 - psubw m4, m3 - - movu [r2], m7 - movu [r2 + r3], m6 - lea r5, [r2 + 2 * r3] - movu [r5], m5 - movu [r5 + r3], m4 -%endif - - lea r5, [8 * r1 - 8] - sub r0, r5 -%ifidn %3,pp - add r2, 8 -%else - add r2, 16 -%endif - dec r4d - jnz .loopW - - lea r0, [r0 + 4 * r1 - %1] -%ifidn %3,pp - lea r2, [r2 + 4 * r3 - %1] -%else - lea r2, [r2 + 4 * r3 - 2 * %1] -%endif - - dec dword [rsp] - jnz .loopH - - RET -%endmacro - - FILTER_VER_LUMA 16, 4, pp - FILTER_VER_LUMA 16, 8, pp - FILTER_VER_LUMA 16, 12, pp - FILTER_VER_LUMA 16, 16, pp - FILTER_VER_LUMA 16, 32, pp - FILTER_VER_LUMA 16, 64, pp - FILTER_VER_LUMA 24, 32, pp - FILTER_VER_LUMA 32, 8, pp - FILTER_VER_LUMA 32, 16, pp - FILTER_VER_LUMA 32, 24, pp - FILTER_VER_LUMA 32, 32, pp - FILTER_VER_LUMA 32, 64, pp - FILTER_VER_LUMA 48, 64, pp - FILTER_VER_LUMA 64, 16, pp - FILTER_VER_LUMA 64, 32, pp - FILTER_VER_LUMA 64, 48, pp - FILTER_VER_LUMA 64, 64, pp - - FILTER_VER_LUMA 16, 4, ps - FILTER_VER_LUMA 16, 8, ps - FILTER_VER_LUMA 16, 12, ps - FILTER_VER_LUMA 16, 16, ps - FILTER_VER_LUMA 16, 32, ps - FILTER_VER_LUMA 16, 64, ps - FILTER_VER_LUMA 24, 32, ps - FILTER_VER_LUMA 32, 8, ps - FILTER_VER_LUMA 32, 16, ps - FILTER_VER_LUMA 32, 24, ps - FILTER_VER_LUMA 32, 32, ps - FILTER_VER_LUMA 32, 64, ps - FILTER_VER_LUMA 48, 64, ps - FILTER_VER_LUMA 64, 16, ps - FILTER_VER_LUMA 64, 32, ps - FILTER_VER_LUMA 64, 48, ps - FILTER_VER_LUMA 64, 64, ps - -%macro PROCESS_LUMA_SP_W4_4R 0 - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m1, m4 ;m1=[1 2] - pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[2 3] - pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 - pmaddwd m4, [r6 + 1 * 16] - paddd m0, m4 ;m0=[0+1+2+3] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[3 4] - pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 - pmaddwd m5, [r6 + 1 * 16] - paddd m1, m5 ;m1 = [1+2+3+4] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[4 5] - pmaddwd m6, m4, [r6 + 1 * 16] - paddd m2, m6 ;m2=[2+3+4+5] Row3 - pmaddwd m4, [r6 + 2 * 16] - paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[5 6] - pmaddwd m6, m5, [r6 + 1 * 16] - paddd m3, m6 ;m3=[3+4+5+6] Row4 - pmaddwd m5, [r6 + 2 * 16] - paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[6 7] - pmaddwd m6, m4, [r6 + 2 * 16] - paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 - pmaddwd m4, [r6 + 3 * 16] - paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[7 8] - pmaddwd m6, m5, [r6 + 2 * 16] - paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 - pmaddwd m5, [r6 + 3 * 16] - paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[8 9] - pmaddwd m4, [r6 + 3 * 16] - paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end - - movq m4, [r0 + 2 * r1] - punpcklwd m5, m4 ;m5=[9 10] - pmaddwd m5, [r6 + 3 * 16] - paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end -%endmacro - -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_SP 2 -INIT_XMM sse4 -cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize - - add r1d, r1d - lea r5, [r1 + 2 * r1] - sub r0, r5 - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_LumaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffV + r4] -%endif - - mova m7, [pd_526336] - - mov dword [rsp], %2/4 -.loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_LUMA_SP_W4_4R - - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 - - packssdw m0, m1 - packssdw m2, m3 - - packuswb m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r5, [r2 + 2 * r3] - pextrd [r5], m0, 2 - pextrd [r5 + r3], m0, 3 - - lea r5, [8 * r1 - 2 * 4] - sub r0, r5 - add r2, 4 - - dec r4d - jnz .loopW - - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - %1] - - dec dword [rsp] - jnz .loopH - - RET -%endmacro - -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_SP 4, 4 - FILTER_VER_LUMA_SP 8, 8 - FILTER_VER_LUMA_SP 8, 4 - FILTER_VER_LUMA_SP 4, 8 - FILTER_VER_LUMA_SP 16, 16 - FILTER_VER_LUMA_SP 16, 8 - FILTER_VER_LUMA_SP 8, 16 - FILTER_VER_LUMA_SP 16, 12 - FILTER_VER_LUMA_SP 12, 16 - FILTER_VER_LUMA_SP 16, 4 - FILTER_VER_LUMA_SP 4, 16 - FILTER_VER_LUMA_SP 32, 32 - FILTER_VER_LUMA_SP 32, 16 - FILTER_VER_LUMA_SP 16, 32 - FILTER_VER_LUMA_SP 32, 24 - FILTER_VER_LUMA_SP 24, 32 - FILTER_VER_LUMA_SP 32, 8 - FILTER_VER_LUMA_SP 8, 32 - FILTER_VER_LUMA_SP 64, 64 - FILTER_VER_LUMA_SP 64, 32 - FILTER_VER_LUMA_SP 32, 64 - FILTER_VER_LUMA_SP 64, 48 - FILTER_VER_LUMA_SP 48, 64 - FILTER_VER_LUMA_SP 64, 16 - FILTER_VER_LUMA_SP 16, 64 - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal filterPixelToShort_4x2, 3, 4, 3 - mov r3d, r3m - add r3d, r3d - - ; load constant - mova m1, [pb_128] - mova m2, [tab_c_64_n64] - - movd m0, [r0] - pinsrd m0, [r0 + r1], 1 - punpcklbw m0, m1 - pmaddubsw m0, m2 - - movq [r2 + r3 * 0], m0 - movhps [r2 + r3 * 1], m0 - - RET - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;----------------------------------------------------------------------------- -INIT_XMM ssse3 -cglobal filterPixelToShort_8x2, 3, 4, 3 - mov r3d, r3m - add r3d, r3d - - ; load constant - mova m1, [pb_128] - mova m2, [tab_c_64_n64] - - movh m0, [r0] - punpcklbw m0, m1 - pmaddubsw m0, m2 - movu [r2 + r3 * 0], m0 - - movh m0, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m0, m2 - movu [r2 + r3 * 1], m0 - - RET - -%macro PROCESS_CHROMA_SP_W4_4R 0 - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m1, m4 ;m1=[1 2] - pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[2 3] - pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 - pmaddwd m4, [r6 + 1 * 16] - paddd m0, m4 ;m0=[0+1+2+3] Row1 done - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[3 4] - pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 - pmaddwd m5, [r6 + 1 * 16] - paddd m1, m5 ;m1 = [1+2+3+4] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[4 5] - pmaddwd m4, [r6 + 1 * 16] - paddd m2, m4 ;m2=[2+3+4+5] Row3 - - movq m4, [r0 + 2 * r1] - punpcklwd m5, m4 ;m5=[5 6] - pmaddwd m5, [r6 + 1 * 16] - paddd m3, m5 ;m3=[3+4+5+6] Row4 -%endmacro - -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SP 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize - - add r1d, r1d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_ChromaCoeffV + r4] -%endif - - mova m6, [pd_526336] - - mov dword [rsp], %2/4 - -.loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_CHROMA_SP_W4_4R - - paddd m0, m6 - paddd m1, m6 - paddd m2, m6 - paddd m3, m6 - - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 - - packssdw m0, m1 - packssdw m2, m3 - - packuswb m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r5, [r2 + 2 * r3] - pextrd [r5], m0, 2 - pextrd [r5 + r3], m0, 3 - - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 4 - - dec r4d - jnz .loopW - - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - %1] - - dec dword [rsp] - jnz .loopH - - RET -%endmacro - - FILTER_VER_CHROMA_SP 4, 4 - FILTER_VER_CHROMA_SP 4, 8 - FILTER_VER_CHROMA_SP 16, 16 - FILTER_VER_CHROMA_SP 16, 8 - FILTER_VER_CHROMA_SP 16, 12 - FILTER_VER_CHROMA_SP 12, 16 - FILTER_VER_CHROMA_SP 16, 4 - FILTER_VER_CHROMA_SP 4, 16 - FILTER_VER_CHROMA_SP 32, 32 - FILTER_VER_CHROMA_SP 32, 16 - FILTER_VER_CHROMA_SP 16, 32 - FILTER_VER_CHROMA_SP 32, 24 - FILTER_VER_CHROMA_SP 24, 32 - FILTER_VER_CHROMA_SP 32, 8 - - FILTER_VER_CHROMA_SP 16, 24 - FILTER_VER_CHROMA_SP 16, 64 - FILTER_VER_CHROMA_SP 12, 32 - FILTER_VER_CHROMA_SP 4, 32 - FILTER_VER_CHROMA_SP 32, 64 - FILTER_VER_CHROMA_SP 32, 48 - FILTER_VER_CHROMA_SP 24, 64 - - FILTER_VER_CHROMA_SP 64, 64 - FILTER_VER_CHROMA_SP 64, 32 - FILTER_VER_CHROMA_SP 64, 48 - FILTER_VER_CHROMA_SP 48, 64 - FILTER_VER_CHROMA_SP 64, 16 - - -%macro PROCESS_CHROMA_SP_W2_4R 1 - movd m0, [r0] - movd m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - - lea r0, [r0 + 2 * r1] - movd m2, [r0] - punpcklwd m1, m2 ;m1=[1 2] - punpcklqdq m0, m1 ;m0=[0 1 1 2] - pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2 - - movd m1, [r0 + r1] - punpcklwd m2, m1 ;m2=[2 3] - - lea r0, [r0 + 2 * r1] - movd m3, [r0] - punpcklwd m1, m3 ;m2=[3 4] - punpcklqdq m2, m1 ;m2=[2 3 3 4] - - pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2 - pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4 - paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 - - movd m1, [r0 + r1] - punpcklwd m3, m1 ;m3=[4 5] - - movd m4, [r0 + 2 * r1] - punpcklwd m1, m4 ;m1=[5 6] - punpcklqdq m3, m1 ;m2=[4 5 5 6] - pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4 - paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 -%endmacro - -;------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SP_W2_4R 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6 - - add r1d, r1d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] -%else - lea r5, [tab_ChromaCoeffV + r4] -%endif - - mova m5, [pd_526336] - - mov r4d, (%2/4) - -.loopH: - PROCESS_CHROMA_SP_W2_4R r5 - - paddd m0, m5 - paddd m2, m5 - - psrad m0, 12 - psrad m2, 12 - - packssdw m0, m2 - packuswb m0, m0 - - pextrw [r2], m0, 0 - pextrw [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrw [r2], m0, 2 - pextrw [r2 + r3], m0, 3 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH - - RET -%endmacro - - FILTER_VER_CHROMA_SP_W2_4R 2, 4 - FILTER_VER_CHROMA_SP_W2_4R 2, 8 - - FILTER_VER_CHROMA_SP_W2_4R 2, 16 - -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_vert_sp_4x2, 5, 6, 5 - - add r1d, r1d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] -%else - lea r5, [tab_ChromaCoeffV + r4] -%endif - - mova m4, [pd_526336] - - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movq m2, [r0] - punpcklwd m1, m2 ;m1=[1 2] - pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 - - movq m3, [r0 + r1] - punpcklwd m2, m3 ;m4=[2 3] - pmaddwd m2, [r5 + 1 * 16] - paddd m0, m2 ;m0=[0+1+2+3] Row1 done - paddd m0, m4 - psrad m0, 12 - - movq m2, [r0 + 2 * r1] - punpcklwd m3, m2 ;m5=[3 4] - pmaddwd m3, [r5 + 1 * 16] - paddd m1, m3 ;m1 = [1+2+3+4] Row2 done - paddd m1, m4 - psrad m1, 12 - - packssdw m0, m1 - packuswb m0, m0 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - - RET - -;------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SP_W6_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7 - - add r1d, r1d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_ChromaCoeffV + r4] -%endif - - mova m6, [pd_526336] - - mov r4d, %2/4 - -.loopH: - PROCESS_CHROMA_SP_W4_4R - - paddd m0, m6 - paddd m1, m6 - paddd m2, m6 - paddd m3, m6 - - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 - - packssdw m0, m1 - packssdw m2, m3 - - packuswb m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r5, [r2 + 2 * r3] - pextrd [r5], m0, 2 - pextrd [r5 + r3], m0, 3 - - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 4 - - PROCESS_CHROMA_SP_W2_4R r6 - - paddd m0, m6 - paddd m2, m6 - - psrad m0, 12 - psrad m2, 12 - - packssdw m0, m2 - packuswb m0, m0 - - pextrw [r2], m0, 0 - pextrw [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrw [r2], m0, 2 - pextrw [r2 + r3], m0, 3 - - sub r0, 2 * 4 - lea r2, [r2 + 2 * r3 - 4] - - dec r4d - jnz .loopH - - RET -%endmacro - - FILTER_VER_CHROMA_SP_W6_H4 6, 8 - - FILTER_VER_CHROMA_SP_W6_H4 6, 16 - -%macro PROCESS_CHROMA_SP_W8_2R 0 - movu m1, [r0] - movu m3, [r0 + r1] - punpcklwd m0, m1, m3 - pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l - punpckhwd m1, m3 - pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h - - movu m4, [r0 + 2 * r1] - punpcklwd m2, m3, m4 - pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l - punpckhwd m3, m4 - pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h - - lea r0, [r0 + 2 * r1] - movu m5, [r0 + r1] - punpcklwd m6, m4, m5 - pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l - paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum - punpckhwd m4, m5 - pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h - paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum - - movu m4, [r0 + 2 * r1] - punpcklwd m6, m5, m4 - pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l - paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum - punpckhwd m5, m4 - pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h - paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum -%endmacro - -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SP_W8_H2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8 - - add r1d, r1d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] -%else - lea r5, [tab_ChromaCoeffV + r4] -%endif - - mova m7, [pd_526336] - - mov r4d, %2/2 -.loopH: - PROCESS_CHROMA_SP_W8_2R - - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 - - packssdw m0, m1 - packssdw m2, m3 - - packuswb m0, m2 - - movlps [r2], m0 - movhps [r2 + r3], m0 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH - - RET -%endmacro - - FILTER_VER_CHROMA_SP_W8_H2 8, 2 - FILTER_VER_CHROMA_SP_W8_H2 8, 4 - FILTER_VER_CHROMA_SP_W8_H2 8, 6 - FILTER_VER_CHROMA_SP_W8_H2 8, 8 - FILTER_VER_CHROMA_SP_W8_H2 8, 16 - FILTER_VER_CHROMA_SP_W8_H2 8, 32 - - FILTER_VER_CHROMA_SP_W8_H2 8, 12 - FILTER_VER_CHROMA_SP_W8_H2 8, 64 - - -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W16n 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 - - mov r4d, r4m - sub r0, r1 - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - mov r4d, %2/2 - -.loop: - - mov r6d, %1/16 - -.loopW: - - movu m2, [r0] - movu m3, [r0 + r1] - - punpcklbw m4, m2, m3 - punpckhbw m2, m3 - - pmaddubsw m4, m1 - pmaddubsw m2, m1 - - lea r5, [r0 + 2 * r1] - movu m5, [r5] - movu m7, [r5 + r1] - - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 - - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 - - mova m6, [pw_2000] - - psubw m4, m6 - psubw m2, m6 - - movu [r2], m4 - movu [r2 + 16], m2 - - punpcklbw m4, m3, m5 - punpckhbw m3, m5 - - pmaddubsw m4, m1 - pmaddubsw m3, m1 - - movu m5, [r5 + 2 * r1] - - punpcklbw m2, m7, m5 - punpckhbw m7, m5 - - pmaddubsw m2, m0 - pmaddubsw m7, m0 - - paddw m4, m2 - paddw m3, m7 - - psubw m4, m6 - psubw m3, m6 - - movu [r2 + r3], m4 - movu [r2 + r3 + 16], m3 - - add r0, 16 - add r2, 32 - dec r6d - jnz .loopW - - lea r0, [r0 + r1 * 2 - %1] - lea r2, [r2 + r3 * 2 - %1 * 2] - - dec r4d - jnz .loop - RET -%endmacro - - FILTER_V_PS_W16n 64, 64 - FILTER_V_PS_W16n 64, 32 - FILTER_V_PS_W16n 64, 48 - FILTER_V_PS_W16n 48, 64 - FILTER_V_PS_W16n 64, 16 - - -;------------------------------------------------------------------------------------------------------------ -;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------ -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_2x4, 4, 6, 7 - - mov r4d, r4m - sub r0, r1 - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m0, [tab_Cm] - - lea r5, [3 * r1] - - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] - - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 - - pmaddubsw m2, m0 - - lea r0, [r0 + 4 * r1] - movd m6, [r0] - - punpcklbw m3, m4 - punpcklbw m1, m5, m6 - punpcklbw m3, m1 - - pmaddubsw m3, m0 - phaddw m2, m3 - - mova m1, [pw_2000] - - psubw m2, m1 - - movd [r2], m2 - pextrd [r2 + r3], m2, 2 - - movd m2, [r0 + r1] - - punpcklbw m4, m5 - punpcklbw m3, m6, m2 - punpcklbw m4, m3 - - pmaddubsw m4, m0 - - movd m3, [r0 + 2 * r1] - - punpcklbw m5, m6 - punpcklbw m2, m3 - punpcklbw m5, m2 - - pmaddubsw m5, m0 - phaddw m4, m5 - psubw m4, m1 - - lea r2, [r2 + 2 * r3] - movd [r2], m4 - pextrd [r2 + r3], m4, 2 - - RET - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W2 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8 - - mov r4d, r4m - sub r0, r1 - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] -%else - movd m0, [tab_ChromaCoeff + r4 * 4] -%endif - - pshufb m0, [tab_Cm] - - mova m1, [pw_2000] - lea r5, [3 * r1] - mov r4d, %2/4 -.loop: - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] - - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 - - pmaddubsw m2, m0 - - lea r0, [r0 + 4 * r1] - movd m6, [r0] - - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 - - pmaddubsw m3, m0 - - phaddw m2, m3 - psubw m2, m1 - - - movd [r2], m2 - pshufd m2, m2, 2 - movd [r2 + r3], m2 - - movd m2, [r0 + r1] - - punpcklbw m4, m5 - punpcklbw m3, m6, m2 - punpcklbw m4, m3 - - pmaddubsw m4, m0 - - movd m3, [r0 + 2 * r1] - - punpcklbw m5, m6 - punpcklbw m2, m3 - punpcklbw m5, m2 - - pmaddubsw m5, m0 - - phaddw m4, m5 - - psubw m4, m1 - - lea r2, [r2 + 2 * r3] - movd [r2], m4 - pshufd m4 , m4 ,2 - movd [r2 + r3], m4 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loop - - RET -%endmacro - - FILTER_V_PS_W2 2, 8 - - FILTER_V_PS_W2 2, 16 - -;----------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize - - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_ChromaCoeffV + r4] -%endif - - mov dword [rsp], %2/4 - -.loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_CHROMA_SP_W4_4R - - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - packssdw m0, m1 - packssdw m2, m3 - - movlps [r2], m0 - movhps [r2 + r3], m0 - lea r5, [r2 + 2 * r3] - movlps [r5], m2 - movhps [r5 + r3], m2 - - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 2 * 4 - - dec r4d - jnz .loopW - - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - 2 * %1] - - dec dword [rsp] - jnz .loopH - - RET -%endmacro - - FILTER_VER_CHROMA_SS 4, 4 - FILTER_VER_CHROMA_SS 4, 8 - FILTER_VER_CHROMA_SS 16, 16 - FILTER_VER_CHROMA_SS 16, 8 - FILTER_VER_CHROMA_SS 16, 12 - FILTER_VER_CHROMA_SS 12, 16 - FILTER_VER_CHROMA_SS 16, 4 - FILTER_VER_CHROMA_SS 4, 16 - FILTER_VER_CHROMA_SS 32, 32 - FILTER_VER_CHROMA_SS 32, 16 - FILTER_VER_CHROMA_SS 16, 32 - FILTER_VER_CHROMA_SS 32, 24 - FILTER_VER_CHROMA_SS 24, 32 - FILTER_VER_CHROMA_SS 32, 8 - - FILTER_VER_CHROMA_SS 16, 24 - FILTER_VER_CHROMA_SS 12, 32 - FILTER_VER_CHROMA_SS 4, 32 - FILTER_VER_CHROMA_SS 32, 64 - FILTER_VER_CHROMA_SS 16, 64 - FILTER_VER_CHROMA_SS 32, 48 - FILTER_VER_CHROMA_SS 24, 64 - - FILTER_VER_CHROMA_SS 64, 64 - FILTER_VER_CHROMA_SS 64, 32 - FILTER_VER_CHROMA_SS 64, 48 - FILTER_VER_CHROMA_SS 48, 64 - FILTER_VER_CHROMA_SS 64, 16 - -%macro FILTER_VER_CHROMA_S_AVX2_4x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x4, 4, 6, 7 - mov r4d, r4m - add r1d, r1d - shl r4d, 6 - sub r0, r1 - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] -%ifidn %1,sp - mova m6, [pd_526336] -%else - add r3d, r3d -%endif - - movq xm0, [r0] - movq xm1, [r0 + r1] - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m4, [r5 + 1 * mmsize] - paddd m2, m4 - -%ifidn %1,sp - paddd m0, m6 - paddd m2, m6 - psrad m0, 12 - psrad m2, 12 -%else - psrad m0, 6 - psrad m2, 6 -%endif - packssdw m0, m2 - vextracti128 xm2, m0, 1 - lea r4, [r3 * 3] - -%ifidn %1,sp - packuswb xm0, xm2 - movd [r2], xm0 - pextrd [r2 + r3], xm0, 2 - pextrd [r2 + r3 * 2], xm0, 1 - pextrd [r2 + r4], xm0, 3 -%else - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r4], xm2 -%endif - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_4x4 sp - FILTER_VER_CHROMA_S_AVX2_4x4 ss - -%macro FILTER_VER_CHROMA_S_AVX2_4x8 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x8, 4, 6, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - sub r0, r1 - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] -%ifidn %1,sp - mova m7, [pd_526336] -%else - add r3d, r3d -%endif - - movq xm0, [r0] - movq xm1, [r0 + r1] - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m5, m4, [r5 + 1 * mmsize] - paddd m2, m5 - pmaddwd m4, [r5] - movq xm3, [r0 + r4] - punpcklwd xm1, xm3 - lea r0, [r0 + 4 * r1] - movq xm6, [r0] - punpcklwd xm3, xm6 - vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] - pmaddwd m5, m1, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m1, [r5] - movq xm3, [r0 + r1] - punpcklwd xm6, xm3 - movq xm5, [r0 + 2 * r1] - punpcklwd xm3, xm5 - vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] - pmaddwd m6, [r5 + 1 * mmsize] - paddd m1, m6 - lea r4, [r3 * 3] - -%ifidn %1,sp - paddd m0, m7 - paddd m2, m7 - paddd m4, m7 - paddd m1, m7 - psrad m0, 12 - psrad m2, 12 - psrad m4, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m2, 6 - psrad m4, 6 - psrad m1, 6 -%endif - packssdw m0, m2 - packssdw m4, m1 -%ifidn %1,sp - packuswb m0, m4 - vextracti128 xm2, m0, 1 - movd [r2], xm0 - movd [r2 + r3], xm2 - pextrd [r2 + r3 * 2], xm0, 1 - pextrd [r2 + r4], xm2, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm2, 2 - pextrd [r2 + r3 * 2], xm0, 3 - pextrd [r2 + r4], xm2, 3 -%else - vextracti128 xm2, m0, 1 - vextracti128 xm1, m4, 1 - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r4], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movq [r2 + r3], xm1 - movhps [r2 + r3 * 2], xm4 - movhps [r2 + r4], xm1 -%endif - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_4x8 sp - FILTER_VER_CHROMA_S_AVX2_4x8 ss - -%macro PROCESS_CHROMA_AVX2_W4_16R 1 - movq xm0, [r0] - movq xm1, [r0 + r1] - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m5, m4, [r5 + 1 * mmsize] - paddd m2, m5 - pmaddwd m4, [r5] - movq xm3, [r0 + r4] - punpcklwd xm1, xm3 - lea r0, [r0 + 4 * r1] - movq xm6, [r0] - punpcklwd xm3, xm6 - vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] - pmaddwd m5, m1, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m1, [r5] - movq xm3, [r0 + r1] - punpcklwd xm6, xm3 - movq xm5, [r0 + 2 * r1] - punpcklwd xm3, xm5 - vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] - pmaddwd m3, m6, [r5 + 1 * mmsize] - paddd m1, m3 - pmaddwd m6, [r5] - -%ifidn %1,sp - paddd m0, m7 - paddd m2, m7 - paddd m4, m7 - paddd m1, m7 - psrad m4, 12 - psrad m1, 12 - psrad m0, 12 - psrad m2, 12 -%else - psrad m0, 6 - psrad m2, 6 - psrad m4, 6 - psrad m1, 6 -%endif - packssdw m0, m2 - packssdw m4, m1 -%ifidn %1,sp - packuswb m0, m4 - vextracti128 xm4, m0, 1 - movd [r2], xm0 - movd [r2 + r3], xm4 - pextrd [r2 + r3 * 2], xm0, 1 - pextrd [r2 + r6], xm4, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm4, 2 - pextrd [r2 + r3 * 2], xm0, 3 - pextrd [r2 + r6], xm4, 3 -%else - vextracti128 xm2, m0, 1 - vextracti128 xm1, m4, 1 - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movq [r2 + r3], xm1 - movhps [r2 + r3 * 2], xm4 - movhps [r2 + r6], xm1 -%endif - - movq xm2, [r0 + r4] - punpcklwd xm5, xm2 - lea r0, [r0 + 4 * r1] - movq xm0, [r0] - punpcklwd xm2, xm0 - vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] - pmaddwd m2, m5, [r5 + 1 * mmsize] - paddd m6, m2 - pmaddwd m5, [r5] - movq xm2, [r0 + r1] - punpcklwd xm0, xm2 - movq xm3, [r0 + 2 * r1] - punpcklwd xm2, xm3 - vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] - pmaddwd m2, m0, [r5 + 1 * mmsize] - paddd m5, m2 - pmaddwd m0, [r5] - movq xm4, [r0 + r4] - punpcklwd xm3, xm4 - lea r0, [r0 + 4 * r1] - movq xm1, [r0] - punpcklwd xm4, xm1 - vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] - pmaddwd m4, m3, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m3, [r5] - movq xm4, [r0 + r1] - punpcklwd xm1, xm4 - movq xm2, [r0 + 2 * r1] - punpcklwd xm4, xm2 - vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] - pmaddwd m1, [r5 + 1 * mmsize] - paddd m3, m1 - -%ifidn %1,sp - paddd m6, m7 - paddd m5, m7 - paddd m0, m7 - paddd m3, m7 - psrad m6, 12 - psrad m5, 12 - psrad m0, 12 - psrad m3, 12 -%else - psrad m6, 6 - psrad m5, 6 - psrad m0, 6 - psrad m3, 6 -%endif - packssdw m6, m5 - packssdw m0, m3 - lea r2, [r2 + r3 * 4] - -%ifidn %1,sp - packuswb m6, m0 - vextracti128 xm0, m6, 1 - movd [r2], xm6 - movd [r2 + r3], xm0 - pextrd [r2 + r3 * 2], xm6, 1 - pextrd [r2 + r6], xm0, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm6, 2 - pextrd [r2 + r3], xm0, 2 - pextrd [r2 + r3 * 2], xm6, 3 - pextrd [r2 + r6], xm0, 3 -%else - vextracti128 xm5, m6, 1 - vextracti128 xm3, m0, 1 - movq [r2], xm6 - movq [r2 + r3], xm5 - movhps [r2 + r3 * 2], xm6 - movhps [r2 + r6], xm5 - lea r2, [r2 + r3 * 4] - movq [r2], xm0 - movq [r2 + r3], xm3 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm3 -%endif -%endmacro - -%macro FILTER_VER_CHROMA_S_AVX2_4x16 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x16, 4, 7, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - sub r0, r1 - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] -%ifidn %1,sp - mova m7, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] - PROCESS_CHROMA_AVX2_W4_16R %1 - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_4x16 sp - FILTER_VER_CHROMA_S_AVX2_4x16 ss - -%macro FILTER_VER_CHROMA_S_AVX2_4x32 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x32, 4, 7, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - sub r0, r1 - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] -%ifidn %1,sp - mova m7, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] -%rep 2 - PROCESS_CHROMA_AVX2_W4_16R %1 - lea r2, [r2 + r3 * 4] -%endrep - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_4x32 sp - FILTER_VER_CHROMA_S_AVX2_4x32 ss - -%macro FILTER_VER_CHROMA_S_AVX2_4x2 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x2, 4, 6, 6 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - sub r0, r1 - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] -%ifidn %1,sp - mova m5, [pd_526336] -%else - add r3d, r3d -%endif - movq xm0, [r0] - movq xm1, [r0 + r1] - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] - punpcklwd xm2, xm3 - movq xm4, [r0 + 4 * r1] - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 -%ifidn %1,sp - paddd m0, m5 - psrad m0, 12 -%else - psrad m0, 6 -%endif - vextracti128 xm1, m0, 1 - packssdw xm0, xm1 -%ifidn %1,sp - packuswb xm0, xm0 - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 -%else - movq [r2], xm0 - movhps [r2 + r3], xm0 -%endif - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_4x2 sp - FILTER_VER_CHROMA_S_AVX2_4x2 ss - -%macro FILTER_VER_CHROMA_S_AVX2_2x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_2x4, 4, 6, 6 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - sub r0, r1 - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] -%ifidn %1,sp - mova m5, [pd_526336] -%else - add r3d, r3d -%endif - movd xm0, [r0] - movd xm1, [r0 + r1] - punpcklwd xm0, xm1 - movd xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] - movd xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movd xm4, [r0] - punpcklwd xm3, xm4 - punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] - vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] - movd xm1, [r0 + r1] - punpcklwd xm4, xm1 - movd xm3, [r0 + r1 * 2] - punpcklwd xm1, xm3 - punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] - vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] - pmaddwd m0, [r5] - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 -%ifidn %1,sp - paddd m0, m5 - psrad m0, 12 -%else - psrad m0, 6 -%endif - vextracti128 xm1, m0, 1 - packssdw xm0, xm1 - lea r4, [r3 * 3] -%ifidn %1,sp - packuswb xm0, xm0 - pextrw [r2], xm0, 0 - pextrw [r2 + r3], xm0, 1 - pextrw [r2 + 2 * r3], xm0, 2 - pextrw [r2 + r4], xm0, 3 -%else - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - pextrd [r2 + 2 * r3], xm0, 2 - pextrd [r2 + r4], xm0, 3 -%endif - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_2x4 sp - FILTER_VER_CHROMA_S_AVX2_2x4 ss - -%macro FILTER_VER_CHROMA_S_AVX2_8x8 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x8, 4, 6, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m7, [pd_526336] -%else - add r3d, r3d -%endif - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m4 - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - pmaddwd m3, [r5] - paddd m1, m5 -%ifidn %1,sp - paddd m0, m7 - paddd m1, m7 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 - - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m1 -%ifidn %1,sp - paddd m2, m7 - paddd m3, m7 - psrad m2, 12 - psrad m3, 12 -%else - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m2, m3 - - movu xm1, [r0 + r4] ; m1 = row 7 - punpckhwd xm3, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm3, 1 - pmaddwd m3, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m3 - - lea r4, [r3 * 3] -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r4], xm2 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - movu [r2], xm0 - vextracti128 xm0, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 -%endif - lea r2, [r2 + r3 * 4] - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 8 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - pmaddwd m1, [r5] - paddd m5, m2 -%ifidn %1,sp - paddd m4, m7 - paddd m5, m7 - psrad m4, 12 - psrad m5, 12 -%else - psrad m4, 6 - psrad m5, 6 -%endif - packssdw m4, m5 - - movu xm2, [r0 + r1] ; m2 = row 9 - punpckhwd xm5, xm0, xm2 - punpcklwd xm0, xm2 - vinserti128 m0, m0, xm5, 1 - pmaddwd m0, [r5 + 1 * mmsize] - paddd m6, m0 - movu xm5, [r0 + r1 * 2] ; m5 = row 10 - punpckhwd xm0, xm2, xm5 - punpcklwd xm2, xm5 - vinserti128 m2, m2, xm0, 1 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m1, m2 - -%ifidn %1,sp - paddd m6, m7 - paddd m1, m7 - psrad m6, 12 - psrad m1, 12 -%else - psrad m6, 6 - psrad m1, 6 -%endif - packssdw m6, m1 -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r2], xm4 - movhps [r2 + r3], xm4 - movq [r2 + r3 * 2], xm6 - movhps [r2 + r4], xm6 -%else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm5, m4, 1 - vextracti128 xm1, m6, 1 - movu [r2], xm4 - movu [r2 + r3], xm5 - movu [r2 + r3 * 2], xm6 - movu [r2 + r4], xm1 -%endif - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_8x8 sp - FILTER_VER_CHROMA_S_AVX2_8x8 ss - -%macro PROCESS_CHROMA_S_AVX2_W8_16R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhwd xm7, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 1 * mmsize] - paddd m3, m7 - pmaddwd m5, [r5] -%ifidn %1,sp - paddd m0, m9 - paddd m1, m9 - paddd m2, m9 - paddd m3, m9 - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 -%else - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m0, m1 - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 -%endif - - movu xm7, [r7 + r4] ; m7 = row 7 - punpckhwd xm8, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddwd m8, m6, [r5 + 1 * mmsize] - paddd m4, m8 - pmaddwd m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm8, [r7] ; m8 = row 8 - punpckhwd xm0, xm7, xm8 - punpcklwd xm7, xm8 - vinserti128 m7, m7, xm0, 1 - pmaddwd m0, m7, [r5 + 1 * mmsize] - paddd m5, m0 - pmaddwd m7, [r5] - movu xm0, [r7 + r1] ; m0 = row 9 - punpckhwd xm1, xm8, xm0 - punpcklwd xm8, xm0 - vinserti128 m8, m8, xm1, 1 - pmaddwd m1, m8, [r5 + 1 * mmsize] - paddd m6, m1 - pmaddwd m8, [r5] - movu xm1, [r7 + r1 * 2] ; m1 = row 10 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m2, m0, [r5 + 1 * mmsize] - paddd m7, m2 - pmaddwd m0, [r5] -%ifidn %1,sp - paddd m4, m9 - paddd m5, m9 - psrad m4, 12 - psrad m5, 12 - paddd m6, m9 - paddd m7, m9 - psrad m6, 12 - psrad m7, 12 -%else - psrad m4, 6 - psrad m5, 6 - psrad m6, 6 - psrad m7, 6 -%endif - packssdw m4, m5 - packssdw m6, m7 - lea r8, [r2 + r3 * 4] -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r8], xm4 - movhps [r8 + r3], xm4 - movq [r8 + r3 * 2], xm6 - movhps [r8 + r6], xm6 -%else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm5, m4, 1 - vextracti128 xm7, m6, 1 - movu [r8], xm4 - movu [r8 + r3], xm5 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 -%endif - - movu xm2, [r7 + r4] ; m2 = row 11 - punpckhwd xm4, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm4, 1 - pmaddwd m4, m1, [r5 + 1 * mmsize] - paddd m8, m4 - pmaddwd m1, [r5] - lea r7, [r7 + r1 * 4] - movu xm4, [r7] ; m4 = row 12 - punpckhwd xm5, xm2, xm4 - punpcklwd xm2, xm4 - vinserti128 m2, m2, xm5, 1 - pmaddwd m5, m2, [r5 + 1 * mmsize] - paddd m0, m5 - pmaddwd m2, [r5] - movu xm5, [r7 + r1] ; m5 = row 13 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m1, m6 - pmaddwd m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 14 - punpckhwd xm7, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 1 * mmsize] - paddd m2, m7 - pmaddwd m5, [r5] -%ifidn %1,sp - paddd m8, m9 - paddd m0, m9 - paddd m1, m9 - paddd m2, m9 - psrad m8, 12 - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 -%else - psrad m8, 6 - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 -%endif - packssdw m8, m0 - packssdw m1, m2 - lea r8, [r8 + r3 * 4] -%ifidn %1,sp - packuswb m8, m1 - vpermd m8, m3, m8 - vextracti128 xm1, m8, 1 - movq [r8], xm8 - movhps [r8 + r3], xm8 - movq [r8 + r3 * 2], xm1 - movhps [r8 + r6], xm1 -%else - vpermq m8, m8, 11011000b - vpermq m1, m1, 11011000b - vextracti128 xm0, m8, 1 - vextracti128 xm2, m1, 1 - movu [r8], xm8 - movu [r8 + r3], xm0 - movu [r8 + r3 * 2], xm1 - movu [r8 + r6], xm2 -%endif - lea r8, [r8 + r3 * 4] - - movu xm7, [r7 + r4] ; m7 = row 15 - punpckhwd xm2, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddwd m2, m6, [r5 + 1 * mmsize] - paddd m4, m2 - pmaddwd m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm2, [r7] ; m2 = row 16 - punpckhwd xm1, xm7, xm2 - punpcklwd xm7, xm2 - vinserti128 m7, m7, xm1, 1 - pmaddwd m1, m7, [r5 + 1 * mmsize] - paddd m5, m1 - pmaddwd m7, [r5] - movu xm1, [r7 + r1] ; m1 = row 17 - punpckhwd xm0, xm2, xm1 - punpcklwd xm2, xm1 - vinserti128 m2, m2, xm0, 1 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m6, m2 - movu xm0, [r7 + r1 * 2] ; m0 = row 18 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m1, [r5 + 1 * mmsize] - paddd m7, m1 - -%ifidn %1,sp - paddd m4, m9 - paddd m5, m9 - paddd m6, m9 - paddd m7, m9 - psrad m4, 12 - psrad m5, 12 - psrad m6, 12 - psrad m7, 12 -%else - psrad m4, 6 - psrad m5, 6 - psrad m6, 6 - psrad m7, 6 -%endif - packssdw m4, m5 - packssdw m6, m7 -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r8], xm4 - movhps [r8 + r3], xm4 - movq [r8 + r3 * 2], xm6 - movhps [r8 + r6], xm6 -%else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm5, m4, 1 - vextracti128 xm7, m6, 1 - movu [r8], xm4 - movu [r8 + r3], xm5 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 -%endif -%endmacro - -%macro FILTER_VER_CHROMA_S_AVX2_Nx16 2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_%2x16, 4, 10, 10 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m9, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] - mov r9d, %2 / 8 -.loopW: - PROCESS_CHROMA_S_AVX2_W8_16R %1 -%ifidn %1,sp - add r2, 8 -%else - add r2, 16 -%endif - add r0, 16 - dec r9d - jnz .loopW - RET -%endif -%endmacro - - FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 16 - FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 32 - FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 64 - FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 16 - FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 32 - FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 64 - -%macro FILTER_VER_CHROMA_S_AVX2_NxN 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%3_%1x%2, 4, 11, 10 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %3,sp - mova m9, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] - mov r9d, %2 / 16 -.loopH: - mov r10d, %1 / 8 -.loopW: - PROCESS_CHROMA_S_AVX2_W8_16R %3 -%ifidn %3,sp - add r2, 8 -%else - add r2, 16 -%endif - add r0, 16 - dec r10d - jnz .loopW - lea r0, [r7 - 2 * %1 + 16] -%ifidn %3,sp - lea r2, [r8 + r3 * 4 - %1 + 8] -%else - lea r2, [r8 + r3 * 4 - 2 * %1 + 16] -%endif - dec r9d - jnz .loopH - RET -%endif -%endmacro - - FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, sp - FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, sp - FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, sp - FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, ss - FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, ss - FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, ss - FILTER_VER_CHROMA_S_AVX2_NxN 16, 64, sp - FILTER_VER_CHROMA_S_AVX2_NxN 24, 64, sp - FILTER_VER_CHROMA_S_AVX2_NxN 32, 64, sp - FILTER_VER_CHROMA_S_AVX2_NxN 32, 48, sp - FILTER_VER_CHROMA_S_AVX2_NxN 32, 48, ss - FILTER_VER_CHROMA_S_AVX2_NxN 16, 64, ss - FILTER_VER_CHROMA_S_AVX2_NxN 24, 64, ss - FILTER_VER_CHROMA_S_AVX2_NxN 32, 64, ss - FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, sp - FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, sp - FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, sp - FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, sp - FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, ss - FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, ss - FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, ss - FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, ss - -%macro PROCESS_CHROMA_S_AVX2_W8_4R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m4, [r5 + 1 * mmsize] - paddd m2, m4 - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm4, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm4, 1 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m3, m5 -%ifidn %1,sp - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 -%else - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m0, m1 - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 -%endif -%endmacro - -%macro FILTER_VER_CHROMA_S_AVX2_8x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x4, 4, 6, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m7, [pd_526336] -%else - add r3d, r3d -%endif - - PROCESS_CHROMA_S_AVX2_W8_4R %1 - lea r4, [r3 * 3] -%ifidn %1,sp - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r4], xm2 -%else - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 -%endif - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_8x4 sp - FILTER_VER_CHROMA_S_AVX2_8x4 ss - -%macro FILTER_VER_CHROMA_S_AVX2_12x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_12x16, 4, 9, 10 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m9, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] - PROCESS_CHROMA_S_AVX2_W8_16R %1 -%ifidn %1,sp - add r2, 8 -%else - add r2, 16 -%endif - add r0, 16 - mova m7, m9 - PROCESS_CHROMA_AVX2_W4_16R %1 - RET -%endif -%endmacro - - FILTER_VER_CHROMA_S_AVX2_12x16 sp - FILTER_VER_CHROMA_S_AVX2_12x16 ss - -%macro FILTER_VER_CHROMA_S_AVX2_12x32 1 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_12x32, 4, 9, 10 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1, sp - mova m9, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] -%rep 2 - PROCESS_CHROMA_S_AVX2_W8_16R %1 -%ifidn %1, sp - add r2, 8 -%else - add r2, 16 -%endif - add r0, 16 - mova m7, m9 - PROCESS_CHROMA_AVX2_W4_16R %1 - sub r0, 16 -%ifidn %1, sp - lea r2, [r2 + r3 * 4 - 8] -%else - lea r2, [r2 + r3 * 4 - 16] -%endif -%endrep - RET -%endif -%endmacro - - FILTER_VER_CHROMA_S_AVX2_12x32 sp - FILTER_VER_CHROMA_S_AVX2_12x32 ss - -%macro FILTER_VER_CHROMA_S_AVX2_16x12 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_16x12, 4, 9, 9 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m8, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] -%rep 2 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] -%ifidn %1,sp - paddd m0, m8 - paddd m1, m8 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 - - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m1 -%ifidn %1,sp - paddd m2, m8 - paddd m3, m8 - psrad m2, 12 - psrad m3, 12 -%else - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - movu [r2], xm0 - vextracti128 xm0, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 -%endif - lea r8, [r2 + r3 * 4] - - movu xm1, [r7 + r4] ; m1 = row 7 - punpckhwd xm0, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm0, 1 - pmaddwd m0, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m0 - lea r7, [r7 + r1 * 4] - movu xm0, [r7] ; m0 = row 8 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - pmaddwd m1, [r5] - paddd m5, m2 -%ifidn %1,sp - paddd m4, m8 - paddd m5, m8 - psrad m4, 12 - psrad m5, 12 -%else - psrad m4, 6 - psrad m5, 6 -%endif - packssdw m4, m5 - - movu xm2, [r7 + r1] ; m2 = row 9 - punpckhwd xm5, xm0, xm2 - punpcklwd xm0, xm2 - vinserti128 m0, m0, xm5, 1 - pmaddwd m5, m0, [r5 + 1 * mmsize] - paddd m6, m5 - pmaddwd m0, [r5] - movu xm5, [r7 + r1 * 2] ; m5 = row 10 - punpckhwd xm7, xm2, xm5 - punpcklwd xm2, xm5 - vinserti128 m2, m2, xm7, 1 - pmaddwd m7, m2, [r5 + 1 * mmsize] - paddd m1, m7 - pmaddwd m2, [r5] - -%ifidn %1,sp - paddd m6, m8 - paddd m1, m8 - psrad m6, 12 - psrad m1, 12 -%else - psrad m6, 6 - psrad m1, 6 -%endif - packssdw m6, m1 -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r8], xm4 - movhps [r8 + r3], xm4 - movq [r8 + r3 * 2], xm6 - movhps [r8 + r6], xm6 -%else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm7, m4, 1 - vextracti128 xm1, m6, 1 - movu [r8], xm4 - movu [r8 + r3], xm7 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm1 -%endif - lea r8, [r8 + r3 * 4] - - movu xm7, [r7 + r4] ; m7 = row 11 - punpckhwd xm1, xm5, xm7 - punpcklwd xm5, xm7 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - paddd m0, m1 - pmaddwd m5, [r5] - lea r7, [r7 + r1 * 4] - movu xm1, [r7] ; m1 = row 12 - punpckhwd xm4, xm7, xm1 - punpcklwd xm7, xm1 - vinserti128 m7, m7, xm4, 1 - pmaddwd m4, m7, [r5 + 1 * mmsize] - paddd m2, m4 - pmaddwd m7, [r5] -%ifidn %1,sp - paddd m0, m8 - paddd m2, m8 - psrad m0, 12 - psrad m2, 12 -%else - psrad m0, 6 - psrad m2, 6 -%endif - packssdw m0, m2 - - movu xm4, [r7 + r1] ; m4 = row 13 - punpckhwd xm2, xm1, xm4 - punpcklwd xm1, xm4 - vinserti128 m1, m1, xm2, 1 - pmaddwd m1, [r5 + 1 * mmsize] - paddd m5, m1 - movu xm2, [r7 + r1 * 2] ; m2 = row 14 - punpckhwd xm6, xm4, xm2 - punpcklwd xm4, xm2 - vinserti128 m4, m4, xm6, 1 - pmaddwd m4, [r5 + 1 * mmsize] - paddd m7, m4 -%ifidn %1,sp - paddd m5, m8 - paddd m7, m8 - psrad m5, 12 - psrad m7, 12 -%else - psrad m5, 6 - psrad m7, 6 -%endif - packssdw m5, m7 -%ifidn %1,sp - packuswb m0, m5 - vpermd m0, m3, m0 - vextracti128 xm5, m0, 1 - movq [r8], xm0 - movhps [r8 + r3], xm0 - movq [r8 + r3 * 2], xm5 - movhps [r8 + r6], xm5 - add r2, 8 -%else - vpermq m0, m0, 11011000b - vpermq m5, m5, 11011000b - vextracti128 xm7, m0, 1 - vextracti128 xm6, m5, 1 - movu [r8], xm0 - movu [r8 + r3], xm7 - movu [r8 + r3 * 2], xm5 - movu [r8 + r6], xm6 - add r2, 16 -%endif - add r0, 16 -%endrep - RET -%endif -%endmacro - - FILTER_VER_CHROMA_S_AVX2_16x12 sp - FILTER_VER_CHROMA_S_AVX2_16x12 ss - -%macro FILTER_VER_CHROMA_S_AVX2_8x12 1 -%if ARCH_X86_64 == 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x12, 4, 7, 9 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m8, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] -%ifidn %1,sp - paddd m0, m8 - paddd m1, m8 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 - - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m1 -%ifidn %1,sp - paddd m2, m8 - paddd m3, m8 - psrad m2, 12 - psrad m3, 12 -%else - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - movu [r2], xm0 - vextracti128 xm0, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 -%endif - lea r2, [r2 + r3 * 4] - - movu xm1, [r0 + r4] ; m1 = row 7 - punpckhwd xm0, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm0, 1 - pmaddwd m0, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m0 - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 8 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - pmaddwd m1, [r5] - paddd m5, m2 -%ifidn %1,sp - paddd m4, m8 - paddd m5, m8 - psrad m4, 12 - psrad m5, 12 -%else - psrad m4, 6 - psrad m5, 6 -%endif - packssdw m4, m5 - - movu xm2, [r0 + r1] ; m2 = row 9 - punpckhwd xm5, xm0, xm2 - punpcklwd xm0, xm2 - vinserti128 m0, m0, xm5, 1 - pmaddwd m5, m0, [r5 + 1 * mmsize] - paddd m6, m5 - pmaddwd m0, [r5] - movu xm5, [r0 + r1 * 2] ; m5 = row 10 - punpckhwd xm7, xm2, xm5 - punpcklwd xm2, xm5 - vinserti128 m2, m2, xm7, 1 - pmaddwd m7, m2, [r5 + 1 * mmsize] - paddd m1, m7 - pmaddwd m2, [r5] - -%ifidn %1,sp - paddd m6, m8 - paddd m1, m8 - psrad m6, 12 - psrad m1, 12 -%else - psrad m6, 6 - psrad m1, 6 -%endif - packssdw m6, m1 -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r2], xm4 - movhps [r2 + r3], xm4 - movq [r2 + r3 * 2], xm6 - movhps [r2 + r6], xm6 -%else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm7, m4, 1 - vextracti128 xm1, m6, 1 - movu [r2], xm4 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm1 -%endif - lea r2, [r2 + r3 * 4] - - movu xm7, [r0 + r4] ; m7 = row 11 - punpckhwd xm1, xm5, xm7 - punpcklwd xm5, xm7 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - paddd m0, m1 - pmaddwd m5, [r5] - lea r0, [r0 + r1 * 4] - movu xm1, [r0] ; m1 = row 12 - punpckhwd xm4, xm7, xm1 - punpcklwd xm7, xm1 - vinserti128 m7, m7, xm4, 1 - pmaddwd m4, m7, [r5 + 1 * mmsize] - paddd m2, m4 - pmaddwd m7, [r5] -%ifidn %1,sp - paddd m0, m8 - paddd m2, m8 - psrad m0, 12 - psrad m2, 12 -%else - psrad m0, 6 - psrad m2, 6 -%endif - packssdw m0, m2 - - movu xm4, [r0 + r1] ; m4 = row 13 - punpckhwd xm2, xm1, xm4 - punpcklwd xm1, xm4 - vinserti128 m1, m1, xm2, 1 - pmaddwd m1, [r5 + 1 * mmsize] - paddd m5, m1 - movu xm2, [r0 + r1 * 2] ; m2 = row 14 - punpckhwd xm6, xm4, xm2 - punpcklwd xm4, xm2 - vinserti128 m4, m4, xm6, 1 - pmaddwd m4, [r5 + 1 * mmsize] - paddd m7, m4 -%ifidn %1,sp - paddd m5, m8 - paddd m7, m8 - psrad m5, 12 - psrad m7, 12 -%else - psrad m5, 6 - psrad m7, 6 -%endif - packssdw m5, m7 -%ifidn %1,sp - packuswb m0, m5 - vpermd m0, m3, m0 - vextracti128 xm5, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm5 -%else - vpermq m0, m0, 11011000b - vpermq m5, m5, 11011000b - vextracti128 xm7, m0, 1 - vextracti128 xm6, m5, 1 - movu [r2], xm0 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm5 - movu [r2 + r6], xm6 -%endif - RET -%endif -%endmacro - - FILTER_VER_CHROMA_S_AVX2_8x12 sp - FILTER_VER_CHROMA_S_AVX2_8x12 ss - -%macro FILTER_VER_CHROMA_S_AVX2_16x4 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_16x4, 4, 7, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m7, [pd_526336] -%else - add r3d, r3d -%endif -%rep 2 - PROCESS_CHROMA_S_AVX2_W8_4R %1 - lea r6, [r3 * 3] -%ifidn %1,sp - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 - add r2, 8 -%else - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - add r2, 16 -%endif - lea r6, [4 * r1 - 16] - sub r0, r6 -%endrep - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_16x4 sp - FILTER_VER_CHROMA_S_AVX2_16x4 ss - -%macro PROCESS_CHROMA_S_AVX2_W8_8R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] -%ifidn %1,sp - paddd m0, m7 - paddd m1, m7 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 - - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m1 -%ifidn %1,sp - paddd m2, m7 - paddd m3, m7 - psrad m2, 12 - psrad m3, 12 -%else - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 -%else - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - movu [r2], xm0 - vextracti128 xm0, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 -%endif - lea r8, [r2 + r3 * 4] - - movu xm1, [r7 + r4] ; m1 = row 7 - punpckhwd xm0, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm0, 1 - pmaddwd m0, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m0 - lea r7, [r7 + r1 * 4] - movu xm0, [r7] ; m0 = row 8 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - pmaddwd m1, [r5] - paddd m5, m2 -%ifidn %1,sp - paddd m4, m7 - paddd m5, m7 - psrad m4, 12 - psrad m5, 12 -%else - psrad m4, 6 - psrad m5, 6 -%endif - packssdw m4, m5 - - movu xm2, [r7 + r1] ; m2 = row 9 - punpckhwd xm5, xm0, xm2 - punpcklwd xm0, xm2 - vinserti128 m0, m0, xm5, 1 - pmaddwd m0, [r5 + 1 * mmsize] - paddd m6, m0 - movu xm5, [r7 + r1 * 2] ; m5 = row 10 - punpckhwd xm0, xm2, xm5 - punpcklwd xm2, xm5 - vinserti128 m2, m2, xm0, 1 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m1, m2 - -%ifidn %1,sp - paddd m6, m7 - paddd m1, m7 - psrad m6, 12 - psrad m1, 12 -%else - psrad m6, 6 - psrad m1, 6 -%endif - packssdw m6, m1 -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r8], xm4 - movhps [r8 + r3], xm4 - movq [r8 + r3 * 2], xm6 - movhps [r8 + r6], xm6 -%else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm7, m4, 1 - vextracti128 xm1, m6, 1 - movu [r8], xm4 - movu [r8 + r3], xm7 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm1 -%endif -%endmacro - -%macro FILTER_VER_CHROMA_S_AVX2_Nx8 2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_%2x8, 4, 9, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m7, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] -%rep %2 / 8 - PROCESS_CHROMA_S_AVX2_W8_8R %1 -%ifidn %1,sp - add r2, 8 -%else - add r2, 16 -%endif - add r0, 16 -%endrep - RET -%endif -%endmacro - - FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 32 - FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 16 - FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 32 - FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 16 - -%macro FILTER_VER_CHROMA_S_AVX2_8x2 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x2, 4, 6, 6 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m5, [pd_526336] -%else - add r3d, r3d -%endif - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 - movu xm4, [r0 + r1 * 4] ; m4 = row 4 - punpckhwd xm2, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm2, 1 - pmaddwd m3, [r5 + 1 * mmsize] - paddd m1, m3 -%ifidn %1,sp - paddd m0, m5 - paddd m1, m5 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 -%ifidn %1,sp - vextracti128 xm1, m0, 1 - packuswb xm0, xm1 - pshufd xm0, xm0, 11011000b - movq [r2], xm0 - movhps [r2 + r3], xm0 -%else - vpermq m0, m0, 11011000b - vextracti128 xm1, m0, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 -%endif - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_8x2 sp - FILTER_VER_CHROMA_S_AVX2_8x2 ss - -%macro FILTER_VER_CHROMA_S_AVX2_8x6 1 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x6, 4, 6, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 -%else - lea r5, [pw_ChromaCoeffV + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m7, [pd_526336] -%else - add r3d, r3d -%endif - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 + movu xm1, [r7 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r7 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m4 - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - pmaddwd m3, [r5] - paddd m1, m5 -%ifidn %1,sp - paddd m0, m7 - paddd m1, m7 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 - - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 + movu xm5, [r7 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m1 -%ifidn %1,sp - paddd m2, m7 - paddd m3, m7 - psrad m2, 12 - psrad m3, 12 -%else - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m2, m3 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r7, [r7 + r1 * 4] + movu xm6, [r7] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r7 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r7 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 - movu xm1, [r0 + r4] ; m1 = row 7 - punpckhwd xm3, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm3, 1 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m4, m6 - movu xm6, [r0 + r1 * 4] ; m6 = row 8 - punpckhwd xm3, xm1, xm6 - punpcklwd xm1, xm6 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5 + 1 * mmsize] - paddd m5, m1 -%ifidn %1,sp - paddd m4, m7 - paddd m5, m7 - psrad m4, 12 - psrad m5, 12 -%else - psrad m4, 6 - psrad m5, 6 -%endif - packssdw m4, m5 - lea r4, [r3 * 3] -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 - vextracti128 xm5, m4, 1 - packuswb xm4, xm5 - pshufd xm4, xm4, 11011000b - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r4], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movhps [r2 + r3], xm4 -%else +%ifidn %1,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - movu [r2], xm0 - vextracti128 xm0, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2 + r3], xm0 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 -%endif - RET -%endmacro - - FILTER_VER_CHROMA_S_AVX2_8x6 sp - FILTER_VER_CHROMA_S_AVX2_8x6 ss - -%macro FILTER_VER_CHROMA_S_AVX2_8xN 2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_8x%2, 4, 7, 9 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - -%ifdef PIC - lea r5, [pw_ChromaCoeffV] - add r5, r4 + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 + lea r8, [r8 + r3 * 4] + movu [r8], xm12 + movu [r8 + r3], xm13 + movu [r8 + r3 * 2], xm0 + movu [r8 + r6], xm1 %else - lea r5, [pw_ChromaCoeffV + r4] + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m12, m14 ; m12 = word: row 12 + psubw m13, m14 ; m13 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r8], m8 + movu [r8 + r3], m9 + movu [r8 + r3 * 2], m10 + movu [r8 + r6], m11 + lea r8, [r8 + r3 * 4] + movu [r8], m12 + movu [r8 + r3], m13 + movu [r8 + r3 * 2], m0 + movu [r8 + r6], m1 %endif +%endmacro - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m8, [pd_526336] -%else - add r3d, r3d -%endif - lea r6, [r3 * 3] -%rep %2 / 16 +%macro PROCESS_LUMA_AVX2_W16_8R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] + pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] + pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] -%ifidn %1,sp - paddd m0, m8 - paddd m1, m8 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 - - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m1 -%ifidn %1,sp - paddd m2, m8 - paddd m3, m8 - psrad m2, 12 - psrad m3, 12 -%else - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - mova m3, [interp8_hps_shuf] - vpermd m0, m3, m0 - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 -%else + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b - movu [r2], xm0 - vextracti128 xm0, m0, 1 + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 - movu [r2 + r3], xm0 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 -%endif - lea r2, [r2 + r3 * 4] - - movu xm1, [r0 + r4] ; m1 = row 7 - punpckhwd xm0, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm0, 1 - pmaddwd m0, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m0 - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 8 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - pmaddwd m1, [r5] - paddd m5, m2 -%ifidn %1,sp - paddd m4, m8 - paddd m5, m8 - psrad m4, 12 - psrad m5, 12 -%else - psrad m4, 6 - psrad m5, 6 -%endif - packssdw m4, m5 - - movu xm2, [r0 + r1] ; m2 = row 9 - punpckhwd xm5, xm0, xm2 - punpcklwd xm0, xm2 - vinserti128 m0, m0, xm5, 1 - pmaddwd m5, m0, [r5 + 1 * mmsize] - paddd m6, m5 - pmaddwd m0, [r5] - movu xm5, [r0 + r1 * 2] ; m5 = row 10 - punpckhwd xm7, xm2, xm5 - punpcklwd xm2, xm5 - vinserti128 m2, m2, xm7, 1 - pmaddwd m7, m2, [r5 + 1 * mmsize] - paddd m1, m7 - pmaddwd m2, [r5] - -%ifidn %1,sp - paddd m6, m8 - paddd m1, m8 - psrad m6, 12 - psrad m1, 12 -%else - psrad m6, 6 - psrad m1, 6 -%endif - packssdw m6, m1 -%ifidn %1,sp - packuswb m4, m6 - vpermd m4, m3, m4 - vextracti128 xm6, m4, 1 - movq [r2], xm4 - movhps [r2 + r3], xm4 - movq [r2 + r3 * 2], xm6 - movhps [r2 + r6], xm6 + lea r8, [r2 + r3 * 4] + movu [r8], xm4 + movu [r8 + r3], xm5 %else - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b - vextracti128 xm7, m4, 1 - vextracti128 xm1, m6, 1 - movu [r2], xm4 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm1 + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + lea r8, [r2 + r3 * 4] + movu [r8], m4 + movu [r8 + r3], m5 %endif - lea r2, [r2 + r3 * 4] - movu xm7, [r0 + r4] ; m7 = row 11 - punpckhwd xm1, xm5, xm7 - punpcklwd xm5, xm7 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - paddd m0, m1 - pmaddwd m5, [r5] - lea r0, [r0 + r1 * 4] - movu xm1, [r0] ; m1 = row 12 - punpckhwd xm4, xm7, xm1 - punpcklwd xm7, xm1 - vinserti128 m7, m7, xm4, 1 - pmaddwd m4, m7, [r5 + 1 * mmsize] - paddd m2, m4 - pmaddwd m7, [r5] -%ifidn %1,sp - paddd m0, m8 - paddd m2, m8 - psrad m0, 12 - psrad m2, 12 -%else - psrad m0, 6 - psrad m2, 6 -%endif - packssdw m0, m2 + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 - movu xm4, [r0 + r1] ; m4 = row 13 - punpckhwd xm2, xm1, xm4 - punpcklwd xm1, xm4 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - paddd m5, m2 - pmaddwd m1, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 14 - punpckhwd xm6, xm4, xm2 - punpcklwd xm4, xm2 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m7, m6 - pmaddwd m4, [r5] -%ifidn %1,sp - paddd m5, m8 - paddd m7, m8 - psrad m5, 12 - psrad m7, 12 +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 %else - psrad m5, 6 - psrad m7, 6 + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r8 + r3 * 2], m6 + movu [r8 + r6], m7 %endif - packssdw m5, m7 -%ifidn %1,sp - packuswb m0, m5 - vpermd m0, m3, m0 - vextracti128 xm5, m0, 1 - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm5 +%endmacro + +%macro FILTER_VER_LUMA_AVX2_24x32 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_24x32, 4, 11, 15 + mov r4d, r4m + shl r4d, 7 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 %else - vpermq m0, m0, 11011000b - vpermq m5, m5, 11011000b - vextracti128 xm7, m0, 1 - vextracti128 xm6, m5, 1 - movu [r2], xm0 - movu [r2 + r3], xm7 - movu [r2 + r3 * 2], xm5 - movu [r2 + r6], xm6 + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - lea r2, [r2 + r3 * 4] - - movu xm6, [r0 + r4] ; m6 = row 15 - punpckhwd xm5, xm2, xm6 - punpcklwd xm2, xm6 - vinserti128 m2, m2, xm5, 1 - pmaddwd m5, m2, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 16 - punpckhwd xm5, xm6, xm0 - punpcklwd xm6, xm0 - vinserti128 m6, m6, xm5, 1 - pmaddwd m5, m6, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m6, [r5] -%ifidn %1,sp - paddd m1, m8 - paddd m4, m8 - psrad m1, 12 - psrad m4, 12 + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,ps + add r3d, r3d + vbroadcasti128 m14, [pw_2000] %else - psrad m1, 6 - psrad m4, 6 + mova m14, [pw_512] %endif - packssdw m1, m4 - - movu xm5, [r0 + r1] ; m5 = row 17 - punpckhwd xm4, xm0, xm5 - punpcklwd xm0, xm5 - vinserti128 m0, m0, xm4, 1 - pmaddwd m0, [r5 + 1 * mmsize] - paddd m2, m0 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhwd xm0, xm5, xm4 - punpcklwd xm5, xm4 - vinserti128 m5, m5, xm0, 1 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m6, m5 -%ifidn %1,sp - paddd m2, m8 - paddd m6, m8 - psrad m2, 12 - psrad m6, 12 + lea r6, [r3 * 3] + lea r10, [r1 * 4] + mov r9d, 2 +.loopH: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 %else - psrad m2, 6 - psrad m6, 6 + add r2, 32 %endif - packssdw m2, m6 -%ifidn %1,sp - packuswb m1, m2 - vpermd m1, m3, m1 - vextracti128 xm2, m1, 1 - movq [r2], xm1 - movhps [r2 + r3], xm1 - movq [r2 + r3 * 2], xm2 + add r0, 16 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 + vinserti128 m5, m1, xm2, 1 + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 + lea r7, [r0 + r1 * 4] + movq xm1, [r7] ; m1 = row 4 + punpcklbw xm4, xm1 + vinserti128 m2, m3, xm4, 1 + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r7 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 + movq xm4, [r7 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m5, m3 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r7 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 + lea r7, [r7 + r1 * 4] + movq xm0, [r7] ; m0 = row 8 + punpcklbw xm3, xm0 + vinserti128 m4, m4, xm3, 1 + pmaddubsw m3, m4, [r5 + 3 * mmsize] + paddw m5, m3 + pmaddubsw m3, m4, [r5 + 2 * mmsize] + paddw m2, m3 + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r7 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 + movq xm6, [r7 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 + vinserti128 m0, m0, xm3, 1 + pmaddubsw m3, m0, [r5 + 3 * mmsize] + paddw m2, m3 + pmaddubsw m3, m0, [r5 + 2 * mmsize] + paddw m1, m3 + pmaddubsw m3, m0, [r5 + 1 * mmsize] + paddw m4, m3 + pmaddubsw m0, [r5] + + movq xm3, [r7 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 + lea r7, [r7 + r1 * 4] + movq xm7, [r7] ; m7 = row 12 + punpcklbw xm3, xm7 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, [r5 + 3 * mmsize] + paddw m1, m3 + pmaddubsw m3, m6, [r5 + 2 * mmsize] + paddw m4, m3 + pmaddubsw m3, m6, [r5 + 1 * mmsize] + paddw m0, m3 + pmaddubsw m6, [r5] + movq xm3, [r7 + r1] ; m3 = row 13 + punpcklbw xm7, xm3 + movq xm8, [r7 + r1 * 2] ; m8 = row 14 + punpcklbw xm3, xm8 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m3, m7, [r5 + 3 * mmsize] + paddw m4, m3 + pmaddubsw m3, m7, [r5 + 2 * mmsize] + paddw m0, m3 + pmaddubsw m3, m7, [r5 + 1 * mmsize] + paddw m6, m3 + pmaddubsw m7, [r5] + movq xm3, [r7 + r4] ; m3 = row 15 + punpcklbw xm8, xm3 + lea r7, [r7 + r1 * 4] + movq xm9, [r7] ; m9 = row 16 + punpcklbw xm3, xm9 + vinserti128 m8, m8, xm3, 1 + pmaddubsw m3, m8, [r5 + 3 * mmsize] + paddw m0, m3 + pmaddubsw m3, m8, [r5 + 2 * mmsize] + paddw m6, m3 + pmaddubsw m3, m8, [r5 + 1 * mmsize] + paddw m7, m3 + pmaddubsw m8, [r5] + movq xm3, [r7 + r1] ; m3 = row 17 + punpcklbw xm9, xm3 + movq xm10, [r7 + r1 * 2] ; m10 = row 18 + punpcklbw xm3, xm10 + vinserti128 m9, m9, xm3, 1 + pmaddubsw m3, m9, [r5 + 3 * mmsize] + paddw m6, m3 + pmaddubsw m3, m9, [r5 + 2 * mmsize] + paddw m7, m3 + pmaddubsw m3, m9, [r5 + 1 * mmsize] + paddw m8, m3 + movq xm3, [r7 + r4] ; m3 = row 19 + punpcklbw xm10, xm3 + lea r7, [r7 + r1 * 4] + movq xm9, [r7] ; m9 = row 20 + punpcklbw xm3, xm9 + vinserti128 m10, m10, xm3, 1 + pmaddubsw m3, m10, [r5 + 3 * mmsize] + paddw m7, m3 + pmaddubsw m3, m10, [r5 + 2 * mmsize] + paddw m8, m3 + movq xm3, [r7 + r1] ; m3 = row 21 + punpcklbw xm9, xm3 + movq xm10, [r7 + r1 * 2] ; m10 = row 22 + punpcklbw xm3, xm10 + vinserti128 m9, m9, xm3, 1 + pmaddubsw m3, m9, [r5 + 3 * mmsize] + paddw m8, m3 +%ifidn %1,pp + pmulhrsw m5, m14 ; m5 = word: row 0, row 1 + pmulhrsw m2, m14 ; m2 = word: row 2, row 3 + pmulhrsw m1, m14 ; m1 = word: row 4, row 5 + pmulhrsw m4, m14 ; m4 = word: row 6, row 7 + pmulhrsw m0, m14 ; m0 = word: row 8, row 9 + pmulhrsw m6, m14 ; m6 = word: row 10, row 11 + pmulhrsw m7, m14 ; m7 = word: row 12, row 13 + pmulhrsw m8, m14 ; m8 = word: row 14, row 15 + packuswb m5, m2 + packuswb m1, m4 + packuswb m0, m6 + packuswb m7, m8 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + vextracti128 xm6, m0, 1 + vextracti128 xm8, m7, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 movhps [r2 + r6], xm2 + lea r8, [r2 + r3 * 4] + movq [r8], xm1 + movq [r8 + r3], xm4 + movhps [r8 + r3 * 2], xm1 + movhps [r8 + r6], xm4 + lea r8, [r8 + r3 * 4] + movq [r8], xm0 + movq [r8 + r3], xm6 + movhps [r8 + r3 * 2], xm0 + movhps [r8 + r6], xm6 + lea r8, [r8 + r3 * 4] + movq [r8], xm7 + movq [r8 + r3], xm8 + movhps [r8 + r3 * 2], xm7 + movhps [r8 + r6], xm8 %else - vpermq m1, m1, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm6, m1, 1 - vextracti128 xm4, m2, 1 - movu [r2], xm1 - movu [r2 + r3], xm6 + psubw m5, m14 ; m5 = word: row 0, row 1 + psubw m2, m14 ; m2 = word: row 2, row 3 + psubw m1, m14 ; m1 = word: row 4, row 5 + psubw m4, m14 ; m4 = word: row 6, row 7 + psubw m0, m14 ; m0 = word: row 8, row 9 + psubw m6, m14 ; m6 = word: row 10, row 11 + psubw m7, m14 ; m7 = word: row 12, row 13 + psubw m8, m14 ; m8 = word: row 14, row 15 + vextracti128 xm3, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm3 + vextracti128 xm3, m2, 1 movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm4 + movu [r2 + r6], xm3 + vextracti128 xm3, m1, 1 + lea r8, [r2 + r3 * 4] + movu [r8], xm1 + movu [r8 + r3], xm3 + vextracti128 xm3, m4, 1 + movu [r8 + r3 * 2], xm4 + movu [r8 + r6], xm3 + vextracti128 xm3, m0, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm0 + movu [r8 + r3], xm3 + vextracti128 xm3, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm3 + vextracti128 xm3, m7, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm7 + movu [r8 + r3], xm3 + vextracti128 xm3, m8, 1 + movu [r8 + r3 * 2], xm8 + movu [r8 + r6], xm3 %endif - lea r2, [r2 + r3 * 4] -%endrep + sub r7, r10 + lea r0, [r7 - 16] +%ifidn %1,pp + lea r2, [r8 + r3 * 4 - 16] +%else + lea r2, [r8 + r3 * 4 - 32] +%endif + dec r9d + jnz .loopH RET %endif %endmacro - FILTER_VER_CHROMA_S_AVX2_8xN sp, 16 - FILTER_VER_CHROMA_S_AVX2_8xN sp, 32 - FILTER_VER_CHROMA_S_AVX2_8xN sp, 64 - FILTER_VER_CHROMA_S_AVX2_8xN ss, 16 - FILTER_VER_CHROMA_S_AVX2_8xN ss, 32 - FILTER_VER_CHROMA_S_AVX2_8xN ss, 64 + FILTER_VER_LUMA_AVX2_24x32 pp + FILTER_VER_LUMA_AVX2_24x32 ps -%macro FILTER_VER_CHROMA_S_AVX2_Nx24 2 -%if ARCH_X86_64 == 1 +%macro FILTER_VER_LUMA_AVX2_32xN 3 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_%2x24, 4, 10, 10 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 mov r4d, r4m - shl r4d, 6 - add r1d, r1d - + shl r4d, 7 %ifdef PIC - lea r5, [pw_ChromaCoeffV] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [pw_ChromaCoeffV + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m9, [pd_526336] -%else + sub r0, r4 +%ifidn %3,ps add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%else + mova m14, [pw_512] %endif lea r6, [r3 * 3] - mov r9d, %2 / 8 + lea r11, [r1 * 4] + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 16 .loopW: - PROCESS_CHROMA_S_AVX2_W8_16R %1 -%ifidn %1,sp - add r2, 8 -%else + PROCESS_LUMA_AVX2_W16_16R %3 +%ifidn %3,pp add r2, 16 +%else + add r2, 32 %endif add r0, 16 - dec r9d + dec r10d jnz .loopW -%ifidn %1,sp - lea r2, [r8 + r3 * 4 - %2 + 8] -%else - lea r2, [r8 + r3 * 4 - 2 * %2 + 16] -%endif - lea r0, [r7 - 2 * %2 + 16] - mova m7, m9 - mov r9d, %2 / 8 -.loop: - PROCESS_CHROMA_S_AVX2_W8_8R %1 -%ifidn %1,sp - add r2, 8 + sub r7, r11 + lea r0, [r7 - 16] +%ifidn %3,pp + lea r2, [r8 + r3 * 4 - 16] %else - add r2, 16 + lea r2, [r8 + r3 * 4 - 32] %endif - add r0, 16 dec r9d - jnz .loop + jnz .loopH RET %endif %endmacro - FILTER_VER_CHROMA_S_AVX2_Nx24 sp, 32 - FILTER_VER_CHROMA_S_AVX2_Nx24 sp, 16 - FILTER_VER_CHROMA_S_AVX2_Nx24 ss, 32 - FILTER_VER_CHROMA_S_AVX2_Nx24 ss, 16 + FILTER_VER_LUMA_AVX2_32xN 32, 32, pp + FILTER_VER_LUMA_AVX2_32xN 32, 64, pp + FILTER_VER_LUMA_AVX2_32xN 32, 32, ps + FILTER_VER_LUMA_AVX2_32xN 32, 64, ps -%macro FILTER_VER_CHROMA_S_AVX2_2x8 1 +%macro FILTER_VER_LUMA_AVX2_32x16 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_2x8, 4, 6, 7 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_32x16, 4, 10, 15 mov r4d, r4m - shl r4d, 6 - add r1d, r1d - sub r0, r1 + shl r4d, 7 %ifdef PIC - lea r5, [pw_ChromaCoeffV] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [pw_ChromaCoeffV + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] -%ifidn %1,sp - mova m6, [pd_526336] -%else + sub r0, r4 +%ifidn %1,ps add r3d, r3d -%endif - movd xm0, [r0] - movd xm1, [r0 + r1] - punpcklwd xm0, xm1 - movd xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] - movd xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movd xm4, [r0] - punpcklwd xm3, xm4 - punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] - vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] - movd xm1, [r0 + r1] - punpcklwd xm4, xm1 - movd xm3, [r0 + r1 * 2] - punpcklwd xm1, xm3 - punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] - vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] - pmaddwd m0, [r5] - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 - movd xm1, [r0 + r4] - punpcklwd xm3, xm1 - lea r0, [r0 + 4 * r1] - movd xm2, [r0] - punpcklwd xm1, xm2 - punpcklqdq xm3, xm1 ; m3 = [8 7 7 6] - vinserti128 m4, m4, xm3, 1 ; m4 = [8 7 7 6 6 5 5 4] - movd xm1, [r0 + r1] - punpcklwd xm2, xm1 - movd xm5, [r0 + r1 * 2] - punpcklwd xm1, xm5 - punpcklqdq xm2, xm1 ; m2 = [10 9 9 8] - vinserti128 m3, m3, xm2, 1 ; m3 = [10 9 9 8 8 7 7 6] - pmaddwd m4, [r5] - pmaddwd m3, [r5 + 1 * mmsize] - paddd m4, m3 -%ifidn %1,sp - paddd m0, m6 - paddd m4, m6 - psrad m0, 12 - psrad m4, 12 + vbroadcasti128 m14, [pw_2000] %else - psrad m0, 6 - psrad m4, 6 + mova m14, [pw_512] %endif - packssdw m0, m4 - vextracti128 xm4, m0, 1 - lea r4, [r3 * 3] -%ifidn %1,sp - packuswb xm0, xm4 - pextrw [r2], xm0, 0 - pextrw [r2 + r3], xm0, 1 - pextrw [r2 + 2 * r3], xm0, 4 - pextrw [r2 + r4], xm0, 5 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 2 - pextrw [r2 + r3], xm0, 3 - pextrw [r2 + 2 * r3], xm0, 6 - pextrw [r2 + r4], xm0, 7 + lea r6, [r3 * 3] + mov r9d, 2 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 %else - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - movd [r2 + 2 * r3], xm4 - pextrd [r2 + r4], xm4, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm0, 3 - pextrd [r2 + 2 * r3], xm4, 2 - pextrd [r2 + r4], xm4, 3 + add r2, 32 %endif + add r0, 16 + dec r9d + jnz .loopW RET +%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_2x8 sp - FILTER_VER_CHROMA_S_AVX2_2x8 ss + FILTER_VER_LUMA_AVX2_32x16 pp + FILTER_VER_LUMA_AVX2_32x16 ps -%macro FILTER_VER_CHROMA_S_AVX2_2x16 1 -%if ARCH_X86_64 == 1 +%macro FILTER_VER_LUMA_AVX2_32x24 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_2x16, 4, 6, 9 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 mov r4d, r4m - shl r4d, 6 - add r1d, r1d - sub r0, r1 - + shl r4d, 7 %ifdef PIC - lea r5, [pw_ChromaCoeffV] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [pw_ChromaCoeffV + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - lea r4, [r1 * 3] -%ifidn %1,sp - mova m6, [pd_526336] -%else + sub r0, r4 +%ifidn %1,ps add r3d, r3d %endif - movd xm0, [r0] - movd xm1, [r0 + r1] - punpcklwd xm0, xm1 - movd xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] - movd xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movd xm4, [r0] - punpcklwd xm3, xm4 - punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] - vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] - movd xm1, [r0 + r1] - punpcklwd xm4, xm1 - movd xm3, [r0 + r1 * 2] - punpcklwd xm1, xm3 - punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] - vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] - pmaddwd m0, [r5] - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 - movd xm1, [r0 + r4] - punpcklwd xm3, xm1 - lea r0, [r0 + 4 * r1] - movd xm2, [r0] - punpcklwd xm1, xm2 - punpcklqdq xm3, xm1 ; m3 = [8 7 7 6] - vinserti128 m4, m4, xm3, 1 ; m4 = [8 7 7 6 6 5 5 4] - movd xm1, [r0 + r1] - punpcklwd xm2, xm1 - movd xm5, [r0 + r1 * 2] - punpcklwd xm1, xm5 - punpcklqdq xm2, xm1 ; m2 = [10 9 9 8] - vinserti128 m3, m3, xm2, 1 ; m3 = [10 9 9 8 8 7 7 6] - pmaddwd m4, [r5] - pmaddwd m3, [r5 + 1 * mmsize] - paddd m4, m3 - movd xm1, [r0 + r4] - punpcklwd xm5, xm1 - lea r0, [r0 + 4 * r1] - movd xm3, [r0] - punpcklwd xm1, xm3 - punpcklqdq xm5, xm1 ; m5 = [12 11 11 10] - vinserti128 m2, m2, xm5, 1 ; m2 = [12 11 11 10 10 9 9 8] - movd xm1, [r0 + r1] - punpcklwd xm3, xm1 - movd xm7, [r0 + r1 * 2] - punpcklwd xm1, xm7 - punpcklqdq xm3, xm1 ; m3 = [14 13 13 12] - vinserti128 m5, m5, xm3, 1 ; m5 = [14 13 13 12 12 11 11 10] - pmaddwd m2, [r5] - pmaddwd m5, [r5 + 1 * mmsize] - paddd m2, m5 - movd xm5, [r0 + r4] - punpcklwd xm7, xm5 - lea r0, [r0 + 4 * r1] - movd xm1, [r0] - punpcklwd xm5, xm1 - punpcklqdq xm7, xm5 ; m7 = [16 15 15 14] - vinserti128 m3, m3, xm7, 1 ; m3 = [16 15 15 14 14 13 13 12] - movd xm5, [r0 + r1] - punpcklwd xm1, xm5 - movd xm8, [r0 + r1 * 2] - punpcklwd xm5, xm8 - punpcklqdq xm1, xm5 ; m1 = [18 17 17 16] - vinserti128 m7, m7, xm1, 1 ; m7 = [18 17 17 16 16 15 15 14] - pmaddwd m3, [r5] - pmaddwd m7, [r5 + 1 * mmsize] - paddd m3, m7 -%ifidn %1,sp - paddd m0, m6 - paddd m4, m6 - paddd m2, m6 - paddd m3, m6 - psrad m0, 12 - psrad m4, 12 - psrad m2, 12 - psrad m3, 12 + lea r6, [r3 * 3] +%ifidn %1,pp + mova m14, [pw_512] %else - psrad m0, 6 - psrad m4, 6 - psrad m2, 6 - psrad m3, 6 + vbroadcasti128 m14, [pw_2000] %endif - packssdw m0, m4 - packssdw m2, m3 - lea r4, [r3 * 3] -%ifidn %1,sp - packuswb m0, m2 - vextracti128 xm2, m0, 1 - pextrw [r2], xm0, 0 - pextrw [r2 + r3], xm0, 1 - pextrw [r2 + 2 * r3], xm2, 0 - pextrw [r2 + r4], xm2, 1 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 2 - pextrw [r2 + r3], xm0, 3 - pextrw [r2 + 2 * r3], xm2, 2 - pextrw [r2 + r4], xm2, 3 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 4 - pextrw [r2 + r3], xm0, 5 - pextrw [r2 + 2 * r3], xm2, 4 - pextrw [r2 + r4], xm2, 5 - lea r2, [r2 + r3 * 4] - pextrw [r2], xm0, 6 - pextrw [r2 + r3], xm0, 7 - pextrw [r2 + 2 * r3], xm2, 6 - pextrw [r2 + r4], xm2, 7 + mov r9d, 2 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 %else - vextracti128 xm4, m0, 1 - vextracti128 xm3, m2, 1 - movd [r2], xm0 - pextrd [r2 + r3], xm0, 1 - movd [r2 + 2 * r3], xm4 - pextrd [r2 + r4], xm4, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm0, 2 - pextrd [r2 + r3], xm0, 3 - pextrd [r2 + 2 * r3], xm4, 2 - pextrd [r2 + r4], xm4, 3 - lea r2, [r2 + r3 * 4] - movd [r2], xm2 - pextrd [r2 + r3], xm2, 1 - movd [r2 + 2 * r3], xm3 - pextrd [r2 + r4], xm3, 1 - lea r2, [r2 + r3 * 4] - pextrd [r2], xm2, 2 - pextrd [r2 + r3], xm2, 3 - pextrd [r2 + 2 * r3], xm3, 2 - pextrd [r2 + r4], xm3, 3 + add r2, 32 +%endif + add r0, 16 + dec r9d + jnz .loopW + lea r9, [r1 * 4] + sub r7, r9 + lea r0, [r7 - 16] +%ifidn %1,pp + lea r2, [r8 + r3 * 4 - 16] +%else + lea r2, [r8 + r3 * 4 - 32] +%endif + mov r9d, 2 +.loop: + PROCESS_LUMA_AVX2_W16_8R %1 +%ifidn %1,pp + add r2, 16 +%else + add r2, 32 %endif + add r0, 16 + dec r9d + jnz .loop RET %endif %endmacro - FILTER_VER_CHROMA_S_AVX2_2x16 sp - FILTER_VER_CHROMA_S_AVX2_2x16 ss + FILTER_VER_LUMA_AVX2_32x24 pp + FILTER_VER_LUMA_AVX2_32x24 ps -%macro FILTER_VER_CHROMA_S_AVX2_6x8 1 +%macro FILTER_VER_LUMA_AVX2_32x8 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_32x8, 4, 10, 15 mov r4d, r4m - shl r4d, 6 - add r1d, r1d + shl r4d, 7 %ifdef PIC - lea r5, [pw_ChromaCoeffV] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [pw_ChromaCoeffV + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m7, [pd_526336] -%else + sub r0, r4 +%ifidn %1,ps add r3d, r3d %endif - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m4 - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - pmaddwd m3, [r5] - paddd m1, m5 -%ifidn %1,sp - paddd m0, m7 - paddd m1, m7 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 - - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m1 -%ifidn %1,sp - paddd m2, m7 - paddd m3, m7 - psrad m2, 12 - psrad m3, 12 -%else - psrad m2, 6 - psrad m3, 6 -%endif - packssdw m2, m3 - - movu xm1, [r0 + r4] ; m1 = row 7 - punpckhwd xm3, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm3, 1 - pmaddwd m3, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m3 - - lea r4, [r3 * 3] -%ifidn %1,sp - packuswb m0, m2 - vextracti128 xm2, m0, 1 - movd [r2], xm0 - pextrw [r2 + 4], xm2, 0 - pextrd [r2 + r3], xm0, 1 - pextrw [r2 + r3 + 4], xm2, 2 - pextrd [r2 + r3 * 2], xm0, 2 - pextrw [r2 + r3 * 2 + 4], xm2, 4 - pextrd [r2 + r4], xm0, 3 - pextrw [r2 + r4 + 4], xm2, 6 -%else - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r4], xm2 - vextracti128 xm0, m0, 1 - vextracti128 xm3, m2, 1 - movd [r2 + 8], xm0 - pextrd [r2 + r3 + 8], xm0, 2 - movd [r2 + r3 * 2 + 8], xm3 - pextrd [r2 + r4 + 8], xm3, 2 -%endif - lea r2, [r2 + r3 * 4] - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 8 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - pmaddwd m1, [r5] - paddd m5, m2 -%ifidn %1,sp - paddd m4, m7 - paddd m5, m7 - psrad m4, 12 - psrad m5, 12 -%else - psrad m4, 6 - psrad m5, 6 -%endif - packssdw m4, m5 - - movu xm2, [r0 + r1] ; m2 = row 9 - punpckhwd xm5, xm0, xm2 - punpcklwd xm0, xm2 - vinserti128 m0, m0, xm5, 1 - pmaddwd m0, [r5 + 1 * mmsize] - paddd m6, m0 - movu xm5, [r0 + r1 * 2] ; m5 = row 10 - punpckhwd xm0, xm2, xm5 - punpcklwd xm2, xm5 - vinserti128 m2, m2, xm0, 1 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m1, m2 - -%ifidn %1,sp - paddd m6, m7 - paddd m1, m7 - psrad m6, 12 - psrad m1, 12 + lea r6, [r3 * 3] +%ifidn %1,pp + mova m14, [pw_512] %else - psrad m6, 6 - psrad m1, 6 + vbroadcasti128 m14, [pw_2000] %endif - packssdw m6, m1 -%ifidn %1,sp - packuswb m4, m6 - vextracti128 xm6, m4, 1 - movd [r2], xm4 - pextrw [r2 + 4], xm6, 0 - pextrd [r2 + r3], xm4, 1 - pextrw [r2 + r3 + 4], xm6, 2 - pextrd [r2 + r3 * 2], xm4, 2 - pextrw [r2 + r3 * 2 + 4], xm6, 4 - pextrd [r2 + r4], xm4, 3 - pextrw [r2 + r4 + 4], xm6, 6 + mov r9d, 2 +.loopW: + PROCESS_LUMA_AVX2_W16_8R %1 +%ifidn %1,pp + add r2, 16 %else - movq [r2], xm4 - movhps [r2 + r3], xm4 - movq [r2 + r3 * 2], xm6 - movhps [r2 + r4], xm6 - vextracti128 xm5, m4, 1 - vextracti128 xm1, m6, 1 - movd [r2 + 8], xm5 - pextrd [r2 + r3 + 8], xm5, 2 - movd [r2 + r3 * 2 + 8], xm1 - pextrd [r2 + r4 + 8], xm1, 2 + add r2, 32 %endif + add r0, 16 + dec r9d + jnz .loopW RET +%endif %endmacro - FILTER_VER_CHROMA_S_AVX2_6x8 sp - FILTER_VER_CHROMA_S_AVX2_6x8 ss + FILTER_VER_LUMA_AVX2_32x8 pp + FILTER_VER_LUMA_AVX2_32x8 ps -%macro FILTER_VER_CHROMA_S_AVX2_6x16 1 -%if ARCH_X86_64 == 1 +%macro FILTER_VER_LUMA_AVX2_48x64 1 INIT_YMM avx2 -cglobal interp_4tap_vert_%1_6x16, 4, 7, 9 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_48x64, 4, 12, 15 mov r4d, r4m - shl r4d, 6 - add r1d, r1d + shl r4d, 7 %ifdef PIC - lea r5, [pw_ChromaCoeffV] + lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else - lea r5, [pw_ChromaCoeffV + r4] + lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,sp - mova m8, [pd_526336] -%else + sub r0, r4 + +%ifidn %1,ps add r3d, r3d %endif + lea r6, [r3 * 3] - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] -%ifidn %1,sp - paddd m0, m8 - paddd m1, m8 - psrad m0, 12 - psrad m1, 12 -%else - psrad m0, 6 - psrad m1, 6 -%endif - packssdw m0, m1 + lea r11, [r1 * 4] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm1, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m1 -%ifidn %1,sp - paddd m2, m8 - paddd m3, m8 - psrad m2, 12 - psrad m3, 12 +%ifidn %1,pp + mova m14, [pw_512] %else - psrad m2, 6 - psrad m3, 6 + vbroadcasti128 m14, [pw_2000] %endif - packssdw m2, m3 -%ifidn %1,sp - packuswb m0, m2 - vextracti128 xm2, m0, 1 - movd [r2], xm0 - pextrw [r2 + 4], xm2, 0 - pextrd [r2 + r3], xm0, 1 - pextrw [r2 + r3 + 4], xm2, 2 - pextrd [r2 + r3 * 2], xm0, 2 - pextrw [r2 + r3 * 2 + 4], xm2, 4 - pextrd [r2 + r6], xm0, 3 - pextrw [r2 + r6 + 4], xm2, 6 + + mov r9d, 4 +.loopH: + mov r10d, 3 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 %else - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 - vextracti128 xm0, m0, 1 - vextracti128 xm3, m2, 1 - movd [r2 + 8], xm0 - pextrd [r2 + r3 + 8], xm0, 2 - movd [r2 + r3 * 2 + 8], xm3 - pextrd [r2 + r6 + 8], xm3, 2 + add r2, 32 %endif - lea r2, [r2 + r3 * 4] - movu xm1, [r0 + r4] ; m1 = row 7 - punpckhwd xm0, xm6, xm1 - punpcklwd xm6, xm1 - vinserti128 m6, m6, xm0, 1 - pmaddwd m0, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m0 - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 8 - punpckhwd xm2, xm1, xm0 - punpcklwd xm1, xm0 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - pmaddwd m1, [r5] - paddd m5, m2 -%ifidn %1,sp - paddd m4, m8 - paddd m5, m8 - psrad m4, 12 - psrad m5, 12 + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 32] +%ifidn %1,pp + lea r2, [r8 + r3 * 4 - 32] %else - psrad m4, 6 - psrad m5, 6 + lea r2, [r8 + r3 * 4 - 64] %endif - packssdw m4, m5 + dec r9d + jnz .loopH + RET +%endif +%endmacro - movu xm2, [r0 + r1] ; m2 = row 9 - punpckhwd xm5, xm0, xm2 - punpcklwd xm0, xm2 - vinserti128 m0, m0, xm5, 1 - pmaddwd m5, m0, [r5 + 1 * mmsize] - paddd m6, m5 - pmaddwd m0, [r5] - movu xm5, [r0 + r1 * 2] ; m5 = row 10 - punpckhwd xm7, xm2, xm5 - punpcklwd xm2, xm5 - vinserti128 m2, m2, xm7, 1 - pmaddwd m7, m2, [r5 + 1 * mmsize] - paddd m1, m7 - pmaddwd m2, [r5] + FILTER_VER_LUMA_AVX2_48x64 pp + FILTER_VER_LUMA_AVX2_48x64 ps -%ifidn %1,sp - paddd m6, m8 - paddd m1, m8 - psrad m6, 12 - psrad m1, 12 +%macro FILTER_VER_LUMA_AVX2_64xN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 %else - psrad m6, 6 - psrad m1, 6 + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - packssdw m6, m1 -%ifidn %1,sp - packuswb m4, m6 - vextracti128 xm6, m4, 1 - movd [r2], xm4 - pextrw [r2 + 4], xm6, 0 - pextrd [r2 + r3], xm4, 1 - pextrw [r2 + r3 + 4], xm6, 2 - pextrd [r2 + r3 * 2], xm4, 2 - pextrw [r2 + r3 * 2 + 4], xm6, 4 - pextrd [r2 + r6], xm4, 3 - pextrw [r2 + r6 + 4], xm6, 6 -%else - movq [r2], xm4 - movhps [r2 + r3], xm4 - movq [r2 + r3 * 2], xm6 - movhps [r2 + r6], xm6 - vextracti128 xm4, m4, 1 - vextracti128 xm1, m6, 1 - movd [r2 + 8], xm4 - pextrd [r2 + r3 + 8], xm4, 2 - movd [r2 + r3 * 2 + 8], xm1 - pextrd [r2 + r6 + 8], xm1, 2 + + lea r4, [r1 * 3] + sub r0, r4 + +%ifidn %3,ps + add r3d, r3d %endif - lea r2, [r2 + r3 * 4] - movu xm7, [r0 + r4] ; m7 = row 11 - punpckhwd xm1, xm5, xm7 - punpcklwd xm5, xm7 - vinserti128 m5, m5, xm1, 1 - pmaddwd m1, m5, [r5 + 1 * mmsize] - paddd m0, m1 - pmaddwd m5, [r5] - lea r0, [r0 + r1 * 4] - movu xm1, [r0] ; m1 = row 12 - punpckhwd xm4, xm7, xm1 - punpcklwd xm7, xm1 - vinserti128 m7, m7, xm4, 1 - pmaddwd m4, m7, [r5 + 1 * mmsize] - paddd m2, m4 - pmaddwd m7, [r5] -%ifidn %1,sp - paddd m0, m8 - paddd m2, m8 - psrad m0, 12 - psrad m2, 12 + + lea r6, [r3 * 3] + lea r11, [r1 * 4] + +%ifidn %3,pp + mova m14, [pw_512] %else - psrad m0, 6 - psrad m2, 6 + vbroadcasti128 m14, [pw_2000] %endif - packssdw m0, m2 - movu xm4, [r0 + r1] ; m4 = row 13 - punpckhwd xm2, xm1, xm4 - punpcklwd xm1, xm4 - vinserti128 m1, m1, xm2, 1 - pmaddwd m2, m1, [r5 + 1 * mmsize] - paddd m5, m2 - pmaddwd m1, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 14 - punpckhwd xm6, xm4, xm2 - punpcklwd xm4, xm2 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m7, m6 - pmaddwd m4, [r5] -%ifidn %1,sp - paddd m5, m8 - paddd m7, m8 - psrad m5, 12 - psrad m7, 12 + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 16 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %3 +%ifidn %3,pp + add r2, 16 %else - psrad m5, 6 - psrad m7, 6 + add r2, 32 %endif - packssdw m5, m7 -%ifidn %1,sp - packuswb m0, m5 - vextracti128 xm5, m0, 1 - movd [r2], xm0 - pextrw [r2 + 4], xm5, 0 - pextrd [r2 + r3], xm0, 1 - pextrw [r2 + r3 + 4], xm5, 2 - pextrd [r2 + r3 * 2], xm0, 2 - pextrw [r2 + r3 * 2 + 4], xm5, 4 - pextrd [r2 + r6], xm0, 3 - pextrw [r2 + r6 + 4], xm5, 6 + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 48] +%ifidn %3,pp + lea r2, [r8 + r3 * 4 - 48] %else - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm5 - movhps [r2 + r6], xm5 - vextracti128 xm0, m0, 1 - vextracti128 xm7, m5, 1 - movd [r2 + 8], xm0 - pextrd [r2 + r3 + 8], xm0, 2 - movd [r2 + r3 * 2 + 8], xm7 - pextrd [r2 + r6 + 8], xm7, 2 + lea r2, [r8 + r3 * 4 - 96] %endif - lea r2, [r2 + r3 * 4] + dec r9d + jnz .loopH + RET +%endif +%endmacro - movu xm6, [r0 + r4] ; m6 = row 15 - punpckhwd xm5, xm2, xm6 - punpcklwd xm2, xm6 - vinserti128 m2, m2, xm5, 1 - pmaddwd m5, m2, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm0, [r0] ; m0 = row 16 - punpckhwd xm5, xm6, xm0 - punpcklwd xm6, xm0 - vinserti128 m6, m6, xm5, 1 - pmaddwd m5, m6, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m6, [r5] -%ifidn %1,sp - paddd m1, m8 - paddd m4, m8 - psrad m1, 12 - psrad m4, 12 + FILTER_VER_LUMA_AVX2_64xN 64, 32, pp + FILTER_VER_LUMA_AVX2_64xN 64, 48, pp + FILTER_VER_LUMA_AVX2_64xN 64, 64, pp + FILTER_VER_LUMA_AVX2_64xN 64, 32, ps + FILTER_VER_LUMA_AVX2_64xN 64, 48, ps + FILTER_VER_LUMA_AVX2_64xN 64, 64, ps + +%macro FILTER_VER_LUMA_AVX2_64x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_64x16, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 %else - psrad m1, 6 - psrad m4, 6 + lea r5, [tab_LumaCoeffVer_32 + r4] %endif - packssdw m1, m4 - - movu xm5, [r0 + r1] ; m5 = row 17 - punpckhwd xm4, xm0, xm5 - punpcklwd xm0, xm5 - vinserti128 m0, m0, xm4, 1 - pmaddwd m0, [r5 + 1 * mmsize] - paddd m2, m0 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhwd xm0, xm5, xm4 - punpcklwd xm5, xm4 - vinserti128 m5, m5, xm0, 1 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m6, m5 -%ifidn %1,sp - paddd m2, m8 - paddd m6, m8 - psrad m2, 12 - psrad m6, 12 + + lea r4, [r1 * 3] + sub r0, r4 + +%ifidn %1,ps + add r3d, r3d +%endif + + lea r6, [r3 * 3] + +%ifidn %1,pp + mova m14, [pw_512] %else - psrad m2, 6 - psrad m6, 6 + vbroadcasti128 m14, [pw_2000] %endif - packssdw m2, m6 -%ifidn %1,sp - packuswb m1, m2 - vextracti128 xm2, m1, 1 - movd [r2], xm1 - pextrw [r2 + 4], xm2, 0 - pextrd [r2 + r3], xm1, 1 - pextrw [r2 + r3 + 4], xm2, 2 - pextrd [r2 + r3 * 2], xm1, 2 - pextrw [r2 + r3 * 2 + 4], xm2, 4 - pextrd [r2 + r6], xm1, 3 - pextrw [r2 + r6 + 4], xm2, 6 + + mov r9d, 4 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 %else - movq [r2], xm1 - movhps [r2 + r3], xm1 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r6], xm2 - vextracti128 xm4, m1, 1 - vextracti128 xm6, m2, 1 - movd [r2 + 8], xm4 - pextrd [r2 + r3 + 8], xm4, 2 - movd [r2 + r3 * 2 + 8], xm6 - pextrd [r2 + r6 + 8], xm6, 2 + add r2, 32 %endif + add r0, 16 + dec r9d + jnz .loopW RET %endif %endmacro - FILTER_VER_CHROMA_S_AVX2_6x16 sp - FILTER_VER_CHROMA_S_AVX2_6x16 ss + FILTER_VER_LUMA_AVX2_64x16 pp + FILTER_VER_LUMA_AVX2_64x16 ps -;--------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS_W2_4R 2 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA 3 INIT_XMM sse4 -cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5 - - add r1d, r1d +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 +%ifidn %3,ps add r3d, r3d - sub r0, r1 - shl r4d, 5 +%endif + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif + +%ifidn %3,pp + mova m3, [pw_512] +%else + mova m3, [pw_2000] +%endif + mov dword [rsp], %2/4 + +.loopH: + mov r4d, (%1/8) +.loopW: + PROCESS_LUMA_W8_4R +%ifidn %3,pp + pmulhrsw m7, m3 + pmulhrsw m6, m3 + pmulhrsw m5, m3 + pmulhrsw m4, m3 + + packuswb m7, m6 + packuswb m5, m4 + + movlps [r2], m7 + movhps [r2 + r3], m7 + lea r5, [r2 + 2 * r3] + movlps [r5], m5 + movhps [r5 + r3], m5 +%else + psubw m7, m3 + psubw m6, m3 + psubw m5, m3 + psubw m4, m3 + + movu [r2], m7 + movu [r2 + r3], m6 + lea r5, [r2 + 2 * r3] + movu [r5], m5 + movu [r5 + r3], m4 +%endif -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] + lea r5, [8 * r1 - 8] + sub r0, r5 +%ifidn %3,pp + add r2, 8 %else - lea r5, [tab_ChromaCoeffV + r4] + add r2, 16 %endif + dec r4d + jnz .loopW - mov r4d, (%2/4) - -.loopH: - PROCESS_CHROMA_SP_W2_4R r5 - - psrad m0, 6 - psrad m2, 6 - - packssdw m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m0, 2 - pextrd [r2 + r3], m0, 3 - - lea r2, [r2 + 2 * r3] + lea r0, [r0 + 4 * r1 - %1] +%ifidn %3,pp + lea r2, [r2 + 4 * r3 - %1] +%else + lea r2, [r2 + 4 * r3 - 2 * %1] +%endif - dec r4d + dec dword [rsp] jnz .loopH RET %endmacro - FILTER_VER_CHROMA_SS_W2_4R 2, 4 - FILTER_VER_CHROMA_SS_W2_4R 2, 8 - - FILTER_VER_CHROMA_SS_W2_4R 2, 16 - -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -INIT_XMM sse2 -cglobal interp_4tap_vert_ss_4x2, 5, 6, 4 - - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 + FILTER_VER_LUMA 16, 4, pp + FILTER_VER_LUMA 16, 8, pp + FILTER_VER_LUMA 16, 12, pp + FILTER_VER_LUMA 16, 16, pp + FILTER_VER_LUMA 16, 32, pp + FILTER_VER_LUMA 16, 64, pp + FILTER_VER_LUMA 24, 32, pp + FILTER_VER_LUMA 32, 8, pp + FILTER_VER_LUMA 32, 16, pp + FILTER_VER_LUMA 32, 24, pp + FILTER_VER_LUMA 32, 32, pp + FILTER_VER_LUMA 32, 64, pp + FILTER_VER_LUMA 48, 64, pp + FILTER_VER_LUMA 64, 16, pp + FILTER_VER_LUMA 64, 32, pp + FILTER_VER_LUMA 64, 48, pp + FILTER_VER_LUMA 64, 64, pp -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] -%else - lea r5, [tab_ChromaCoeffV + r4] -%endif + FILTER_VER_LUMA 16, 4, ps + FILTER_VER_LUMA 16, 8, ps + FILTER_VER_LUMA 16, 12, ps + FILTER_VER_LUMA 16, 16, ps + FILTER_VER_LUMA 16, 32, ps + FILTER_VER_LUMA 16, 64, ps + FILTER_VER_LUMA 24, 32, ps + FILTER_VER_LUMA 32, 8, ps + FILTER_VER_LUMA 32, 16, ps + FILTER_VER_LUMA 32, 24, ps + FILTER_VER_LUMA 32, 32, ps + FILTER_VER_LUMA 32, 64, ps + FILTER_VER_LUMA 48, 64, ps + FILTER_VER_LUMA 64, 16, ps + FILTER_VER_LUMA 64, 32, ps + FILTER_VER_LUMA 64, 48, ps + FILTER_VER_LUMA 64, 64, ps +%macro PROCESS_LUMA_SP_W4_4R 0 movq m0, [r0] movq m1, [r0 + r1] punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 lea r0, [r0 + 2 * r1] - movq m2, [r0] - punpcklwd m1, m2 ;m1=[1 2] - pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 - - movq m3, [r0 + r1] - punpcklwd m2, m3 ;m4=[2 3] - pmaddwd m2, [r5 + 1 * 16] - paddd m0, m2 ;m0=[0+1+2+3] Row1 done - psrad m0, 6 + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 - movq m2, [r0 + 2 * r1] - punpcklwd m3, m2 ;m5=[3 4] - pmaddwd m3, [r5 + 1 * 16] - paddd m1, m3 ;m1=[1+2+3+4] Row2 done - psrad m1, 6 + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 - packssdw m0, m1 + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 - movlps [r2], m0 - movhps [r2 + r3], m0 + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m6, m4, [r6 + 1 * 16] + paddd m2, m6 ;m2=[2+3+4+5] Row3 + pmaddwd m4, [r6 + 2 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 - RET + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m6, m5, [r6 + 1 * 16] + paddd m3, m6 ;m3=[3+4+5+6] Row4 + pmaddwd m5, [r6 + 2 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[6 7] + pmaddwd m6, m4, [r6 + 2 * 16] + paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 + pmaddwd m4, [r6 + 3 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[7 8] + pmaddwd m6, m5, [r6 + 2 * 16] + paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 + pmaddwd m5, [r6 + 3 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[8 9] + pmaddwd m4, [r6 + 3 * 16] + paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[9 10] + pmaddwd m5, [r6 + 3 * 16] + paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end +%endmacro -;------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS_W6_H4 2 +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_SP 2 INIT_XMM sse4 -cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6 +cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 + lea r5, [r1 + 2 * r1] + sub r0, r5 + shl r4d, 6 %ifdef PIC - lea r5, [tab_ChromaCoeffV] + lea r5, [tab_LumaCoeffV] lea r6, [r5 + r4] %else - lea r6, [tab_ChromaCoeffV + r4] + lea r6, [tab_LumaCoeffV + r4] %endif - mov r4d, %2/4 + mova m7, [pd_526336] + mov dword [rsp], %2/4 .loopH: - PROCESS_CHROMA_SP_W4_4R + mov r4d, (%1/4) +.loopW: + PROCESS_LUMA_SP_W4_4R + + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 packssdw m0, m1 packssdw m2, m3 - movlps [r2], m0 - movhps [r2 + r3], m0 - lea r5, [r2 + 2 * r3] - movlps [r5], m2 - movhps [r5 + r3], m2 - - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 2 * 4 - - PROCESS_CHROMA_SP_W2_4R r6 - - psrad m0, 6 - psrad m2, 6 - - packssdw m0, m2 + packuswb m0, m2 movd [r2], m0 pextrd [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m0, 2 - pextrd [r2 + r3], m0, 3 + lea r5, [r2 + 2 * r3] + pextrd [r5], m0, 2 + pextrd [r5 + r3], m0, 3 - sub r0, 2 * 4 - lea r2, [r2 + 2 * r3 - 2 * 4] + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 4 dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - %1] + + dec dword [rsp] jnz .loopH RET %endmacro - FILTER_VER_CHROMA_SS_W6_H4 6, 8 - - FILTER_VER_CHROMA_SS_W6_H4 6, 16 - +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_SP 4, 4 + FILTER_VER_LUMA_SP 8, 8 + FILTER_VER_LUMA_SP 8, 4 + FILTER_VER_LUMA_SP 4, 8 + FILTER_VER_LUMA_SP 16, 16 + FILTER_VER_LUMA_SP 16, 8 + FILTER_VER_LUMA_SP 8, 16 + FILTER_VER_LUMA_SP 16, 12 + FILTER_VER_LUMA_SP 12, 16 + FILTER_VER_LUMA_SP 16, 4 + FILTER_VER_LUMA_SP 4, 16 + FILTER_VER_LUMA_SP 32, 32 + FILTER_VER_LUMA_SP 32, 16 + FILTER_VER_LUMA_SP 16, 32 + FILTER_VER_LUMA_SP 32, 24 + FILTER_VER_LUMA_SP 24, 32 + FILTER_VER_LUMA_SP 32, 8 + FILTER_VER_LUMA_SP 8, 32 + FILTER_VER_LUMA_SP 64, 64 + FILTER_VER_LUMA_SP 64, 32 + FILTER_VER_LUMA_SP 32, 64 + FILTER_VER_LUMA_SP 64, 48 + FILTER_VER_LUMA_SP 48, 64 + FILTER_VER_LUMA_SP 64, 16 + FILTER_VER_LUMA_SP 16, 64 -;---------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;---------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS_W8_H2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal filterPixelToShort_4x2, 3, 4, 3 + mov r3d, r3m + add r3d, r3d - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 + ; load constant + mova m1, [pb_128] + mova m2, [tab_c_64_n64] -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] -%else - lea r5, [tab_ChromaCoeffV + r4] -%endif + movd m0, [r0] + pinsrd m0, [r0 + r1], 1 + punpcklbw m0, m1 + pmaddubsw m0, m2 - mov r4d, %2/2 -.loopH: - PROCESS_CHROMA_SP_W8_2R + movq [r2 + r3 * 0], m0 + movhps [r2 + r3 * 1], m0 - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 + RET - packssdw m0, m1 - packssdw m2, m3 +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal filterPixelToShort_8x2, 3, 4, 3 + mov r3d, r3m + add r3d, r3d - movu [r2], m0 - movu [r2 + r3], m2 + ; load constant + mova m1, [pb_128] + mova m2, [tab_c_64_n64] - lea r2, [r2 + 2 * r3] + movh m0, [r0] + punpcklbw m0, m1 + pmaddubsw m0, m2 + movu [r2 + r3 * 0], m0 - dec r4d - jnz .loopH + movh m0, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m0, m2 + movu [r2 + r3 * 1], m0 RET -%endmacro - - FILTER_VER_CHROMA_SS_W8_H2 8, 2 - FILTER_VER_CHROMA_SS_W8_H2 8, 4 - FILTER_VER_CHROMA_SS_W8_H2 8, 6 - FILTER_VER_CHROMA_SS_W8_H2 8, 8 - FILTER_VER_CHROMA_SS_W8_H2 8, 16 - FILTER_VER_CHROMA_SS_W8_H2 8, 32 - - FILTER_VER_CHROMA_SS_W8_H2 8, 12 - FILTER_VER_CHROMA_SS_W8_H2 8, 64 ;----------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) diff --git a/source/common/x86/v4-ipfilter8.asm b/source/common/x86/v4-ipfilter8.asm new file mode 100644 index 0000000000..bfdc0fada1 --- /dev/null +++ b/source/common/x86/v4-ipfilter8.asm @@ -0,0 +1,12799 @@ +;***************************************************************************** +;* Copyright (C) 2013-2017 MulticoreWare, Inc +;* +;* Authors: Min Chen +;* Nabajit Deka +;* Praveen Kumar Tiwari +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +const v4_pd_526336, times 8 dd 8192*64+2048 + +const tab_Vm, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 + db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 + +const tab_Cm, db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3 + +const interp_vert_shuf, times 2 db 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9 + times 2 db 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13 + +const v4_interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15 + +const v4_interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4 + dd 2, 3, 3, 4, 4, 5, 5, 6 + +const tab_ChromaCoeff, db 0, 64, 0, 0 + db -2, 58, 10, -2 + db -4, 54, 16, -2 + db -6, 46, 28, -4 + db -4, 36, 36, -4 + db -4, 28, 46, -6 + db -2, 16, 54, -4 + db -2, 10, 58, -2 + +const tabw_ChromaCoeff, dw 0, 64, 0, 0 + dw -2, 58, 10, -2 + dw -4, 54, 16, -2 + dw -6, 46, 28, -4 + dw -4, 36, 36, -4 + dw -4, 28, 46, -6 + dw -2, 16, 54, -4 + dw -2, 10, 58, -2 + +const tab_ChromaCoeffV, times 4 dw 0, 64 + times 4 dw 0, 0 + + times 4 dw -2, 58 + times 4 dw 10, -2 + + times 4 dw -4, 54 + times 4 dw 16, -2 + + times 4 dw -6, 46 + times 4 dw 28, -4 + + times 4 dw -4, 36 + times 4 dw 36, -4 + + times 4 dw -4, 28 + times 4 dw 46, -6 + + times 4 dw -2, 16 + times 4 dw 54, -4 + + times 4 dw -2, 10 + times 4 dw 58, -2 + +const tab_ChromaCoeff_V, times 8 db 0, 64 + times 8 db 0, 0 + + times 8 db -2, 58 + times 8 db 10, -2 + + times 8 db -4, 54 + times 8 db 16, -2 + + times 8 db -6, 46 + times 8 db 28, -4 + + times 8 db -4, 36 + times 8 db 36, -4 + + times 8 db -4, 28 + times 8 db 46, -6 + + times 8 db -2, 16 + times 8 db 54, -4 + + times 8 db -2, 10 + times 8 db 58, -2 + +const tab_ChromaCoeffVer_32, times 16 db 0, 64 + times 16 db 0, 0 + + times 16 db -2, 58 + times 16 db 10, -2 + + times 16 db -4, 54 + times 16 db 16, -2 + + times 16 db -6, 46 + times 16 db 28, -4 + + times 16 db -4, 36 + times 16 db 36, -4 + + times 16 db -4, 28 + times 16 db 46, -6 + + times 16 db -2, 16 + times 16 db 54, -4 + + times 16 db -2, 10 + times 16 db 58, -2 + +const pw_ChromaCoeffV, times 8 dw 0, 64 + times 8 dw 0, 0 + + times 8 dw -2, 58 + times 8 dw 10, -2 + + times 8 dw -4, 54 + times 8 dw 16, -2 + + times 8 dw -6, 46 + times 8 dw 28, -4 + + times 8 dw -4, 36 + times 8 dw 36, -4 + + times 8 dw -4, 28 + times 8 dw 46, -6 + + times 8 dw -2, 16 + times 8 dw 54, -4 + + times 8 dw -2, 10 + times 8 dw 58, -2 + +const v4_interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 + +SECTION .text + +cextern pw_32 +cextern pw_512 +cextern pw_2000 + +%macro WORD_TO_DOUBLE 1 +%if ARCH_X86_64 + punpcklbw %1, m8 +%else + punpcklbw %1, %1 + psrlw %1, 8 +%endif +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_%1_2x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W2_H4_sse2 2 +INIT_XMM sse2 +%if ARCH_X86_64 +cglobal interp_4tap_vert_%1_2x%2, 4, 6, 9 + pxor m8, m8 +%else +cglobal interp_4tap_vert_%1_2x%2, 4, 6, 8 +%endif + mov r4d, r4m + sub r0, r1 + +%ifidn %1,pp + mova m1, [pw_32] +%elifidn %1,ps + mova m1, [pw_2000] + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tabw_ChromaCoeff] + movh m0, [r5 + r4 * 8] +%else + movh m0, [tabw_ChromaCoeff + r4 * 8] +%endif + + punpcklqdq m0, m0 + lea r5, [3 * r1] + +%assign x 1 +%rep %2/4 + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklwd m2, m6 + + WORD_TO_DOUBLE m2 + pmaddwd m2, m0 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklwd m3, m7 + + WORD_TO_DOUBLE m3 + pmaddwd m3, m0 + + packssdw m2, m3 + pshuflw m3, m2, q2301 + pshufhw m3, m3, q2301 + paddw m2, m3 + + movd m7, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m7 + punpcklwd m4, m3 + + WORD_TO_DOUBLE m4 + pmaddwd m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m7, m3 + punpcklwd m5, m7 + + WORD_TO_DOUBLE m5 + pmaddwd m5, m0 + + packssdw m4, m5 + pshuflw m5, m4, q2301 + pshufhw m5, m5, q2301 + paddw m4, m5 + +%ifidn %1,pp + psrld m2, 16 + psrld m4, 16 + packssdw m2, m4 + paddw m2, m1 + psraw m2, 6 + packuswb m2, m2 + +%if ARCH_X86_64 + movq r4, m2 + mov [r2], r4w + shr r4, 16 + mov [r2 + r3], r4w + lea r2, [r2 + 2 * r3] + shr r4, 16 + mov [r2], r4w + shr r4, 16 + mov [r2 + r3], r4w +%else + movd r4, m2 + mov [r2], r4w + shr r4, 16 + mov [r2 + r3], r4w + lea r2, [r2 + 2 * r3] + psrldq m2, 4 + movd r4, m2 + mov [r2], r4w + shr r4, 16 + mov [r2 + r3], r4w +%endif +%elifidn %1,ps + psrldq m2, 2 + psrldq m4, 2 + pshufd m2, m2, q3120 + pshufd m4, m4, q3120 + psubw m4, m1 + psubw m2, m1 + + movd [r2], m2 + psrldq m2, 4 + movd [r2 + r3], m2 + lea r2, [r2 + 2 * r3] + movd [r2], m4 + psrldq m4, 4 + movd [r2 + r3], m4 +%endif + +%if x < %2/4 + lea r2, [r2 + 2 * r3] +%endif +%assign x x+1 +%endrep + RET + +%endmacro + + FILTER_V4_W2_H4_sse2 pp, 4 + FILTER_V4_W2_H4_sse2 pp, 8 + FILTER_V4_W2_H4_sse2 pp, 16 + + FILTER_V4_W2_H4_sse2 ps, 4 + FILTER_V4_W2_H4_sse2 ps, 8 + FILTER_V4_W2_H4_sse2 ps, 16 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_%1_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V2_W4_H4_sse2 1 +INIT_XMM sse2 +cglobal interp_4tap_vert_%1_4x2, 4, 6, 8 + mov r4d, r4m + sub r0, r1 + pxor m7, m7 + +%ifdef PIC + lea r5, [tabw_ChromaCoeff] + movh m0, [r5 + r4 * 8] +%else + movh m0, [tabw_ChromaCoeff + r4 * 8] +%endif + + lea r5, [r0 + 2 * r1] + punpcklqdq m0, m0 + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r5] + movd m5, [r5 + r1] + + punpcklbw m2, m3 + punpcklbw m1, m4, m5 + punpcklwd m2, m1 + + movhlps m6, m2 + punpcklbw m2, m7 + punpcklbw m6, m7 + pmaddwd m2, m0 + pmaddwd m6, m0 + packssdw m2, m6 + + movd m1, [r0 + 4 * r1] + + punpcklbw m3, m4 + punpcklbw m5, m1 + punpcklwd m3, m5 + + movhlps m6, m3 + punpcklbw m3, m7 + punpcklbw m6, m7 + pmaddwd m3, m0 + pmaddwd m6, m0 + packssdw m3, m6 + + pshuflw m4, m2, q2301 + pshufhw m4, m4, q2301 + paddw m2, m4 + pshuflw m5, m3, q2301 + pshufhw m5, m5, q2301 + paddw m3, m5 + +%ifidn %1, pp + psrld m2, 16 + psrld m3, 16 + packssdw m2, m3 + + paddw m2, [pw_32] + psraw m2, 6 + packuswb m2, m2 + + movd [r2], m2 + psrldq m2, 4 + movd [r2 + r3], m2 +%elifidn %1, ps + psrldq m2, 2 + psrldq m3, 2 + pshufd m2, m2, q3120 + pshufd m3, m3, q3120 + punpcklqdq m2, m3 + + add r3d, r3d + psubw m2, [pw_2000] + movh [r2], m2 + movhps [r2 + r3], m2 +%endif + RET + +%endmacro + + FILTER_V2_W4_H4_sse2 pp + FILTER_V2_W4_H4_sse2 ps + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_%1_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W4_H4_sse2 2 +INIT_XMM sse2 +%if ARCH_X86_64 +cglobal interp_4tap_vert_%1_4x%2, 4, 6, 9 + pxor m8, m8 +%else +cglobal interp_4tap_vert_%1_4x%2, 4, 6, 8 +%endif + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tabw_ChromaCoeff] + movh m0, [r5 + r4 * 8] +%else + movh m0, [tabw_ChromaCoeff + r4 * 8] +%endif + +%ifidn %1,pp + mova m1, [pw_32] +%elifidn %1,ps + add r3d, r3d + mova m1, [pw_2000] +%endif + + lea r5, [3 * r1] + lea r4, [3 * r3] + punpcklqdq m0, m0 + +%assign x 1 +%rep %2/4 + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklwd m2, m6 + + movhlps m6, m2 + WORD_TO_DOUBLE m2 + WORD_TO_DOUBLE m6 + pmaddwd m2, m0 + pmaddwd m6, m0 + packssdw m2, m6 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklwd m3, m7 + + movhlps m7, m3 + WORD_TO_DOUBLE m3 + WORD_TO_DOUBLE m7 + pmaddwd m3, m0 + pmaddwd m7, m0 + packssdw m3, m7 + + pshuflw m7, m2, q2301 + pshufhw m7, m7, q2301 + paddw m2, m7 + pshuflw m7, m3, q2301 + pshufhw m7, m7, q2301 + paddw m3, m7 + +%ifidn %1,pp + psrld m2, 16 + psrld m3, 16 + packssdw m2, m3 + paddw m2, m1 + psraw m2, 6 +%elifidn %1,ps + psrldq m2, 2 + psrldq m3, 2 + pshufd m2, m2, q3120 + pshufd m3, m3, q3120 + punpcklqdq m2, m3 + + psubw m2, m1 + movh [r2], m2 + movhps [r2 + r3], m2 +%endif + + movd m7, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m7 + punpcklwd m4, m3 + + movhlps m3, m4 + WORD_TO_DOUBLE m4 + WORD_TO_DOUBLE m3 + pmaddwd m4, m0 + pmaddwd m3, m0 + packssdw m4, m3 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m7, m3 + punpcklwd m5, m7 + + movhlps m3, m5 + WORD_TO_DOUBLE m5 + WORD_TO_DOUBLE m3 + pmaddwd m5, m0 + pmaddwd m3, m0 + packssdw m5, m3 + + pshuflw m7, m4, q2301 + pshufhw m7, m7, q2301 + paddw m4, m7 + pshuflw m7, m5, q2301 + pshufhw m7, m7, q2301 + paddw m5, m7 + +%ifidn %1,pp + psrld m4, 16 + psrld m5, 16 + packssdw m4, m5 + + paddw m4, m1 + psraw m4, 6 + packuswb m2, m4 + + movd [r2], m2 + psrldq m2, 4 + movd [r2 + r3], m2 + psrldq m2, 4 + movd [r2 + 2 * r3], m2 + psrldq m2, 4 + movd [r2 + r4], m2 +%elifidn %1,ps + psrldq m4, 2 + psrldq m5, 2 + pshufd m4, m4, q3120 + pshufd m5, m5, q3120 + punpcklqdq m4, m5 + psubw m4, m1 + movh [r2 + 2 * r3], m4 + movhps [r2 + r4], m4 +%endif + +%if x < %2/4 + lea r2, [r2 + 4 * r3] +%endif + +%assign x x+1 +%endrep + RET + +%endmacro + + FILTER_V4_W4_H4_sse2 pp, 4 + FILTER_V4_W4_H4_sse2 pp, 8 + FILTER_V4_W4_H4_sse2 pp, 16 + FILTER_V4_W4_H4_sse2 pp, 32 + + FILTER_V4_W4_H4_sse2 ps, 4 + FILTER_V4_W4_H4_sse2 ps, 8 + FILTER_V4_W4_H4_sse2 ps, 16 + FILTER_V4_W4_H4_sse2 ps, 32 + +;----------------------------------------------------------------------------- +;void interp_4tap_vert_%1_6x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W6_H4_sse2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_%1_6x%2, 4, 7, 10 + mov r4d, r4m + sub r0, r1 + shl r4d, 5 + pxor m9, m9 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + mova m6, [r5 + r4] + mova m5, [r5 + r4 + 16] +%else + mova m6, [tab_ChromaCoeffV + r4] + mova m5, [tab_ChromaCoeffV + r4 + 16] +%endif + +%ifidn %1,pp + mova m4, [pw_32] +%elifidn %1,ps + mova m4, [pw_2000] + add r3d, r3d +%endif + lea r5, [3 * r1] + +%assign x 1 +%rep %2/4 + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] + + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + + movhlps m7, m0 + punpcklbw m0, m9 + punpcklbw m7, m9 + pmaddwd m0, m6 + pmaddwd m7, m6 + packssdw m0, m7 + + movhlps m8, m2 + movq m7, m2 + punpcklbw m8, m9 + punpcklbw m7, m9 + pmaddwd m8, m5 + pmaddwd m7, m5 + packssdw m7, m8 + + paddw m0, m7 + +%ifidn %1,pp + paddw m0, m4 + psraw m0, 6 + packuswb m0, m0 + + movd [r2], m0 + pextrw r6d, m0, 2 + mov [r2 + 4], r6w +%elifidn %1,ps + psubw m0, m4 + movh [r2], m0 + pshufd m0, m0, 2 + movd [r2 + 8], m0 +%endif + + lea r0, [r0 + 4 * r1] + + movq m0, [r0] + punpcklbw m3, m0 + + movhlps m8, m1 + punpcklbw m1, m9 + punpcklbw m8, m9 + pmaddwd m1, m6 + pmaddwd m8, m6 + packssdw m1, m8 + + movhlps m8, m3 + movq m7, m3 + punpcklbw m8, m9 + punpcklbw m7, m9 + pmaddwd m8, m5 + pmaddwd m7, m5 + packssdw m7, m8 + + paddw m1, m7 + +%ifidn %1,pp + paddw m1, m4 + psraw m1, 6 + packuswb m1, m1 + + movd [r2 + r3], m1 + pextrw r6d, m1, 2 + mov [r2 + r3 + 4], r6w +%elifidn %1,ps + psubw m1, m4 + movh [r2 + r3], m1 + pshufd m1, m1, 2 + movd [r2 + r3 + 8], m1 +%endif + + movq m1, [r0 + r1] + punpcklbw m7, m0, m1 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m6 + pmaddwd m8, m6 + packssdw m2, m8 + + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m5 + pmaddwd m8, m5 + packssdw m7, m8 + + paddw m2, m7 + lea r2, [r2 + 2 * r3] + +%ifidn %1,pp + paddw m2, m4 + psraw m2, 6 + packuswb m2, m2 + movd [r2], m2 + pextrw r6d, m2, 2 + mov [r2 + 4], r6w +%elifidn %1,ps + psubw m2, m4 + movh [r2], m2 + pshufd m2, m2, 2 + movd [r2 + 8], m2 +%endif + + movq m2, [r0 + 2 * r1] + punpcklbw m1, m2 + + movhlps m8, m3 + punpcklbw m3, m9 + punpcklbw m8, m9 + pmaddwd m3, m6 + pmaddwd m8, m6 + packssdw m3, m8 + + movhlps m8, m1 + punpcklbw m1, m9 + punpcklbw m8, m9 + pmaddwd m1, m5 + pmaddwd m8, m5 + packssdw m1, m8 + + paddw m3, m1 + +%ifidn %1,pp + paddw m3, m4 + psraw m3, 6 + packuswb m3, m3 + + movd [r2 + r3], m3 + pextrw r6d, m3, 2 + mov [r2 + r3 + 4], r6w +%elifidn %1,ps + psubw m3, m4 + movh [r2 + r3], m3 + pshufd m3, m3, 2 + movd [r2 + r3 + 8], m3 +%endif + +%if x < %2/4 + lea r2, [r2 + 2 * r3] +%endif + +%assign x x+1 +%endrep + RET + +%endmacro + +%if ARCH_X86_64 + FILTER_V4_W6_H4_sse2 pp, 8 + FILTER_V4_W6_H4_sse2 pp, 16 + FILTER_V4_W6_H4_sse2 ps, 8 + FILTER_V4_W6_H4_sse2 ps, 16 +%endif + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_%1_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W8_sse2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_%1_8x%2, 4, 7, 12 + mov r4d, r4m + sub r0, r1 + shl r4d, 5 + pxor m9, m9 + +%ifidn %1,pp + mova m4, [pw_32] +%elifidn %1,ps + mova m4, [pw_2000] + add r3d, r3d +%endif + +%ifdef PIC + lea r6, [tab_ChromaCoeffV] + mova m6, [r6 + r4] + mova m5, [r6 + r4 + 16] +%else + mova m6, [tab_ChromaCoeffV + r4] + mova m5, [tab_ChromaCoeffV + r4 + 16] +%endif + + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + lea r5, [r0 + 2 * r1] + movq m3, [r5 + r1] + + punpcklbw m0, m1 + punpcklbw m7, m2, m3 + + movhlps m8, m0 + punpcklbw m0, m9 + punpcklbw m8, m9 + pmaddwd m0, m6 + pmaddwd m8, m6 + packssdw m0, m8 + + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m5 + pmaddwd m8, m5 + packssdw m7, m8 + + paddw m0, m7 + +%ifidn %1,pp + paddw m0, m4 + psraw m0, 6 +%elifidn %1,ps + psubw m0, m4 + movu [r2], m0 +%endif + + movq m11, [r0 + 4 * r1] + + punpcklbw m1, m2 + punpcklbw m7, m3, m11 + + movhlps m8, m1 + punpcklbw m1, m9 + punpcklbw m8, m9 + pmaddwd m1, m6 + pmaddwd m8, m6 + packssdw m1, m8 + + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m5 + pmaddwd m8, m5 + packssdw m7, m8 + + paddw m1, m7 + +%ifidn %1,pp + paddw m1, m4 + psraw m1, 6 + packuswb m1, m0 + + movhps [r2], m1 + movh [r2 + r3], m1 +%elifidn %1,ps + psubw m1, m4 + movu [r2 + r3], m1 +%endif +%if %2 == 2 ;end of 8x2 + RET + +%else + lea r6, [r0 + 4 * r1] + movq m1, [r6 + r1] + + punpcklbw m2, m3 + punpcklbw m7, m11, m1 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m6 + pmaddwd m8, m6 + packssdw m2, m8 + + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m5 + pmaddwd m8, m5 + packssdw m7, m8 + + paddw m2, m7 + +%ifidn %1,pp + paddw m2, m4 + psraw m2, 6 +%elifidn %1,ps + psubw m2, m4 + movu [r2 + 2 * r3], m2 +%endif + + movq m10, [r6 + 2 * r1] + + punpcklbw m3, m11 + punpcklbw m7, m1, m10 + + movhlps m8, m3 + punpcklbw m3, m9 + punpcklbw m8, m9 + pmaddwd m3, m6 + pmaddwd m8, m6 + packssdw m3, m8 + + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m5 + pmaddwd m8, m5 + packssdw m7, m8 + + paddw m3, m7 + lea r5, [r2 + 2 * r3] + +%ifidn %1,pp + paddw m3, m4 + psraw m3, 6 + packuswb m3, m2 + + movhps [r2 + 2 * r3], m3 + movh [r5 + r3], m3 +%elifidn %1,ps + psubw m3, m4 + movu [r5 + r3], m3 +%endif +%if %2 == 4 ;end of 8x4 + RET + +%else + lea r6, [r6 + 2 * r1] + movq m3, [r6 + r1] + + punpcklbw m11, m1 + punpcklbw m7, m10, m3 + + movhlps m8, m11 + punpcklbw m11, m9 + punpcklbw m8, m9 + pmaddwd m11, m6 + pmaddwd m8, m6 + packssdw m11, m8 + + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m5 + pmaddwd m8, m5 + packssdw m7, m8 + + paddw m11, m7 + +%ifidn %1, pp + paddw m11, m4 + psraw m11, 6 +%elifidn %1,ps + psubw m11, m4 + movu [r2 + 4 * r3], m11 +%endif + + movq m7, [r0 + 8 * r1] + + punpcklbw m1, m10 + punpcklbw m3, m7 + + movhlps m8, m1 + punpcklbw m1, m9 + punpcklbw m8, m9 + pmaddwd m1, m6 + pmaddwd m8, m6 + packssdw m1, m8 + + movhlps m8, m3 + punpcklbw m3, m9 + punpcklbw m8, m9 + pmaddwd m3, m5 + pmaddwd m8, m5 + packssdw m3, m8 + + paddw m1, m3 + lea r5, [r2 + 4 * r3] + +%ifidn %1,pp + paddw m1, m4 + psraw m1, 6 + packuswb m1, m11 + + movhps [r2 + 4 * r3], m1 + movh [r5 + r3], m1 +%elifidn %1,ps + psubw m1, m4 + movu [r5 + r3], m1 +%endif +%if %2 == 6 + RET + +%else + %error INVALID macro argument, only 2, 4 or 6! +%endif +%endif +%endif +%endmacro + +%if ARCH_X86_64 + FILTER_V4_W8_sse2 pp, 2 + FILTER_V4_W8_sse2 pp, 4 + FILTER_V4_W8_sse2 pp, 6 + FILTER_V4_W8_sse2 ps, 2 + FILTER_V4_W8_sse2 ps, 4 + FILTER_V4_W8_sse2 ps, 6 +%endif + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_%1_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W8_H8_H16_H32_sse2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_%1_8x%2, 4, 6, 11 + mov r4d, r4m + sub r0, r1 + shl r4d, 5 + pxor m9, m9 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + mova m6, [r5 + r4] + mova m5, [r5 + r4 + 16] +%else + mova m6, [tab_ChromaCoeff + r4] + mova m5, [tab_ChromaCoeff + r4 + 16] +%endif + +%ifidn %1,pp + mova m4, [pw_32] +%elifidn %1,ps + mova m4, [pw_2000] + add r3d, r3d +%endif + + lea r5, [r1 * 3] + +%assign x 1 +%rep %2/4 + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] + + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + + movhlps m7, m0 + punpcklbw m0, m9 + punpcklbw m7, m9 + pmaddwd m0, m6 + pmaddwd m7, m6 + packssdw m0, m7 + + movhlps m8, m2 + movq m7, m2 + punpcklbw m8, m9 + punpcklbw m7, m9 + pmaddwd m8, m5 + pmaddwd m7, m5 + packssdw m7, m8 + + paddw m0, m7 + +%ifidn %1,pp + paddw m0, m4 + psraw m0, 6 +%elifidn %1,ps + psubw m0, m4 + movu [r2], m0 +%endif + + lea r0, [r0 + 4 * r1] + movq m10, [r0] + punpcklbw m3, m10 + + movhlps m8, m1 + punpcklbw m1, m9 + punpcklbw m8, m9 + pmaddwd m1, m6 + pmaddwd m8, m6 + packssdw m1, m8 + + movhlps m8, m3 + movq m7, m3 + punpcklbw m8, m9 + punpcklbw m7, m9 + pmaddwd m8, m5 + pmaddwd m7, m5 + packssdw m7, m8 + + paddw m1, m7 + +%ifidn %1,pp + paddw m1, m4 + psraw m1, 6 + + packuswb m0, m1 + movh [r2], m0 + movhps [r2 + r3], m0 +%elifidn %1,ps + psubw m1, m4 + movu [r2 + r3], m1 +%endif + + movq m1, [r0 + r1] + punpcklbw m10, m1 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m6 + pmaddwd m8, m6 + packssdw m2, m8 + + movhlps m8, m10 + punpcklbw m10, m9 + punpcklbw m8, m9 + pmaddwd m10, m5 + pmaddwd m8, m5 + packssdw m10, m8 + + paddw m2, m10 + lea r2, [r2 + 2 * r3] + +%ifidn %1,pp + paddw m2, m4 + psraw m2, 6 +%elifidn %1,ps + psubw m2, m4 + movu [r2], m2 +%endif + + movq m7, [r0 + 2 * r1] + punpcklbw m1, m7 + + movhlps m8, m3 + punpcklbw m3, m9 + punpcklbw m8, m9 + pmaddwd m3, m6 + pmaddwd m8, m6 + packssdw m3, m8 + + movhlps m8, m1 + punpcklbw m1, m9 + punpcklbw m8, m9 + pmaddwd m1, m5 + pmaddwd m8, m5 + packssdw m1, m8 + + paddw m3, m1 + +%ifidn %1,pp + paddw m3, m4 + psraw m3, 6 + + packuswb m2, m3 + movh [r2], m2 + movhps [r2 + r3], m2 +%elifidn %1,ps + psubw m3, m4 + movu [r2 + r3], m3 +%endif + +%if x < %2/4 + lea r2, [r2 + 2 * r3] +%endif +%endrep + RET +%endmacro + +%if ARCH_X86_64 + FILTER_V4_W8_H8_H16_H32_sse2 pp, 8 + FILTER_V4_W8_H8_H16_H32_sse2 pp, 16 + FILTER_V4_W8_H8_H16_H32_sse2 pp, 32 + + FILTER_V4_W8_H8_H16_H32_sse2 pp, 12 + FILTER_V4_W8_H8_H16_H32_sse2 pp, 64 + + FILTER_V4_W8_H8_H16_H32_sse2 ps, 8 + FILTER_V4_W8_H8_H16_H32_sse2 ps, 16 + FILTER_V4_W8_H8_H16_H32_sse2 ps, 32 + + FILTER_V4_W8_H8_H16_H32_sse2 ps, 12 + FILTER_V4_W8_H8_H16_H32_sse2 ps, 64 +%endif + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_%1_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W12_H2_sse2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_%1_12x%2, 4, 6, 11 + mov r4d, r4m + sub r0, r1 + shl r4d, 5 + pxor m9, m9 + +%ifidn %1,pp + mova m6, [pw_32] +%elifidn %1,ps + mova m6, [pw_2000] + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + mova m1, [r5 + r4] + mova m0, [r5 + r4 + 16] +%else + mova m1, [tab_ChromaCoeffV + r4] + mova m0, [tab_ChromaCoeffV + r4 + 16] +%endif + +%assign x 1 +%rep %2/2 + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + movhlps m8, m4 + punpcklbw m4, m9 + punpcklbw m8, m9 + pmaddwd m4, m1 + pmaddwd m8, m1 + packssdw m4, m8 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m1 + pmaddwd m8, m1 + packssdw m2, m8 + + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m7, [r0 + r1] + + punpcklbw m10, m5, m7 + movhlps m8, m10 + punpcklbw m10, m9 + punpcklbw m8, m9 + pmaddwd m10, m0 + pmaddwd m8, m0 + packssdw m10, m8 + + paddw m4, m10 + + punpckhbw m10, m5, m7 + movhlps m8, m10 + punpcklbw m10, m9 + punpcklbw m8, m9 + pmaddwd m10, m0 + pmaddwd m8, m0 + packssdw m10, m8 + + paddw m2, m10 + +%ifidn %1,pp + paddw m4, m6 + psraw m4, 6 + paddw m2, m6 + psraw m2, 6 + + packuswb m4, m2 + movh [r2], m4 + psrldq m4, 8 + movd [r2 + 8], m4 +%elifidn %1,ps + psubw m4, m6 + psubw m2, m6 + movu [r2], m4 + movh [r2 + 16], m2 +%endif + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + movhlps m8, m4 + punpcklbw m4, m9 + punpcklbw m8, m9 + pmaddwd m4, m1 + pmaddwd m8, m1 + packssdw m4, m8 + + movhlps m8, m4 + punpcklbw m3, m9 + punpcklbw m8, m9 + pmaddwd m3, m1 + pmaddwd m8, m1 + packssdw m3, m8 + + movu m5, [r0 + 2 * r1] + punpcklbw m2, m7, m5 + punpckhbw m7, m5 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m0 + pmaddwd m8, m0 + packssdw m2, m8 + + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m0 + pmaddwd m8, m0 + packssdw m7, m8 + + paddw m4, m2 + paddw m3, m7 + +%ifidn %1,pp + paddw m4, m6 + psraw m4, 6 + paddw m3, m6 + psraw m3, 6 + + packuswb m4, m3 + movh [r2 + r3], m4 + psrldq m4, 8 + movd [r2 + r3 + 8], m4 +%elifidn %1,ps + psubw m4, m6 + psubw m3, m6 + movu [r2 + r3], m4 + movh [r2 + r3 + 16], m3 +%endif + +%if x < %2/2 + lea r2, [r2 + 2 * r3] +%endif +%assign x x+1 +%endrep + RET + +%endmacro + +%if ARCH_X86_64 + FILTER_V4_W12_H2_sse2 pp, 16 + FILTER_V4_W12_H2_sse2 pp, 32 + FILTER_V4_W12_H2_sse2 ps, 16 + FILTER_V4_W12_H2_sse2 ps, 32 +%endif + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_%1_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W16_H2_sse2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_%1_16x%2, 4, 6, 11 + mov r4d, r4m + sub r0, r1 + shl r4d, 5 + pxor m9, m9 + +%ifidn %1,pp + mova m6, [pw_32] +%elifidn %1,ps + mova m6, [pw_2000] + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + mova m1, [r5 + r4] + mova m0, [r5 + r4 + 16] +%else + mova m1, [tab_ChromaCoeffV + r4] + mova m0, [tab_ChromaCoeffV + r4 + 16] +%endif + +%assign x 1 +%rep %2/2 + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + movhlps m8, m4 + punpcklbw m4, m9 + punpcklbw m8, m9 + pmaddwd m4, m1 + pmaddwd m8, m1 + packssdw m4, m8 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m1 + pmaddwd m8, m1 + packssdw m2, m8 + + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m10, [r0 + r1] + + punpckhbw m7, m5, m10 + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m0 + pmaddwd m8, m0 + packssdw m7, m8 + paddw m2, m7 + + punpcklbw m7, m5, m10 + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m0 + pmaddwd m8, m0 + packssdw m7, m8 + paddw m4, m7 + +%ifidn %1,pp + paddw m4, m6 + psraw m4, 6 + paddw m2, m6 + psraw m2, 6 + + packuswb m4, m2 + movu [r2], m4 +%elifidn %1,ps + psubw m4, m6 + psubw m2, m6 + movu [r2], m4 + movu [r2 + 16], m2 +%endif + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + movhlps m8, m4 + punpcklbw m4, m9 + punpcklbw m8, m9 + pmaddwd m4, m1 + pmaddwd m8, m1 + packssdw m4, m8 + + movhlps m8, m3 + punpcklbw m3, m9 + punpcklbw m8, m9 + pmaddwd m3, m1 + pmaddwd m8, m1 + packssdw m3, m8 + + movu m5, [r0 + 2 * r1] + + punpcklbw m2, m10, m5 + punpckhbw m10, m5 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m0 + pmaddwd m8, m0 + packssdw m2, m8 + + movhlps m8, m10 + punpcklbw m10, m9 + punpcklbw m8, m9 + pmaddwd m10, m0 + pmaddwd m8, m0 + packssdw m10, m8 + + paddw m4, m2 + paddw m3, m10 + +%ifidn %1,pp + paddw m4, m6 + psraw m4, 6 + paddw m3, m6 + psraw m3, 6 + + packuswb m4, m3 + movu [r2 + r3], m4 +%elifidn %1,ps + psubw m4, m6 + psubw m3, m6 + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 +%endif + +%if x < %2/2 + lea r2, [r2 + 2 * r3] +%endif +%assign x x+1 +%endrep + RET + +%endmacro + +%if ARCH_X86_64 + FILTER_V4_W16_H2_sse2 pp, 4 + FILTER_V4_W16_H2_sse2 pp, 8 + FILTER_V4_W16_H2_sse2 pp, 12 + FILTER_V4_W16_H2_sse2 pp, 16 + FILTER_V4_W16_H2_sse2 pp, 32 + + FILTER_V4_W16_H2_sse2 pp, 24 + FILTER_V4_W16_H2_sse2 pp, 64 + + FILTER_V4_W16_H2_sse2 ps, 4 + FILTER_V4_W16_H2_sse2 ps, 8 + FILTER_V4_W16_H2_sse2 ps, 12 + FILTER_V4_W16_H2_sse2 ps, 16 + FILTER_V4_W16_H2_sse2 ps, 32 + + FILTER_V4_W16_H2_sse2 ps, 24 + FILTER_V4_W16_H2_sse2 ps, 64 +%endif + +;----------------------------------------------------------------------------- +;void interp_4tap_vert_%1_24%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W24_sse2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_%1_24x%2, 4, 6, 11 + mov r4d, r4m + sub r0, r1 + shl r4d, 5 + pxor m9, m9 + +%ifidn %1,pp + mova m6, [pw_32] +%elifidn %1,ps + mova m6, [pw_2000] + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + mova m1, [r5 + r4] + mova m0, [r5 + r4 + 16] +%else + mova m1, [tab_ChromaCoeffV + r4] + mova m0, [tab_ChromaCoeffV + r4 + 16] +%endif + +%assign x 1 +%rep %2/2 + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + movhlps m8, m4 + punpcklbw m4, m9 + punpcklbw m8, m9 + pmaddwd m4, m1 + pmaddwd m8, m1 + packssdw m4, m8 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m1 + pmaddwd m8, m1 + packssdw m2, m8 + + lea r5, [r0 + 2 * r1] + movu m5, [r5] + movu m10, [r5 + r1] + punpcklbw m7, m5, m10 + + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m0 + pmaddwd m8, m0 + packssdw m7, m8 + paddw m4, m7 + + punpckhbw m7, m5, m10 + + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m0 + pmaddwd m8, m0 + packssdw m7, m8 + + paddw m2, m7 + +%ifidn %1,pp + paddw m4, m6 + psraw m4, 6 + paddw m2, m6 + psraw m2, 6 + + packuswb m4, m2 + movu [r2], m4 +%elifidn %1,ps + psubw m4, m6 + psubw m2, m6 + movu [r2], m4 + movu [r2 + 16], m2 +%endif + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + movhlps m8, m4 + punpcklbw m4, m9 + punpcklbw m8, m9 + pmaddwd m4, m1 + pmaddwd m8, m1 + packssdw m4, m8 + + movhlps m8, m3 + punpcklbw m3, m9 + punpcklbw m8, m9 + pmaddwd m3, m1 + pmaddwd m8, m1 + packssdw m3, m8 + + movu m2, [r5 + 2 * r1] + + punpcklbw m5, m10, m2 + punpckhbw m10, m2 + + movhlps m8, m5 + punpcklbw m5, m9 + punpcklbw m8, m9 + pmaddwd m5, m0 + pmaddwd m8, m0 + packssdw m5, m8 + + movhlps m8, m10 + punpcklbw m10, m9 + punpcklbw m8, m9 + pmaddwd m10, m0 + pmaddwd m8, m0 + packssdw m10, m8 + + paddw m4, m5 + paddw m3, m10 + +%ifidn %1,pp + paddw m4, m6 + psraw m4, 6 + paddw m3, m6 + psraw m3, 6 + + packuswb m4, m3 + movu [r2 + r3], m4 +%elifidn %1,ps + psubw m4, m6 + psubw m3, m6 + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 +%endif + + movq m2, [r0 + 16] + movq m3, [r0 + r1 + 16] + movq m4, [r5 + 16] + movq m5, [r5 + r1 + 16] + + punpcklbw m2, m3 + punpcklbw m4, m5 + + movhlps m8, m4 + punpcklbw m4, m9 + punpcklbw m8, m9 + pmaddwd m4, m0 + pmaddwd m8, m0 + packssdw m4, m8 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m1 + pmaddwd m8, m1 + packssdw m2, m8 + + paddw m2, m4 + +%ifidn %1,pp + paddw m2, m6 + psraw m2, 6 +%elifidn %1,ps + psubw m2, m6 + movu [r2 + 32], m2 +%endif + + movq m3, [r0 + r1 + 16] + movq m4, [r5 + 16] + movq m5, [r5 + r1 + 16] + movq m7, [r5 + 2 * r1 + 16] + + punpcklbw m3, m4 + punpcklbw m5, m7 + + movhlps m8, m5 + punpcklbw m5, m9 + punpcklbw m8, m9 + pmaddwd m5, m0 + pmaddwd m8, m0 + packssdw m5, m8 + + movhlps m8, m3 + punpcklbw m3, m9 + punpcklbw m8, m9 + pmaddwd m3, m1 + pmaddwd m8, m1 + packssdw m3, m8 + + paddw m3, m5 + +%ifidn %1,pp + paddw m3, m6 + psraw m3, 6 + + packuswb m2, m3 + movh [r2 + 16], m2 + movhps [r2 + r3 + 16], m2 +%elifidn %1,ps + psubw m3, m6 + movu [r2 + r3 + 32], m3 +%endif + +%if x < %2/2 + mov r0, r5 + lea r2, [r2 + 2 * r3] +%endif +%assign x x+1 +%endrep + RET + +%endmacro + +%if ARCH_X86_64 + FILTER_V4_W24_sse2 pp, 32 + FILTER_V4_W24_sse2 pp, 64 + FILTER_V4_W24_sse2 ps, 32 + FILTER_V4_W24_sse2 ps, 64 +%endif + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_%1_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W32_sse2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_%1_32x%2, 4, 6, 10 + mov r4d, r4m + sub r0, r1 + shl r4d, 5 + pxor m9, m9 + +%ifidn %1,pp + mova m6, [pw_32] +%elifidn %1,ps + mova m6, [pw_2000] + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + mova m1, [r5 + r4] + mova m0, [r5 + r4 + 16] +%else + mova m1, [tab_ChromaCoeffV + r4] + mova m0, [tab_ChromaCoeffV + r4 + 16] +%endif + + mov r4d, %2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + movhlps m8, m4 + punpcklbw m4, m9 + punpcklbw m8, m9 + pmaddwd m4, m1 + pmaddwd m8, m1 + packssdw m4, m8 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m1 + pmaddwd m8, m1 + packssdw m2, m8 + + lea r5, [r0 + 2 * r1] + movu m3, [r5] + movu m5, [r5 + r1] + + punpcklbw m7, m3, m5 + punpckhbw m3, m5 + + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m0 + pmaddwd m8, m0 + packssdw m7, m8 + + movhlps m8, m3 + punpcklbw m3, m9 + punpcklbw m8, m9 + pmaddwd m3, m0 + pmaddwd m8, m0 + packssdw m3, m8 + + paddw m4, m7 + paddw m2, m3 + +%ifidn %1,pp + paddw m4, m6 + psraw m4, 6 + paddw m2, m6 + psraw m2, 6 + + packuswb m4, m2 + movu [r2], m4 +%elifidn %1,ps + psubw m4, m6 + psubw m2, m6 + movu [r2], m4 + movu [r2 + 16], m2 +%endif + + movu m2, [r0 + 16] + movu m3, [r0 + r1 + 16] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + movhlps m8, m4 + punpcklbw m4, m9 + punpcklbw m8, m9 + pmaddwd m4, m1 + pmaddwd m8, m1 + packssdw m4, m8 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m1 + pmaddwd m8, m1 + packssdw m2, m8 + + movu m3, [r5 + 16] + movu m5, [r5 + r1 + 16] + + punpcklbw m7, m3, m5 + punpckhbw m3, m5 + + movhlps m8, m7 + punpcklbw m7, m9 + punpcklbw m8, m9 + pmaddwd m7, m0 + pmaddwd m8, m0 + packssdw m7, m8 + + movhlps m8, m3 + punpcklbw m3, m9 + punpcklbw m8, m9 + pmaddwd m3, m0 + pmaddwd m8, m0 + packssdw m3, m8 + + paddw m4, m7 + paddw m2, m3 + +%ifidn %1,pp + paddw m4, m6 + psraw m4, 6 + paddw m2, m6 + psraw m2, 6 + + packuswb m4, m2 + movu [r2 + 16], m4 +%elifidn %1,ps + psubw m4, m6 + psubw m2, m6 + movu [r2 + 32], m4 + movu [r2 + 48], m2 +%endif + + lea r0, [r0 + r1] + lea r2, [r2 + r3] + dec r4 + jnz .loop + RET + +%endmacro + +%if ARCH_X86_64 + FILTER_V4_W32_sse2 pp, 8 + FILTER_V4_W32_sse2 pp, 16 + FILTER_V4_W32_sse2 pp, 24 + FILTER_V4_W32_sse2 pp, 32 + + FILTER_V4_W32_sse2 pp, 48 + FILTER_V4_W32_sse2 pp, 64 + + FILTER_V4_W32_sse2 ps, 8 + FILTER_V4_W32_sse2 ps, 16 + FILTER_V4_W32_sse2 ps, 24 + FILTER_V4_W32_sse2 ps, 32 + + FILTER_V4_W32_sse2 ps, 48 + FILTER_V4_W32_sse2 ps, 64 +%endif + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_%1_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W16n_H2_sse2 3 +INIT_XMM sse2 +cglobal interp_4tap_vert_%1_%2x%3, 4, 7, 11 + mov r4d, r4m + sub r0, r1 + shl r4d, 5 + pxor m9, m9 + +%ifidn %1,pp + mova m7, [pw_32] +%elifidn %1,ps + mova m7, [pw_2000] + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + mova m1, [r5 + r4] + mova m0, [r5 + r4 + 16] +%else + mova m1, [tab_ChromaCoeffV + r4] + mova m0, [tab_ChromaCoeffV + r4 + 16] +%endif + + mov r4d, %3/2 + +.loop: + + mov r6d, %2/16 + +.loopW: + + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + movhlps m8, m4 + punpcklbw m4, m9 + punpcklbw m8, m9 + pmaddwd m4, m1 + pmaddwd m8, m1 + packssdw m4, m8 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m1 + pmaddwd m8, m1 + packssdw m2, m8 + + lea r5, [r0 + 2 * r1] + movu m5, [r5] + movu m6, [r5 + r1] + + punpckhbw m10, m5, m6 + movhlps m8, m10 + punpcklbw m10, m9 + punpcklbw m8, m9 + pmaddwd m10, m0 + pmaddwd m8, m0 + packssdw m10, m8 + paddw m2, m10 + + punpcklbw m10, m5, m6 + movhlps m8, m10 + punpcklbw m10, m9 + punpcklbw m8, m9 + pmaddwd m10, m0 + pmaddwd m8, m0 + packssdw m10, m8 + paddw m4, m10 + +%ifidn %1,pp + paddw m4, m7 + psraw m4, 6 + paddw m2, m7 + psraw m2, 6 + + packuswb m4, m2 + movu [r2], m4 +%elifidn %1,ps + psubw m4, m7 + psubw m2, m7 + movu [r2], m4 + movu [r2 + 16], m2 +%endif + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + movhlps m8, m4 + punpcklbw m4, m9 + punpcklbw m8, m9 + pmaddwd m4, m1 + pmaddwd m8, m1 + packssdw m4, m8 + + movhlps m8, m3 + punpcklbw m3, m9 + punpcklbw m8, m9 + pmaddwd m3, m1 + pmaddwd m8, m1 + packssdw m3, m8 + + movu m5, [r5 + 2 * r1] + + punpcklbw m2, m6, m5 + punpckhbw m6, m5 + + movhlps m8, m2 + punpcklbw m2, m9 + punpcklbw m8, m9 + pmaddwd m2, m0 + pmaddwd m8, m0 + packssdw m2, m8 + + movhlps m8, m6 + punpcklbw m6, m9 + punpcklbw m8, m9 + pmaddwd m6, m0 + pmaddwd m8, m0 + packssdw m6, m8 + + paddw m4, m2 + paddw m3, m6 + +%ifidn %1,pp + paddw m4, m7 + psraw m4, 6 + paddw m3, m7 + psraw m3, 6 + + packuswb m4, m3 + movu [r2 + r3], m4 + add r2, 16 +%elifidn %1,ps + psubw m4, m7 + psubw m3, m7 + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 + add r2, 32 +%endif + + add r0, 16 + dec r6d + jnz .loopW + + lea r0, [r0 + r1 * 2 - %2] + +%ifidn %1,pp + lea r2, [r2 + r3 * 2 - %2] +%elifidn %1,ps + lea r2, [r2 + r3 * 2 - (%2 * 2)] +%endif + + dec r4d + jnz .loop + RET + +%endmacro + +%if ARCH_X86_64 + FILTER_V4_W16n_H2_sse2 pp, 64, 64 + FILTER_V4_W16n_H2_sse2 pp, 64, 32 + FILTER_V4_W16n_H2_sse2 pp, 64, 48 + FILTER_V4_W16n_H2_sse2 pp, 48, 64 + FILTER_V4_W16n_H2_sse2 pp, 64, 16 + FILTER_V4_W16n_H2_sse2 ps, 64, 64 + FILTER_V4_W16n_H2_sse2 ps, 64, 32 + FILTER_V4_W16n_H2_sse2 ps, 64, 48 + FILTER_V4_W16n_H2_sse2 ps, 48, 64 + FILTER_V4_W16n_H2_sse2 ps, 64, 16 +%endif + +;----------------------------------------------------------------------------- +;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + lea r4, [r1 * 3] + lea r5, [r0 + 4 * r1] + pshufb m0, [tab_Cm] + mova m1, [pw_512] + + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r4] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + movd m6, [r5] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 + + pmaddubsw m3, m0 + + phaddw m2, m3 + + pmulhrsw m2, m1 + + movd m7, [r5 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m7 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r5 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m7, m3 + punpcklbw m5, m7 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + pmulhrsw m4, m1 + packuswb m2, m4 + + pextrw [r2], m2, 0 + pextrw [r2 + r3], m2, 2 + lea r2, [r2 + 2 * r3] + pextrw [r2], m2, 4 + pextrw [r2 + r3], m2, 6 + + RET + +%macro FILTER_VER_CHROMA_AVX2_2x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_2x4, 4, 6, 2 + mov r4d, r4m + shl r4d, 5 + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff_V] + add r5, r4 +%else + lea r5, [tab_ChromaCoeff_V + r4] +%endif + + lea r4, [r1 * 3] + + pinsrw xm1, [r0], 0 + pinsrw xm1, [r0 + r1], 1 + pinsrw xm1, [r0 + r1 * 2], 2 + pinsrw xm1, [r0 + r4], 3 + lea r0, [r0 + r1 * 4] + pinsrw xm1, [r0], 4 + pinsrw xm1, [r0 + r1], 5 + pinsrw xm1, [r0 + r1 * 2], 6 + + pshufb xm0, xm1, [interp_vert_shuf] + pshufb xm1, [interp_vert_shuf + 32] + vinserti128 m0, m0, xm1, 1 + pmaddubsw m0, [r5] + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +%ifidn %1,pp + pmulhrsw xm0, [pw_512] + packuswb xm0, xm0 + lea r4, [r3 * 3] + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r4], xm0, 3 +%else + add r3d, r3d + lea r4, [r3 * 3] + psubw xm0, [pw_2000] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + r3 * 2], xm0, 2 + pextrd [r2 + r4], xm0, 3 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_2x4 pp + FILTER_VER_CHROMA_AVX2_2x4 ps + +%macro FILTER_VER_CHROMA_AVX2_2x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_2x8, 4, 6, 2 + mov r4d, r4m + shl r4d, 6 + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + + pinsrw xm1, [r0], 0 + pinsrw xm1, [r0 + r1], 1 + pinsrw xm1, [r0 + r1 * 2], 2 + pinsrw xm1, [r0 + r4], 3 + lea r0, [r0 + r1 * 4] + pinsrw xm1, [r0], 4 + pinsrw xm1, [r0 + r1], 5 + pinsrw xm1, [r0 + r1 * 2], 6 + pinsrw xm1, [r0 + r4], 7 + movhlps xm0, xm1 + lea r0, [r0 + r1 * 4] + pinsrw xm0, [r0], 4 + pinsrw xm0, [r0 + r1], 5 + pinsrw xm0, [r0 + r1 * 2], 6 + vinserti128 m1, m1, xm0, 1 + + pshufb m0, m1, [interp_vert_shuf] + pshufb m1, [interp_vert_shuf + 32] + pmaddubsw m0, [r5] + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m0, m1 +%ifidn %1,pp + pmulhrsw m0, [pw_512] + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + lea r4, [r3 * 3] + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r4], xm0, 3 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 4 + pextrw [r2 + r3], xm0, 5 + pextrw [r2 + r3 * 2], xm0, 6 + pextrw [r2 + r4], xm0, 7 +%else + add r3d, r3d + lea r4, [r3 * 3] + psubw m0, [pw_2000] + vextracti128 xm1, m0, 1 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + r3 * 2], xm0, 2 + pextrd [r2 + r4], xm0, 3 + lea r2, [r2 + r3 * 4] + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r4], xm1, 3 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_2x8 pp + FILTER_VER_CHROMA_AVX2_2x8 ps + +%macro FILTER_VER_CHROMA_AVX2_2x16 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_2x16, 4, 6, 3 + mov r4d, r4m + shl r4d, 6 + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + + movd xm1, [r0] + pinsrw xm1, [r0 + r1], 1 + pinsrw xm1, [r0 + r1 * 2], 2 + pinsrw xm1, [r0 + r4], 3 + lea r0, [r0 + r1 * 4] + pinsrw xm1, [r0], 4 + pinsrw xm1, [r0 + r1], 5 + pinsrw xm1, [r0 + r1 * 2], 6 + pinsrw xm1, [r0 + r4], 7 + lea r0, [r0 + r1 * 4] + pinsrw xm0, [r0], 4 + pinsrw xm0, [r0 + r1], 5 + pinsrw xm0, [r0 + r1 * 2], 6 + pinsrw xm0, [r0 + r4], 7 + punpckhqdq xm0, xm1, xm0 + vinserti128 m1, m1, xm0, 1 + + pshufb m2, m1, [interp_vert_shuf] + pshufb m1, [interp_vert_shuf + 32] + pmaddubsw m2, [r5] + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m2, m1 + + lea r0, [r0 + r1 * 4] + pinsrw xm1, [r0], 4 + pinsrw xm1, [r0 + r1], 5 + pinsrw xm1, [r0 + r1 * 2], 6 + pinsrw xm1, [r0 + r4], 7 + punpckhqdq xm1, xm0, xm1 + lea r0, [r0 + r1 * 4] + pinsrw xm0, [r0], 4 + pinsrw xm0, [r0 + r1], 5 + pinsrw xm0, [r0 + r1 * 2], 6 + punpckhqdq xm0, xm1, xm0 + vinserti128 m1, m1, xm0, 1 + + pshufb m0, m1, [interp_vert_shuf] + pshufb m1, [interp_vert_shuf + 32] + pmaddubsw m0, [r5] + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m0, m1 +%ifidn %1,pp + mova m1, [pw_512] + pmulhrsw m2, m1 + pmulhrsw m0, m1 + packuswb m2, m0 + lea r4, [r3 * 3] + pextrw [r2], xm2, 0 + pextrw [r2 + r3], xm2, 1 + pextrw [r2 + r3 * 2], xm2, 2 + pextrw [r2 + r4], xm2, 3 + vextracti128 xm0, m2, 1 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r4], xm0, 3 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm2, 4 + pextrw [r2 + r3], xm2, 5 + pextrw [r2 + r3 * 2], xm2, 6 + pextrw [r2 + r4], xm2, 7 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 4 + pextrw [r2 + r3], xm0, 5 + pextrw [r2 + r3 * 2], xm0, 6 + pextrw [r2 + r4], xm0, 7 +%else + add r3d, r3d + lea r4, [r3 * 3] + vbroadcasti128 m1, [pw_2000] + psubw m2, m1 + psubw m0, m1 + vextracti128 xm1, m2, 1 + movd [r2], xm2 + pextrd [r2 + r3], xm2, 1 + pextrd [r2 + r3 * 2], xm2, 2 + pextrd [r2 + r4], xm2, 3 + lea r2, [r2 + r3 * 4] + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r4], xm1, 3 + vextracti128 xm1, m0, 1 + lea r2, [r2 + r3 * 4] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + r3 * 2], xm0, 2 + pextrd [r2 + r4], xm0, 3 + lea r2, [r2 + r3 * 4] + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r4], xm1, 3 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_2x16 pp + FILTER_VER_CHROMA_AVX2_2x16 ps + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W2_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + mova m1, [pw_512] + + mov r4d, %2 + lea r5, [3 * r1] + +.loop: + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 + + pmaddubsw m3, m0 + + phaddw m2, m3 + + pmulhrsw m2, m1 + + movd m7, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m7 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m7, m3 + punpcklbw m5, m7 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + pmulhrsw m4, m1 + packuswb m2, m4 + + pextrw [r2], m2, 0 + pextrw [r2 + r3], m2, 2 + lea r2, [r2 + 2 * r3] + pextrw [r2], m2, 4 + pextrw [r2 + r3], m2, 6 + + lea r2, [r2 + 2 * r3] + + sub r4, 4 + jnz .loop + RET +%endmacro + + FILTER_V4_W2_H4 2, 8 + FILTER_V4_W2_H4 2, 16 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_4x2, 4, 6, 6 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + lea r5, [r0 + 2 * r1] + + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r5] + movd m5, [r5 + r1] + + punpcklbw m2, m3 + punpcklbw m1, m4, m5 + punpcklbw m2, m1 + + pmaddubsw m2, m0 + + movd m1, [r0 + 4 * r1] + + punpcklbw m3, m4 + punpcklbw m5, m1 + punpcklbw m3, m5 + + pmaddubsw m3, m0 + + phaddw m2, m3 + + pmulhrsw m2, [pw_512] + packuswb m2, m2 + movd [r2], m2 + pextrd [r2 + r3], m2, 1 + + RET + +%macro FILTER_VER_CHROMA_AVX2_4x2 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x2, 4, 6, 4 + mov r4d, r4m + shl r4d, 5 + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff_V] + add r5, r4 +%else + lea r5, [tab_ChromaCoeff_V + r4] +%endif + + lea r4, [r1 * 3] + + movd xm1, [r0] + movd xm2, [r0 + r1] + punpcklbw xm1, xm2 + movd xm3, [r0 + r1 * 2] + punpcklbw xm2, xm3 + movlhps xm1, xm2 + movd xm0, [r0 + r4] + punpcklbw xm3, xm0 + movd xm2, [r0 + r1 * 4] + punpcklbw xm0, xm2 + movlhps xm3, xm0 + vinserti128 m1, m1, xm3, 1 ; m1 = row[x x x 4 3 2 1 0] + + pmaddubsw m1, [r5] + vextracti128 xm3, m1, 1 + paddw xm1, xm3 +%ifidn %1,pp + pmulhrsw xm1, [pw_512] + packuswb xm1, xm1 + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 +%else + add r3d, r3d + psubw xm1, [pw_2000] + movq [r2], xm1 + movhps [r2 + r3], xm1 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_4x2 pp + FILTER_VER_CHROMA_AVX2_4x2 ps + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_4x4, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + mova m1, [pw_512] + lea r5, [r0 + 4 * r1] + lea r4, [r1 * 3] + + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r4] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + movd m6, [r5] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 + + pmaddubsw m3, m0 + + phaddw m2, m3 + + pmulhrsw m2, m1 + + movd m7, [r5 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m7 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r5 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m7, m3 + punpcklbw m5, m7 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + pmulhrsw m4, m1 + + packuswb m2, m4 + movd [r2], m2 + pextrd [r2 + r3], m2, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m2, 2 + pextrd [r2 + r3], m2, 3 + RET + +%macro FILTER_VER_CHROMA_AVX2_4x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x4, 4, 6, 3 + mov r4d, r4m + shl r4d, 6 + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 ; m2 = row[x 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[x 6 5 4 3 2 1 0] + mova m2, [v4_interp4_vpp_shuf1] + vpermd m0, m2, m1 ; m0 = row[4 3 3 2 2 1 1 0] + mova m2, [v4_interp4_vpp_shuf1 + mmsize] + vpermd m1, m2, m1 ; m1 = row[6 5 5 4 4 3 3 2] + + mova m2, [v4_interp4_vpp_shuf] + pshufb m0, m0, m2 + pshufb m1, m1, m2 + pmaddubsw m0, [r5] + pmaddubsw m1, [r5 + mmsize] + paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] +%ifidn %1,pp + pmulhrsw m0, [pw_512] + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + lea r5, [r3 * 3] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + r3 * 2], xm0, 2 + pextrd [r2 + r5], xm0, 3 +%else + add r3d, r3d + psubw m0, [pw_2000] + vextracti128 xm1, m0, 1 + lea r5, [r3 * 3] + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm1 + movhps [r2 + r5], xm1 +%endif + RET +%endmacro + FILTER_VER_CHROMA_AVX2_4x4 pp + FILTER_VER_CHROMA_AVX2_4x4 ps + +%macro FILTER_VER_CHROMA_AVX2_4x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x8, 4, 6, 5 + mov r4d, r4m + shl r4d, 6 + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] + mova m3, [v4_interp4_vpp_shuf1] + vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] + vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] + mova m3, [v4_interp4_vpp_shuf1 + mmsize] + vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] + + mova m3, [v4_interp4_vpp_shuf] + pshufb m0, m0, m3 + pshufb m1, m1, m3 + pshufb m2, m2, m3 + pshufb m4, m4, m3 + pmaddubsw m0, [r5] + pmaddubsw m4, [r5] + pmaddubsw m1, [r5 + mmsize] + pmaddubsw m2, [r5 + mmsize] + paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] + paddw m4, m2 ; m4 = WORD ROW[7 6 5 4] +%ifidn %1,pp + pmulhrsw m0, [pw_512] + pmulhrsw m4, [pw_512] + packuswb m0, m4 + vextracti128 xm1, m0, 1 + lea r5, [r3 * 3] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + movd [r2 + r3 * 2], xm1 + pextrd [r2 + r5], xm1, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm0, 3 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r5], xm1, 3 +%else + add r3d, r3d + psubw m0, [pw_2000] + psubw m4, [pw_2000] + vextracti128 xm1, m0, 1 + vextracti128 xm2, m4, 1 + lea r5, [r3 * 3] + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm1 + movhps [r2 + r5], xm1 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r5], xm2 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_4x8 pp + FILTER_VER_CHROMA_AVX2_4x8 ps + +%macro FILTER_VER_CHROMA_AVX2_4xN 2 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x%2, 4, 6, 12 + mov r4d, r4m + shl r4d, 6 + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + mova m10, [r5] + mova m11, [r5 + mmsize] +%ifidn %1,pp + mova m9, [pw_512] +%else + add r3d, r3d + mova m9, [pw_2000] +%endif + lea r5, [r3 * 3] +%rep %2 / 16 + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 + pinsrd xm3, [r0 + r4], 3 ; m3 = row[11 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[11 10 9 8 7 6 5 4] + lea r0, [r0 + r1 * 4] + movd xm4, [r0] + pinsrd xm4, [r0 + r1], 1 + pinsrd xm4, [r0 + r1 * 2], 2 + pinsrd xm4, [r0 + r4], 3 ; m4 = row[15 14 13 12] + vinserti128 m3, m3, xm4, 1 ; m3 = row[15 14 13 12 11 10 9 8] + lea r0, [r0 + r1 * 4] + movd xm5, [r0] + pinsrd xm5, [r0 + r1], 1 + pinsrd xm5, [r0 + r1 * 2], 2 ; m5 = row[x 18 17 16] + vinserti128 m4, m4, xm5, 1 ; m4 = row[x 18 17 16 15 14 13 12] + mova m5, [v4_interp4_vpp_shuf1] + vpermd m0, m5, m1 ; m0 = row[4 3 3 2 2 1 1 0] + vpermd m6, m5, m2 ; m6 = row[8 7 7 6 6 5 5 4] + vpermd m7, m5, m3 ; m7 = row[12 11 11 10 10 9 9 8] + vpermd m8, m5, m4 ; m8 = row[16 15 15 14 14 13 13 12] + mova m5, [v4_interp4_vpp_shuf1 + mmsize] + vpermd m1, m5, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m5, m2 ; m2 = row[10 9 9 8 8 7 7 6] + vpermd m3, m5, m3 ; m3 = row[14 13 13 12 12 11 11 10] + vpermd m4, m5, m4 ; m4 = row[18 17 17 16 16 15 15 14] + + mova m5, [v4_interp4_vpp_shuf] + pshufb m0, m0, m5 + pshufb m1, m1, m5 + pshufb m2, m2, m5 + pshufb m4, m4, m5 + pshufb m3, m3, m5 + pshufb m6, m6, m5 + pshufb m7, m7, m5 + pshufb m8, m8, m5 + pmaddubsw m0, m10 + pmaddubsw m6, m10 + pmaddubsw m7, m10 + pmaddubsw m8, m10 + pmaddubsw m1, m11 + pmaddubsw m2, m11 + pmaddubsw m3, m11 + pmaddubsw m4, m11 + paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] + paddw m6, m2 ; m6 = WORD ROW[7 6 5 4] + paddw m7, m3 ; m7 = WORD ROW[11 10 9 8] + paddw m8, m4 ; m8 = WORD ROW[15 14 13 12] +%ifidn %1,pp + pmulhrsw m0, m9 + pmulhrsw m6, m9 + pmulhrsw m7, m9 + pmulhrsw m8, m9 + packuswb m0, m6 + packuswb m7, m8 + vextracti128 xm1, m0, 1 + vextracti128 xm2, m7, 1 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + movd [r2 + r3 * 2], xm1 + pextrd [r2 + r5], xm1, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm0, 3 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r5], xm1, 3 + lea r2, [r2 + r3 * 4] + movd [r2], xm7 + pextrd [r2 + r3], xm7, 1 + movd [r2 + r3 * 2], xm2 + pextrd [r2 + r5], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm7, 2 + pextrd [r2 + r3], xm7, 3 + pextrd [r2 + r3 * 2], xm2, 2 + pextrd [r2 + r5], xm2, 3 +%else + psubw m0, m9 + psubw m6, m9 + psubw m7, m9 + psubw m8, m9 + vextracti128 xm1, m0, 1 + vextracti128 xm2, m6, 1 + vextracti128 xm3, m7, 1 + vextracti128 xm4, m8, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm1 + movhps [r2 + r5], xm1 + lea r2, [r2 + r3 * 4] + movq [r2], xm6 + movhps [r2 + r3], xm6 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r5], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm7 + movhps [r2 + r3], xm7 + movq [r2 + r3 * 2], xm3 + movhps [r2 + r5], xm3 + lea r2, [r2 + r3 * 4] + movq [r2], xm8 + movhps [r2 + r3], xm8 + movq [r2 + r3 * 2], xm4 + movhps [r2 + r5], xm4 +%endif + lea r2, [r2 + r3 * 4] +%endrep + RET +%endif +%endmacro + + FILTER_VER_CHROMA_AVX2_4xN pp, 16 + FILTER_VER_CHROMA_AVX2_4xN ps, 16 + FILTER_VER_CHROMA_AVX2_4xN pp, 32 + FILTER_VER_CHROMA_AVX2_4xN ps, 32 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W4_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + mova m1, [pw_512] + + mov r4d, %2 + + lea r5, [3 * r1] + +.loop: + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 + + pmaddubsw m3, m0 + + phaddw m2, m3 + + pmulhrsw m2, m1 + + movd m7, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m7 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m7, m3 + punpcklbw m5, m7 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + pmulhrsw m4, m1 + packuswb m2, m4 + movd [r2], m2 + pextrd [r2 + r3], m2, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m2, 2 + pextrd [r2 + r3], m2, 3 + + lea r2, [r2 + 2 * r3] + + sub r4, 4 + jnz .loop + RET +%endmacro + + FILTER_V4_W4_H4 4, 8 + FILTER_V4_W4_H4 4, 16 + + FILTER_V4_W4_H4 4, 32 + +%macro FILTER_V4_W8_H2 0 + punpcklbw m1, m2 + punpcklbw m7, m3, m0 + + pmaddubsw m1, m6 + pmaddubsw m7, m5 + + paddw m1, m7 + + pmulhrsw m1, m4 + packuswb m1, m1 +%endmacro + +%macro FILTER_V4_W8_H3 0 + punpcklbw m2, m3 + punpcklbw m7, m0, m1 + + pmaddubsw m2, m6 + pmaddubsw m7, m5 + + paddw m2, m7 + + pmulhrsw m2, m4 + packuswb m2, m2 +%endmacro + +%macro FILTER_V4_W8_H4 0 + punpcklbw m3, m0 + punpcklbw m7, m1, m2 + + pmaddubsw m3, m6 + pmaddubsw m7, m5 + + paddw m3, m7 + + pmulhrsw m3, m4 + packuswb m3, m3 +%endmacro + +%macro FILTER_V4_W8_H5 0 + punpcklbw m0, m1 + punpcklbw m7, m2, m3 + + pmaddubsw m0, m6 + pmaddubsw m7, m5 + + paddw m0, m7 + + pmulhrsw m0, m4 + packuswb m0, m0 +%endmacro + +%macro FILTER_V4_W8_8x2 2 + FILTER_V4_W8 %1, %2 + movq m0, [r0 + 4 * r1] + + FILTER_V4_W8_H2 + + movh [r2 + r3], m1 +%endmacro + +%macro FILTER_V4_W8_8x4 2 + FILTER_V4_W8_8x2 %1, %2 +;8x3 + lea r6, [r0 + 4 * r1] + movq m1, [r6 + r1] + + FILTER_V4_W8_H3 + + movh [r2 + 2 * r3], m2 + +;8x4 + movq m2, [r6 + 2 * r1] + + FILTER_V4_W8_H4 + + lea r5, [r2 + 2 * r3] + movh [r5 + r3], m3 +%endmacro + +%macro FILTER_V4_W8_8x6 2 + FILTER_V4_W8_8x4 %1, %2 +;8x5 + lea r6, [r6 + 2 * r1] + movq m3, [r6 + r1] + + FILTER_V4_W8_H5 + + movh [r2 + 4 * r3], m0 + +;8x6 + movq m0, [r0 + 8 * r1] + + FILTER_V4_W8_H2 + + lea r5, [r2 + 4 * r3] + movh [r5 + r3], m1 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W8 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 + + mov r4d, r4m + + sub r0, r1 + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + lea r5, [r0 + 2 * r1] + movq m3, [r5 + r1] + + punpcklbw m0, m1 + punpcklbw m4, m2, m3 + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movd m5, [r6 + r4 * 4] +%else + movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m6, m5, [tab_Vm] + pmaddubsw m0, m6 + + pshufb m5, [tab_Vm + 16] + pmaddubsw m4, m5 + + paddw m0, m4 + + mova m4, [pw_512] + + pmulhrsw m0, m4 + packuswb m0, m0 + movh [r2], m0 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- + FILTER_V4_W8_8x2 8, 2 + + RET + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- + FILTER_V4_W8_8x4 8, 4 + + RET + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- + FILTER_V4_W8_8x6 8, 6 + + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_4x2, 4, 6, 6 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + movd m2, [r0] + movd m3, [r0 + r1] + lea r5, [r0 + 2 * r1] + movd m4, [r5] + movd m5, [r5 + r1] + + punpcklbw m2, m3 + punpcklbw m1, m4, m5 + punpcklbw m2, m1 + + pmaddubsw m2, m0 + + movd m1, [r0 + 4 * r1] + + punpcklbw m3, m4 + punpcklbw m5, m1 + punpcklbw m3, m5 + + pmaddubsw m3, m0 + + phaddw m2, m3 + + psubw m2, [pw_2000] + movh [r2], m2 + movhps [r2 + r3], m2 + + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_4x4, 4, 6, 7 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + lea r4, [r1 * 3] + lea r5, [r0 + 4 * r1] + + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r4] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + movd m6, [r5] + + punpcklbw m3, m4 + punpcklbw m1, m5, m6 + punpcklbw m3, m1 + + pmaddubsw m3, m0 + + phaddw m2, m3 + + mova m1, [pw_2000] + + psubw m2, m1 + movh [r2], m2 + movhps [r2 + r3], m2 + + movd m2, [r5 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r5 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + psubw m4, m1 + lea r2, [r2 + 2 * r3] + movh [r2], m4 + movhps [r2 + r3], m4 + + RET + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W4_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + mova m1, [pw_2000] + + mov r4d, %2/4 + lea r5, [3 * r1] + +.loop: + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 + + pmaddubsw m3, m0 + + phaddw m2, m3 + + psubw m2, m1 + movh [r2], m2 + movhps [r2 + r3], m2 + + movd m2, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + psubw m4, m1 + lea r2, [r2 + 2 * r3] + movh [r2], m4 + movhps [r2 + r3], m4 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + + FILTER_V_PS_W4_H4 4, 8 + FILTER_V_PS_W4_H4 4, 16 + + FILTER_V_PS_W4_H4 4, 32 + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W8_H8_H16_H2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] +%else + movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [pw_2000] + + mov r4d, %2/2 + lea r5, [3 * r1] + +.loopH: + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] + + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + + pmaddubsw m0, m6 + pmaddubsw m2, m5 + + paddw m0, m2 + + psubw m0, m4 + movu [r2], m0 + + movq m0, [r0 + 4 * r1] + + punpcklbw m3, m0 + + pmaddubsw m1, m6 + pmaddubsw m3, m5 + + paddw m1, m3 + psubw m1, m4 + + movu [r2 + r3], m1 + + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_V_PS_W8_H8_H16_H2 8, 2 + FILTER_V_PS_W8_H8_H16_H2 8, 4 + FILTER_V_PS_W8_H8_H16_H2 8, 6 + + FILTER_V_PS_W8_H8_H16_H2 8, 12 + FILTER_V_PS_W8_H8_H16_H2 8, 64 + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W8_H8_H16_H32 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] +%else + movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [pw_2000] + + mov r4d, %2/4 + lea r5, [3 * r1] + +.loop: + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] + + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + + pmaddubsw m0, m6 + pmaddubsw m7, m2, m5 + + paddw m0, m7 + + psubw m0, m4 + movu [r2], m0 + + lea r0, [r0 + 4 * r1] + movq m0, [r0] + + punpcklbw m3, m0 + + pmaddubsw m1, m6 + pmaddubsw m7, m3, m5 + + paddw m1, m7 + + psubw m1, m4 + movu [r2 + r3], m1 + + movq m1, [r0 + r1] + + punpcklbw m0, m1 + + pmaddubsw m2, m6 + pmaddubsw m0, m5 + + paddw m2, m0 + + psubw m2, m4 + lea r2, [r2 + 2 * r3] + movu [r2], m2 + + movq m2, [r0 + 2 * r1] + + punpcklbw m1, m2 + + pmaddubsw m3, m6 + pmaddubsw m1, m5 + + paddw m3, m1 + psubw m3, m4 + + movu [r2 + r3], m3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + + FILTER_V_PS_W8_H8_H16_H32 8, 8 + FILTER_V_PS_W8_H8_H16_H32 8, 16 + FILTER_V_PS_W8_H8_H16_H32 8, 32 + +;------------------------------------------------------------------------------------------------------------ +;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +%macro FILTER_V_PS_W6 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] +%else + movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [pw_2000] + lea r5, [3 * r1] + mov r4d, %2/4 + +.loop: + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] + + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + + pmaddubsw m0, m6 + pmaddubsw m7, m2, m5 + + paddw m0, m7 + psubw m0, m4 + + movh [r2], m0 + pshufd m0, m0, 2 + movd [r2 + 8], m0 + + lea r0, [r0 + 4 * r1] + movq m0, [r0] + punpcklbw m3, m0 + + pmaddubsw m1, m6 + pmaddubsw m7, m3, m5 + + paddw m1, m7 + psubw m1, m4 + + movh [r2 + r3], m1 + pshufd m1, m1, 2 + movd [r2 + r3 + 8], m1 + + movq m1, [r0 + r1] + punpcklbw m0, m1 + + pmaddubsw m2, m6 + pmaddubsw m0, m5 + + paddw m2, m0 + psubw m2, m4 + + lea r2,[r2 + 2 * r3] + movh [r2], m2 + pshufd m2, m2, 2 + movd [r2 + 8], m2 + + movq m2,[r0 + 2 * r1] + punpcklbw m1, m2 + + pmaddubsw m3, m6 + pmaddubsw m1, m5 + + paddw m3, m1 + psubw m3, m4 + + movh [r2 + r3], m3 + pshufd m3, m3, 2 + movd [r2 + r3 + 8], m3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + + FILTER_V_PS_W6 6, 8 + FILTER_V_PS_W6 6, 16 + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W12 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mov r4d, %2/2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m7, [r0 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 + + movu [r2], m4 + movh [r2 + 16], m2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m2, [r0 + 2 * r1] + + punpcklbw m5, m7, m2 + punpckhbw m7, m2 + + pmaddubsw m5, m0 + pmaddubsw m7, m0 + + paddw m4, m5 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 + + movu [r2 + r3], m4 + movh [r2 + r3 + 16], m3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + + FILTER_V_PS_W12 12, 16 + FILTER_V_PS_W12 12, 32 + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W16 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + mov r4d, %2/2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m7, [r0 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 + + movu [r2], m4 + movu [r2 + 16], m2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m5, [r0 + 2 * r1] + + punpcklbw m2, m7, m5 + punpckhbw m7, m5 + + pmaddubsw m2, m0 + pmaddubsw m7, m0 + + paddw m4, m2 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 + + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + + FILTER_V_PS_W16 16, 4 + FILTER_V_PS_W16 16, 8 + FILTER_V_PS_W16 16, 12 + FILTER_V_PS_W16 16, 16 + FILTER_V_PS_W16 16, 32 + + FILTER_V_PS_W16 16, 24 + FILTER_V_PS_W16 16, 64 + +;-------------------------------------------------------------------------------------------------------------- +;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_V4_PS_W24 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mov r4d, %2/2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + + movu m5, [r5] + movu m7, [r5 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 + + movu [r2], m4 + movu [r2 + 16], m2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m2, [r5 + 2 * r1] + + punpcklbw m5, m7, m2 + punpckhbw m7, m2 + + pmaddubsw m5, m0 + pmaddubsw m7, m0 + + paddw m4, m5 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 + + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 + + movq m2, [r0 + 16] + movq m3, [r0 + r1 + 16] + movq m4, [r5 + 16] + movq m5, [r5 + r1 + 16] + + punpcklbw m2, m3 + punpcklbw m7, m4, m5 + + pmaddubsw m2, m1 + pmaddubsw m7, m0 + + paddw m2, m7 + psubw m2, m6 + + movu [r2 + 32], m2 + + movq m2, [r5 + 2 * r1 + 16] + + punpcklbw m3, m4 + punpcklbw m5, m2 + + pmaddubsw m3, m1 + pmaddubsw m5, m0 + + paddw m3, m5 + psubw m3, m6 + + movu [r2 + r3 + 32], m3 + + mov r0, r5 + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + + FILTER_V4_PS_W24 24, 32 + + FILTER_V4_PS_W24 24, 64 + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W32 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mova m7, [pw_2000] + + mov r4d, %2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + movu m3, [r5] + movu m5, [r5 + r1] + + punpcklbw m6, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m6, m0 + pmaddubsw m3, m0 + + paddw m4, m6 + paddw m2, m3 + + psubw m4, m7 + psubw m2, m7 + + movu [r2], m4 + movu [r2 + 16], m2 + + movu m2, [r0 + 16] + movu m3, [r0 + r1 + 16] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + movu m3, [r5 + 16] + movu m5, [r5 + r1 + 16] + + punpcklbw m6, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m6, m0 + pmaddubsw m3, m0 + + paddw m4, m6 + paddw m2, m3 + + psubw m4, m7 + psubw m2, m7 + + movu [r2 + 32], m4 + movu [r2 + 48], m2 + + lea r0, [r0 + r1] + lea r2, [r2 + r3] + + dec r4d + jnz .loop + RET +%endmacro + + FILTER_V_PS_W32 32, 8 + FILTER_V_PS_W32 32, 16 + FILTER_V_PS_W32 32, 24 + FILTER_V_PS_W32 32, 32 + + FILTER_V_PS_W32 32, 48 + FILTER_V_PS_W32 32, 64 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W8_H8_H16_H32 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] +%else + movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [pw_512] + lea r5, [r1 * 3] + + mov r4d, %2 + +.loop: + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] + + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + + pmaddubsw m0, m6 + pmaddubsw m7, m2, m5 + + paddw m0, m7 + + pmulhrsw m0, m4 + packuswb m0, m0 + movh [r2], m0 + + lea r0, [r0 + 4 * r1] + movq m0, [r0] + + punpcklbw m3, m0 + + pmaddubsw m1, m6 + pmaddubsw m7, m3, m5 + + paddw m1, m7 + + pmulhrsw m1, m4 + packuswb m1, m1 + movh [r2 + r3], m1 + + movq m1, [r0 + r1] + + punpcklbw m0, m1 + + pmaddubsw m2, m6 + pmaddubsw m0, m5 + + paddw m2, m0 + + pmulhrsw m2, m4 + + movq m7, [r0 + 2 * r1] + punpcklbw m1, m7 + + pmaddubsw m3, m6 + pmaddubsw m1, m5 + + paddw m3, m1 + + pmulhrsw m3, m4 + packuswb m2, m3 + + lea r2, [r2 + 2 * r3] + movh [r2], m2 + movhps [r2 + r3], m2 + + lea r2, [r2 + 2 * r3] + + sub r4, 4 + jnz .loop + RET +%endmacro + + FILTER_V4_W8_H8_H16_H32 8, 8 + FILTER_V4_W8_H8_H16_H32 8, 16 + FILTER_V4_W8_H8_H16_H32 8, 32 + + FILTER_V4_W8_H8_H16_H32 8, 12 + FILTER_V4_W8_H8_H16_H32 8, 64 + +%macro PROCESS_CHROMA_AVX2_W8_8R 0 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] + vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] + vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + pmaddubsw m0, [r5 + 1 * mmsize] + paddw m4, m0 +%endmacro + +%macro FILTER_VER_CHROMA_AVX2_8x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x8, 4, 6, 7 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + PROCESS_CHROMA_AVX2_W8_8R +%ifidn %1,pp + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + pmulhrsw m4, m3 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 + movhps [r2 + r3 * 2], xm1 + movhps [r2 + r4], xm4 +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] + lea r4, [r3 * 3] + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + psubw m4, m3 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movu [r2], xm5 + movu [r2 + r3], xm6 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm4 + vextracti128 xm4, m4, 1 + movu [r2 + r4], xm4 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_8x8 pp + FILTER_VER_CHROMA_AVX2_8x8 ps + +%macro FILTER_VER_CHROMA_AVX2_8x6 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x6, 4, 6, 6 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] + vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m4, [r5 + 1 * mmsize] + paddw m1, m4 +%ifidn %1,pp + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + packuswb m5, m2 + packuswb m1, m1 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 +%else + add r3d, r3d + mova m3, [pw_2000] + lea r4, [r3 * 3] + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + vextracti128 xm4, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movu [r2], xm5 + movu [r2 + r3], xm4 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm1 + movu [r2 + r3], xm0 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_8x6 pp + FILTER_VER_CHROMA_AVX2_8x6 ps + +%macro PROCESS_CHROMA_AVX2_W8_16R 1 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 + vinserti128 m5, m1, xm2, 1 + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 + vinserti128 m2, m3, xm4, 1 + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 + vinserti128 m4, m4, xm3, 1 + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 + vinserti128 m0, m0, xm3, 1 + pmaddubsw m3, m0, [r5 + 1 * mmsize] + paddw m4, m3 + pmaddubsw m0, [r5] +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 + movhps [r2 + r3 * 2], xm1 + movhps [r2 + r6], xm4 +%else + psubw m5, m7 ; m5 = word: row 0, row 1 + psubw m2, m7 ; m2 = word: row 2, row 3 + psubw m1, m7 ; m1 = word: row 4, row 5 + psubw m4, m7 ; m4 = word: row 6, row 7 + vextracti128 xm3, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm3 + vextracti128 xm3, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + vextracti128 xm5, m1, 1 + vextracti128 xm3, m4, 1 + movu [r2], xm1 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm4 + movu [r2 + r6], xm3 +%endif + movq xm3, [r0 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 + lea r0, [r0 + r1 * 4] + movq xm5, [r0] ; m5 = row 12 + punpcklbw xm3, xm5 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, [r5 + 1 * mmsize] + paddw m0, m3 + pmaddubsw m6, [r5] + movq xm3, [r0 + r1] ; m3 = row 13 + punpcklbw xm5, xm3 + movq xm2, [r0 + r1 * 2] ; m2 = row 14 + punpcklbw xm3, xm2 + vinserti128 m5, m5, xm3, 1 + pmaddubsw m3, m5, [r5 + 1 * mmsize] + paddw m6, m3 + pmaddubsw m5, [r5] + movq xm3, [r0 + r4] ; m3 = row 15 + punpcklbw xm2, xm3 + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 16 + punpcklbw xm3, xm1 + vinserti128 m2, m2, xm3, 1 + pmaddubsw m3, m2, [r5 + 1 * mmsize] + paddw m5, m3 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 17 + punpcklbw xm1, xm3 + movq xm4, [r0 + r1 * 2] ; m4 = row 18 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m2, m1 + lea r2, [r2 + r3 * 4] +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 8, row 9 + pmulhrsw m6, m7 ; m6 = word: row 10, row 11 + pmulhrsw m5, m7 ; m5 = word: row 12, row 13 + pmulhrsw m2, m7 ; m2 = word: row 14, row 15 + packuswb m0, m6 + packuswb m5, m2 + vextracti128 xm6, m0, 1 + vextracti128 xm2, m5, 1 + movq [r2], xm0 + movq [r2 + r3], xm6 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm6 + lea r2, [r2 + r3 * 4] + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 +%else + psubw m0, m7 ; m0 = word: row 8, row 9 + psubw m6, m7 ; m6 = word: row 10, row 11 + psubw m5, m7 ; m5 = word: row 12, row 13 + psubw m2, m7 ; m2 = word: row 14, row 15 + vextracti128 xm1, m0, 1 + vextracti128 xm3, m6, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + vextracti128 xm1, m5, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm5 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif +%endmacro + +%macro FILTER_VER_CHROMA_AVX2_8x16 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x16, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + mova m7, [pw_2000] +%endif + lea r6, [r3 * 3] + PROCESS_CHROMA_AVX2_W8_16R %1 + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_8x16 pp + FILTER_VER_CHROMA_AVX2_8x16 ps + +%macro FILTER_VER_CHROMA_AVX2_8x12 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x12, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1, pp + mova m7, [pw_512] +%else + add r3d, r3d + mova m7, [pw_2000] +%endif + lea r6, [r3 * 3] + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 + vinserti128 m5, m1, xm2, 1 + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 + vinserti128 m2, m3, xm4, 1 + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 + vinserti128 m4, m4, xm3, 1 + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 + vinserti128 m0, m0, xm3, 1 + pmaddubsw m3, m0, [r5 + 1 * mmsize] + paddw m4, m3 + pmaddubsw m0, [r5] +%ifidn %1, pp + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 + movhps [r2 + r3 * 2], xm1 + movhps [r2 + r6], xm4 +%else + psubw m5, m7 ; m5 = word: row 0, row 1 + psubw m2, m7 ; m2 = word: row 2, row 3 + psubw m1, m7 ; m1 = word: row 4, row 5 + psubw m4, m7 ; m4 = word: row 6, row 7 + vextracti128 xm3, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm3 + vextracti128 xm3, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + vextracti128 xm5, m1, 1 + vextracti128 xm3, m4, 1 + movu [r2], xm1 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm4 + movu [r2 + r6], xm3 +%endif + movq xm3, [r0 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 + lea r0, [r0 + r1 * 4] + movq xm5, [r0] ; m5 = row 12 + punpcklbw xm3, xm5 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, [r5 + 1 * mmsize] + paddw m0, m3 + pmaddubsw m6, [r5] + movq xm3, [r0 + r1] ; m3 = row 13 + punpcklbw xm5, xm3 + movq xm2, [r0 + r1 * 2] ; m2 = row 14 + punpcklbw xm3, xm2 + vinserti128 m5, m5, xm3, 1 + pmaddubsw m3, m5, [r5 + 1 * mmsize] + paddw m6, m3 + lea r2, [r2 + r3 * 4] +%ifidn %1, pp + pmulhrsw m0, m7 ; m0 = word: row 8, row 9 + pmulhrsw m6, m7 ; m6 = word: row 10, row 11 + packuswb m0, m6 + vextracti128 xm6, m0, 1 + movq [r2], xm0 + movq [r2 + r3], xm6 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm6 +%else + psubw m0, m7 ; m0 = word: row 8, row 9 + psubw m6, m7 ; m6 = word: row 10, row 11 + vextracti128 xm1, m0, 1 + vextracti128 xm3, m6, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm3 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_8x12 pp + FILTER_VER_CHROMA_AVX2_8x12 ps + +%macro FILTER_VER_CHROMA_AVX2_8xN 2 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x%2, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + mova m7, [pw_2000] +%endif + lea r6, [r3 * 3] +%rep %2 / 16 + PROCESS_CHROMA_AVX2_W8_16R %1 + lea r2, [r2 + r3 * 4] +%endrep + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_8xN pp, 32 + FILTER_VER_CHROMA_AVX2_8xN ps, 32 + FILTER_VER_CHROMA_AVX2_8xN pp, 64 + FILTER_VER_CHROMA_AVX2_8xN ps, 64 + +%macro PROCESS_CHROMA_AVX2_W8_4R 0 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m0, m1, xm2, 1 ; m0 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m0, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m2, m1 +%endmacro + +%macro FILTER_VER_CHROMA_AVX2_8x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x4, 4, 6, 5 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + PROCESS_CHROMA_AVX2_W8_4R +%ifidn %1,pp + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m0, m3 ; m0 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + packuswb m0, m2 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] + lea r4, [r3 * 3] + psubw m0, m3 ; m0 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + vextracti128 xm1, m0, 1 + vextracti128 xm4, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm4 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_8x4 pp + FILTER_VER_CHROMA_AVX2_8x4 ps + +%macro FILTER_VER_CHROMA_AVX2_8x2 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x2, 4, 6, 4 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m1, m1, xm2, 1 ; m1 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m1, [r5] + movq xm2, [r0 + r4] ; m2 = row 3 + punpcklbw xm3, xm2 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + movq xm0, [r0 + r1 * 4] ; m0 = row 4 + punpcklbw xm2, xm0 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m3, m3, xm2, 1 ; m3 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 +%ifidn %1,pp + pmulhrsw m1, [pw_512] ; m1 = word: row 0, row 1 + packuswb m1, m1 + vextracti128 xm0, m1, 1 + movq [r2], xm1 + movq [r2 + r3], xm0 +%else + add r3d, r3d + psubw m1, [pw_2000] ; m1 = word: row 0, row 1 + vextracti128 xm0, m1, 1 + movu [r2], xm1 + movu [r2 + r3], xm0 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_8x2 pp + FILTER_VER_CHROMA_AVX2_8x2 ps + +%macro FILTER_VER_CHROMA_AVX2_6x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_6x8, 4, 6, 7 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + PROCESS_CHROMA_AVX2_W8_8R +%ifidn %1,pp + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + pmulhrsw m4, m3 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movd [r2], xm5 + pextrw [r2 + 4], xm5, 2 + movd [r2 + r3], xm2 + pextrw [r2 + r3 + 4], xm2, 2 + pextrd [r2 + r3 * 2], xm5, 2 + pextrw [r2 + r3 * 2 + 4], xm5, 6 + pextrd [r2 + r4], xm2, 2 + pextrw [r2 + r4 + 4], xm2, 6 + lea r2, [r2 + r3 * 4] + movd [r2], xm1 + pextrw [r2 + 4], xm1, 2 + movd [r2 + r3], xm4 + pextrw [r2 + r3 + 4], xm4, 2 + pextrd [r2 + r3 * 2], xm1, 2 + pextrw [r2 + r3 * 2 + 4], xm1, 6 + pextrd [r2 + r4], xm4, 2 + pextrw [r2 + r4 + 4], xm4, 6 +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] + lea r4, [r3 * 3] + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + psubw m4, m3 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movq [r2], xm5 + pextrd [r2 + 8], xm5, 2 + movq [r2 + r3], xm6 + pextrd [r2 + r3 + 8], xm6, 2 + movq [r2 + r3 * 2], xm2 + pextrd [r2 + r3 * 2 + 8], xm2, 2 + movq [r2 + r4], xm3 + pextrd [r2 + r4 + 8], xm3, 2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + pextrd [r2 + 8], xm1, 2 + movq [r2 + r3], xm0 + pextrd [r2 + r3 + 8], xm0, 2 + movq [r2 + r3 * 2], xm4 + pextrd [r2 + r3 * 2 + 8], xm4, 2 + vextracti128 xm4, m4, 1 + movq [r2 + r4], xm4 + pextrd [r2 + r4 + 8], xm4, 2 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_6x8 pp + FILTER_VER_CHROMA_AVX2_6x8 ps + +;----------------------------------------------------------------------------- +;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W6_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m5, [r5 + r4 * 4] +%else + movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m6, m5, [tab_Vm] + pshufb m5, [tab_Vm + 16] + mova m4, [pw_512] + + mov r4d, %2 + lea r5, [3 * r1] + +.loop: + movq m0, [r0] + movq m1, [r0 + r1] + movq m2, [r0 + 2 * r1] + movq m3, [r0 + r5] + + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + + pmaddubsw m0, m6 + pmaddubsw m7, m2, m5 + + paddw m0, m7 + + pmulhrsw m0, m4 + packuswb m0, m0 + movd [r2], m0 + pextrw [r2 + 4], m0, 2 + + lea r0, [r0 + 4 * r1] + + movq m0, [r0] + punpcklbw m3, m0 + + pmaddubsw m1, m6 + pmaddubsw m7, m3, m5 + + paddw m1, m7 + + pmulhrsw m1, m4 + packuswb m1, m1 + movd [r2 + r3], m1 + pextrw [r2 + r3 + 4], m1, 2 + + movq m1, [r0 + r1] + punpcklbw m7, m0, m1 + + pmaddubsw m2, m6 + pmaddubsw m7, m5 + + paddw m2, m7 + + pmulhrsw m2, m4 + packuswb m2, m2 + lea r2, [r2 + 2 * r3] + movd [r2], m2 + pextrw [r2 + 4], m2, 2 + + movq m2, [r0 + 2 * r1] + punpcklbw m1, m2 + + pmaddubsw m3, m6 + pmaddubsw m1, m5 + + paddw m3, m1 + + pmulhrsw m3, m4 + packuswb m3, m3 + + movd [r2 + r3], m3 + pextrw [r2 + r3 + 4], m3, 2 + + lea r2, [r2 + 2 * r3] + + sub r4, 4 + jnz .loop + RET +%endmacro + + FILTER_V4_W6_H4 6, 8 + + FILTER_V4_W6_H4 6, 16 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W12_H2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mov r4d, %2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m7, [r0 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_512] + + pmulhrsw m4, m6 + pmulhrsw m2, m6 + + packuswb m4, m2 + + movh [r2], m4 + pextrd [r2 + 8], m4, 2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m5, [r0 + 2 * r1] + + punpcklbw m2, m7, m5 + punpckhbw m7, m5 + + pmaddubsw m2, m0 + pmaddubsw m7, m0 + + paddw m4, m2 + paddw m3, m7 + + pmulhrsw m4, m6 + pmulhrsw m3, m6 + + packuswb m4, m3 + + movh [r2 + r3], m4 + pextrd [r2 + r3 + 8], m4, 2 + + lea r2, [r2 + 2 * r3] + + sub r4, 2 + jnz .loop + RET +%endmacro + + FILTER_V4_W12_H2 12, 16 + + FILTER_V4_W12_H2 12, 32 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W16_H2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mov r4d, %2/2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r0, [r0 + 2 * r1] + movu m5, [r0] + movu m6, [r0 + r1] + + punpckhbw m7, m5, m6 + pmaddubsw m7, m0 + paddw m2, m7 + + punpcklbw m7, m5, m6 + pmaddubsw m7, m0 + paddw m4, m7 + + mova m7, [pw_512] + + pmulhrsw m4, m7 + pmulhrsw m2, m7 + + packuswb m4, m2 + + movu [r2], m4 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m5, [r0 + 2 * r1] + + punpcklbw m2, m6, m5 + punpckhbw m6, m5 + + pmaddubsw m2, m0 + pmaddubsw m6, m0 + + paddw m4, m2 + paddw m3, m6 + + pmulhrsw m4, m7 + pmulhrsw m3, m7 + + packuswb m4, m3 + + movu [r2 + r3], m4 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + RET +%endmacro + + FILTER_V4_W16_H2 16, 4 + FILTER_V4_W16_H2 16, 8 + FILTER_V4_W16_H2 16, 12 + FILTER_V4_W16_H2 16, 16 + FILTER_V4_W16_H2 16, 32 + + FILTER_V4_W16_H2 16, 24 + FILTER_V4_W16_H2 16, 64 + +%macro FILTER_VER_CHROMA_AVX2_16x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + mova m12, [r5] + mova m13, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m14, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif + lea r5, [r3 * 3] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, m12 + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, m12 + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, m13 + paddw m0, m4 + pmaddubsw m2, m12 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, m13 + paddw m1, m5 + pmaddubsw m3, m12 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, m13 + paddw m2, m6 + pmaddubsw m4, m12 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, m13 + paddw m3, m7 + pmaddubsw m5, m12 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, m13 + paddw m4, m8 + pmaddubsw m6, m12 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, m13 + paddw m5, m9 + pmaddubsw m7, m12 + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, m13 + paddw m6, m10 + pmaddubsw m8, m12 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, m13 + paddw m7, m11 + pmaddubsw m9, m12 + +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + packuswb m6, m7 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r5], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r5], xm7 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r5], m3 + lea r2, [r2 + r3 * 4] + movu [r2], m4 + movu [r2 + r3], m5 + movu [r2 + r3 * 2], m6 + movu [r2 + r5], m7 +%endif + lea r2, [r2 + r3 * 4] + + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm6, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm6, 1 + pmaddubsw m6, m10, m13 + paddw m8, m6 + pmaddubsw m10, m12 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 12 + punpckhbw xm7, xm11, xm6 + punpcklbw xm11, xm6 + vinserti128 m11, m11, xm7, 1 + pmaddubsw m7, m11, m13 + paddw m9, m7 + pmaddubsw m11, m12 + + movu xm7, [r0 + r1] ; m7 = row 13 + punpckhbw xm0, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm0, 1 + pmaddubsw m0, m6, m13 + paddw m10, m0 + pmaddubsw m6, m12 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm7, xm0 + punpcklbw xm7, xm0 + vinserti128 m7, m7, xm1, 1 + pmaddubsw m1, m7, m13 + paddw m11, m1 + pmaddubsw m7, m12 + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, m13 + paddw m6, m2 + pmaddubsw m0, m12 + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, m13 + paddw m7, m3 + pmaddubsw m1, m12 + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m2, m13 + paddw m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m3, m13 + paddw m1, m3 + +%ifidn %1,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m6, m14 ; m6 = word: row 12 + pmulhrsw m7, m14 ; m7 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m6, m7 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m6, m6, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm7, m6, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r5], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm6 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm0 + movu [r2 + r5], xm1 +%else + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m6, m14 ; m6 = word: row 12 + psubw m7, m14 ; m7 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r2], m8 + movu [r2 + r3], m9 + movu [r2 + r3 * 2], m10 + movu [r2 + r5], m11 + lea r2, [r2 + r3 * 4] + movu [r2], m6 + movu [r2 + r3], m7 + movu [r2 + r3 * 2], m0 + movu [r2 + r5], m1 +%endif + RET +%endif +%endmacro + + FILTER_VER_CHROMA_AVX2_16x16 pp + FILTER_VER_CHROMA_AVX2_16x16 ps +%macro FILTER_VER_CHROMA_AVX2_16x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_16x8, 4, 7, 7 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m6, [pw_512] +%else + add r3d, r3d + mova m6, [pw_2000] +%endif + lea r6, [r3 * 3] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] +%ifidn %1,pp + pmulhrsw m0, m6 ; m0 = word: row 0 + pmulhrsw m1, m6 ; m1 = word: row 1 + packuswb m0, m1 + vpermq m0, m0, 11011000b + vextracti128 xm1, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 +%else + psubw m0, m6 ; m0 = word: row 0 + psubw m1, m6 ; m1 = word: row 1 + movu [r2], m0 + movu [r2 + r3], m1 +%endif + + movu xm0, [r0 + r1] ; m0 = row 5 + punpckhbw xm1, xm4, xm0 + punpcklbw xm4, xm0 + vinserti128 m4, m4, xm1, 1 + pmaddubsw m1, m4, [r5 + mmsize] + paddw m2, m1 + pmaddubsw m4, [r5] + movu xm1, [r0 + r1 * 2] ; m1 = row 6 + punpckhbw xm5, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm5, 1 + pmaddubsw m5, m0, [r5 + mmsize] + paddw m3, m5 + pmaddubsw m0, [r5] +%ifidn %1,pp + pmulhrsw m2, m6 ; m2 = word: row 2 + pmulhrsw m3, m6 ; m3 = word: row 3 + packuswb m2, m3 + vpermq m2, m2, 11011000b + vextracti128 xm3, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%else + psubw m2, m6 ; m2 = word: row 2 + psubw m3, m6 ; m3 = word: row 3 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 +%endif + + movu xm2, [r0 + r4] ; m2 = row 7 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + mmsize] + paddw m4, m3 + pmaddubsw m1, [r5] + lea r0, [r0 + r1 * 4] + movu xm3, [r0] ; m3 = row 8 + punpckhbw xm5, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm5, 1 + pmaddubsw m5, m2, [r5 + mmsize] + paddw m0, m5 + pmaddubsw m2, [r5] + lea r2, [r2 + r3 * 4] +%ifidn %1,pp + pmulhrsw m4, m6 ; m4 = word: row 4 + pmulhrsw m0, m6 ; m0 = word: row 5 + packuswb m4, m0 + vpermq m4, m4, 11011000b + vextracti128 xm0, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm0 +%else + psubw m4, m6 ; m4 = word: row 4 + psubw m0, m6 ; m0 = word: row 5 + movu [r2], m4 + movu [r2 + r3], m0 +%endif + + movu xm5, [r0 + r1] ; m5 = row 9 + punpckhbw xm4, xm3, xm5 + punpcklbw xm3, xm5 + vinserti128 m3, m3, xm4, 1 + pmaddubsw m3, [r5 + mmsize] + paddw m1, m3 + movu xm4, [r0 + r1 * 2] ; m4 = row 10 + punpckhbw xm0, xm5, xm4 + punpcklbw xm5, xm4 + vinserti128 m5, m5, xm0, 1 + pmaddubsw m5, [r5 + mmsize] + paddw m2, m5 +%ifidn %1,pp + pmulhrsw m1, m6 ; m1 = word: row 6 + pmulhrsw m2, m6 ; m2 = word: row 7 + packuswb m1, m2 + vpermq m1, m1, 11011000b + vextracti128 xm2, m1, 1 + movu [r2 + r3 * 2], xm1 + movu [r2 + r6], xm2 +%else + psubw m1, m6 ; m1 = word: row 6 + psubw m2, m6 ; m2 = word: row 7 + movu [r2 + r3 * 2], m1 + movu [r2 + r6], m2 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_16x8 pp + FILTER_VER_CHROMA_AVX2_16x8 ps + +%macro FILTER_VER_CHROMA_AVX2_16x12 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + mova m8, [r5] + mova m9, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m7, [pw_2000] +%endif + lea r5, [r3 * 3] + + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r1 * 2], 1 + movu xm1, [r0 + r1] + vinserti128 m1, m1, [r0 + r4], 1 + + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + vperm2i128 m4, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + pmaddubsw m4, m8 + pmaddubsw m3, m2, m9 + paddw m4, m3 + pmaddubsw m2, m8 + + vextracti128 xm0, m0, 1 + lea r0, [r0 + r1 * 4] + vinserti128 m0, m0, [r0], 1 + + punpcklbw m5, m1, m0 + punpckhbw m3, m1, m0 + vperm2i128 m6, m5, m3, 0x20 + vperm2i128 m5, m5, m3, 0x31 + pmaddubsw m6, m8 + pmaddubsw m3, m5, m9 + paddw m6, m3 + pmaddubsw m5, m8 +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 0 + pmulhrsw m6, m7 ; m6 = word: row 1 + packuswb m4, m6 + vpermq m4, m4, 11011000b + vextracti128 xm6, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm6 +%else + psubw m4, m7 ; m4 = word: row 0 + psubw m6, m7 ; m6 = word: row 1 + movu [r2], m4 + movu [r2 + r3], m6 +%endif + + movu xm4, [r0 + r1 * 2] + vinserti128 m4, m4, [r0 + r1], 1 + vextracti128 xm1, m4, 1 + vinserti128 m0, m0, xm1, 0 + + punpcklbw m6, m0, m4 + punpckhbw m1, m0, m4 + vperm2i128 m0, m6, m1, 0x20 + vperm2i128 m6, m6, m1, 0x31 + pmaddubsw m1, m0, m9 + paddw m5, m1 + pmaddubsw m0, m8 + pmaddubsw m1, m6, m9 + paddw m2, m1 + pmaddubsw m6, m8 + +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m5, m7 ; m5 = word: row 3 + packuswb m2, m5 + vpermq m2, m2, 11011000b + vextracti128 xm5, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r5], xm5 +%else + psubw m2, m7 ; m2 = word: row 2 + psubw m5, m7 ; m5 = word: row 3 + movu [r2 + r3 * 2], m2 + movu [r2 + r5], m5 +%endif + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m1, m1, [r0], 1 + vinserti128 m4, m4, xm1, 1 + + punpcklbw m2, m4, m1 + punpckhbw m5, m4, m1 + vperm2i128 m3, m2, m5, 0x20 + vperm2i128 m2, m2, m5, 0x31 + pmaddubsw m5, m3, m9 + paddw m6, m5 + pmaddubsw m3, m8 + pmaddubsw m5, m2, m9 + paddw m0, m5 + pmaddubsw m2, m8 + +%ifidn %1,pp + pmulhrsw m6, m7 ; m6 = word: row 4 + pmulhrsw m0, m7 ; m0 = word: row 5 + packuswb m6, m0 + vpermq m6, m6, 11011000b + vextracti128 xm0, m6, 1 + movu [r2], xm6 + movu [r2 + r3], xm0 +%else + psubw m6, m7 ; m6 = word: row 4 + psubw m0, m7 ; m0 = word: row 5 + movu [r2], m6 + movu [r2 + r3], m0 +%endif + + movu xm6, [r0 + r1 * 2] + vinserti128 m6, m6, [r0 + r1], 1 + vextracti128 xm0, m6, 1 + vinserti128 m1, m1, xm0, 0 + + punpcklbw m4, m1, m6 + punpckhbw m5, m1, m6 + vperm2i128 m0, m4, m5, 0x20 + vperm2i128 m5, m4, m5, 0x31 + pmaddubsw m4, m0, m9 + paddw m2, m4 + pmaddubsw m0, m8 + pmaddubsw m4, m5, m9 + paddw m3, m4 + pmaddubsw m5, m8 + +%ifidn %1,pp + pmulhrsw m3, m7 ; m3 = word: row 6 + pmulhrsw m2, m7 ; m2 = word: row 7 + packuswb m3, m2 + vpermq m3, m3, 11011000b + vextracti128 xm2, m3, 1 + movu [r2 + r3 * 2], xm3 + movu [r2 + r5], xm2 +%else + psubw m3, m7 ; m3 = word: row 6 + psubw m2, m7 ; m2 = word: row 7 + movu [r2 + r3 * 2], m3 + movu [r2 + r5], m2 +%endif + lea r2, [r2 + r3 * 4] + + movu xm3, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m3, m3, [r0], 1 + vinserti128 m6, m6, xm3, 1 + + punpcklbw m2, m6, m3 + punpckhbw m1, m6, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, m9 + paddw m5, m1 + pmaddubsw m4, m8 + pmaddubsw m1, m2, m9 + paddw m0, m1 + pmaddubsw m2, m8 + +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 8 + pmulhrsw m0, m7 ; m0 = word: row 9 + packuswb m5, m0 + vpermq m5, m5, 11011000b + vextracti128 xm0, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm0 +%else + psubw m5, m7 ; m5 = word: row 8 + psubw m0, m7 ; m0 = word: row 9 + movu [r2], m5 + movu [r2 + r3], m0 +%endif + + movu xm5, [r0 + r1 * 2] + vinserti128 m5, m5, [r0 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 + + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m1, m6, m9 + paddw m2, m1 + pmaddubsw m1, m0, m9 + paddw m4, m1 + +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 10 + pmulhrsw m2, m7 ; m2 = word: row 11 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r2 + r3 * 2], xm4 + movu [r2 + r5], xm2 +%else + psubw m4, m7 ; m4 = word: row 10 + psubw m2, m7 ; m2 = word: row 11 + movu [r2 + r3 * 2], m4 + movu [r2 + r5], m2 +%endif + RET +%endif +%endmacro + + FILTER_VER_CHROMA_AVX2_16x12 pp + FILTER_VER_CHROMA_AVX2_16x12 ps + +%macro FILTER_VER_CHROMA_AVX2_16xN 2 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_16x%2, 4, 8, 8 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + mova m7, [pw_2000] +%endif + lea r6, [r3 * 3] + mov r7d, %2 / 16 +.loopH: + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r1 * 2], 1 + movu xm1, [r0 + r1] + vinserti128 m1, m1, [r0 + r4], 1 + + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + vperm2i128 m4, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + pmaddubsw m4, [r5] + pmaddubsw m3, m2, [r5 + mmsize] + paddw m4, m3 + pmaddubsw m2, [r5] + + vextracti128 xm0, m0, 1 + lea r0, [r0 + r1 * 4] + vinserti128 m0, m0, [r0], 1 + + punpcklbw m5, m1, m0 + punpckhbw m3, m1, m0 + vperm2i128 m6, m5, m3, 0x20 + vperm2i128 m5, m5, m3, 0x31 + pmaddubsw m6, [r5] + pmaddubsw m3, m5, [r5 + mmsize] + paddw m6, m3 + pmaddubsw m5, [r5] +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 0 + pmulhrsw m6, m7 ; m6 = word: row 1 + packuswb m4, m6 + vpermq m4, m4, 11011000b + vextracti128 xm6, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm6 +%else + psubw m4, m7 ; m4 = word: row 0 + psubw m6, m7 ; m6 = word: row 1 + movu [r2], m4 + movu [r2 + r3], m6 +%endif + + movu xm4, [r0 + r1 * 2] + vinserti128 m4, m4, [r0 + r1], 1 + vextracti128 xm1, m4, 1 + vinserti128 m0, m0, xm1, 0 + + punpcklbw m6, m0, m4 + punpckhbw m1, m0, m4 + vperm2i128 m0, m6, m1, 0x20 + vperm2i128 m6, m6, m1, 0x31 + pmaddubsw m1, m0, [r5 + mmsize] + paddw m5, m1 + pmaddubsw m0, [r5] + pmaddubsw m1, m6, [r5 + mmsize] + paddw m2, m1 + pmaddubsw m6, [r5] + +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m5, m7 ; m5 = word: row 3 + packuswb m2, m5 + vpermq m2, m2, 11011000b + vextracti128 xm5, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm5 +%else + psubw m2, m7 ; m2 = word: row 2 + psubw m5, m7 ; m5 = word: row 3 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m5 +%endif + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m1, m1, [r0], 1 + vinserti128 m4, m4, xm1, 1 + + punpcklbw m2, m4, m1 + punpckhbw m5, m4, m1 + vperm2i128 m3, m2, m5, 0x20 + vperm2i128 m2, m2, m5, 0x31 + pmaddubsw m5, m3, [r5 + mmsize] + paddw m6, m5 + pmaddubsw m3, [r5] + pmaddubsw m5, m2, [r5 + mmsize] + paddw m0, m5 + pmaddubsw m2, [r5] + +%ifidn %1,pp + pmulhrsw m6, m7 ; m6 = word: row 4 + pmulhrsw m0, m7 ; m0 = word: row 5 + packuswb m6, m0 + vpermq m6, m6, 11011000b + vextracti128 xm0, m6, 1 + movu [r2], xm6 + movu [r2 + r3], xm0 +%else + psubw m6, m7 ; m6 = word: row 4 + psubw m0, m7 ; m0 = word: row 5 + movu [r2], m6 + movu [r2 + r3], m0 +%endif + + movu xm6, [r0 + r1 * 2] + vinserti128 m6, m6, [r0 + r1], 1 + vextracti128 xm0, m6, 1 + vinserti128 m1, m1, xm0, 0 + + punpcklbw m4, m1, m6 + punpckhbw m5, m1, m6 + vperm2i128 m0, m4, m5, 0x20 + vperm2i128 m5, m4, m5, 0x31 + pmaddubsw m4, m0, [r5 + mmsize] + paddw m2, m4 + pmaddubsw m0, [r5] + pmaddubsw m4, m5, [r5 + mmsize] + paddw m3, m4 + pmaddubsw m5, [r5] + +%ifidn %1,pp + pmulhrsw m3, m7 ; m3 = word: row 6 + pmulhrsw m2, m7 ; m2 = word: row 7 + packuswb m3, m2 + vpermq m3, m3, 11011000b + vextracti128 xm2, m3, 1 + movu [r2 + r3 * 2], xm3 + movu [r2 + r6], xm2 +%else + psubw m3, m7 ; m3 = word: row 6 + psubw m2, m7 ; m2 = word: row 7 + movu [r2 + r3 * 2], m3 + movu [r2 + r6], m2 +%endif + lea r2, [r2 + r3 * 4] + + movu xm3, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m3, m3, [r0], 1 + vinserti128 m6, m6, xm3, 1 + + punpcklbw m2, m6, m3 + punpckhbw m1, m6, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, [r5 + mmsize] + paddw m5, m1 + pmaddubsw m4, [r5] + pmaddubsw m1, m2, [r5 + mmsize] + paddw m0, m1 + pmaddubsw m2, [r5] + +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 8 + pmulhrsw m0, m7 ; m0 = word: row 9 + packuswb m5, m0 + vpermq m5, m5, 11011000b + vextracti128 xm0, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm0 +%else + psubw m5, m7 ; m5 = word: row 8 + psubw m0, m7 ; m0 = word: row 9 + movu [r2], m5 + movu [r2 + r3], m0 +%endif + + movu xm5, [r0 + r1 * 2] + vinserti128 m5, m5, [r0 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 + + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m1, m6, [r5 + mmsize] + paddw m2, m1 + pmaddubsw m6, [r5] + pmaddubsw m1, m0, [r5 + mmsize] + paddw m4, m1 + pmaddubsw m0, [r5] + +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 10 + pmulhrsw m2, m7 ; m2 = word: row 11 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r2 + r3 * 2], xm4 + movu [r2 + r6], xm2 +%else + psubw m4, m7 ; m4 = word: row 10 + psubw m2, m7 ; m2 = word: row 11 + movu [r2 + r3 * 2], m4 + movu [r2 + r6], m2 +%endif + lea r2, [r2 + r3 * 4] + + movu xm3, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m3, m3, [r0], 1 + vinserti128 m5, m5, xm3, 1 + + punpcklbw m2, m5, m3 + punpckhbw m1, m5, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, [r5 + mmsize] + paddw m0, m1 + pmaddubsw m4, [r5] + pmaddubsw m1, m2, [r5 + mmsize] + paddw m6, m1 + pmaddubsw m2, [r5] + +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 12 + pmulhrsw m6, m7 ; m6 = word: row 13 + packuswb m0, m6 + vpermq m0, m0, 11011000b + vextracti128 xm6, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm6 +%else + psubw m0, m7 ; m0 = word: row 12 + psubw m6, m7 ; m6 = word: row 13 + movu [r2], m0 + movu [r2 + r3], m6 +%endif + + movu xm5, [r0 + r1 * 2] + vinserti128 m5, m5, [r0 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 + + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m6, [r5 + mmsize] + paddw m2, m6 + pmaddubsw m0, [r5 + mmsize] + paddw m4, m0 + +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 14 + pmulhrsw m2, m7 ; m2 = word: row 15 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r2 + r3 * 2], xm4 + movu [r2 + r6], xm2 +%else + psubw m4, m7 ; m4 = word: row 14 + psubw m2, m7 ; m2 = word: row 15 + movu [r2 + r3 * 2], m4 + movu [r2 + r6], m2 +%endif + lea r2, [r2 + r3 * 4] + dec r7d + jnz .loopH + RET +%endif +%endmacro + + FILTER_VER_CHROMA_AVX2_16xN pp, 32 + FILTER_VER_CHROMA_AVX2_16xN ps, 32 + FILTER_VER_CHROMA_AVX2_16xN pp, 64 + FILTER_VER_CHROMA_AVX2_16xN ps, 64 + +%macro FILTER_VER_CHROMA_AVX2_16x24 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_16x24, 4, 6, 15 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + mova m12, [r5] + mova m13, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m14, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif + lea r5, [r3 * 3] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, m12 + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, m12 + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, m13 + paddw m0, m4 + pmaddubsw m2, m12 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, m13 + paddw m1, m5 + pmaddubsw m3, m12 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, m13 + paddw m2, m6 + pmaddubsw m4, m12 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, m13 + paddw m3, m7 + pmaddubsw m5, m12 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, m13 + paddw m4, m8 + pmaddubsw m6, m12 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, m13 + paddw m5, m9 + pmaddubsw m7, m12 + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, m13 + paddw m6, m10 + pmaddubsw m8, m12 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, m13 + paddw m7, m11 + pmaddubsw m9, m12 + +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + packuswb m6, m7 + vpermq m0, m0, q3120 + vpermq m2, m2, q3120 + vpermq m4, m4, q3120 + vpermq m6, m6, q3120 + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r5], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r5], xm7 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r5], m3 + lea r2, [r2 + r3 * 4] + movu [r2], m4 + movu [r2 + r3], m5 + movu [r2 + r3 * 2], m6 + movu [r2 + r5], m7 +%endif + lea r2, [r2 + r3 * 4] + + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm6, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm6, 1 + pmaddubsw m6, m10, m13 + paddw m8, m6 + pmaddubsw m10, m12 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 12 + punpckhbw xm7, xm11, xm6 + punpcklbw xm11, xm6 + vinserti128 m11, m11, xm7, 1 + pmaddubsw m7, m11, m13 + paddw m9, m7 + pmaddubsw m11, m12 + + movu xm7, [r0 + r1] ; m7 = row 13 + punpckhbw xm0, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm0, 1 + pmaddubsw m0, m6, m13 + paddw m10, m0 + pmaddubsw m6, m12 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm7, xm0 + punpcklbw xm7, xm0 + vinserti128 m7, m7, xm1, 1 + pmaddubsw m1, m7, m13 + paddw m11, m1 + pmaddubsw m7, m12 + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, m13 + paddw m6, m2 + pmaddubsw m0, m12 + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, m13 + paddw m7, m3 + pmaddubsw m1, m12 + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, m13 + paddw m0, m4 + pmaddubsw m2, m12 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, m13 + paddw m1, m5 + pmaddubsw m3, m12 + +%ifidn %1,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m6, m14 ; m6 = word: row 12 + pmulhrsw m7, m14 ; m7 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m6, m7 + packuswb m0, m1 + vpermq m8, m8, q3120 + vpermq m10, m10, q3120 + vpermq m6, m6, q3120 + vpermq m0, m0, q3120 + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm7, m6, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r5], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm6 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm0 + movu [r2 + r5], xm1 +%else + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m6, m14 ; m6 = word: row 12 + psubw m7, m14 ; m7 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r2], m8 + movu [r2 + r3], m9 + movu [r2 + r3 * 2], m10 + movu [r2 + r5], m11 + lea r2, [r2 + r3 * 4] + movu [r2], m6 + movu [r2 + r3], m7 + movu [r2 + r3 * 2], m0 + movu [r2 + r5], m1 +%endif + lea r2, [r2 + r3 * 4] + + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, m13 + paddw m2, m6 + pmaddubsw m4, m12 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, m13 + paddw m3, m7 + pmaddubsw m5, m12 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm0, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm0, 1 + pmaddubsw m0, m6, m13 + paddw m4, m0 + pmaddubsw m6, m12 + movu xm0, [r0 + r1 * 2] ; m0 = row 22 + punpckhbw xm1, xm7, xm0 + punpcklbw xm7, xm0 + vinserti128 m7, m7, xm1, 1 + pmaddubsw m1, m7, m13 + paddw m5, m1 + pmaddubsw m7, m12 + movu xm1, [r0 + r4] ; m1 = row 23 + punpckhbw xm8, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm8, 1 + pmaddubsw m8, m0, m13 + paddw m6, m8 + pmaddubsw m0, m12 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 24 + punpckhbw xm9, xm1, xm8 + punpcklbw xm1, xm8 + vinserti128 m1, m1, xm9, 1 + pmaddubsw m9, m1, m13 + paddw m7, m9 + pmaddubsw m1, m12 + movu xm9, [r0 + r1] ; m9 = row 25 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m8, m13 + paddw m0, m8 + movu xm10, [r0 + r1 * 2] ; m10 = row 26 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m9, m13 + paddw m1, m9 + +%ifidn %1,pp + pmulhrsw m2, m14 ; m2 = word: row 16 + pmulhrsw m3, m14 ; m3 = word: row 17 + pmulhrsw m4, m14 ; m4 = word: row 18 + pmulhrsw m5, m14 ; m5 = word: row 19 + pmulhrsw m6, m14 ; m6 = word: row 20 + pmulhrsw m7, m14 ; m7 = word: row 21 + pmulhrsw m0, m14 ; m0 = word: row 22 + pmulhrsw m1, m14 ; m1 = word: row 23 + packuswb m2, m3 + packuswb m4, m5 + packuswb m6, m7 + packuswb m0, m1 + vpermq m2, m2, q3120 + vpermq m4, m4, q3120 + vpermq m6, m6, q3120 + vpermq m0, m0, q3120 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm2 + movu [r2 + r3], xm3 + movu [r2 + r3 * 2], xm4 + movu [r2 + r5], xm5 + lea r2, [r2 + r3 * 4] + movu [r2], xm6 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm0 + movu [r2 + r5], xm1 +%else + psubw m2, m14 ; m2 = word: row 16 + psubw m3, m14 ; m3 = word: row 17 + psubw m4, m14 ; m4 = word: row 18 + psubw m5, m14 ; m5 = word: row 19 + psubw m6, m14 ; m6 = word: row 20 + psubw m7, m14 ; m7 = word: row 21 + psubw m0, m14 ; m0 = word: row 22 + psubw m1, m14 ; m1 = word: row 23 + movu [r2], m2 + movu [r2 + r3], m3 + movu [r2 + r3 * 2], m4 + movu [r2 + r5], m5 + lea r2, [r2 + r3 * 4] + movu [r2], m6 + movu [r2 + r3], m7 + movu [r2 + r3 * 2], m0 + movu [r2 + r5], m1 +%endif + RET +%endif +%endmacro + + FILTER_VER_CHROMA_AVX2_16x24 pp + FILTER_VER_CHROMA_AVX2_16x24 ps + +%macro FILTER_VER_CHROMA_AVX2_24x32 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_24x32, 4, 9, 10 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + mova m8, [r5] + mova m9, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m7, [pw_2000] +%endif + lea r6, [r3 * 3] + mov r5d, 2 +.loopH: + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r1 * 2], 1 + movu xm1, [r0 + r1] + vinserti128 m1, m1, [r0 + r4], 1 + + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + vperm2i128 m4, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + pmaddubsw m4, m8 + pmaddubsw m3, m2, m9 + paddw m4, m3 + pmaddubsw m2, m8 + + vextracti128 xm0, m0, 1 + lea r7, [r0 + r1 * 4] + vinserti128 m0, m0, [r7], 1 + + punpcklbw m5, m1, m0 + punpckhbw m3, m1, m0 + vperm2i128 m6, m5, m3, 0x20 + vperm2i128 m5, m5, m3, 0x31 + pmaddubsw m6, m8 + pmaddubsw m3, m5, m9 + paddw m6, m3 + pmaddubsw m5, m8 +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 0 + pmulhrsw m6, m7 ; m6 = word: row 1 + packuswb m4, m6 + vpermq m4, m4, 11011000b + vextracti128 xm6, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm6 +%else + psubw m4, m7 ; m4 = word: row 0 + psubw m6, m7 ; m6 = word: row 1 + movu [r2], m4 + movu [r2 + r3], m6 +%endif + + movu xm4, [r7 + r1 * 2] + vinserti128 m4, m4, [r7 + r1], 1 + vextracti128 xm1, m4, 1 + vinserti128 m0, m0, xm1, 0 + + punpcklbw m6, m0, m4 + punpckhbw m1, m0, m4 + vperm2i128 m0, m6, m1, 0x20 + vperm2i128 m6, m6, m1, 0x31 + pmaddubsw m1, m0, m9 + paddw m5, m1 + pmaddubsw m0, m8 + pmaddubsw m1, m6, m9 + paddw m2, m1 + pmaddubsw m6, m8 + +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m5, m7 ; m5 = word: row 3 + packuswb m2, m5 + vpermq m2, m2, 11011000b + vextracti128 xm5, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm5 +%else + psubw m2, m7 ; m2 = word: row 2 + psubw m5, m7 ; m5 = word: row 3 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m5 +%endif + lea r8, [r2 + r3 * 4] + + movu xm1, [r7 + r4] + lea r7, [r7 + r1 * 4] + vinserti128 m1, m1, [r7], 1 + vinserti128 m4, m4, xm1, 1 + + punpcklbw m2, m4, m1 + punpckhbw m5, m4, m1 + vperm2i128 m3, m2, m5, 0x20 + vperm2i128 m2, m2, m5, 0x31 + pmaddubsw m5, m3, m9 + paddw m6, m5 + pmaddubsw m3, m8 + pmaddubsw m5, m2, m9 + paddw m0, m5 + pmaddubsw m2, m8 + +%ifidn %1,pp + pmulhrsw m6, m7 ; m6 = word: row 4 + pmulhrsw m0, m7 ; m0 = word: row 5 + packuswb m6, m0 + vpermq m6, m6, 11011000b + vextracti128 xm0, m6, 1 + movu [r8], xm6 + movu [r8 + r3], xm0 +%else + psubw m6, m7 ; m6 = word: row 4 + psubw m0, m7 ; m0 = word: row 5 + movu [r8], m6 + movu [r8 + r3], m0 +%endif + + movu xm6, [r7 + r1 * 2] + vinserti128 m6, m6, [r7 + r1], 1 + vextracti128 xm0, m6, 1 + vinserti128 m1, m1, xm0, 0 + + punpcklbw m4, m1, m6 + punpckhbw m5, m1, m6 + vperm2i128 m0, m4, m5, 0x20 + vperm2i128 m5, m4, m5, 0x31 + pmaddubsw m4, m0, m9 + paddw m2, m4 + pmaddubsw m0, m8 + pmaddubsw m4, m5, m9 + paddw m3, m4 + pmaddubsw m5, m8 + +%ifidn %1,pp + pmulhrsw m3, m7 ; m3 = word: row 6 + pmulhrsw m2, m7 ; m2 = word: row 7 + packuswb m3, m2 + vpermq m3, m3, 11011000b + vextracti128 xm2, m3, 1 + movu [r8 + r3 * 2], xm3 + movu [r8 + r6], xm2 +%else + psubw m3, m7 ; m3 = word: row 6 + psubw m2, m7 ; m2 = word: row 7 + movu [r8 + r3 * 2], m3 + movu [r8 + r6], m2 +%endif + lea r8, [r8 + r3 * 4] + + movu xm3, [r7 + r4] + lea r7, [r7 + r1 * 4] + vinserti128 m3, m3, [r7], 1 + vinserti128 m6, m6, xm3, 1 + + punpcklbw m2, m6, m3 + punpckhbw m1, m6, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, m9 + paddw m5, m1 + pmaddubsw m4, m8 + pmaddubsw m1, m2, m9 + paddw m0, m1 + pmaddubsw m2, m8 + +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 8 + pmulhrsw m0, m7 ; m0 = word: row 9 + packuswb m5, m0 + vpermq m5, m5, 11011000b + vextracti128 xm0, m5, 1 + movu [r8], xm5 + movu [r8 + r3], xm0 +%else + psubw m5, m7 ; m5 = word: row 8 + psubw m0, m7 ; m0 = word: row 9 + movu [r8], m5 + movu [r8 + r3], m0 +%endif + + movu xm5, [r7 + r1 * 2] + vinserti128 m5, m5, [r7 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 + + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m1, m6, m9 + paddw m2, m1 + pmaddubsw m6, m8 + pmaddubsw m1, m0, m9 + paddw m4, m1 + pmaddubsw m0, m8 + +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 10 + pmulhrsw m2, m7 ; m2 = word: row 11 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r8 + r3 * 2], xm4 + movu [r8 + r6], xm2 +%else + psubw m4, m7 ; m4 = word: row 10 + psubw m2, m7 ; m2 = word: row 11 + movu [r8 + r3 * 2], m4 + movu [r8 + r6], m2 +%endif + lea r8, [r8 + r3 * 4] + + movu xm3, [r7 + r4] + lea r7, [r7 + r1 * 4] + vinserti128 m3, m3, [r7], 1 + vinserti128 m5, m5, xm3, 1 + + punpcklbw m2, m5, m3 + punpckhbw m1, m5, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, m9 + paddw m0, m1 + pmaddubsw m4, m8 + pmaddubsw m1, m2, m9 + paddw m6, m1 + pmaddubsw m2, m8 + +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 12 + pmulhrsw m6, m7 ; m6 = word: row 13 + packuswb m0, m6 + vpermq m0, m0, 11011000b + vextracti128 xm6, m0, 1 + movu [r8], xm0 + movu [r8 + r3], xm6 +%else + psubw m0, m7 ; m0 = word: row 12 + psubw m6, m7 ; m6 = word: row 13 + movu [r8], m0 + movu [r8 + r3], m6 +%endif + + movu xm5, [r7 + r1 * 2] + vinserti128 m5, m5, [r7 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 + + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m6, m9 + paddw m2, m6 + pmaddubsw m0, m9 + paddw m4, m0 + +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 14 + pmulhrsw m2, m7 ; m2 = word: row 15 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r8 + r3 * 2], xm4 + movu [r8 + r6], xm2 + add r2, 16 +%else + psubw m4, m7 ; m4 = word: row 14 + psubw m2, m7 ; m2 = word: row 15 + movu [r8 + r3 * 2], m4 + movu [r8 + r6], m2 + add r2, 32 +%endif + add r0, 16 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 + vinserti128 m5, m1, xm2, 1 + pmaddubsw m5, m8 + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 + lea r7, [r0 + r1 * 4] + movq xm1, [r7] ; m1 = row 4 + punpcklbw xm4, xm1 + vinserti128 m2, m3, xm4, 1 + pmaddubsw m0, m2, m9 + paddw m5, m0 + pmaddubsw m2, m8 + movq xm3, [r7 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 + movq xm4, [r7 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m0, m1, m9 + paddw m2, m0 + pmaddubsw m1, m8 + movq xm3, [r7 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 + lea r7, [r7 + r1 * 4] + movq xm0, [r7] ; m0 = row 8 + punpcklbw xm3, xm0 + vinserti128 m4, m4, xm3, 1 + pmaddubsw m3, m4, m9 + paddw m1, m3 + pmaddubsw m4, m8 + movq xm3, [r7 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 + movq xm6, [r7 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 + vinserti128 m0, m0, xm3, 1 + pmaddubsw m3, m0, m9 + paddw m4, m3 + pmaddubsw m0, m8 + +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 + lea r8, [r2 + r3 * 4] + movq [r8], xm1 + movq [r8 + r3], xm4 + movhps [r8 + r3 * 2], xm1 + movhps [r8 + r6], xm4 +%else + psubw m5, m7 ; m5 = word: row 0, row 1 + psubw m2, m7 ; m2 = word: row 2, row 3 + psubw m1, m7 ; m1 = word: row 4, row 5 + psubw m4, m7 ; m4 = word: row 6, row 7 + vextracti128 xm3, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm3 + vextracti128 xm3, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + vextracti128 xm3, m1, 1 + lea r8, [r2 + r3 * 4] + movu [r8], xm1 + movu [r8 + r3], xm3 + vextracti128 xm3, m4, 1 + movu [r8 + r3 * 2], xm4 + movu [r8 + r6], xm3 +%endif + lea r8, [r8 + r3 * 4] + + movq xm3, [r7 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 + lea r7, [r7 + r1 * 4] + movq xm5, [r7] ; m5 = row 12 + punpcklbw xm3, xm5 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, m9 + paddw m0, m3 + pmaddubsw m6, m8 + movq xm3, [r7 + r1] ; m3 = row 13 + punpcklbw xm5, xm3 + movq xm2, [r7 + r1 * 2] ; m2 = row 14 + punpcklbw xm3, xm2 + vinserti128 m5, m5, xm3, 1 + pmaddubsw m3, m5, m9 + paddw m6, m3 + pmaddubsw m5, m8 + movq xm3, [r7 + r4] ; m3 = row 15 + punpcklbw xm2, xm3 + lea r7, [r7 + r1 * 4] + movq xm1, [r7] ; m1 = row 16 + punpcklbw xm3, xm1 + vinserti128 m2, m2, xm3, 1 + pmaddubsw m3, m2, m9 + paddw m5, m3 + pmaddubsw m2, m8 + movq xm3, [r7 + r1] ; m3 = row 17 + punpcklbw xm1, xm3 + movq xm4, [r7 + r1 * 2] ; m4 = row 18 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, m9 + paddw m2, m3 +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 8, row 9 + pmulhrsw m6, m7 ; m6 = word: row 10, row 11 + pmulhrsw m5, m7 ; m5 = word: row 12, row 13 + pmulhrsw m2, m7 ; m2 = word: row 14, row 15 + packuswb m0, m6 + packuswb m5, m2 + vextracti128 xm6, m0, 1 + vextracti128 xm2, m5, 1 + movq [r8], xm0 + movq [r8 + r3], xm6 + movhps [r8 + r3 * 2], xm0 + movhps [r8 + r6], xm6 + lea r8, [r8 + r3 * 4] + movq [r8], xm5 + movq [r8 + r3], xm2 + movhps [r8 + r3 * 2], xm5 + movhps [r8 + r6], xm2 + lea r2, [r8 + r3 * 4 - 16] +%else + psubw m0, m7 ; m0 = word: row 8, row 9 + psubw m6, m7 ; m6 = word: row 10, row 11 + psubw m5, m7 ; m5 = word: row 12, row 13 + psubw m2, m7 ; m2 = word: row 14, row 15 + vextracti128 xm3, m0, 1 + movu [r8], xm0 + movu [r8 + r3], xm3 + vextracti128 xm3, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm3 + vextracti128 xm3, m5, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm5 + movu [r8 + r3], xm3 + vextracti128 xm3, m2, 1 + movu [r8 + r3 * 2], xm2 + movu [r8 + r6], xm3 + lea r2, [r8 + r3 * 4 - 32] +%endif + lea r0, [r7 - 16] + dec r5d + jnz .loopH + RET +%endif +%endmacro + + FILTER_VER_CHROMA_AVX2_24x32 pp + FILTER_VER_CHROMA_AVX2_24x32 ps + +%macro FILTER_VER_CHROMA_AVX2_24x64 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_24x64, 4, 7, 13 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + mova m10, [r5] + mova m11, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m12, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m12, [pw_2000] +%endif + lea r5, [r3 * 3] + mov r6d, 16 +.loopH: + movu m0, [r0] ; m0 = row 0 + movu m1, [r0 + r1] ; m1 = row 1 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + movu m0, [r0 + r1 * 2] ; m0 = row 2 + punpcklbw m4, m1, m0 + punpckhbw m5, m1, m0 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + movu m1, [r0 + r4] ; m1 = row 3 + punpcklbw m6, m0, m1 + punpckhbw m7, m0, m1 + pmaddubsw m8, m6, m11 + pmaddubsw m9, m7, m11 + pmaddubsw m6, m10 + pmaddubsw m7, m10 + paddw m2, m8 + paddw m3, m9 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2], xm2 + vextracti128 xm2, m2, 1 + movq [r2 + 16], xm2 +%else + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2], m0 + movu [r2 + mmsize], xm2 +%endif + lea r0, [r0 + r1 * 4] + movu m0, [r0] ; m0 = row 4 + punpcklbw m2, m1, m0 + punpckhbw m3, m1, m0 + pmaddubsw m8, m2, m11 + pmaddubsw m9, m3, m11 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + paddw m4, m8 + paddw m5, m9 +%ifidn %1,pp + pmulhrsw m4, m12 + pmulhrsw m5, m12 + packuswb m4, m5 + movu [r2 + r3], xm4 + vextracti128 xm4, m4, 1 + movq [r2 + r3 + 16], xm4 +%else + psubw m4, m12 + psubw m5, m12 + vperm2i128 m1, m4, m5, 0x20 + vperm2i128 m4, m4, m5, 0x31 + movu [r2 + r3], m1 + movu [r2 + r3 + mmsize], xm4 +%endif + + movu m1, [r0 + r1] ; m1 = row 5 + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m6, m4 + paddw m7, m5 +%ifidn %1,pp + pmulhrsw m6, m12 + pmulhrsw m7, m12 + packuswb m6, m7 + movu [r2 + r3 * 2], xm6 + vextracti128 xm6, m6, 1 + movq [r2 + r3 * 2 + 16], xm6 +%else + psubw m6, m12 + psubw m7, m12 + vperm2i128 m0, m6, m7, 0x20 + vperm2i128 m6, m6, m7, 0x31 + movu [r2 + r3 * 2], m0 + movu [r2 + r3 * 2 + mmsize], xm6 +%endif + + movu m0, [r0 + r1 * 2] ; m0 = row 6 + punpcklbw m6, m1, m0 + punpckhbw m7, m1, m0 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m2, m6 + paddw m3, m7 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2 + r5], xm2 + vextracti128 xm2, m2, 1 + movq [r2 + r5 + 16], xm2 +%else + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2 + r5], m0 + movu [r2 + r5 + mmsize], xm2 +%endif + lea r2, [r2 + r3 * 4] + dec r6d + jnz .loopH + RET +%endif +%endmacro + + FILTER_VER_CHROMA_AVX2_24x64 pp + FILTER_VER_CHROMA_AVX2_24x64 ps + +%macro FILTER_VER_CHROMA_AVX2_16x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_16x4, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + mova m7, [pw_2000] +%endif + + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r1 * 2], 1 + movu xm1, [r0 + r1] + vinserti128 m1, m1, [r0 + r4], 1 + + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + vperm2i128 m4, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + pmaddubsw m4, [r5] + pmaddubsw m3, m2, [r5 + mmsize] + paddw m4, m3 + pmaddubsw m2, [r5] + + vextracti128 xm0, m0, 1 + lea r0, [r0 + r1 * 4] + vinserti128 m0, m0, [r0], 1 + + punpcklbw m5, m1, m0 + punpckhbw m3, m1, m0 + vperm2i128 m6, m5, m3, 0x20 + vperm2i128 m5, m5, m3, 0x31 + pmaddubsw m6, [r5] + pmaddubsw m3, m5, [r5 + mmsize] + paddw m6, m3 + pmaddubsw m5, [r5] +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 0 + pmulhrsw m6, m7 ; m6 = word: row 1 + packuswb m4, m6 + vpermq m4, m4, 11011000b + vextracti128 xm6, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm6 +%else + psubw m4, m7 ; m4 = word: row 0 + psubw m6, m7 ; m6 = word: row 1 + movu [r2], m4 + movu [r2 + r3], m6 +%endif + lea r2, [r2 + r3 * 2] + + movu xm4, [r0 + r1 * 2] + vinserti128 m4, m4, [r0 + r1], 1 + vextracti128 xm1, m4, 1 + vinserti128 m0, m0, xm1, 0 + + punpcklbw m6, m0, m4 + punpckhbw m1, m0, m4 + vperm2i128 m0, m6, m1, 0x20 + vperm2i128 m6, m6, m1, 0x31 + pmaddubsw m0, [r5 + mmsize] + paddw m5, m0 + pmaddubsw m6, [r5 + mmsize] + paddw m2, m6 + +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m5, m7 ; m5 = word: row 3 + packuswb m2, m5 + vpermq m2, m2, 11011000b + vextracti128 xm5, m2, 1 + movu [r2], xm2 + movu [r2 + r3], xm5 +%else + psubw m2, m7 ; m2 = word: row 2 + psubw m5, m7 ; m5 = word: row 3 + movu [r2], m2 + movu [r2 + r3], m5 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_16x4 pp + FILTER_VER_CHROMA_AVX2_16x4 ps + +%macro FILTER_VER_CHROMA_AVX2_12xN 2 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_12x%2, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m7, [pw_2000] +%endif + lea r6, [r3 * 3] +%rep %2 / 16 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 0 + pmulhrsw m1, m7 ; m1 = word: row 1 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [r2], xm0 + movd [r2 + 8], xm1 + movhps [r2 + r3], xm0 + pextrd [r2 + r3 + 8], xm1, 2 +%else + psubw m0, m7 ; m0 = word: row 0 + psubw m1, m7 ; m1 = word: row 1 + movu [r2], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + 16], xm0 + movu [r2 + r3], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r3 + 16], xm1 +%endif + + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm0, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm0, 1 + pmaddubsw m0, m5, [r5 + 1 * mmsize] + paddw m3, m0 + pmaddubsw m5, [r5] +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m3, m7 ; m3 = word: row 3 + packuswb m2, m3 + vextracti128 xm3, m2, 1 + movq [r2 + r3 * 2], xm2 + movd [r2 + r3 * 2 + 8], xm3 + movhps [r2 + r6], xm2 + pextrd [r2 + r6 + 8], xm3, 2 +%else + psubw m2, m7 ; m2 = word: row 2 + psubw m3, m7 ; m3 = word: row 3 + movu [r2 + r3 * 2], xm2 + vextracti128 xm2, m2, 1 + movq [r2 + r3 * 2 + 16], xm2 + movu [r2 + r6], xm3 + vextracti128 xm3, m3, 1 + movq [r2 + r6 + 16], xm3 +%endif + lea r2, [r2 + r3 * 4] + + movu xm0, [r0 + r4] ; m0 = row 7 + punpckhbw xm3, xm6, xm0 + punpcklbw xm6, xm0 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, [r5 + 1 * mmsize] + paddw m4, m3 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm3, [r0] ; m3 = row 8 + punpckhbw xm1, xm0, xm3 + punpcklbw xm0, xm3 + vinserti128 m0, m0, xm1, 1 + pmaddubsw m1, m0, [r5 + 1 * mmsize] + paddw m5, m1 + pmaddubsw m0, [r5] +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 4 + pmulhrsw m5, m7 ; m5 = word: row 5 + packuswb m4, m5 + vextracti128 xm5, m4, 1 + movq [r2], xm4 + movd [r2 + 8], xm5 + movhps [r2 + r3], xm4 + pextrd [r2 + r3 + 8], xm5, 2 +%else + psubw m4, m7 ; m4 = word: row 4 + psubw m5, m7 ; m5 = word: row 5 + movu [r2], xm4 + vextracti128 xm4, m4, 1 + movq [r2 + 16], xm4 + movu [r2 + r3], xm5 + vextracti128 xm5, m5, 1 + movq [r2 + r3 + 16], xm5 +%endif + + movu xm1, [r0 + r1] ; m1 = row 9 + punpckhbw xm2, xm3, xm1 + punpcklbw xm3, xm1 + vinserti128 m3, m3, xm2, 1 + pmaddubsw m2, m3, [r5 + 1 * mmsize] + paddw m6, m2 + pmaddubsw m3, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 10 + punpckhbw xm4, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm4, 1 + pmaddubsw m4, m1, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m1, [r5] + +%ifidn %1,pp + pmulhrsw m6, m7 ; m6 = word: row 6 + pmulhrsw m0, m7 ; m0 = word: row 7 + packuswb m6, m0 + vextracti128 xm0, m6, 1 + movq [r2 + r3 * 2], xm6 + movd [r2 + r3 * 2 + 8], xm0 + movhps [r2 + r6], xm6 + pextrd [r2 + r6 + 8], xm0, 2 +%else + psubw m6, m7 ; m6 = word: row 6 + psubw m0, m7 ; m0 = word: row 7 + movu [r2 + r3 * 2], xm6 + vextracti128 xm6, m6, 1 + movq [r2 + r3 * 2 + 16], xm6 + movu [r2 + r6], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + r6 + 16], xm0 +%endif + lea r2, [r2 + r3 * 4] + + movu xm4, [r0 + r4] ; m4 = row 11 + punpckhbw xm6, xm2, xm4 + punpcklbw xm2, xm4 + vinserti128 m2, m2, xm6, 1 + pmaddubsw m6, m2, [r5 + 1 * mmsize] + paddw m3, m6 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 12 + punpckhbw xm0, xm4, xm6 + punpcklbw xm4, xm6 + vinserti128 m4, m4, xm0, 1 + pmaddubsw m0, m4, [r5 + 1 * mmsize] + paddw m1, m0 + pmaddubsw m4, [r5] +%ifidn %1,pp + pmulhrsw m3, m7 ; m3 = word: row 8 + pmulhrsw m1, m7 ; m1 = word: row 9 + packuswb m3, m1 + vextracti128 xm1, m3, 1 + movq [r2], xm3 + movd [r2 + 8], xm1 + movhps [r2 + r3], xm3 + pextrd [r2 + r3 + 8], xm1, 2 +%else + psubw m3, m7 ; m3 = word: row 8 + psubw m1, m7 ; m1 = word: row 9 + movu [r2], xm3 + vextracti128 xm3, m3, 1 + movq [r2 + 16], xm3 + movu [r2 + r3], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r3 + 16], xm1 +%endif + + movu xm0, [r0 + r1] ; m0 = row 13 + punpckhbw xm1, xm6, xm0 + punpcklbw xm6, xm0 + vinserti128 m6, m6, xm1, 1 + pmaddubsw m1, m6, [r5 + 1 * mmsize] + paddw m2, m1 + pmaddubsw m6, [r5] + movu xm1, [r0 + r1 * 2] ; m1 = row 14 + punpckhbw xm5, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm5, 1 + pmaddubsw m5, m0, [r5 + 1 * mmsize] + paddw m4, m5 + pmaddubsw m0, [r5] +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 10 + pmulhrsw m4, m7 ; m4 = word: row 11 + packuswb m2, m4 + vextracti128 xm4, m2, 1 + movq [r2 + r3 * 2], xm2 + movd [r2 + r3 * 2 + 8], xm4 + movhps [r2 + r6], xm2 + pextrd [r2 + r6 + 8], xm4, 2 +%else + psubw m2, m7 ; m2 = word: row 10 + psubw m4, m7 ; m4 = word: row 11 + movu [r2 + r3 * 2], xm2 + vextracti128 xm2, m2, 1 + movq [r2 + r3 * 2 + 16], xm2 + movu [r2 + r6], xm4 + vextracti128 xm4, m4, 1 + movq [r2 + r6 + 16], xm4 +%endif + lea r2, [r2 + r3 * 4] + + movu xm5, [r0 + r4] ; m5 = row 15 + punpckhbw xm2, xm1, xm5 + punpcklbw xm1, xm5 + vinserti128 m1, m1, xm2, 1 + pmaddubsw m2, m1, [r5 + 1 * mmsize] + paddw m6, m2 + pmaddubsw m1, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm5, xm2 + punpcklbw xm5, xm2 + vinserti128 m5, m5, xm3, 1 + pmaddubsw m3, m5, [r5 + 1 * mmsize] + paddw m0, m3 + pmaddubsw m5, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m1, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm2, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m5, m3 + +%ifidn %1,pp + pmulhrsw m6, m7 ; m6 = word: row 12 + pmulhrsw m0, m7 ; m0 = word: row 13 + pmulhrsw m1, m7 ; m1 = word: row 14 + pmulhrsw m5, m7 ; m5 = word: row 15 + packuswb m6, m0 + packuswb m1, m5 + vextracti128 xm0, m6, 1 + vextracti128 xm5, m1, 1 + movq [r2], xm6 + movd [r2 + 8], xm0 + movhps [r2 + r3], xm6 + pextrd [r2 + r3 + 8], xm0, 2 + movq [r2 + r3 * 2], xm1 + movd [r2 + r3 * 2 + 8], xm5 + movhps [r2 + r6], xm1 + pextrd [r2 + r6 + 8], xm5, 2 +%else + psubw m6, m7 ; m6 = word: row 12 + psubw m0, m7 ; m0 = word: row 13 + psubw m1, m7 ; m1 = word: row 14 + psubw m5, m7 ; m5 = word: row 15 + movu [r2], xm6 + vextracti128 xm6, m6, 1 + movq [r2 + 16], xm6 + movu [r2 + r3], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + r3 + 16], xm0 + movu [r2 + r3 * 2], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r3 * 2 + 16], xm1 + movu [r2 + r6], xm5 + vextracti128 xm5, m5, 1 + movq [r2 + r6 + 16], xm5 +%endif + lea r2, [r2 + r3 * 4] +%endrep + RET +%endmacro + + FILTER_VER_CHROMA_AVX2_12xN pp, 16 + FILTER_VER_CHROMA_AVX2_12xN ps, 16 + FILTER_VER_CHROMA_AVX2_12xN pp, 32 + FILTER_VER_CHROMA_AVX2_12xN ps, 32 + +;----------------------------------------------------------------------------- +;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W24 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mov r4d, %2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + movu m5, [r5] + movu m7, [r5 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_512] + + pmulhrsw m4, m6 + pmulhrsw m2, m6 + + packuswb m4, m2 + + movu [r2], m4 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m2, [r5 + 2 * r1] + + punpcklbw m5, m7, m2 + punpckhbw m7, m2 + + pmaddubsw m5, m0 + pmaddubsw m7, m0 + + paddw m4, m5 + paddw m3, m7 + + pmulhrsw m4, m6 + pmulhrsw m3, m6 + + packuswb m4, m3 + + movu [r2 + r3], m4 + + movq m2, [r0 + 16] + movq m3, [r0 + r1 + 16] + movq m4, [r5 + 16] + movq m5, [r5 + r1 + 16] + + punpcklbw m2, m3 + punpcklbw m4, m5 + + pmaddubsw m2, m1 + pmaddubsw m4, m0 + + paddw m2, m4 + + pmulhrsw m2, m6 + + movq m3, [r0 + r1 + 16] + movq m4, [r5 + 16] + movq m5, [r5 + r1 + 16] + movq m7, [r5 + 2 * r1 + 16] + + punpcklbw m3, m4 + punpcklbw m5, m7 + + pmaddubsw m3, m1 + pmaddubsw m5, m0 + + paddw m3, m5 + + pmulhrsw m3, m6 + packuswb m2, m3 + + movh [r2 + 16], m2 + movhps [r2 + r3 + 16], m2 + + mov r0, r5 + lea r2, [r2 + 2 * r3] + + sub r4, 2 + jnz .loop + RET +%endmacro + + FILTER_V4_W24 24, 32 + + FILTER_V4_W24 24, 64 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W32 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mova m7, [pw_512] + + mov r4d, %2 + +.loop: + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + movu m3, [r5] + movu m5, [r5 + r1] + + punpcklbw m6, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m6, m0 + pmaddubsw m3, m0 + + paddw m4, m6 + paddw m2, m3 + + pmulhrsw m4, m7 + pmulhrsw m2, m7 + + packuswb m4, m2 + + movu [r2], m4 + + movu m2, [r0 + 16] + movu m3, [r0 + r1 + 16] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + movu m3, [r5 + 16] + movu m5, [r5 + r1 + 16] + + punpcklbw m6, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m6, m0 + pmaddubsw m3, m0 + + paddw m4, m6 + paddw m2, m3 + + pmulhrsw m4, m7 + pmulhrsw m2, m7 + + packuswb m4, m2 + + movu [r2 + 16], m4 + + lea r0, [r0 + r1] + lea r2, [r2 + r3] + + dec r4 + jnz .loop + RET +%endmacro + + FILTER_V4_W32 32, 8 + FILTER_V4_W32 32, 16 + FILTER_V4_W32 32, 24 + FILTER_V4_W32 32, 32 + + FILTER_V4_W32 32, 48 + FILTER_V4_W32 32, 64 + +%macro FILTER_VER_CHROMA_AVX2_32xN 2 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_32x%2, 4, 7, 13 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + mova m10, [r5] + mova m11, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m12, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m12, [pw_2000] +%endif + lea r5, [r3 * 3] + mov r6d, %2 / 4 +.loopW: + movu m0, [r0] ; m0 = row 0 + movu m1, [r0 + r1] ; m1 = row 1 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + movu m0, [r0 + r1 * 2] ; m0 = row 2 + punpcklbw m4, m1, m0 + punpckhbw m5, m1, m0 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + movu m1, [r0 + r4] ; m1 = row 3 + punpcklbw m6, m0, m1 + punpckhbw m7, m0, m1 + pmaddubsw m8, m6, m11 + pmaddubsw m9, m7, m11 + pmaddubsw m6, m10 + pmaddubsw m7, m10 + paddw m2, m8 + paddw m3, m9 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2], m2 +%else + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2], m0 + movu [r2 + mmsize], m2 +%endif + lea r0, [r0 + r1 * 4] + movu m0, [r0] ; m0 = row 4 + punpcklbw m2, m1, m0 + punpckhbw m3, m1, m0 + pmaddubsw m8, m2, m11 + pmaddubsw m9, m3, m11 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + paddw m4, m8 + paddw m5, m9 +%ifidn %1,pp + pmulhrsw m4, m12 + pmulhrsw m5, m12 + packuswb m4, m5 + movu [r2 + r3], m4 +%else + psubw m4, m12 + psubw m5, m12 + vperm2i128 m1, m4, m5, 0x20 + vperm2i128 m4, m4, m5, 0x31 + movu [r2 + r3], m1 + movu [r2 + r3 + mmsize], m4 +%endif + + movu m1, [r0 + r1] ; m1 = row 5 + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m6, m4 + paddw m7, m5 +%ifidn %1,pp + pmulhrsw m6, m12 + pmulhrsw m7, m12 + packuswb m6, m7 + movu [r2 + r3 * 2], m6 +%else + psubw m6, m12 + psubw m7, m12 + vperm2i128 m0, m6, m7, 0x20 + vperm2i128 m6, m6, m7, 0x31 + movu [r2 + r3 * 2], m0 + movu [r2 + r3 * 2 + mmsize], m6 +%endif + + movu m0, [r0 + r1 * 2] ; m0 = row 6 + punpcklbw m6, m1, m0 + punpckhbw m7, m1, m0 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m2, m6 + paddw m3, m7 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2 + r5], m2 +%else + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2 + r5], m0 + movu [r2 + r5 + mmsize], m2 +%endif + lea r2, [r2 + r3 * 4] + dec r6d + jnz .loopW + RET +%endif +%endmacro + + FILTER_VER_CHROMA_AVX2_32xN pp, 64 + FILTER_VER_CHROMA_AVX2_32xN pp, 48 + FILTER_VER_CHROMA_AVX2_32xN pp, 32 + FILTER_VER_CHROMA_AVX2_32xN pp, 24 + FILTER_VER_CHROMA_AVX2_32xN pp, 16 + FILTER_VER_CHROMA_AVX2_32xN pp, 8 + FILTER_VER_CHROMA_AVX2_32xN ps, 64 + FILTER_VER_CHROMA_AVX2_32xN ps, 48 + FILTER_VER_CHROMA_AVX2_32xN ps, 32 + FILTER_VER_CHROMA_AVX2_32xN ps, 24 + FILTER_VER_CHROMA_AVX2_32xN ps, 16 + FILTER_VER_CHROMA_AVX2_32xN ps, 8 + +%macro FILTER_VER_CHROMA_AVX2_48x64 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_48x64, 4, 8, 13 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + mova m10, [r5] + mova m11, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m12, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m12, [pw_2000] +%endif + lea r5, [r3 * 3] + lea r7, [r1 * 4] + mov r6d, 16 +.loopH: + movu m0, [r0] ; m0 = row 0 + movu m1, [r0 + r1] ; m1 = row 1 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + movu m0, [r0 + r1 * 2] ; m0 = row 2 + punpcklbw m4, m1, m0 + punpckhbw m5, m1, m0 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + movu m1, [r0 + r4] ; m1 = row 3 + punpcklbw m6, m0, m1 + punpckhbw m7, m0, m1 + pmaddubsw m8, m6, m11 + pmaddubsw m9, m7, m11 + pmaddubsw m6, m10 + pmaddubsw m7, m10 + paddw m2, m8 + paddw m3, m9 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2], m2 +%else + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2], m0 + movu [r2 + mmsize], m2 +%endif + lea r0, [r0 + r1 * 4] + movu m0, [r0] ; m0 = row 4 + punpcklbw m2, m1, m0 + punpckhbw m3, m1, m0 + pmaddubsw m8, m2, m11 + pmaddubsw m9, m3, m11 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + paddw m4, m8 + paddw m5, m9 +%ifidn %1,pp + pmulhrsw m4, m12 + pmulhrsw m5, m12 + packuswb m4, m5 + movu [r2 + r3], m4 +%else + psubw m4, m12 + psubw m5, m12 + vperm2i128 m1, m4, m5, 0x20 + vperm2i128 m4, m4, m5, 0x31 + movu [r2 + r3], m1 + movu [r2 + r3 + mmsize], m4 +%endif + + movu m1, [r0 + r1] ; m1 = row 5 + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m6, m4 + paddw m7, m5 +%ifidn %1,pp + pmulhrsw m6, m12 + pmulhrsw m7, m12 + packuswb m6, m7 + movu [r2 + r3 * 2], m6 +%else + psubw m6, m12 + psubw m7, m12 + vperm2i128 m0, m6, m7, 0x20 + vperm2i128 m6, m6, m7, 0x31 + movu [r2 + r3 * 2], m0 + movu [r2 + r3 * 2 + mmsize], m6 +%endif + + movu m0, [r0 + r1 * 2] ; m0 = row 6 + punpcklbw m6, m1, m0 + punpckhbw m7, m1, m0 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m2, m6 + paddw m3, m7 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2 + r5], m2 + add r2, 32 +%else + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2 + r5], m0 + movu [r2 + r5 + mmsize], m2 + add r2, 64 +%endif + sub r0, r7 + + movu xm0, [r0 + 32] ; m0 = row 0 + movu xm1, [r0 + r1 + 32] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, m10 + movu xm2, [r0 + r1 * 2 + 32] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, m10 + movu xm3, [r0 + r4 + 32] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, m11 + paddw m0, m4 + pmaddubsw m2, m10 + lea r0, [r0 + r1 * 4] + movu xm4, [r0 + 32] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, m11 + paddw m1, m5 + pmaddubsw m3, m10 + movu xm5, [r0 + r1 + 32] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m4, m11 + paddw m2, m4 + movu xm6, [r0 + r1 * 2 + 32] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m5, m11 + paddw m3, m5 +%ifidn %1,pp + pmulhrsw m0, m12 ; m0 = word: row 0 + pmulhrsw m1, m12 ; m1 = word: row 1 + pmulhrsw m2, m12 ; m2 = word: row 2 + pmulhrsw m3, m12 ; m3 = word: row 3 + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r5], xm3 + lea r2, [r2 + r3 * 4 - 32] +%else + psubw m0, m12 ; m0 = word: row 0 + psubw m1, m12 ; m1 = word: row 1 + psubw m2, m12 ; m2 = word: row 2 + psubw m3, m12 ; m3 = word: row 3 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r5], m3 + lea r2, [r2 + r3 * 4 - 64] +%endif + dec r6d + jnz .loopH + RET +%endif +%endmacro + + FILTER_VER_CHROMA_AVX2_48x64 pp + FILTER_VER_CHROMA_AVX2_48x64 ps + +%macro FILTER_VER_CHROMA_AVX2_64xN 2 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_64x%2, 4, 8, 13 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + mova m10, [r5] + mova m11, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m12, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m12, [pw_2000] +%endif + lea r5, [r3 * 3] + lea r7, [r1 * 4] + mov r6d, %2 / 4 +.loopH: +%assign x 0 +%rep 2 + movu m0, [r0 + x] ; m0 = row 0 + movu m1, [r0 + r1 + x] ; m1 = row 1 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + movu m0, [r0 + r1 * 2 + x] ; m0 = row 2 + punpcklbw m4, m1, m0 + punpckhbw m5, m1, m0 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + movu m1, [r0 + r4 + x] ; m1 = row 3 + punpcklbw m6, m0, m1 + punpckhbw m7, m0, m1 + pmaddubsw m8, m6, m11 + pmaddubsw m9, m7, m11 + pmaddubsw m6, m10 + pmaddubsw m7, m10 + paddw m2, m8 + paddw m3, m9 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2], m2 +%else + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2], m0 + movu [r2 + mmsize], m2 +%endif + lea r0, [r0 + r1 * 4] + movu m0, [r0 + x] ; m0 = row 4 + punpcklbw m2, m1, m0 + punpckhbw m3, m1, m0 + pmaddubsw m8, m2, m11 + pmaddubsw m9, m3, m11 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + paddw m4, m8 + paddw m5, m9 +%ifidn %1,pp + pmulhrsw m4, m12 + pmulhrsw m5, m12 + packuswb m4, m5 + movu [r2 + r3], m4 +%else + psubw m4, m12 + psubw m5, m12 + vperm2i128 m1, m4, m5, 0x20 + vperm2i128 m4, m4, m5, 0x31 + movu [r2 + r3], m1 + movu [r2 + r3 + mmsize], m4 +%endif + + movu m1, [r0 + r1 + x] ; m1 = row 5 + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m6, m4 + paddw m7, m5 +%ifidn %1,pp + pmulhrsw m6, m12 + pmulhrsw m7, m12 + packuswb m6, m7 + movu [r2 + r3 * 2], m6 +%else + psubw m6, m12 + psubw m7, m12 + vperm2i128 m0, m6, m7, 0x20 + vperm2i128 m6, m6, m7, 0x31 + movu [r2 + r3 * 2], m0 + movu [r2 + r3 * 2 + mmsize], m6 +%endif + + movu m0, [r0 + r1 * 2 + x] ; m0 = row 6 + punpcklbw m6, m1, m0 + punpckhbw m7, m1, m0 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m2, m6 + paddw m3, m7 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2 + r5], m2 + add r2, 32 +%else + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2 + r5], m0 + movu [r2 + r5 + mmsize], m2 + add r2, 64 +%endif + sub r0, r7 +%assign x x+32 +%endrep +%ifidn %1,pp + lea r2, [r2 + r3 * 4 - 64] +%else + lea r2, [r2 + r3 * 4 - 128] +%endif + add r0, r7 + dec r6d + jnz .loopH + RET +%endif +%endmacro + + FILTER_VER_CHROMA_AVX2_64xN pp, 64 + FILTER_VER_CHROMA_AVX2_64xN pp, 48 + FILTER_VER_CHROMA_AVX2_64xN pp, 32 + FILTER_VER_CHROMA_AVX2_64xN pp, 16 + FILTER_VER_CHROMA_AVX2_64xN ps, 64 + FILTER_VER_CHROMA_AVX2_64xN ps, 48 + FILTER_VER_CHROMA_AVX2_64xN ps, 32 + FILTER_VER_CHROMA_AVX2_64xN ps, 16 + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W16n_H2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 + + mov r4d, r4m + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + + mov r4d, %2/2 + +.loop: + + mov r6d, %1/16 + +.loopW: + + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + movu m5, [r5] + movu m6, [r5 + r1] + + punpckhbw m7, m5, m6 + pmaddubsw m7, m0 + paddw m2, m7 + + punpcklbw m7, m5, m6 + pmaddubsw m7, m0 + paddw m4, m7 + + mova m7, [pw_512] + + pmulhrsw m4, m7 + pmulhrsw m2, m7 + + packuswb m4, m2 + + movu [r2], m4 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m5, [r5 + 2 * r1] + + punpcklbw m2, m6, m5 + punpckhbw m6, m5 + + pmaddubsw m2, m0 + pmaddubsw m6, m0 + + paddw m4, m2 + paddw m3, m6 + + pmulhrsw m4, m7 + pmulhrsw m3, m7 + + packuswb m4, m3 + + movu [r2 + r3], m4 + + add r0, 16 + add r2, 16 + dec r6d + jnz .loopW + + lea r0, [r0 + r1 * 2 - %1] + lea r2, [r2 + r3 * 2 - %1] + + dec r4d + jnz .loop + RET +%endmacro + + FILTER_V4_W16n_H2 64, 64 + FILTER_V4_W16n_H2 64, 32 + FILTER_V4_W16n_H2 64, 48 + FILTER_V4_W16n_H2 48, 64 + FILTER_V4_W16n_H2 64, 16 + +%macro PROCESS_CHROMA_SP_W4_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 done + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m4, [r6 + 1 * 16] + paddd m2, m4 ;m2=[2+3+4+5] Row3 + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m5, [r6 + 1 * 16] + paddd m3, m5 ;m3=[3+4+5+6] Row4 +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mova m6, [v4_pd_526336] + + mov dword [rsp], %2/4 + +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_CHROMA_SP_W4_4R + + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m0, 2 + pextrd [r5 + r3], m0, 3 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SP 4, 4 + FILTER_VER_CHROMA_SP 4, 8 + FILTER_VER_CHROMA_SP 16, 16 + FILTER_VER_CHROMA_SP 16, 8 + FILTER_VER_CHROMA_SP 16, 12 + FILTER_VER_CHROMA_SP 12, 16 + FILTER_VER_CHROMA_SP 16, 4 + FILTER_VER_CHROMA_SP 4, 16 + FILTER_VER_CHROMA_SP 32, 32 + FILTER_VER_CHROMA_SP 32, 16 + FILTER_VER_CHROMA_SP 16, 32 + FILTER_VER_CHROMA_SP 32, 24 + FILTER_VER_CHROMA_SP 24, 32 + FILTER_VER_CHROMA_SP 32, 8 + + FILTER_VER_CHROMA_SP 16, 24 + FILTER_VER_CHROMA_SP 16, 64 + FILTER_VER_CHROMA_SP 12, 32 + FILTER_VER_CHROMA_SP 4, 32 + FILTER_VER_CHROMA_SP 32, 64 + FILTER_VER_CHROMA_SP 32, 48 + FILTER_VER_CHROMA_SP 24, 64 + + FILTER_VER_CHROMA_SP 64, 64 + FILTER_VER_CHROMA_SP 64, 32 + FILTER_VER_CHROMA_SP 64, 48 + FILTER_VER_CHROMA_SP 48, 64 + FILTER_VER_CHROMA_SP 64, 16 + + +%macro PROCESS_CHROMA_SP_W2_4R 1 + movd m0, [r0] + movd m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + + lea r0, [r0 + 2 * r1] + movd m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + punpcklqdq m0, m1 ;m0=[0 1 1 2] + pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2 + + movd m1, [r0 + r1] + punpcklwd m2, m1 ;m2=[2 3] + + lea r0, [r0 + 2 * r1] + movd m3, [r0] + punpcklwd m1, m3 ;m2=[3 4] + punpcklqdq m2, m1 ;m2=[2 3 3 4] + + pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2 + pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4 + paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 + + movd m1, [r0 + r1] + punpcklwd m3, m1 ;m3=[4 5] + + movd m4, [r0 + 2 * r1] + punpcklwd m1, m4 ;m1=[5 6] + punpcklqdq m3, m1 ;m2=[4 5 5 6] + pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4 + paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 +%endmacro + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP_W2_4R 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mova m5, [v4_pd_526336] + + mov r4d, (%2/4) + +.loopH: + PROCESS_CHROMA_SP_W2_4R r5 + + paddd m0, m5 + paddd m2, m5 + + psrad m0, 12 + psrad m2, 12 + + packssdw m0, m2 + packuswb m0, m0 + + pextrw [r2], m0, 0 + pextrw [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrw [r2], m0, 2 + pextrw [r2 + r3], m0, 3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SP_W2_4R 2, 4 + FILTER_VER_CHROMA_SP_W2_4R 2, 8 + + FILTER_VER_CHROMA_SP_W2_4R 2, 16 + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_4x2, 5, 6, 5 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mova m4, [v4_pd_526336] + + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 + + movq m3, [r0 + r1] + punpcklwd m2, m3 ;m4=[2 3] + pmaddwd m2, [r5 + 1 * 16] + paddd m0, m2 ;m0=[0+1+2+3] Row1 done + paddd m0, m4 + psrad m0, 12 + + movq m2, [r0 + 2 * r1] + punpcklwd m3, m2 ;m5=[3 4] + pmaddwd m3, [r5 + 1 * 16] + paddd m1, m3 ;m1 = [1+2+3+4] Row2 done + paddd m1, m4 + psrad m1, 12 + + packssdw m0, m1 + packuswb m0, m0 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + + RET + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP_W6_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mova m6, [v4_pd_526336] + + mov r4d, %2/4 + +.loopH: + PROCESS_CHROMA_SP_W4_4R + + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m0, 2 + pextrd [r5 + r3], m0, 3 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 4 + + PROCESS_CHROMA_SP_W2_4R r6 + + paddd m0, m6 + paddd m2, m6 + + psrad m0, 12 + psrad m2, 12 + + packssdw m0, m2 + packuswb m0, m0 + + pextrw [r2], m0, 0 + pextrw [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrw [r2], m0, 2 + pextrw [r2 + r3], m0, 3 + + sub r0, 2 * 4 + lea r2, [r2 + 2 * r3 - 4] + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SP_W6_H4 6, 8 + + FILTER_VER_CHROMA_SP_W6_H4 6, 16 + +%macro PROCESS_CHROMA_SP_W8_2R 0 + movu m1, [r0] + movu m3, [r0 + r1] + punpcklwd m0, m1, m3 + pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l + punpckhwd m1, m3 + pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h + + movu m4, [r0 + 2 * r1] + punpcklwd m2, m3, m4 + pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l + punpckhwd m3, m4 + pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h + + lea r0, [r0 + 2 * r1] + movu m5, [r0 + r1] + punpcklwd m6, m4, m5 + pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l + paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum + punpckhwd m4, m5 + pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h + paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum + + movu m4, [r0 + 2 * r1] + punpcklwd m6, m5, m4 + pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l + paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum + punpckhwd m5, m4 + pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h + paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP_W8_H2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mova m7, [v4_pd_526336] + + mov r4d, %2/2 +.loopH: + PROCESS_CHROMA_SP_W8_2R + + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movlps [r2], m0 + movhps [r2 + r3], m0 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SP_W8_H2 8, 2 + FILTER_VER_CHROMA_SP_W8_H2 8, 4 + FILTER_VER_CHROMA_SP_W8_H2 8, 6 + FILTER_VER_CHROMA_SP_W8_H2 8, 8 + FILTER_VER_CHROMA_SP_W8_H2 8, 16 + FILTER_VER_CHROMA_SP_W8_H2 8, 32 + + FILTER_VER_CHROMA_SP_W8_H2 8, 12 + FILTER_VER_CHROMA_SP_W8_H2 8, 64 + + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W16n 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + mov r4d, %2/2 + +.loop: + + mov r6d, %1/16 + +.loopW: + + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + movu m5, [r5] + movu m7, [r5 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 + + movu [r2], m4 + movu [r2 + 16], m2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m5, [r5 + 2 * r1] + + punpcklbw m2, m7, m5 + punpckhbw m7, m5 + + pmaddubsw m2, m0 + pmaddubsw m7, m0 + + paddw m4, m2 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 + + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 + + add r0, 16 + add r2, 32 + dec r6d + jnz .loopW + + lea r0, [r0 + r1 * 2 - %1] + lea r2, [r2 + r3 * 2 - %1 * 2] + + dec r4d + jnz .loop + RET +%endmacro + + FILTER_V_PS_W16n 64, 64 + FILTER_V_PS_W16n 64, 32 + FILTER_V_PS_W16n 64, 48 + FILTER_V_PS_W16n 48, 64 + FILTER_V_PS_W16n 64, 16 + + +;------------------------------------------------------------------------------------------------------------ +;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_2x4, 4, 6, 7 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + lea r5, [3 * r1] + + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m1, m5, m6 + punpcklbw m3, m1 + + pmaddubsw m3, m0 + phaddw m2, m3 + + mova m1, [pw_2000] + + psubw m2, m1 + + movd [r2], m2 + pextrd [r2 + r3], m2, 2 + + movd m2, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + phaddw m4, m5 + psubw m4, m1 + + lea r2, [r2 + 2 * r3] + movd [r2], m4 + pextrd [r2 + r3], m4, 2 + + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + mova m1, [pw_2000] + lea r5, [3 * r1] + mov r4d, %2/4 +.loop: + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 + + pmaddubsw m3, m0 + + phaddw m2, m3 + psubw m2, m1 + + + movd [r2], m2 + pshufd m2, m2, 2 + movd [r2 + r3], m2 + + movd m2, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + psubw m4, m1 + + lea r2, [r2 + 2 * r3] + movd [r2], m4 + pshufd m4 , m4 ,2 + movd [r2 + r3], m4 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + + RET +%endmacro + + FILTER_V_PS_W2 2, 8 + + FILTER_V_PS_W2 2, 16 + +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mov dword [rsp], %2/4 + +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_CHROMA_SP_W4_4R + + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 + + movlps [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movlps [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SS 4, 4 + FILTER_VER_CHROMA_SS 4, 8 + FILTER_VER_CHROMA_SS 16, 16 + FILTER_VER_CHROMA_SS 16, 8 + FILTER_VER_CHROMA_SS 16, 12 + FILTER_VER_CHROMA_SS 12, 16 + FILTER_VER_CHROMA_SS 16, 4 + FILTER_VER_CHROMA_SS 4, 16 + FILTER_VER_CHROMA_SS 32, 32 + FILTER_VER_CHROMA_SS 32, 16 + FILTER_VER_CHROMA_SS 16, 32 + FILTER_VER_CHROMA_SS 32, 24 + FILTER_VER_CHROMA_SS 24, 32 + FILTER_VER_CHROMA_SS 32, 8 + + FILTER_VER_CHROMA_SS 16, 24 + FILTER_VER_CHROMA_SS 12, 32 + FILTER_VER_CHROMA_SS 4, 32 + FILTER_VER_CHROMA_SS 32, 64 + FILTER_VER_CHROMA_SS 16, 64 + FILTER_VER_CHROMA_SS 32, 48 + FILTER_VER_CHROMA_SS 24, 64 + + FILTER_VER_CHROMA_SS 64, 64 + FILTER_VER_CHROMA_SS 64, 32 + FILTER_VER_CHROMA_SS 64, 48 + FILTER_VER_CHROMA_SS 48, 64 + FILTER_VER_CHROMA_SS 64, 16 + +%macro FILTER_VER_CHROMA_S_AVX2_4x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x4, 4, 6, 7 + mov r4d, r4m + add r1d, r1d + shl r4d, 6 + sub r0, r1 + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] +%ifidn %1,sp + mova m6, [v4_pd_526336] +%else + add r3d, r3d +%endif + + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m4, [r5 + 1 * mmsize] + paddd m2, m4 + +%ifidn %1,sp + paddd m0, m6 + paddd m2, m6 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + vextracti128 xm2, m0, 1 + lea r4, [r3 * 3] + +%ifidn %1,sp + packuswb xm0, xm2 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r4], xm0, 3 +%else + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_4x4 sp + FILTER_VER_CHROMA_S_AVX2_4x4 ss + +%macro FILTER_VER_CHROMA_S_AVX2_4x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x8, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + sub r0, r1 + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] +%ifidn %1,sp + mova m7, [v4_pd_526336] +%else + add r3d, r3d +%endif + + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 + lea r4, [r3 * 3] + +%ifidn %1,sp + paddd m0, m7 + paddd m2, m7 + paddd m4, m7 + paddd m1, m7 + psrad m0, 12 + psrad m2, 12 + psrad m4, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m2, 6 + psrad m4, 6 + psrad m1, 6 +%endif + packssdw m0, m2 + packssdw m4, m1 +%ifidn %1,sp + packuswb m0, m4 + vextracti128 xm2, m0, 1 + movd [r2], xm0 + movd [r2 + r3], xm2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r4], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm2, 2 + pextrd [r2 + r3 * 2], xm0, 3 + pextrd [r2 + r4], xm2, 3 +%else + vextracti128 xm2, m0, 1 + vextracti128 xm1, m4, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r4], xm1 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_4x8 sp + FILTER_VER_CHROMA_S_AVX2_4x8 ss + +%macro PROCESS_CHROMA_AVX2_W4_16R 1 + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] + pmaddwd m3, m6, [r5 + 1 * mmsize] + paddd m1, m3 + pmaddwd m6, [r5] + +%ifidn %1,sp + paddd m0, m7 + paddd m2, m7 + paddd m4, m7 + paddd m1, m7 + psrad m4, 12 + psrad m1, 12 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 + psrad m4, 6 + psrad m1, 6 +%endif + packssdw m0, m2 + packssdw m4, m1 +%ifidn %1,sp + packuswb m0, m4 + vextracti128 xm4, m0, 1 + movd [r2], xm0 + movd [r2 + r3], xm4 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r6], xm4, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm4, 2 + pextrd [r2 + r3 * 2], xm0, 3 + pextrd [r2 + r6], xm4, 3 +%else + vextracti128 xm2, m0, 1 + vextracti128 xm1, m4, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 +%endif + + movq xm2, [r0 + r4] + punpcklwd xm5, xm2 + lea r0, [r0 + 4 * r1] + movq xm0, [r0] + punpcklwd xm2, xm0 + vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] + pmaddwd m2, m5, [r5 + 1 * mmsize] + paddd m6, m2 + pmaddwd m5, [r5] + movq xm2, [r0 + r1] + punpcklwd xm0, xm2 + movq xm3, [r0 + 2 * r1] + punpcklwd xm2, xm3 + vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m5, m2 + pmaddwd m0, [r5] + movq xm4, [r0 + r4] + punpcklwd xm3, xm4 + lea r0, [r0 + 4 * r1] + movq xm1, [r0] + punpcklwd xm4, xm1 + vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] + pmaddwd m4, m3, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m3, [r5] + movq xm4, [r0 + r1] + punpcklwd xm1, xm4 + movq xm2, [r0 + 2 * r1] + punpcklwd xm4, xm2 + vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] + pmaddwd m1, [r5 + 1 * mmsize] + paddd m3, m1 + +%ifidn %1,sp + paddd m6, m7 + paddd m5, m7 + paddd m0, m7 + paddd m3, m7 + psrad m6, 12 + psrad m5, 12 + psrad m0, 12 + psrad m3, 12 +%else + psrad m6, 6 + psrad m5, 6 + psrad m0, 6 + psrad m3, 6 +%endif + packssdw m6, m5 + packssdw m0, m3 + lea r2, [r2 + r3 * 4] + +%ifidn %1,sp + packuswb m6, m0 + vextracti128 xm0, m6, 1 + movd [r2], xm6 + movd [r2 + r3], xm0 + pextrd [r2 + r3 * 2], xm6, 1 + pextrd [r2 + r6], xm0, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm6, 2 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm6, 3 + pextrd [r2 + r6], xm0, 3 +%else + vextracti128 xm5, m6, 1 + vextracti128 xm3, m0, 1 + movq [r2], xm6 + movq [r2 + r3], xm5 + movhps [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm5 + lea r2, [r2 + r3 * 4] + movq [r2], xm0 + movq [r2 + r3], xm3 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm3 +%endif +%endmacro + +%macro FILTER_VER_CHROMA_S_AVX2_4x16 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x16, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + sub r0, r1 + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] +%ifidn %1,sp + mova m7, [v4_pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + PROCESS_CHROMA_AVX2_W4_16R %1 + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_4x16 sp + FILTER_VER_CHROMA_S_AVX2_4x16 ss + +%macro FILTER_VER_CHROMA_S_AVX2_4x32 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x32, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + sub r0, r1 + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] +%ifidn %1,sp + mova m7, [v4_pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] +%rep 2 + PROCESS_CHROMA_AVX2_W4_16R %1 + lea r2, [r2 + r3 * 4] +%endrep + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_4x32 sp + FILTER_VER_CHROMA_S_AVX2_4x32 ss + +%macro FILTER_VER_CHROMA_S_AVX2_4x2 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x2, 4, 6, 6 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + sub r0, r1 + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] +%ifidn %1,sp + mova m5, [v4_pd_526336] +%else + add r3d, r3d +%endif + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + movq xm4, [r0 + 4 * r1] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 +%ifidn %1,sp + paddd m0, m5 + psrad m0, 12 +%else + psrad m0, 6 +%endif + vextracti128 xm1, m0, 1 + packssdw xm0, xm1 +%ifidn %1,sp + packuswb xm0, xm0 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 +%else + movq [r2], xm0 + movhps [r2 + r3], xm0 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_4x2 sp + FILTER_VER_CHROMA_S_AVX2_4x2 ss + +%macro FILTER_VER_CHROMA_S_AVX2_2x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_2x4, 4, 6, 6 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + sub r0, r1 + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] +%ifidn %1,sp + mova m5, [v4_pd_526336] +%else + add r3d, r3d +%endif + movd xm0, [r0] + movd xm1, [r0 + r1] + punpcklwd xm0, xm1 + movd xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] + movd xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movd xm4, [r0] + punpcklwd xm3, xm4 + punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] + vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] + movd xm1, [r0 + r1] + punpcklwd xm4, xm1 + movd xm3, [r0 + r1 * 2] + punpcklwd xm1, xm3 + punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] + vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] + pmaddwd m0, [r5] + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 +%ifidn %1,sp + paddd m0, m5 + psrad m0, 12 +%else + psrad m0, 6 +%endif + vextracti128 xm1, m0, 1 + packssdw xm0, xm1 + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb xm0, xm0 + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + 2 * r3], xm0, 2 + pextrw [r2 + r4], xm0, 3 +%else + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + 2 * r3], xm0, 2 + pextrd [r2 + r4], xm0, 3 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_2x4 sp + FILTER_VER_CHROMA_S_AVX2_2x4 ss + +%macro FILTER_VER_CHROMA_S_AVX2_8x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x8, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m7, [v4_pd_526336] +%else + add r3d, r3d +%endif + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m7 + paddd m3, m7 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 + + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm3, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm3, 1 + pmaddwd m3, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m3 + + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb m0, m2 + mova m3, [v4_interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 +%endif + lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m7 + paddd m5, m7 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r0 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m6, m0 + movu xm5, [r0 + r1 * 2] ; m5 = row 10 + punpckhwd xm0, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m1, m2 + +%ifidn %1,sp + paddd m6, m7 + paddd m1, m7 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r4], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 + vextracti128 xm1, m6, 1 + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r4], xm1 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_8x8 sp + FILTER_VER_CHROMA_S_AVX2_8x8 ss + +%macro PROCESS_CHROMA_S_AVX2_W8_16R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] +%ifidn %1,sp + paddd m0, m9 + paddd m1, m9 + paddd m2, m9 + paddd m3, m9 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [v4_interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif + + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm0, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm0, 1 + pmaddwd m0, m7, [r5 + 1 * mmsize] + paddd m5, m0 + pmaddwd m7, [r5] + movu xm0, [r7 + r1] ; m0 = row 9 + punpckhwd xm1, xm8, xm0 + punpcklwd xm8, xm0 + vinserti128 m8, m8, xm1, 1 + pmaddwd m1, m8, [r5 + 1 * mmsize] + paddd m6, m1 + pmaddwd m8, [r5] + movu xm1, [r7 + r1 * 2] ; m1 = row 10 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m7, m2 + pmaddwd m0, [r5] +%ifidn %1,sp + paddd m4, m9 + paddd m5, m9 + psrad m4, 12 + psrad m5, 12 + paddd m6, m9 + paddd m7, m9 + psrad m6, 12 + psrad m7, 12 +%else + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 +%endif + packssdw m4, m5 + packssdw m6, m7 + lea r8, [r2 + r3 * 4] +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm5 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 +%endif + + movu xm2, [r7 + r4] ; m2 = row 11 + punpckhwd xm4, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm4, 1 + pmaddwd m4, m1, [r5 + 1 * mmsize] + paddd m8, m4 + pmaddwd m1, [r5] + lea r7, [r7 + r1 * 4] + movu xm4, [r7] ; m4 = row 12 + punpckhwd xm5, xm2, xm4 + punpcklwd xm2, xm4 + vinserti128 m2, m2, xm5, 1 + pmaddwd m5, m2, [r5 + 1 * mmsize] + paddd m0, m5 + pmaddwd m2, [r5] + movu xm5, [r7 + r1] ; m5 = row 13 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m1, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 14 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m2, m7 + pmaddwd m5, [r5] +%ifidn %1,sp + paddd m8, m9 + paddd m0, m9 + paddd m1, m9 + paddd m2, m9 + psrad m8, 12 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 +%else + psrad m8, 6 + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 +%endif + packssdw m8, m0 + packssdw m1, m2 + lea r8, [r8 + r3 * 4] +%ifidn %1,sp + packuswb m8, m1 + vpermd m8, m3, m8 + vextracti128 xm1, m8, 1 + movq [r8], xm8 + movhps [r8 + r3], xm8 + movq [r8 + r3 * 2], xm1 + movhps [r8 + r6], xm1 +%else + vpermq m8, m8, 11011000b + vpermq m1, m1, 11011000b + vextracti128 xm0, m8, 1 + vextracti128 xm2, m1, 1 + movu [r8], xm8 + movu [r8 + r3], xm0 + movu [r8 + r3 * 2], xm1 + movu [r8 + r6], xm2 +%endif + lea r8, [r8 + r3 * 4] + + movu xm7, [r7 + r4] ; m7 = row 15 + punpckhwd xm2, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddwd m2, m6, [r5 + 1 * mmsize] + paddd m4, m2 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhwd xm1, xm7, xm2 + punpcklwd xm7, xm2 + vinserti128 m7, m7, xm1, 1 + pmaddwd m1, m7, [r5 + 1 * mmsize] + paddd m5, m1 + pmaddwd m7, [r5] + movu xm1, [r7 + r1] ; m1 = row 17 + punpckhwd xm0, xm2, xm1 + punpcklwd xm2, xm1 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m6, m2 + movu xm0, [r7 + r1 * 2] ; m0 = row 18 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m7, m1 + +%ifidn %1,sp + paddd m4, m9 + paddd m5, m9 + paddd m6, m9 + paddd m7, m9 + psrad m4, 12 + psrad m5, 12 + psrad m6, 12 + psrad m7, 12 +%else + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 +%endif + packssdw m4, m5 + packssdw m6, m7 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm5 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 +%endif +%endmacro + +%macro FILTER_VER_CHROMA_S_AVX2_Nx16 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_%2x16, 4, 10, 10 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m9, [v4_pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + mov r9d, %2 / 8 +.loopW: + PROCESS_CHROMA_S_AVX2_W8_16R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r9d + jnz .loopW + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 16 + FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 32 + FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 64 + FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 16 + FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 32 + FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 64 + +%macro FILTER_VER_CHROMA_S_AVX2_NxN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%3_%1x%2, 4, 11, 10 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %3,sp + mova m9, [v4_pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 8 +.loopW: + PROCESS_CHROMA_S_AVX2_W8_16R %3 +%ifidn %3,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r10d + jnz .loopW + lea r0, [r7 - 2 * %1 + 16] +%ifidn %3,sp + lea r2, [r8 + r3 * 4 - %1 + 8] +%else + lea r2, [r8 + r3 * 4 - 2 * %1 + 16] +%endif + dec r9d + jnz .loopH + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, sp + FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, sp + FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, sp + FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, ss + FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, ss + FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, ss + FILTER_VER_CHROMA_S_AVX2_NxN 16, 64, sp + FILTER_VER_CHROMA_S_AVX2_NxN 24, 64, sp + FILTER_VER_CHROMA_S_AVX2_NxN 32, 64, sp + FILTER_VER_CHROMA_S_AVX2_NxN 32, 48, sp + FILTER_VER_CHROMA_S_AVX2_NxN 32, 48, ss + FILTER_VER_CHROMA_S_AVX2_NxN 16, 64, ss + FILTER_VER_CHROMA_S_AVX2_NxN 24, 64, ss + FILTER_VER_CHROMA_S_AVX2_NxN 32, 64, ss + FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, sp + FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, sp + FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, sp + FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, sp + FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, ss + FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, ss + FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, ss + FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, ss + +%macro PROCESS_CHROMA_S_AVX2_W8_4R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m2, m4 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm4, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm4, 1 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m3, m5 +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [v4_interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 +%endif +%endmacro + +%macro FILTER_VER_CHROMA_S_AVX2_8x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x4, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m7, [v4_pd_526336] +%else + add r3d, r3d +%endif + + PROCESS_CHROMA_S_AVX2_W8_4R %1 + lea r4, [r3 * 3] +%ifidn %1,sp + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 +%else + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_8x4 sp + FILTER_VER_CHROMA_S_AVX2_8x4 ss + +%macro FILTER_VER_CHROMA_S_AVX2_12x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_12x16, 4, 9, 10 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m9, [v4_pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + PROCESS_CHROMA_S_AVX2_W8_16R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + mova m7, m9 + PROCESS_CHROMA_AVX2_W4_16R %1 + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_12x16 sp + FILTER_VER_CHROMA_S_AVX2_12x16 ss + +%macro FILTER_VER_CHROMA_S_AVX2_12x32 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_12x32, 4, 9, 10 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1, sp + mova m9, [v4_pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] +%rep 2 + PROCESS_CHROMA_S_AVX2_W8_16R %1 +%ifidn %1, sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + mova m7, m9 + PROCESS_CHROMA_AVX2_W4_16R %1 + sub r0, 16 +%ifidn %1, sp + lea r2, [r2 + r3 * 4 - 8] +%else + lea r2, [r2 + r3 * 4 - 16] +%endif +%endrep + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_12x32 sp + FILTER_VER_CHROMA_S_AVX2_12x32 ss + +%macro FILTER_VER_CHROMA_S_AVX2_16x12 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_16x12, 4, 9, 9 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m8, [v4_pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] +%rep 2 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m1, m8 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m8 + paddd m3, m8 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [v4_interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif + lea r8, [r2 + r3 * 4] + + movu xm1, [r7 + r4] ; m1 = row 7 + punpckhwd xm0, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm0, 1 + pmaddwd m0, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m0 + lea r7, [r7 + r1 * 4] + movu xm0, [r7] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m8 + paddd m5, m8 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r7 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m5, m0, [r5 + 1 * mmsize] + paddd m6, m5 + pmaddwd m0, [r5] + movu xm5, [r7 + r1 * 2] ; m5 = row 10 + punpckhwd xm7, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 1 * mmsize] + paddd m1, m7 + pmaddwd m2, [r5] + +%ifidn %1,sp + paddd m6, m8 + paddd m1, m8 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm7, m4, 1 + vextracti128 xm1, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm7 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm1 +%endif + lea r8, [r8 + r3 * 4] + + movu xm7, [r7 + r4] ; m7 = row 11 + punpckhwd xm1, xm5, xm7 + punpcklwd xm5, xm7 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + paddd m0, m1 + pmaddwd m5, [r5] + lea r7, [r7 + r1 * 4] + movu xm1, [r7] ; m1 = row 12 + punpckhwd xm4, xm7, xm1 + punpcklwd xm7, xm1 + vinserti128 m7, m7, xm4, 1 + pmaddwd m4, m7, [r5 + 1 * mmsize] + paddd m2, m4 + pmaddwd m7, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m2, m8 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + + movu xm4, [r7 + r1] ; m4 = row 13 + punpckhwd xm2, xm1, xm4 + punpcklwd xm1, xm4 + vinserti128 m1, m1, xm2, 1 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m5, m1 + movu xm2, [r7 + r1 * 2] ; m2 = row 14 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m7, m4 +%ifidn %1,sp + paddd m5, m8 + paddd m7, m8 + psrad m5, 12 + psrad m7, 12 +%else + psrad m5, 6 + psrad m7, 6 +%endif + packssdw m5, m7 +%ifidn %1,sp + packuswb m0, m5 + vpermd m0, m3, m0 + vextracti128 xm5, m0, 1 + movq [r8], xm0 + movhps [r8 + r3], xm0 + movq [r8 + r3 * 2], xm5 + movhps [r8 + r6], xm5 + add r2, 8 +%else + vpermq m0, m0, 11011000b + vpermq m5, m5, 11011000b + vextracti128 xm7, m0, 1 + vextracti128 xm6, m5, 1 + movu [r8], xm0 + movu [r8 + r3], xm7 + movu [r8 + r3 * 2], xm5 + movu [r8 + r6], xm6 + add r2, 16 +%endif + add r0, 16 +%endrep + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_16x12 sp + FILTER_VER_CHROMA_S_AVX2_16x12 ss + +%macro FILTER_VER_CHROMA_S_AVX2_8x12 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x12, 4, 7, 9 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m8, [v4_pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m1, m8 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m8 + paddd m3, m8 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [v4_interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm0, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm0, 1 + pmaddwd m0, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m0 + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m8 + paddd m5, m8 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r0 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m5, m0, [r5 + 1 * mmsize] + paddd m6, m5 + pmaddwd m0, [r5] + movu xm5, [r0 + r1 * 2] ; m5 = row 10 + punpckhwd xm7, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 1 * mmsize] + paddd m1, m7 + pmaddwd m2, [r5] + +%ifidn %1,sp + paddd m6, m8 + paddd m1, m8 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm7, m4, 1 + vextracti128 xm1, m6, 1 + movu [r2], xm4 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm1 +%endif + lea r2, [r2 + r3 * 4] + + movu xm7, [r0 + r4] ; m7 = row 11 + punpckhwd xm1, xm5, xm7 + punpcklwd xm5, xm7 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + paddd m0, m1 + pmaddwd m5, [r5] + lea r0, [r0 + r1 * 4] + movu xm1, [r0] ; m1 = row 12 + punpckhwd xm4, xm7, xm1 + punpcklwd xm7, xm1 + vinserti128 m7, m7, xm4, 1 + pmaddwd m4, m7, [r5 + 1 * mmsize] + paddd m2, m4 + pmaddwd m7, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m2, m8 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + + movu xm4, [r0 + r1] ; m4 = row 13 + punpckhwd xm2, xm1, xm4 + punpcklwd xm1, xm4 + vinserti128 m1, m1, xm2, 1 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m5, m1 + movu xm2, [r0 + r1 * 2] ; m2 = row 14 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m7, m4 +%ifidn %1,sp + paddd m5, m8 + paddd m7, m8 + psrad m5, 12 + psrad m7, 12 +%else + psrad m5, 6 + psrad m7, 6 +%endif + packssdw m5, m7 +%ifidn %1,sp + packuswb m0, m5 + vpermd m0, m3, m0 + vextracti128 xm5, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm5 +%else + vpermq m0, m0, 11011000b + vpermq m5, m5, 11011000b + vextracti128 xm7, m0, 1 + vextracti128 xm6, m5, 1 + movu [r2], xm0 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm5 + movu [r2 + r6], xm6 +%endif + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_8x12 sp + FILTER_VER_CHROMA_S_AVX2_8x12 ss + +%macro FILTER_VER_CHROMA_S_AVX2_16x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_16x4, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m7, [v4_pd_526336] +%else + add r3d, r3d +%endif +%rep 2 + PROCESS_CHROMA_S_AVX2_W8_4R %1 + lea r6, [r3 * 3] +%ifidn %1,sp + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 + add r2, 8 +%else + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + add r2, 16 +%endif + lea r6, [4 * r1 - 16] + sub r0, r6 +%endrep + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_16x4 sp + FILTER_VER_CHROMA_S_AVX2_16x4 ss + +%macro PROCESS_CHROMA_S_AVX2_W8_8R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m7 + paddd m3, m7 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [v4_interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif + lea r8, [r2 + r3 * 4] + + movu xm1, [r7 + r4] ; m1 = row 7 + punpckhwd xm0, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm0, 1 + pmaddwd m0, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m0 + lea r7, [r7 + r1 * 4] + movu xm0, [r7] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m7 + paddd m5, m7 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r7 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m6, m0 + movu xm5, [r7 + r1 * 2] ; m5 = row 10 + punpckhwd xm0, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m1, m2 + +%ifidn %1,sp + paddd m6, m7 + paddd m1, m7 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm7, m4, 1 + vextracti128 xm1, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm7 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm1 +%endif +%endmacro + +%macro FILTER_VER_CHROMA_S_AVX2_Nx8 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_%2x8, 4, 9, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m7, [v4_pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] +%rep %2 / 8 + PROCESS_CHROMA_S_AVX2_W8_8R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 +%endrep + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 32 + FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 16 + FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 32 + FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 16 + +%macro FILTER_VER_CHROMA_S_AVX2_8x2 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x2, 4, 6, 6 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m5, [v4_pd_526336] +%else + add r3d, r3d +%endif + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r0 + r1 * 4] ; m4 = row 4 + punpckhwd xm2, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddwd m3, [r5 + 1 * mmsize] + paddd m1, m3 +%ifidn %1,sp + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 +%ifidn %1,sp + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, 11011000b + movq [r2], xm0 + movhps [r2 + r3], xm0 +%else + vpermq m0, m0, 11011000b + vextracti128 xm1, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_8x2 sp + FILTER_VER_CHROMA_S_AVX2_8x2 ss + +%macro FILTER_VER_CHROMA_S_AVX2_8x6 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x6, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m7, [v4_pd_526336] +%else + add r3d, r3d +%endif + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m7 + paddd m3, m7 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 + + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm3, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm3, 1 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m4, m6 + movu xm6, [r0 + r1 * 4] ; m6 = row 8 + punpckhwd xm3, xm1, xm6 + punpcklwd xm1, xm6 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m5, m1 +%ifidn %1,sp + paddd m4, m7 + paddd m5, m7 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb m0, m2 + mova m3, [v4_interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + pshufd xm4, xm4, 11011000b + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movhps [r2 + r3], xm4 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_8x6 sp + FILTER_VER_CHROMA_S_AVX2_8x6 ss + +%macro FILTER_VER_CHROMA_S_AVX2_8xN 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_8x%2, 4, 7, 9 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m8, [v4_pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] +%rep %2 / 16 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m1, m8 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m8 + paddd m3, m8 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [v4_interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm0, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm0, 1 + pmaddwd m0, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m0 + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m8 + paddd m5, m8 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r0 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m5, m0, [r5 + 1 * mmsize] + paddd m6, m5 + pmaddwd m0, [r5] + movu xm5, [r0 + r1 * 2] ; m5 = row 10 + punpckhwd xm7, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 1 * mmsize] + paddd m1, m7 + pmaddwd m2, [r5] + +%ifidn %1,sp + paddd m6, m8 + paddd m1, m8 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm7, m4, 1 + vextracti128 xm1, m6, 1 + movu [r2], xm4 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm1 +%endif + lea r2, [r2 + r3 * 4] + + movu xm7, [r0 + r4] ; m7 = row 11 + punpckhwd xm1, xm5, xm7 + punpcklwd xm5, xm7 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + paddd m0, m1 + pmaddwd m5, [r5] + lea r0, [r0 + r1 * 4] + movu xm1, [r0] ; m1 = row 12 + punpckhwd xm4, xm7, xm1 + punpcklwd xm7, xm1 + vinserti128 m7, m7, xm4, 1 + pmaddwd m4, m7, [r5 + 1 * mmsize] + paddd m2, m4 + pmaddwd m7, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m2, m8 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + + movu xm4, [r0 + r1] ; m4 = row 13 + punpckhwd xm2, xm1, xm4 + punpcklwd xm1, xm4 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + paddd m5, m2 + pmaddwd m1, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 14 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m7, m6 + pmaddwd m4, [r5] +%ifidn %1,sp + paddd m5, m8 + paddd m7, m8 + psrad m5, 12 + psrad m7, 12 +%else + psrad m5, 6 + psrad m7, 6 +%endif + packssdw m5, m7 +%ifidn %1,sp + packuswb m0, m5 + vpermd m0, m3, m0 + vextracti128 xm5, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm5 +%else + vpermq m0, m0, 11011000b + vpermq m5, m5, 11011000b + vextracti128 xm7, m0, 1 + vextracti128 xm6, m5, 1 + movu [r2], xm0 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm5 + movu [r2 + r6], xm6 +%endif + lea r2, [r2 + r3 * 4] + + movu xm6, [r0 + r4] ; m6 = row 15 + punpckhwd xm5, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm5, 1 + pmaddwd m5, m2, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 16 + punpckhwd xm5, xm6, xm0 + punpcklwd xm6, xm0 + vinserti128 m6, m6, xm5, 1 + pmaddwd m5, m6, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m6, [r5] +%ifidn %1,sp + paddd m1, m8 + paddd m4, m8 + psrad m1, 12 + psrad m4, 12 +%else + psrad m1, 6 + psrad m4, 6 +%endif + packssdw m1, m4 + + movu xm5, [r0 + r1] ; m5 = row 17 + punpckhwd xm4, xm0, xm5 + punpcklwd xm0, xm5 + vinserti128 m0, m0, xm4, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m2, m0 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhwd xm0, xm5, xm4 + punpcklwd xm5, xm4 + vinserti128 m5, m5, xm0, 1 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m6, m5 +%ifidn %1,sp + paddd m2, m8 + paddd m6, m8 + psrad m2, 12 + psrad m6, 12 +%else + psrad m2, 6 + psrad m6, 6 +%endif + packssdw m2, m6 +%ifidn %1,sp + packuswb m1, m2 + vpermd m1, m3, m1 + vextracti128 xm2, m1, 1 + movq [r2], xm1 + movhps [r2 + r3], xm1 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m1, m1, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm6, m1, 1 + vextracti128 xm4, m2, 1 + movu [r2], xm1 + movu [r2 + r3], xm6 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm4 +%endif + lea r2, [r2 + r3 * 4] +%endrep + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_8xN sp, 16 + FILTER_VER_CHROMA_S_AVX2_8xN sp, 32 + FILTER_VER_CHROMA_S_AVX2_8xN sp, 64 + FILTER_VER_CHROMA_S_AVX2_8xN ss, 16 + FILTER_VER_CHROMA_S_AVX2_8xN ss, 32 + FILTER_VER_CHROMA_S_AVX2_8xN ss, 64 + +%macro FILTER_VER_CHROMA_S_AVX2_Nx24 2 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_%2x24, 4, 10, 10 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m9, [v4_pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + mov r9d, %2 / 8 +.loopW: + PROCESS_CHROMA_S_AVX2_W8_16R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r9d + jnz .loopW +%ifidn %1,sp + lea r2, [r8 + r3 * 4 - %2 + 8] +%else + lea r2, [r8 + r3 * 4 - 2 * %2 + 16] +%endif + lea r0, [r7 - 2 * %2 + 16] + mova m7, m9 + mov r9d, %2 / 8 +.loop: + PROCESS_CHROMA_S_AVX2_W8_8R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r9d + jnz .loop + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_Nx24 sp, 32 + FILTER_VER_CHROMA_S_AVX2_Nx24 sp, 16 + FILTER_VER_CHROMA_S_AVX2_Nx24 ss, 32 + FILTER_VER_CHROMA_S_AVX2_Nx24 ss, 16 + +%macro FILTER_VER_CHROMA_S_AVX2_2x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_2x8, 4, 6, 7 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + sub r0, r1 + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] +%ifidn %1,sp + mova m6, [v4_pd_526336] +%else + add r3d, r3d +%endif + movd xm0, [r0] + movd xm1, [r0 + r1] + punpcklwd xm0, xm1 + movd xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] + movd xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movd xm4, [r0] + punpcklwd xm3, xm4 + punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] + vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] + movd xm1, [r0 + r1] + punpcklwd xm4, xm1 + movd xm3, [r0 + r1 * 2] + punpcklwd xm1, xm3 + punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] + vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] + pmaddwd m0, [r5] + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movd xm1, [r0 + r4] + punpcklwd xm3, xm1 + lea r0, [r0 + 4 * r1] + movd xm2, [r0] + punpcklwd xm1, xm2 + punpcklqdq xm3, xm1 ; m3 = [8 7 7 6] + vinserti128 m4, m4, xm3, 1 ; m4 = [8 7 7 6 6 5 5 4] + movd xm1, [r0 + r1] + punpcklwd xm2, xm1 + movd xm5, [r0 + r1 * 2] + punpcklwd xm1, xm5 + punpcklqdq xm2, xm1 ; m2 = [10 9 9 8] + vinserti128 m3, m3, xm2, 1 ; m3 = [10 9 9 8 8 7 7 6] + pmaddwd m4, [r5] + pmaddwd m3, [r5 + 1 * mmsize] + paddd m4, m3 +%ifidn %1,sp + paddd m0, m6 + paddd m4, m6 + psrad m0, 12 + psrad m4, 12 +%else + psrad m0, 6 + psrad m4, 6 +%endif + packssdw m0, m4 + vextracti128 xm4, m0, 1 + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb xm0, xm4 + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + 2 * r3], xm0, 4 + pextrw [r2 + r4], xm0, 5 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 2 + pextrw [r2 + r3], xm0, 3 + pextrw [r2 + 2 * r3], xm0, 6 + pextrw [r2 + r4], xm0, 7 +%else + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + movd [r2 + 2 * r3], xm4 + pextrd [r2 + r4], xm4, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm0, 3 + pextrd [r2 + 2 * r3], xm4, 2 + pextrd [r2 + r4], xm4, 3 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_2x8 sp + FILTER_VER_CHROMA_S_AVX2_2x8 ss + +%macro FILTER_VER_CHROMA_S_AVX2_2x16 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_2x16, 4, 6, 9 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + sub r0, r1 + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] +%ifidn %1,sp + mova m6, [v4_pd_526336] +%else + add r3d, r3d +%endif + movd xm0, [r0] + movd xm1, [r0 + r1] + punpcklwd xm0, xm1 + movd xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] + movd xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movd xm4, [r0] + punpcklwd xm3, xm4 + punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] + vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] + movd xm1, [r0 + r1] + punpcklwd xm4, xm1 + movd xm3, [r0 + r1 * 2] + punpcklwd xm1, xm3 + punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] + vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] + pmaddwd m0, [r5] + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movd xm1, [r0 + r4] + punpcklwd xm3, xm1 + lea r0, [r0 + 4 * r1] + movd xm2, [r0] + punpcklwd xm1, xm2 + punpcklqdq xm3, xm1 ; m3 = [8 7 7 6] + vinserti128 m4, m4, xm3, 1 ; m4 = [8 7 7 6 6 5 5 4] + movd xm1, [r0 + r1] + punpcklwd xm2, xm1 + movd xm5, [r0 + r1 * 2] + punpcklwd xm1, xm5 + punpcklqdq xm2, xm1 ; m2 = [10 9 9 8] + vinserti128 m3, m3, xm2, 1 ; m3 = [10 9 9 8 8 7 7 6] + pmaddwd m4, [r5] + pmaddwd m3, [r5 + 1 * mmsize] + paddd m4, m3 + movd xm1, [r0 + r4] + punpcklwd xm5, xm1 + lea r0, [r0 + 4 * r1] + movd xm3, [r0] + punpcklwd xm1, xm3 + punpcklqdq xm5, xm1 ; m5 = [12 11 11 10] + vinserti128 m2, m2, xm5, 1 ; m2 = [12 11 11 10 10 9 9 8] + movd xm1, [r0 + r1] + punpcklwd xm3, xm1 + movd xm7, [r0 + r1 * 2] + punpcklwd xm1, xm7 + punpcklqdq xm3, xm1 ; m3 = [14 13 13 12] + vinserti128 m5, m5, xm3, 1 ; m5 = [14 13 13 12 12 11 11 10] + pmaddwd m2, [r5] + pmaddwd m5, [r5 + 1 * mmsize] + paddd m2, m5 + movd xm5, [r0 + r4] + punpcklwd xm7, xm5 + lea r0, [r0 + 4 * r1] + movd xm1, [r0] + punpcklwd xm5, xm1 + punpcklqdq xm7, xm5 ; m7 = [16 15 15 14] + vinserti128 m3, m3, xm7, 1 ; m3 = [16 15 15 14 14 13 13 12] + movd xm5, [r0 + r1] + punpcklwd xm1, xm5 + movd xm8, [r0 + r1 * 2] + punpcklwd xm5, xm8 + punpcklqdq xm1, xm5 ; m1 = [18 17 17 16] + vinserti128 m7, m7, xm1, 1 ; m7 = [18 17 17 16 16 15 15 14] + pmaddwd m3, [r5] + pmaddwd m7, [r5 + 1 * mmsize] + paddd m3, m7 +%ifidn %1,sp + paddd m0, m6 + paddd m4, m6 + paddd m2, m6 + paddd m3, m6 + psrad m0, 12 + psrad m4, 12 + psrad m2, 12 + psrad m3, 12 +%else + psrad m0, 6 + psrad m4, 6 + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m0, m4 + packssdw m2, m3 + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb m0, m2 + vextracti128 xm2, m0, 1 + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + 2 * r3], xm2, 0 + pextrw [r2 + r4], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 2 + pextrw [r2 + r3], xm0, 3 + pextrw [r2 + 2 * r3], xm2, 2 + pextrw [r2 + r4], xm2, 3 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 4 + pextrw [r2 + r3], xm0, 5 + pextrw [r2 + 2 * r3], xm2, 4 + pextrw [r2 + r4], xm2, 5 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 6 + pextrw [r2 + r3], xm0, 7 + pextrw [r2 + 2 * r3], xm2, 6 + pextrw [r2 + r4], xm2, 7 +%else + vextracti128 xm4, m0, 1 + vextracti128 xm3, m2, 1 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + movd [r2 + 2 * r3], xm4 + pextrd [r2 + r4], xm4, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm0, 3 + pextrd [r2 + 2 * r3], xm4, 2 + pextrd [r2 + r4], xm4, 3 + lea r2, [r2 + r3 * 4] + movd [r2], xm2 + pextrd [r2 + r3], xm2, 1 + movd [r2 + 2 * r3], xm3 + pextrd [r2 + r4], xm3, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm2, 2 + pextrd [r2 + r3], xm2, 3 + pextrd [r2 + 2 * r3], xm3, 2 + pextrd [r2 + r4], xm3, 3 +%endif + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_2x16 sp + FILTER_VER_CHROMA_S_AVX2_2x16 ss + +%macro FILTER_VER_CHROMA_S_AVX2_6x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m7, [v4_pd_526336] +%else + add r3d, r3d +%endif + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m7 + paddd m3, m7 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 + + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm3, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm3, 1 + pmaddwd m3, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m3 + + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb m0, m2 + vextracti128 xm2, m0, 1 + movd [r2], xm0 + pextrw [r2 + 4], xm2, 0 + pextrd [r2 + r3], xm0, 1 + pextrw [r2 + r3 + 4], xm2, 2 + pextrd [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r3 * 2 + 4], xm2, 4 + pextrd [r2 + r4], xm0, 3 + pextrw [r2 + r4 + 4], xm2, 6 +%else + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movd [r2 + 8], xm0 + pextrd [r2 + r3 + 8], xm0, 2 + movd [r2 + r3 * 2 + 8], xm3 + pextrd [r2 + r4 + 8], xm3, 2 +%endif + lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m7 + paddd m5, m7 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r0 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m6, m0 + movu xm5, [r0 + r1 * 2] ; m5 = row 10 + punpckhwd xm0, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m1, m2 + +%ifidn %1,sp + paddd m6, m7 + paddd m1, m7 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vextracti128 xm6, m4, 1 + movd [r2], xm4 + pextrw [r2 + 4], xm6, 0 + pextrd [r2 + r3], xm4, 1 + pextrw [r2 + r3 + 4], xm6, 2 + pextrd [r2 + r3 * 2], xm4, 2 + pextrw [r2 + r3 * 2 + 4], xm6, 4 + pextrd [r2 + r4], xm4, 3 + pextrw [r2 + r4 + 4], xm6, 6 +%else + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r4], xm6 + vextracti128 xm5, m4, 1 + vextracti128 xm1, m6, 1 + movd [r2 + 8], xm5 + pextrd [r2 + r3 + 8], xm5, 2 + movd [r2 + r3 * 2 + 8], xm1 + pextrd [r2 + r4 + 8], xm1, 2 +%endif + RET +%endmacro + + FILTER_VER_CHROMA_S_AVX2_6x8 sp + FILTER_VER_CHROMA_S_AVX2_6x8 ss + +%macro FILTER_VER_CHROMA_S_AVX2_6x16 1 +%if ARCH_X86_64 == 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_6x16, 4, 7, 9 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m8, [v4_pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m1, m8 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m8 + paddd m3, m8 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + vextracti128 xm2, m0, 1 + movd [r2], xm0 + pextrw [r2 + 4], xm2, 0 + pextrd [r2 + r3], xm0, 1 + pextrw [r2 + r3 + 4], xm2, 2 + pextrd [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r3 * 2 + 4], xm2, 4 + pextrd [r2 + r6], xm0, 3 + pextrw [r2 + r6 + 4], xm2, 6 +%else + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movd [r2 + 8], xm0 + pextrd [r2 + r3 + 8], xm0, 2 + movd [r2 + r3 * 2 + 8], xm3 + pextrd [r2 + r6 + 8], xm3, 2 +%endif + lea r2, [r2 + r3 * 4] + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm0, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm0, 1 + pmaddwd m0, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m0 + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m8 + paddd m5, m8 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r0 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m5, m0, [r5 + 1 * mmsize] + paddd m6, m5 + pmaddwd m0, [r5] + movu xm5, [r0 + r1 * 2] ; m5 = row 10 + punpckhwd xm7, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 1 * mmsize] + paddd m1, m7 + pmaddwd m2, [r5] + +%ifidn %1,sp + paddd m6, m8 + paddd m1, m8 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vextracti128 xm6, m4, 1 + movd [r2], xm4 + pextrw [r2 + 4], xm6, 0 + pextrd [r2 + r3], xm4, 1 + pextrw [r2 + r3 + 4], xm6, 2 + pextrd [r2 + r3 * 2], xm4, 2 + pextrw [r2 + r3 * 2 + 4], xm6, 4 + pextrd [r2 + r6], xm4, 3 + pextrw [r2 + r6 + 4], xm6, 6 +%else + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm6 + vextracti128 xm4, m4, 1 + vextracti128 xm1, m6, 1 + movd [r2 + 8], xm4 + pextrd [r2 + r3 + 8], xm4, 2 + movd [r2 + r3 * 2 + 8], xm1 + pextrd [r2 + r6 + 8], xm1, 2 +%endif + lea r2, [r2 + r3 * 4] + movu xm7, [r0 + r4] ; m7 = row 11 + punpckhwd xm1, xm5, xm7 + punpcklwd xm5, xm7 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + paddd m0, m1 + pmaddwd m5, [r5] + lea r0, [r0 + r1 * 4] + movu xm1, [r0] ; m1 = row 12 + punpckhwd xm4, xm7, xm1 + punpcklwd xm7, xm1 + vinserti128 m7, m7, xm4, 1 + pmaddwd m4, m7, [r5 + 1 * mmsize] + paddd m2, m4 + pmaddwd m7, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m2, m8 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + + movu xm4, [r0 + r1] ; m4 = row 13 + punpckhwd xm2, xm1, xm4 + punpcklwd xm1, xm4 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + paddd m5, m2 + pmaddwd m1, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 14 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m7, m6 + pmaddwd m4, [r5] +%ifidn %1,sp + paddd m5, m8 + paddd m7, m8 + psrad m5, 12 + psrad m7, 12 +%else + psrad m5, 6 + psrad m7, 6 +%endif + packssdw m5, m7 +%ifidn %1,sp + packuswb m0, m5 + vextracti128 xm5, m0, 1 + movd [r2], xm0 + pextrw [r2 + 4], xm5, 0 + pextrd [r2 + r3], xm0, 1 + pextrw [r2 + r3 + 4], xm5, 2 + pextrd [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r3 * 2 + 4], xm5, 4 + pextrd [r2 + r6], xm0, 3 + pextrw [r2 + r6 + 4], xm5, 6 +%else + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm5 + vextracti128 xm0, m0, 1 + vextracti128 xm7, m5, 1 + movd [r2 + 8], xm0 + pextrd [r2 + r3 + 8], xm0, 2 + movd [r2 + r3 * 2 + 8], xm7 + pextrd [r2 + r6 + 8], xm7, 2 +%endif + lea r2, [r2 + r3 * 4] + + movu xm6, [r0 + r4] ; m6 = row 15 + punpckhwd xm5, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm5, 1 + pmaddwd m5, m2, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 16 + punpckhwd xm5, xm6, xm0 + punpcklwd xm6, xm0 + vinserti128 m6, m6, xm5, 1 + pmaddwd m5, m6, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m6, [r5] +%ifidn %1,sp + paddd m1, m8 + paddd m4, m8 + psrad m1, 12 + psrad m4, 12 +%else + psrad m1, 6 + psrad m4, 6 +%endif + packssdw m1, m4 + + movu xm5, [r0 + r1] ; m5 = row 17 + punpckhwd xm4, xm0, xm5 + punpcklwd xm0, xm5 + vinserti128 m0, m0, xm4, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m2, m0 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhwd xm0, xm5, xm4 + punpcklwd xm5, xm4 + vinserti128 m5, m5, xm0, 1 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m6, m5 +%ifidn %1,sp + paddd m2, m8 + paddd m6, m8 + psrad m2, 12 + psrad m6, 12 +%else + psrad m2, 6 + psrad m6, 6 +%endif + packssdw m2, m6 +%ifidn %1,sp + packuswb m1, m2 + vextracti128 xm2, m1, 1 + movd [r2], xm1 + pextrw [r2 + 4], xm2, 0 + pextrd [r2 + r3], xm1, 1 + pextrw [r2 + r3 + 4], xm2, 2 + pextrd [r2 + r3 * 2], xm1, 2 + pextrw [r2 + r3 * 2 + 4], xm2, 4 + pextrd [r2 + r6], xm1, 3 + pextrw [r2 + r6 + 4], xm2, 6 +%else + movq [r2], xm1 + movhps [r2 + r3], xm1 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 + vextracti128 xm4, m1, 1 + vextracti128 xm6, m2, 1 + movd [r2 + 8], xm4 + pextrd [r2 + r3 + 8], xm4, 2 + movd [r2 + r3 * 2 + 8], xm6 + pextrd [r2 + r6 + 8], xm6, 2 +%endif + RET +%endif +%endmacro + + FILTER_VER_CHROMA_S_AVX2_6x16 sp + FILTER_VER_CHROMA_S_AVX2_6x16 ss + +;--------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS_W2_4R 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, (%2/4) + +.loopH: + PROCESS_CHROMA_SP_W2_4R r5 + + psrad m0, 6 + psrad m2, 6 + + packssdw m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m0, 2 + pextrd [r2 + r3], m0, 3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SS_W2_4R 2, 4 + FILTER_VER_CHROMA_SS_W2_4R 2, 8 + + FILTER_VER_CHROMA_SS_W2_4R 2, 16 + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal interp_4tap_vert_ss_4x2, 5, 6, 4 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 + + movq m3, [r0 + r1] + punpcklwd m2, m3 ;m4=[2 3] + pmaddwd m2, [r5 + 1 * 16] + paddd m0, m2 ;m0=[0+1+2+3] Row1 done + psrad m0, 6 + + movq m2, [r0 + 2 * r1] + punpcklwd m3, m2 ;m5=[3 4] + pmaddwd m3, [r5 + 1 * 16] + paddd m1, m3 ;m1=[1+2+3+4] Row2 done + psrad m1, 6 + + packssdw m0, m1 + + movlps [r2], m0 + movhps [r2 + r3], m0 + + RET + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS_W6_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, %2/4 + +.loopH: + PROCESS_CHROMA_SP_W4_4R + + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 + + movlps [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movlps [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + PROCESS_CHROMA_SP_W2_4R r6 + + psrad m0, 6 + psrad m2, 6 + + packssdw m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m0, 2 + pextrd [r2 + r3], m0, 3 + + sub r0, 2 * 4 + lea r2, [r2 + 2 * r3 - 2 * 4] + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SS_W6_H4 6, 8 + + FILTER_VER_CHROMA_SS_W6_H4 6, 16 + + +;---------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;---------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS_W8_H2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, %2/2 +.loopH: + PROCESS_CHROMA_SP_W8_2R + + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 + + movu [r2], m0 + movu [r2 + r3], m2 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SS_W8_H2 8, 2 + FILTER_VER_CHROMA_SS_W8_H2 8, 4 + FILTER_VER_CHROMA_SS_W8_H2 8, 6 + FILTER_VER_CHROMA_SS_W8_H2 8, 8 + FILTER_VER_CHROMA_SS_W8_H2 8, 16 + FILTER_VER_CHROMA_SS_W8_H2 8, 32 + + FILTER_VER_CHROMA_SS_W8_H2 8, 12 + FILTER_VER_CHROMA_SS_W8_H2 8, 64 + From 1a6e8f9c054ee34f4cd5e98db7fa9055a710fd66 Mon Sep 17 00:00:00 2001 From: Mythreyi P Date: Thu, 15 Feb 2018 00:56:16 -0800 Subject: [PATCH 48/51] x86: Split ipfilter16 kernals into two different source files. Port horizonal kernals from ipfilter16.asm to a new source file, h-ipfilter16.asm to improve build time. --- source/common/CMakeLists.txt | 2 +- source/common/x86/h-ipfilter16.asm | 5000 +++++++++++++++++++++++++++ source/common/x86/ipfilter16.asm | 5104 +--------------------------- 3 files changed, 5090 insertions(+), 5016 deletions(-) create mode 100644 source/common/x86/h-ipfilter16.asm diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 8261fac0c3..079b64982c 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -61,7 +61,7 @@ if(ENABLE_ASSEMBLY AND X86) mc-a2.asm pixel-util8.asm blockcopy8.asm pixeladd8.asm dct8.asm seaintegral.asm) if(HIGH_BIT_DEPTH) - set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm loopfilter.asm) + set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm h-ipfilter16.asm ipfilter16.asm loopfilter.asm) else() set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm v4-ipfilter8.asm h-ipfilter8.asm ipfilter8.asm loopfilter.asm) endif() diff --git a/source/common/x86/h-ipfilter16.asm b/source/common/x86/h-ipfilter16.asm new file mode 100644 index 0000000000..92c329b036 --- /dev/null +++ b/source/common/x86/h-ipfilter16.asm @@ -0,0 +1,5000 @@ +;***************************************************************************** +;* Copyright (C) 2013-2017 MulticoreWare, Inc +;* +;* Authors: Nabajit Deka +;* Murugan Vairavel +;* Min Chen +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ +%include "x86inc.asm" +%include "x86util.asm" + + +%define INTERP_OFFSET_PP pd_32 +%define INTERP_SHIFT_PP 6 + +%if BIT_DEPTH == 10 + %define INTERP_SHIFT_PS 2 + %define INTERP_OFFSET_PS pd_n32768 + %define INTERP_SHIFT_SP 10 + %define INTERP_OFFSET_SP h_pd_524800 +%elif BIT_DEPTH == 12 + %define INTERP_SHIFT_PS 4 + %define INTERP_OFFSET_PS pd_n131072 + %define INTERP_SHIFT_SP 8 + %define INTERP_OFFSET_SP pd_524416 +%else + %error Unsupport bit depth! +%endif + +SECTION_RODATA 32 + +tab_c_32: times 8 dd 32 +h_pd_524800: times 8 dd 524800 + +tab_Tm16: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + + +tab_ChromaCoeff: dw 0, 64, 0, 0 + dw -2, 58, 10, -2 + dw -4, 54, 16, -2 + dw -6, 46, 28, -4 + dw -4, 36, 36, -4 + dw -4, 28, 46, -6 + dw -2, 16, 54, -4 + dw -2, 10, 58, -2 + +tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0 + dw -1, 4, -10, 58, 17, -5, 1, 0 + dw -1, 4, -11, 40, 40, -11, 4, -1 + dw 0, 1, -5, 17, 58, -10, 4, -1 + +ALIGN 32 +h_tab_LumaCoeffV: times 4 dw 0, 0 + times 4 dw 0, 64 + times 4 dw 0, 0 + times 4 dw 0, 0 + + times 4 dw -1, 4 + times 4 dw -10, 58 + times 4 dw 17, -5 + times 4 dw 1, 0 + + times 4 dw -1, 4 + times 4 dw -11, 40 + times 4 dw 40, -11 + times 4 dw 4, -1 + + times 4 dw 0, 1 + times 4 dw -5, 17 + times 4 dw 58, -10 + times 4 dw 4, -1 + +const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 + +const interp8_hpp_shuf, db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 + +const interp8_hpp_shuf_new, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 + db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 + +SECTION .text +cextern pd_8 +cextern pd_32 +cextern pw_pixel_max +cextern pd_524416 +cextern pd_n32768 +cextern pd_n131072 +cextern pw_2000 +cextern idct8_shuf2 + +%macro FILTER_LUMA_HOR_4_sse2 1 + movu m4, [r0 + %1] ; m4 = src[0-7] + movu m5, [r0 + %1 + 2] ; m5 = src[1-8] + pmaddwd m4, m0 + pmaddwd m5, m0 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m4, m4, q3120 + pshufd m5, m5, q3120 + punpcklqdq m4, m5 + + movu m5, [r0 + %1 + 4] ; m5 = src[2-9] + movu m3, [r0 + %1 + 6] ; m3 = src[3-10] + pmaddwd m5, m0 + pmaddwd m3, m0 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m5, m5, q3120 + pshufd m3, m3, q3120 + punpcklqdq m5, m3 + + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m4, m4, q3120 + pshufd m5, m5, q3120 + punpcklqdq m4, m5 + paddd m4, m1 +%endmacro + +%macro FILTER_LUMA_HOR_8_sse2 1 + movu m4, [r0 + %1] ; m4 = src[0-7] + movu m5, [r0 + %1 + 2] ; m5 = src[1-8] + pmaddwd m4, m0 + pmaddwd m5, m0 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m4, m4, q3120 + pshufd m5, m5, q3120 + punpcklqdq m4, m5 + + movu m5, [r0 + %1 + 4] ; m5 = src[2-9] + movu m3, [r0 + %1 + 6] ; m3 = src[3-10] + pmaddwd m5, m0 + pmaddwd m3, m0 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m5, m5, q3120 + pshufd m3, m3, q3120 + punpcklqdq m5, m3 + + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m4, m4, q3120 + pshufd m5, m5, q3120 + punpcklqdq m4, m5 + paddd m4, m1 + + movu m5, [r0 + %1 + 8] ; m5 = src[4-11] + movu m6, [r0 + %1 + 10] ; m6 = src[5-12] + pmaddwd m5, m0 + pmaddwd m6, m0 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m2, m6, q2301 + paddd m6, m2 + pshufd m5, m5, q3120 + pshufd m6, m6, q3120 + punpcklqdq m5, m6 + + movu m6, [r0 + %1 + 12] ; m6 = src[6-13] + movu m3, [r0 + %1 + 14] ; m3 = src[7-14] + pmaddwd m6, m0 + pmaddwd m3, m0 + pshufd m2, m6, q2301 + paddd m6, m2 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m6, m6, q3120 + pshufd m3, m3, q3120 + punpcklqdq m6, m3 + + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m2, m6, q2301 + paddd m6, m2 + pshufd m5, m5, q3120 + pshufd m6, m6, q3120 + punpcklqdq m5, m6 + paddd m5, m1 +%endmacro + +;------------------------------------------------------------------------------------------------------------ +; void interp_8tap_horiz_p%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +%macro FILTER_HOR_LUMA_sse2 3 +INIT_XMM sse2 +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 + mov r4d, r4m + sub r0, 6 + shl r4d, 4 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r6, [tab_LumaCoeff] + mova m0, [r6 + r4] +%else + mova m0, [tab_LumaCoeff + r4] +%endif + +%ifidn %3, pp + mova m1, [pd_32] + pxor m7, m7 +%else + mova m1, [INTERP_OFFSET_PS] +%endif + + mov r4d, %2 +%ifidn %3, ps + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: +%assign x 0 +%rep %1/8 + FILTER_LUMA_HOR_8_sse2 x + +%ifidn %3, pp + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + CLIPW m4, m7, [pw_pixel_max] +%else + %if BIT_DEPTH == 10 + psrad m4, 2 + psrad m5, 2 + %elif BIT_DEPTH == 12 + psrad m4, 4 + psrad m5, 4 + %endif + packssdw m4, m5 +%endif + + movu [r2 + x], m4 +%assign x x+16 +%endrep + +%rep (%1 % 8)/4 + FILTER_LUMA_HOR_4_sse2 x + +%ifidn %3, pp + psrad m4, 6 + packssdw m4, m4 + CLIPW m4, m7, [pw_pixel_max] +%else + %if BIT_DEPTH == 10 + psrad m4, 2 + %elif BIT_DEPTH == 12 + psrad m4, 4 + %endif + packssdw m4, m4 +%endif + + movh [r2 + x], m4 +%endrep + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET + +%endmacro + +;------------------------------------------------------------------------------------------------------------ +; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------ + FILTER_HOR_LUMA_sse2 4, 4, pp + FILTER_HOR_LUMA_sse2 4, 8, pp + FILTER_HOR_LUMA_sse2 4, 16, pp + FILTER_HOR_LUMA_sse2 8, 4, pp + FILTER_HOR_LUMA_sse2 8, 8, pp + FILTER_HOR_LUMA_sse2 8, 16, pp + FILTER_HOR_LUMA_sse2 8, 32, pp + FILTER_HOR_LUMA_sse2 12, 16, pp + FILTER_HOR_LUMA_sse2 16, 4, pp + FILTER_HOR_LUMA_sse2 16, 8, pp + FILTER_HOR_LUMA_sse2 16, 12, pp + FILTER_HOR_LUMA_sse2 16, 16, pp + FILTER_HOR_LUMA_sse2 16, 32, pp + FILTER_HOR_LUMA_sse2 16, 64, pp + FILTER_HOR_LUMA_sse2 24, 32, pp + FILTER_HOR_LUMA_sse2 32, 8, pp + FILTER_HOR_LUMA_sse2 32, 16, pp + FILTER_HOR_LUMA_sse2 32, 24, pp + FILTER_HOR_LUMA_sse2 32, 32, pp + FILTER_HOR_LUMA_sse2 32, 64, pp + FILTER_HOR_LUMA_sse2 48, 64, pp + FILTER_HOR_LUMA_sse2 64, 16, pp + FILTER_HOR_LUMA_sse2 64, 32, pp + FILTER_HOR_LUMA_sse2 64, 48, pp + FILTER_HOR_LUMA_sse2 64, 64, pp + +;--------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;--------------------------------------------------------------------------------------------------------------------------- + FILTER_HOR_LUMA_sse2 4, 4, ps + FILTER_HOR_LUMA_sse2 4, 8, ps + FILTER_HOR_LUMA_sse2 4, 16, ps + FILTER_HOR_LUMA_sse2 8, 4, ps + FILTER_HOR_LUMA_sse2 8, 8, ps + FILTER_HOR_LUMA_sse2 8, 16, ps + FILTER_HOR_LUMA_sse2 8, 32, ps + FILTER_HOR_LUMA_sse2 12, 16, ps + FILTER_HOR_LUMA_sse2 16, 4, ps + FILTER_HOR_LUMA_sse2 16, 8, ps + FILTER_HOR_LUMA_sse2 16, 12, ps + FILTER_HOR_LUMA_sse2 16, 16, ps + FILTER_HOR_LUMA_sse2 16, 32, ps + FILTER_HOR_LUMA_sse2 16, 64, ps + FILTER_HOR_LUMA_sse2 24, 32, ps + FILTER_HOR_LUMA_sse2 32, 8, ps + FILTER_HOR_LUMA_sse2 32, 16, ps + FILTER_HOR_LUMA_sse2 32, 24, ps + FILTER_HOR_LUMA_sse2 32, 32, ps + FILTER_HOR_LUMA_sse2 32, 64, ps + FILTER_HOR_LUMA_sse2 48, 64, ps + FILTER_HOR_LUMA_sse2 64, 16, ps + FILTER_HOR_LUMA_sse2 64, 32, ps + FILTER_HOR_LUMA_sse2 64, 48, ps + FILTER_HOR_LUMA_sse2 64, 64, ps + +%macro FILTERH_W2_4_sse3 2 + movh m3, [r0 + %1] + movhps m3, [r0 + %1 + 2] + pmaddwd m3, m0 + movh m4, [r0 + r1 + %1] + movhps m4, [r0 + r1 + %1 + 2] + pmaddwd m4, m0 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m3, m3, q3120 + pshufd m4, m4, q3120 + punpcklqdq m3, m4 + paddd m3, m1 + movh m5, [r0 + 2 * r1 + %1] + movhps m5, [r0 + 2 * r1 + %1 + 2] + pmaddwd m5, m0 + movh m4, [r0 + r4 + %1] + movhps m4, [r0 + r4 + %1 + 2] + pmaddwd m4, m0 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m5, m5, q3120 + pshufd m4, m4, q3120 + punpcklqdq m5, m4 + paddd m5, m1 +%ifidn %2, pp + psrad m3, 6 + psrad m5, 6 + packssdw m3, m5 + CLIPW m3, m7, m6 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movd [r2 + %1], m3 + psrldq m3, 4 + movd [r2 + r3 + %1], m3 + psrldq m3, 4 + movd [r2 + r3 * 2 + %1], m3 + psrldq m3, 4 + movd [r2 + r5 + %1], m3 +%endmacro + +%macro FILTERH_W2_3_sse3 1 + movh m3, [r0 + %1] + movhps m3, [r0 + %1 + 2] + pmaddwd m3, m0 + movh m4, [r0 + r1 + %1] + movhps m4, [r0 + r1 + %1 + 2] + pmaddwd m4, m0 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m3, m3, q3120 + pshufd m4, m4, q3120 + punpcklqdq m3, m4 + paddd m3, m1 + + movh m5, [r0 + 2 * r1 + %1] + movhps m5, [r0 + 2 * r1 + %1 + 2] + pmaddwd m5, m0 + + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m5, m5, q3120 + paddd m5, m1 + + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 + + movd [r2 + %1], m3 + psrldq m3, 4 + movd [r2 + r3 + %1], m3 + psrldq m3, 4 + movd [r2 + r3 * 2 + %1], m3 +%endmacro + +%macro FILTERH_W4_2_sse3 2 + movh m3, [r0 + %1] + movhps m3, [r0 + %1 + 2] + pmaddwd m3, m0 + movh m4, [r0 + %1 + 4] + movhps m4, [r0 + %1 + 6] + pmaddwd m4, m0 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m3, m3, q3120 + pshufd m4, m4, q3120 + punpcklqdq m3, m4 + paddd m3, m1 + + movh m5, [r0 + r1 + %1] + movhps m5, [r0 + r1 + %1 + 2] + pmaddwd m5, m0 + movh m4, [r0 + r1 + %1 + 4] + movhps m4, [r0 + r1 + %1 + 6] + pmaddwd m4, m0 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m5, m5, q3120 + pshufd m4, m4, q3120 + punpcklqdq m5, m4 + paddd m5, m1 +%ifidn %2, pp + psrad m3, 6 + psrad m5, 6 + packssdw m3, m5 + CLIPW m3, m7, m6 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + %1], m3 + movhps [r2 + r3 + %1], m3 +%endmacro + +%macro FILTERH_W4_1_sse3 1 + movh m3, [r0 + 2 * r1 + %1] + movhps m3, [r0 + 2 * r1 + %1 + 2] + pmaddwd m3, m0 + movh m4, [r0 + 2 * r1 + %1 + 4] + movhps m4, [r0 + 2 * r1 + %1 + 6] + pmaddwd m4, m0 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m3, m3, q3120 + pshufd m4, m4, q3120 + punpcklqdq m3, m4 + paddd m3, m1 + + psrad m3, INTERP_SHIFT_PS + packssdw m3, m3 + movh [r2 + r3 * 2 + %1], m3 +%endmacro + +%macro FILTERH_W8_1_sse3 2 + movh m3, [r0 + %1] + movhps m3, [r0 + %1 + 2] + pmaddwd m3, m0 + movh m4, [r0 + %1 + 4] + movhps m4, [r0 + %1 + 6] + pmaddwd m4, m0 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m3, m3, q3120 + pshufd m4, m4, q3120 + punpcklqdq m3, m4 + paddd m3, m1 + + movh m5, [r0 + %1 + 8] + movhps m5, [r0 + %1 + 10] + pmaddwd m5, m0 + movh m4, [r0 + %1 + 12] + movhps m4, [r0 + %1 + 14] + pmaddwd m4, m0 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m5, m5, q3120 + pshufd m4, m4, q3120 + punpcklqdq m5, m4 + paddd m5, m1 +%ifidn %2, pp + psrad m3, 6 + psrad m5, 6 + packssdw m3, m5 + CLIPW m3, m7, m6 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movdqu [r2 + %1], m3 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_HOR_CHROMA_sse3 3 +INIT_XMM sse3 +cglobal interp_4tap_horiz_%3_%1x%2, 4, 7, 8 + add r3, r3 + add r1, r1 + sub r0, 2 + mov r4d, r4m + add r4d, r4d + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movddup m0, [r6 + r4 * 4] +%else + movddup m0, [tab_ChromaCoeff + r4 * 4] +%endif + +%ifidn %3, ps + mova m1, [INTERP_OFFSET_PS] + cmp r5m, byte 0 +%if %1 <= 6 + lea r4, [r1 * 3] + lea r5, [r3 * 3] +%endif + je .skip + sub r0, r1 +%if %1 <= 6 +%assign y 1 +%else +%assign y 3 +%endif +%assign z 0 +%rep y +%assign x 0 +%rep %1/8 + FILTERH_W8_1_sse3 x, %3 +%assign x x+16 +%endrep +%if %1 == 4 || (%1 == 6 && z == 0) || (%1 == 12 && z == 0) + FILTERH_W4_2_sse3 x, %3 + FILTERH_W4_1_sse3 x +%assign x x+8 +%endif +%if %1 == 2 || (%1 == 6 && z == 0) + FILTERH_W2_3_sse3 x +%endif +%if %1 <= 6 + lea r0, [r0 + r4] + lea r2, [r2 + r5] +%else + lea r0, [r0 + r1] + lea r2, [r2 + r3] +%endif +%assign z z+1 +%endrep +.skip: +%elifidn %3, pp + pxor m7, m7 + mova m6, [pw_pixel_max] + mova m1, [tab_c_32] +%if %1 == 2 || %1 == 6 + lea r4, [r1 * 3] + lea r5, [r3 * 3] +%endif +%endif + +%if %1 == 2 +%assign y %2/4 +%elif %1 <= 6 +%assign y %2/2 +%else +%assign y %2 +%endif +%assign z 0 +%rep y +%assign x 0 +%rep %1/8 + FILTERH_W8_1_sse3 x, %3 +%assign x x+16 +%endrep +%if %1 == 4 || %1 == 6 || (%1 == 12 && (z % 2) == 0) + FILTERH_W4_2_sse3 x, %3 +%assign x x+8 +%endif +%if %1 == 2 || (%1 == 6 && (z % 2) == 0) + FILTERH_W2_4_sse3 x, %3 +%endif +%assign z z+1 +%if z < y +%if %1 == 2 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] +%elif %1 <= 6 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] +%else + lea r0, [r0 + r1] + lea r2, [r2 + r3] +%endif +%endif ;z < y +%endrep + + RET +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- + +FILTER_HOR_CHROMA_sse3 2, 4, pp +FILTER_HOR_CHROMA_sse3 2, 8, pp +FILTER_HOR_CHROMA_sse3 2, 16, pp +FILTER_HOR_CHROMA_sse3 4, 2, pp +FILTER_HOR_CHROMA_sse3 4, 4, pp +FILTER_HOR_CHROMA_sse3 4, 8, pp +FILTER_HOR_CHROMA_sse3 4, 16, pp +FILTER_HOR_CHROMA_sse3 4, 32, pp +FILTER_HOR_CHROMA_sse3 6, 8, pp +FILTER_HOR_CHROMA_sse3 6, 16, pp +FILTER_HOR_CHROMA_sse3 8, 2, pp +FILTER_HOR_CHROMA_sse3 8, 4, pp +FILTER_HOR_CHROMA_sse3 8, 6, pp +FILTER_HOR_CHROMA_sse3 8, 8, pp +FILTER_HOR_CHROMA_sse3 8, 12, pp +FILTER_HOR_CHROMA_sse3 8, 16, pp +FILTER_HOR_CHROMA_sse3 8, 32, pp +FILTER_HOR_CHROMA_sse3 8, 64, pp +FILTER_HOR_CHROMA_sse3 12, 16, pp +FILTER_HOR_CHROMA_sse3 12, 32, pp +FILTER_HOR_CHROMA_sse3 16, 4, pp +FILTER_HOR_CHROMA_sse3 16, 8, pp +FILTER_HOR_CHROMA_sse3 16, 12, pp +FILTER_HOR_CHROMA_sse3 16, 16, pp +FILTER_HOR_CHROMA_sse3 16, 24, pp +FILTER_HOR_CHROMA_sse3 16, 32, pp +FILTER_HOR_CHROMA_sse3 16, 64, pp +FILTER_HOR_CHROMA_sse3 24, 32, pp +FILTER_HOR_CHROMA_sse3 24, 64, pp +FILTER_HOR_CHROMA_sse3 32, 8, pp +FILTER_HOR_CHROMA_sse3 32, 16, pp +FILTER_HOR_CHROMA_sse3 32, 24, pp +FILTER_HOR_CHROMA_sse3 32, 32, pp +FILTER_HOR_CHROMA_sse3 32, 48, pp +FILTER_HOR_CHROMA_sse3 32, 64, pp +FILTER_HOR_CHROMA_sse3 48, 64, pp +FILTER_HOR_CHROMA_sse3 64, 16, pp +FILTER_HOR_CHROMA_sse3 64, 32, pp +FILTER_HOR_CHROMA_sse3 64, 48, pp +FILTER_HOR_CHROMA_sse3 64, 64, pp + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- + +FILTER_HOR_CHROMA_sse3 2, 4, ps +FILTER_HOR_CHROMA_sse3 2, 8, ps +FILTER_HOR_CHROMA_sse3 2, 16, ps +FILTER_HOR_CHROMA_sse3 4, 2, ps +FILTER_HOR_CHROMA_sse3 4, 4, ps +FILTER_HOR_CHROMA_sse3 4, 8, ps +FILTER_HOR_CHROMA_sse3 4, 16, ps +FILTER_HOR_CHROMA_sse3 4, 32, ps +FILTER_HOR_CHROMA_sse3 6, 8, ps +FILTER_HOR_CHROMA_sse3 6, 16, ps +FILTER_HOR_CHROMA_sse3 8, 2, ps +FILTER_HOR_CHROMA_sse3 8, 4, ps +FILTER_HOR_CHROMA_sse3 8, 6, ps +FILTER_HOR_CHROMA_sse3 8, 8, ps +FILTER_HOR_CHROMA_sse3 8, 12, ps +FILTER_HOR_CHROMA_sse3 8, 16, ps +FILTER_HOR_CHROMA_sse3 8, 32, ps +FILTER_HOR_CHROMA_sse3 8, 64, ps +FILTER_HOR_CHROMA_sse3 12, 16, ps +FILTER_HOR_CHROMA_sse3 12, 32, ps +FILTER_HOR_CHROMA_sse3 16, 4, ps +FILTER_HOR_CHROMA_sse3 16, 8, ps +FILTER_HOR_CHROMA_sse3 16, 12, ps +FILTER_HOR_CHROMA_sse3 16, 16, ps +FILTER_HOR_CHROMA_sse3 16, 24, ps +FILTER_HOR_CHROMA_sse3 16, 32, ps +FILTER_HOR_CHROMA_sse3 16, 64, ps +FILTER_HOR_CHROMA_sse3 24, 32, ps +FILTER_HOR_CHROMA_sse3 24, 64, ps +FILTER_HOR_CHROMA_sse3 32, 8, ps +FILTER_HOR_CHROMA_sse3 32, 16, ps +FILTER_HOR_CHROMA_sse3 32, 24, ps +FILTER_HOR_CHROMA_sse3 32, 32, ps +FILTER_HOR_CHROMA_sse3 32, 48, ps +FILTER_HOR_CHROMA_sse3 32, 64, ps +FILTER_HOR_CHROMA_sse3 48, 64, ps +FILTER_HOR_CHROMA_sse3 64, 16, ps +FILTER_HOR_CHROMA_sse3 64, 32, ps +FILTER_HOR_CHROMA_sse3 64, 48, ps +FILTER_HOR_CHROMA_sse3 64, 64, ps + +%macro FILTER_P2S_2_4_sse2 1 + movd m0, [r0 + %1] + movd m2, [r0 + r1 * 2 + %1] + movhps m0, [r0 + r1 + %1] + movhps m2, [r0 + r4 + %1] + psllw m0, (14 - BIT_DEPTH) + psllw m2, (14 - BIT_DEPTH) + psubw m0, m1 + psubw m2, m1 + + movd [r2 + r3 * 0 + %1], m0 + movd [r2 + r3 * 2 + %1], m2 + movhlps m0, m0 + movhlps m2, m2 + movd [r2 + r3 * 1 + %1], m0 + movd [r2 + r5 + %1], m2 +%endmacro + +%macro FILTER_P2S_4_4_sse2 1 + movh m0, [r0 + %1] + movhps m0, [r0 + r1 + %1] + psllw m0, (14 - BIT_DEPTH) + psubw m0, m1 + movh [r2 + r3 * 0 + %1], m0 + movhps [r2 + r3 * 1 + %1], m0 + + movh m2, [r0 + r1 * 2 + %1] + movhps m2, [r0 + r4 + %1] + psllw m2, (14 - BIT_DEPTH) + psubw m2, m1 + movh [r2 + r3 * 2 + %1], m2 + movhps [r2 + r5 + %1], m2 +%endmacro + +%macro FILTER_P2S_4_2_sse2 0 + movh m0, [r0] + movhps m0, [r0 + r1 * 2] + psllw m0, (14 - BIT_DEPTH) + psubw m0, [pw_2000] + movh [r2 + r3 * 0], m0 + movhps [r2 + r3 * 2], m0 +%endmacro + +%macro FILTER_P2S_8_4_sse2 1 + movu m0, [r0 + %1] + movu m2, [r0 + r1 + %1] + psllw m0, (14 - BIT_DEPTH) + psllw m2, (14 - BIT_DEPTH) + psubw m0, m1 + psubw m2, m1 + movu [r2 + r3 * 0 + %1], m0 + movu [r2 + r3 * 1 + %1], m2 + + movu m3, [r0 + r1 * 2 + %1] + movu m4, [r0 + r4 + %1] + psllw m3, (14 - BIT_DEPTH) + psllw m4, (14 - BIT_DEPTH) + psubw m3, m1 + psubw m4, m1 + movu [r2 + r3 * 2 + %1], m3 + movu [r2 + r5 + %1], m4 +%endmacro + +%macro FILTER_P2S_8_2_sse2 1 + movu m0, [r0 + %1] + movu m2, [r0 + r1 + %1] + psllw m0, (14 - BIT_DEPTH) + psllw m2, (14 - BIT_DEPTH) + psubw m0, m1 + psubw m2, m1 + movu [r2 + r3 * 0 + %1], m0 + movu [r2 + r3 * 1 + %1], m2 +%endmacro + +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride) +;----------------------------------------------------------------------------- +%macro FILTER_PIX_TO_SHORT_sse2 2 +INIT_XMM sse2 +cglobal filterPixelToShort_%1x%2, 4, 6, 3 +%if %2 == 2 +%if %1 == 4 + FILTER_P2S_4_2_sse2 +%elif %1 == 8 + add r1d, r1d + add r3d, r3d + mova m1, [pw_2000] + FILTER_P2S_8_2_sse2 0 +%endif +%else + add r1d, r1d + add r3d, r3d + mova m1, [pw_2000] + lea r4, [r1 * 3] + lea r5, [r3 * 3] +%assign y 1 +%rep %2/4 +%assign x 0 +%rep %1/8 + FILTER_P2S_8_4_sse2 x +%if %2 == 6 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + FILTER_P2S_8_2_sse2 x +%endif +%assign x x+16 +%endrep +%rep (%1 % 8)/4 + FILTER_P2S_4_4_sse2 x +%assign x x+8 +%endrep +%rep (%1 % 4)/2 + FILTER_P2S_2_4_sse2 x +%endrep +%if y < %2/4 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] +%assign y y+1 +%endif +%endrep +%endif +RET +%endmacro + + FILTER_PIX_TO_SHORT_sse2 2, 4 + FILTER_PIX_TO_SHORT_sse2 2, 8 + FILTER_PIX_TO_SHORT_sse2 2, 16 + FILTER_PIX_TO_SHORT_sse2 4, 2 + FILTER_PIX_TO_SHORT_sse2 4, 4 + FILTER_PIX_TO_SHORT_sse2 4, 8 + FILTER_PIX_TO_SHORT_sse2 4, 16 + FILTER_PIX_TO_SHORT_sse2 4, 32 + FILTER_PIX_TO_SHORT_sse2 6, 8 + FILTER_PIX_TO_SHORT_sse2 6, 16 + FILTER_PIX_TO_SHORT_sse2 8, 2 + FILTER_PIX_TO_SHORT_sse2 8, 4 + FILTER_PIX_TO_SHORT_sse2 8, 6 + FILTER_PIX_TO_SHORT_sse2 8, 8 + FILTER_PIX_TO_SHORT_sse2 8, 12 + FILTER_PIX_TO_SHORT_sse2 8, 16 + FILTER_PIX_TO_SHORT_sse2 8, 32 + FILTER_PIX_TO_SHORT_sse2 8, 64 + FILTER_PIX_TO_SHORT_sse2 12, 16 + FILTER_PIX_TO_SHORT_sse2 12, 32 + FILTER_PIX_TO_SHORT_sse2 16, 4 + FILTER_PIX_TO_SHORT_sse2 16, 8 + FILTER_PIX_TO_SHORT_sse2 16, 12 + FILTER_PIX_TO_SHORT_sse2 16, 16 + FILTER_PIX_TO_SHORT_sse2 16, 24 + FILTER_PIX_TO_SHORT_sse2 16, 32 + FILTER_PIX_TO_SHORT_sse2 16, 64 + FILTER_PIX_TO_SHORT_sse2 24, 32 + FILTER_PIX_TO_SHORT_sse2 24, 64 + FILTER_PIX_TO_SHORT_sse2 32, 8 + FILTER_PIX_TO_SHORT_sse2 32, 16 + FILTER_PIX_TO_SHORT_sse2 32, 24 + FILTER_PIX_TO_SHORT_sse2 32, 32 + FILTER_PIX_TO_SHORT_sse2 32, 48 + FILTER_PIX_TO_SHORT_sse2 32, 64 + FILTER_PIX_TO_SHORT_sse2 48, 64 + FILTER_PIX_TO_SHORT_sse2 64, 16 + FILTER_PIX_TO_SHORT_sse2 64, 32 + FILTER_PIX_TO_SHORT_sse2 64, 48 + FILTER_PIX_TO_SHORT_sse2 64, 64 + +;------------------------------------------------------------------------------------------------------------ +; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +%macro FILTER_HOR_LUMA_W4 3 +INIT_XMM sse4 +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 + mov r4d, r4m + sub r0, 6 + shl r4d, 4 + add r1, r1 + add r3, r3 + +%ifdef PIC + lea r6, [tab_LumaCoeff] + mova m0, [r6 + r4] +%else + mova m0, [tab_LumaCoeff + r4] +%endif + +%ifidn %3, pp + mova m1, [pd_32] + pxor m6, m6 + mova m7, [pw_pixel_max] +%else + mova m1, [INTERP_OFFSET_PS] +%endif + + mov r4d, %2 +%ifidn %3, ps + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: + movu m2, [r0] ; m2 = src[0-7] + movu m3, [r0 + 16] ; m3 = src[8-15] + + pmaddwd m4, m2, m0 + palignr m5, m3, m2, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m3, m2, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m3, m2, 6 ; m3 = src[3-10] + pmaddwd m3, m0 + phaddd m5, m3 + + phaddd m4, m5 + paddd m4, m1 +%ifidn %3, pp + psrad m4, 6 + packusdw m4, m4 + CLIPW m4, m6, m7 +%else + psrad m4, INTERP_SHIFT_PS + packssdw m4, m4 +%endif + + movh [r2], m4 + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------ +; void interp_8tap_horiz_pp_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------ +FILTER_HOR_LUMA_W4 4, 4, pp +FILTER_HOR_LUMA_W4 4, 8, pp +FILTER_HOR_LUMA_W4 4, 16, pp + +;--------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;--------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W4 4, 4, ps +FILTER_HOR_LUMA_W4 4, 8, ps +FILTER_HOR_LUMA_W4 4, 16, ps + +;------------------------------------------------------------------------------------------------------------ +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +%macro FILTER_HOR_LUMA_W8 3 +INIT_XMM sse4 +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 + + add r1, r1 + add r3, r3 + mov r4d, r4m + sub r0, 6 + shl r4d, 4 + +%ifdef PIC + lea r6, [tab_LumaCoeff] + mova m0, [r6 + r4] +%else + mova m0, [tab_LumaCoeff + r4] +%endif + +%ifidn %3, pp + mova m1, [pd_32] + pxor m7, m7 +%else + mova m1, [INTERP_OFFSET_PS] +%endif + + mov r4d, %2 +%ifidn %3, ps + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: + movu m2, [r0] ; m2 = src[0-7] + movu m3, [r0 + 16] ; m3 = src[8-15] + + pmaddwd m4, m2, m0 + palignr m5, m3, m2, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m3, m2, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m6, m3, m2, 6 ; m6 = src[3-10] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m3, m2, 8 ; m5 = src[4-11] + pmaddwd m5, m0 + palignr m6, m3, m2, 10 ; m6 = src[5-12] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m3, m2, 12 ; m6 = src[6-13] + pmaddwd m6, m0 + palignr m3, m2, 14 ; m3 = src[7-14] + pmaddwd m3, m0 + phaddd m6, m3 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, 6 + psrad m5, 6 + packusdw m4, m5 + CLIPW m4, m7, [pw_pixel_max] +%else + psrad m4, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m4, m5 +%endif + + movu [r2], m4 + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------ +; void interp_8tap_horiz_pp_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------ +FILTER_HOR_LUMA_W8 8, 4, pp +FILTER_HOR_LUMA_W8 8, 8, pp +FILTER_HOR_LUMA_W8 8, 16, pp +FILTER_HOR_LUMA_W8 8, 32, pp + +;--------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;--------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W8 8, 4, ps +FILTER_HOR_LUMA_W8 8, 8, ps +FILTER_HOR_LUMA_W8 8, 16, ps +FILTER_HOR_LUMA_W8 8, 32, ps + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_HOR_LUMA_W12 3 +INIT_XMM sse4 +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 + + add r1, r1 + add r3, r3 + mov r4d, r4m + sub r0, 6 + shl r4d, 4 + +%ifdef PIC + lea r6, [tab_LumaCoeff] + mova m0, [r6 + r4] +%else + mova m0, [tab_LumaCoeff + r4] +%endif +%ifidn %3, pp + mova m1, [INTERP_OFFSET_PP] +%else + mova m1, [INTERP_OFFSET_PS] +%endif + + mov r4d, %2 +%ifidn %3, ps + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: + movu m2, [r0] ; m2 = src[0-7] + movu m3, [r0 + 16] ; m3 = src[8-15] + + pmaddwd m4, m2, m0 + palignr m5, m3, m2, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m3, m2, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m6, m3, m2, 6 ; m6 = src[3-10] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m3, m2, 8 ; m5 = src[4-11] + pmaddwd m5, m0 + palignr m6, m3, m2, 10 ; m6 = src[5-12] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m3, m2, 12 ; m6 = src[6-13] + pmaddwd m6, m0 + palignr m7, m3, m2, 14 ; m2 = src[7-14] + pmaddwd m7, m0 + phaddd m6, m7 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m4, m5 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m4, m5 +%endif + + movu [r2], m4 + + movu m2, [r0 + 32] ; m2 = src[16-23] + + pmaddwd m4, m3, m0 ; m3 = src[8-15] + palignr m5, m2, m3, 2 ; m5 = src[9-16] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m2, m3, 4 ; m5 = src[10-17] + pmaddwd m5, m0 + palignr m2, m3, 6 ; m2 = src[11-18] + pmaddwd m2, m0 + phaddd m5, m2 + phaddd m4, m5 + paddd m4, m1 +%ifidn %3, pp + psrad m4, INTERP_SHIFT_PP + packusdw m4, m4 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, INTERP_SHIFT_PS + packssdw m4, m4 +%endif + + movh [r2 + 16], m4 + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W12 12, 16, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W12 12, 16, ps + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_HOR_LUMA_W16 3 +INIT_XMM sse4 +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 + + add r1, r1 + add r3, r3 + mov r4d, r4m + sub r0, 6 + shl r4d, 4 + +%ifdef PIC + lea r6, [tab_LumaCoeff] + mova m0, [r6 + r4] +%else + mova m0, [tab_LumaCoeff + r4] +%endif + +%ifidn %3, pp + mova m1, [pd_32] +%else + mova m1, [INTERP_OFFSET_PS] +%endif + + mov r4d, %2 +%ifidn %3, ps + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: +%assign x 0 +%rep %1 / 16 + movu m2, [r0 + x] ; m2 = src[0-7] + movu m3, [r0 + 16 + x] ; m3 = src[8-15] + + pmaddwd m4, m2, m0 + palignr m5, m3, m2, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m3, m2, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m6, m3, m2, 6 ; m6 = src[3-10] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m3, m2, 8 ; m5 = src[4-11] + pmaddwd m5, m0 + palignr m6, m3, m2, 10 ; m6 = src[5-12] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m3, m2, 12 ; m6 = src[6-13] + pmaddwd m6, m0 + palignr m7, m3, m2, 14 ; m2 = src[7-14] + pmaddwd m7, m0 + phaddd m6, m7 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m4, m5 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m4, m5 +%endif + movu [r2 + x], m4 + + movu m2, [r0 + 32 + x] ; m2 = src[16-23] + + pmaddwd m4, m3, m0 ; m3 = src[8-15] + palignr m5, m2, m3, 2 ; m5 = src[9-16] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m2, m3, 4 ; m5 = src[10-17] + pmaddwd m5, m0 + palignr m6, m2, m3, 6 ; m6 = src[11-18] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m2, m3, 8 ; m5 = src[12-19] + pmaddwd m5, m0 + palignr m6, m2, m3, 10 ; m6 = src[13-20] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m2, m3, 12 ; m6 = src[14-21] + pmaddwd m6, m0 + palignr m2, m3, 14 ; m3 = src[15-22] + pmaddwd m2, m0 + phaddd m6, m2 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m4, m5 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m4, m5 +%endif + movu [r2 + 16 + x], m4 + +%assign x x+32 +%endrep + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 16, 4, pp +FILTER_HOR_LUMA_W16 16, 8, pp +FILTER_HOR_LUMA_W16 16, 12, pp +FILTER_HOR_LUMA_W16 16, 16, pp +FILTER_HOR_LUMA_W16 16, 32, pp +FILTER_HOR_LUMA_W16 16, 64, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 16, 4, ps +FILTER_HOR_LUMA_W16 16, 8, ps +FILTER_HOR_LUMA_W16 16, 12, ps +FILTER_HOR_LUMA_W16 16, 16, ps +FILTER_HOR_LUMA_W16 16, 32, ps +FILTER_HOR_LUMA_W16 16, 64, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 32, 8, pp +FILTER_HOR_LUMA_W16 32, 16, pp +FILTER_HOR_LUMA_W16 32, 24, pp +FILTER_HOR_LUMA_W16 32, 32, pp +FILTER_HOR_LUMA_W16 32, 64, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 32, 8, ps +FILTER_HOR_LUMA_W16 32, 16, ps +FILTER_HOR_LUMA_W16 32, 24, ps +FILTER_HOR_LUMA_W16 32, 32, ps +FILTER_HOR_LUMA_W16 32, 64, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_48x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 48, 64, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_48x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 48, 64, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_64x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 64, 16, pp +FILTER_HOR_LUMA_W16 64, 32, pp +FILTER_HOR_LUMA_W16 64, 48, pp +FILTER_HOR_LUMA_W16 64, 64, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_64x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W16 64, 16, ps +FILTER_HOR_LUMA_W16 64, 32, ps +FILTER_HOR_LUMA_W16 64, 48, ps +FILTER_HOR_LUMA_W16 64, 64, ps + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_HOR_LUMA_W24 3 +INIT_XMM sse4 +cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 + + add r1, r1 + add r3, r3 + mov r4d, r4m + sub r0, 6 + shl r4d, 4 + +%ifdef PIC + lea r6, [tab_LumaCoeff] + mova m0, [r6 + r4] +%else + mova m0, [tab_LumaCoeff + r4] +%endif +%ifidn %3, pp + mova m1, [pd_32] +%else + mova m1, [INTERP_OFFSET_PS] +%endif + + mov r4d, %2 +%ifidn %3, ps + cmp r5m, byte 0 + je .loopH + lea r6, [r1 + 2 * r1] + sub r0, r6 + add r4d, 7 +%endif + +.loopH: + movu m2, [r0] ; m2 = src[0-7] + movu m3, [r0 + 16] ; m3 = src[8-15] + + pmaddwd m4, m2, m0 + palignr m5, m3, m2, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m3, m2, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m6, m3, m2, 6 ; m6 = src[3-10] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m3, m2, 8 ; m5 = src[4-11] + pmaddwd m5, m0 + palignr m6, m3, m2, 10 ; m6 = src[5-12] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m3, m2, 12 ; m6 = src[6-13] + pmaddwd m6, m0 + palignr m7, m3, m2, 14 ; m7 = src[7-14] + pmaddwd m7, m0 + phaddd m6, m7 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m4, m5 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m4, m5 +%endif + movu [r2], m4 + + movu m2, [r0 + 32] ; m2 = src[16-23] + + pmaddwd m4, m3, m0 ; m3 = src[8-15] + palignr m5, m2, m3, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m2, m3, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m6, m2, m3, 6 ; m6 = src[3-10] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m2, m3, 8 ; m5 = src[4-11] + pmaddwd m5, m0 + palignr m6, m2, m3, 10 ; m6 = src[5-12] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m2, m3, 12 ; m6 = src[6-13] + pmaddwd m6, m0 + palignr m7, m2, m3, 14 ; m7 = src[7-14] + pmaddwd m7, m0 + phaddd m6, m7 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m4, m5 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m4, m5 +%endif + movu [r2 + 16], m4 + + movu m3, [r0 + 48] ; m3 = src[24-31] + + pmaddwd m4, m2, m0 ; m2 = src[16-23] + palignr m5, m3, m2, 2 ; m5 = src[1-8] + pmaddwd m5, m0 + phaddd m4, m5 + + palignr m5, m3, m2, 4 ; m5 = src[2-9] + pmaddwd m5, m0 + palignr m6, m3, m2, 6 ; m6 = src[3-10] + pmaddwd m6, m0 + phaddd m5, m6 + phaddd m4, m5 + paddd m4, m1 + + palignr m5, m3, m2, 8 ; m5 = src[4-11] + pmaddwd m5, m0 + palignr m6, m3, m2, 10 ; m6 = src[5-12] + pmaddwd m6, m0 + phaddd m5, m6 + + palignr m6, m3, m2, 12 ; m6 = src[6-13] + pmaddwd m6, m0 + palignr m7, m3, m2, 14 ; m7 = src[7-14] + pmaddwd m7, m0 + phaddd m6, m7 + phaddd m5, m6 + paddd m5, m1 +%ifidn %3, pp + psrad m4, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m4, m5 + pxor m5, m5 + CLIPW m4, m5, [pw_pixel_max] +%else + psrad m4, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m4, m5 +%endif + movu [r2 + 32], m4 + + add r0, r1 + add r2, r3 + + dec r4d + jnz .loopH + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W24 24, 32, pp + +;---------------------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_ps_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;---------------------------------------------------------------------------------------------------------------------------- +FILTER_HOR_LUMA_W24 24, 32, ps + +%macro FILTER_W2_2 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + r1] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + packusdw m3, m3 + CLIPW m3, m7, m6 +%else + psrad m3, INTERP_SHIFT_PS + packssdw m3, m3 +%endif + movd [r2], m3 + pextrd [r2 + r3], m3, 1 +%endmacro + +%macro FILTER_W4_2 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + r1] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + r1 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m7, m6 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + r3], m3 +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_HOR_LUMA_W4_avx2 1 +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_4x%1, 4,7,7 + add r1d, r1d + add r3d, r3d + sub r0, 6 + mov r4d, r4m + shl r4d, 4 +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4] + vpbroadcastq m1, [r5 + r4 + 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +%endif + lea r6, [pw_pixel_max] + mova m3, [interp8_hpp_shuf] + mova m6, [pd_32] + pxor m2, m2 + + ; register map + ; m0 , m1 interpolate coeff + + mov r4d, %1/2 + +.loop: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + phaddd m4, m4 + vpermq m4, m4, q3120 + paddd m4, m6 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [r6] + movq [r2], xm4 + + vbroadcasti128 m4, [r0 + r1] + vbroadcasti128 m5, [r0 + r1 + 8] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + phaddd m4, m4 + vpermq m4, m4, q3120 + paddd m4, m6 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [r6] + movq [r2 + r3], xm4 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + dec r4d + jnz .loop + RET +%endmacro +FILTER_HOR_LUMA_W4_avx2 4 +FILTER_HOR_LUMA_W4_avx2 8 +FILTER_HOR_LUMA_W4_avx2 16 + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_HOR_LUMA_W8 1 +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_8x%1, 4,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 6 + mov r4d, r4m + shl r4d, 4 +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4] + vpbroadcastq m1, [r5 + r4 + 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [h_ab_LumaCoeff + r4 + 8] +%endif + mova m3, [interp8_hpp_shuf] + mova m7, [pd_32] + pxor m2, m2 + + ; register map + ; m0 , m1 interpolate coeff + + mov r4d, %1/2 + +.loop: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 8] + vbroadcasti128 m6, [r0 + 16] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + r1] + vbroadcasti128 m5, [r0 + r1 + 8] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + r1 + 8] + vbroadcasti128 m6, [r0 + r1 + 16] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2 + r3], xm4 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + dec r4d + jnz .loop + RET +%endmacro +FILTER_HOR_LUMA_W8 4 +FILTER_HOR_LUMA_W8 8 +FILTER_HOR_LUMA_W8 16 +FILTER_HOR_LUMA_W8 32 + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_HOR_LUMA_W16 1 +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_16x%1, 4,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 6 + mov r4d, r4m + shl r4d, 4 +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4] + vpbroadcastq m1, [r5 + r4 + 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +%endif + mova m3, [interp8_hpp_shuf] + mova m7, [pd_32] + pxor m2, m2 + + ; register map + ; m0 , m1 interpolate coeff + + mov r4d, %1 + +.loop: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 8] + vbroadcasti128 m6, [r0 + 16] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 24] + vbroadcasti128 m6, [r0 + 32] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2 + 16], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET +%endmacro +FILTER_HOR_LUMA_W16 4 +FILTER_HOR_LUMA_W16 8 +FILTER_HOR_LUMA_W16 12 +FILTER_HOR_LUMA_W16 16 +FILTER_HOR_LUMA_W16 32 +FILTER_HOR_LUMA_W16 64 + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_HOR_LUMA_W32 2 +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 6 + mov r4d, r4m + shl r4d, 4 +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4] + vpbroadcastq m1, [r5 + r4 + 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +%endif + mova m3, [interp8_hpp_shuf] + mova m7, [pd_32] + pxor m2, m2 + + ; register map + ; m0 , m1 interpolate coeff + + mov r4d, %2 + +.loop: +%assign x 0 +%rep %1/16 + vbroadcasti128 m4, [r0 + x] + vbroadcasti128 m5, [r0 + 8 + x] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 8 + x] + vbroadcasti128 m6, [r0 + 16 + x] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2 + x], xm4 + + vbroadcasti128 m4, [r0 + 16 + x] + vbroadcasti128 m5, [r0 + 24 + x] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 24 + x] + vbroadcasti128 m6, [r0 + 32 + x] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2 + 16 + x], xm4 + +%assign x x+32 +%endrep + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET +%endmacro +FILTER_HOR_LUMA_W32 32, 8 +FILTER_HOR_LUMA_W32 32, 16 +FILTER_HOR_LUMA_W32 32, 24 +FILTER_HOR_LUMA_W32 32, 32 +FILTER_HOR_LUMA_W32 32, 64 +FILTER_HOR_LUMA_W32 64, 16 +FILTER_HOR_LUMA_W32 64, 32 +FILTER_HOR_LUMA_W32 64, 48 +FILTER_HOR_LUMA_W32 64, 64 + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_12x16, 4,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 6 + mov r4d, r4m + shl r4d, 4 +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4] + vpbroadcastq m1, [r5 + r4 + 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +%endif + mova m3, [interp8_hpp_shuf] + mova m7, [pd_32] + pxor m2, m2 + + ; register map + ; m0 , m1 interpolate coeff + + mov r4d, 16 + +.loop: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 8] + vbroadcasti128 m6, [r0 + 16] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 24] + vbroadcasti128 m6, [r0 + 32] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movq [r2 + 16], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_24x32, 4,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 6 + mov r4d, r4m + shl r4d, 4 +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4] + vpbroadcastq m1, [r5 + r4 + 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +%endif + mova m3, [interp8_hpp_shuf] + mova m7, [pd_32] + pxor m2, m2 + + ; register map + ; m0 , m1 interpolate coeff + + mov r4d, 32 + +.loop: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 8] + vbroadcasti128 m6, [r0 + 16] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 24] + vbroadcasti128 m6, [r0 + 32] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2 + 16], xm4 + + vbroadcasti128 m4, [r0 + 32] + vbroadcasti128 m5, [r0 + 40] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 40] + vbroadcasti128 m6, [r0 + 48] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2 + 32], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_48x64, 4,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 6 + mov r4d, r4m + shl r4d, 4 +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4] + vpbroadcastq m1, [r5 + r4 + 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +%endif + mova m3, [interp8_hpp_shuf] + mova m7, [pd_32] + pxor m2, m2 + + ; register map + ; m0 , m1 interpolate coeff + + mov r4d, 64 + +.loop: +%assign x 0 +%rep 2 + vbroadcasti128 m4, [r0 + x] + vbroadcasti128 m5, [r0 + 8 + x] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 8 + x] + vbroadcasti128 m6, [r0 + 16 + x] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2 + x], xm4 + + vbroadcasti128 m4, [r0 + 16 + x] + vbroadcasti128 m5, [r0 + 24 + x] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 24 + x] + vbroadcasti128 m6, [r0 + 32 + x] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2 + 16 + x], xm4 + + vbroadcasti128 m4, [r0 + 32 + x] + vbroadcasti128 m5, [r0 + 40 + x] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m5, m1 + paddd m4, m5 + + vbroadcasti128 m5, [r0 + 40 + x] + vbroadcasti128 m6, [r0 + 48 + x] + pshufb m5, m3 + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m7 + psrad m4, INTERP_SHIFT_PP + + packusdw m4, m4 + vpermq m4, m4, q2020 + CLIPW m4, m2, [pw_pixel_max] + movu [r2 + 32 + x], xm4 + +%assign x x+48 +%endrep + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_CHROMA_H 6 +INIT_XMM sse4 +cglobal interp_4tap_horiz_%3_%1x%2, 4, %4, %5 + + add r3, r3 + add r1, r1 + sub r0, 2 + mov r4d, r4m + add r4d, r4d + +%ifdef PIC + lea r%6, [tab_ChromaCoeff] + movh m0, [r%6 + r4 * 4] +%else + movh m0, [tab_ChromaCoeff + r4 * 4] +%endif + + punpcklqdq m0, m0 + mova m2, [tab_Tm16] + +%ifidn %3, ps + mova m1, [INTERP_OFFSET_PS] + cmp r5m, byte 0 + je .skip + sub r0, r1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + + %if %1 == 4 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + %else + phaddd m3, m3 + %endif + + paddd m3, m1 + psrad m3, INTERP_SHIFT_PS + packssdw m3, m3 + + %if %1 == 2 + movd [r2], m3 + %else + movh [r2], m3 + %endif + + add r0, r1 + add r2, r3 + FILTER_W%1_2 %3 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + +.skip: + +%else ;%ifidn %3, ps + pxor m7, m7 + mova m6, [pw_pixel_max] + mova m1, [tab_c_32] +%endif ;%ifidn %3, ps + + FILTER_W%1_2 %3 + +%rep (%2/2) - 1 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + FILTER_W%1_2 %3 +%endrep + RET +%endmacro + +FILTER_CHROMA_H 2, 4, pp, 6, 8, 5 +FILTER_CHROMA_H 2, 8, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 2, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 4, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 8, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 16, pp, 6, 8, 5 + +FILTER_CHROMA_H 2, 4, ps, 7, 5, 6 +FILTER_CHROMA_H 2, 8, ps, 7, 5, 6 +FILTER_CHROMA_H 4, 2, ps, 7, 6, 6 +FILTER_CHROMA_H 4, 4, ps, 7, 6, 6 +FILTER_CHROMA_H 4, 8, ps, 7, 6, 6 +FILTER_CHROMA_H 4, 16, ps, 7, 6, 6 + +FILTER_CHROMA_H 2, 16, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 32, pp, 6, 8, 5 +FILTER_CHROMA_H 2, 16, ps, 7, 5, 6 +FILTER_CHROMA_H 4, 32, ps, 7, 6, 6 + + +%macro FILTER_W6_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m4, [r0 + 8] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m4, m4 + paddd m4, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m4, INTERP_SHIFT_PP + packusdw m3, m4 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m4, INTERP_SHIFT_PS + packssdw m3, m4 +%endif + movh [r2], m3 + pextrd [r2 + 8], m3, 2 +%endmacro + +cglobal chroma_filter_pp_6x1_internal + FILTER_W6_1 pp + ret + +cglobal chroma_filter_ps_6x1_internal + FILTER_W6_1 ps + ret + +%macro FILTER_W8_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 +%endmacro + +cglobal chroma_filter_pp_8x1_internal + FILTER_W8_1 pp + ret + +cglobal chroma_filter_ps_8x1_internal + FILTER_W8_1 ps + ret + +%macro FILTER_W12_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 + + movu m3, [r0 + 16] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 20] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + packusdw m3, m3 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + packssdw m3, m3 +%endif + movh [r2 + 16], m3 +%endmacro + +cglobal chroma_filter_pp_12x1_internal + FILTER_W12_1 pp + ret + +cglobal chroma_filter_ps_12x1_internal + FILTER_W12_1 ps + ret + +%macro FILTER_W16_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 + + movu m3, [r0 + 16] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 20] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 24] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 28] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + 16], m3 + movhps [r2 + 24], m3 +%endmacro + +cglobal chroma_filter_pp_16x1_internal + FILTER_W16_1 pp + ret + +cglobal chroma_filter_ps_16x1_internal + FILTER_W16_1 ps + ret + +%macro FILTER_W24_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 + + movu m3, [r0 + 16] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 20] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 24] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 28] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + 16], m3 + movhps [r2 + 24], m3 + + movu m3, [r0 + 32] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 36] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 40] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 44] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + 32], m3 + movhps [r2 + 40], m3 +%endmacro + +cglobal chroma_filter_pp_24x1_internal + FILTER_W24_1 pp + ret + +cglobal chroma_filter_ps_24x1_internal + FILTER_W24_1 ps + ret + +%macro FILTER_W32_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 + + movu m3, [r0 + 16] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 20] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 24] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 28] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + 16], m3 + movhps [r2 + 24], m3 + + movu m3, [r0 + 32] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 36] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 40] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 44] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + 32], m3 + movhps [r2 + 40], m3 + + movu m3, [r0 + 48] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 52] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 56] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 60] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + 48], m3 + movhps [r2 + 56], m3 +%endmacro + +cglobal chroma_filter_pp_32x1_internal + FILTER_W32_1 pp + ret + +cglobal chroma_filter_ps_32x1_internal + FILTER_W32_1 ps + ret + +%macro FILTER_W8o_1 2 + movu m3, [r0 + %2] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + %2 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + %2 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + %2 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + %2], m3 + movhps [r2 + %2 + 8], m3 +%endmacro + +%macro FILTER_W48_1 1 + FILTER_W8o_1 %1, 0 + FILTER_W8o_1 %1, 16 + FILTER_W8o_1 %1, 32 + FILTER_W8o_1 %1, 48 + FILTER_W8o_1 %1, 64 + FILTER_W8o_1 %1, 80 +%endmacro + +cglobal chroma_filter_pp_48x1_internal + FILTER_W48_1 pp + ret + +cglobal chroma_filter_ps_48x1_internal + FILTER_W48_1 ps + ret + +%macro FILTER_W64_1 1 + FILTER_W8o_1 %1, 0 + FILTER_W8o_1 %1, 16 + FILTER_W8o_1 %1, 32 + FILTER_W8o_1 %1, 48 + FILTER_W8o_1 %1, 64 + FILTER_W8o_1 %1, 80 + FILTER_W8o_1 %1, 96 + FILTER_W8o_1 %1, 112 +%endmacro + +cglobal chroma_filter_pp_64x1_internal + FILTER_W64_1 pp + ret + +cglobal chroma_filter_ps_64x1_internal + FILTER_W64_1 ps + ret + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- + +INIT_XMM sse4 +%macro IPFILTER_CHROMA 6 +cglobal interp_4tap_horiz_%3_%1x%2, 4, %5, %6 + + add r3, r3 + add r1, r1 + sub r0, 2 + mov r4d, r4m + add r4d, r4d + +%ifdef PIC + lea r%4, [tab_ChromaCoeff] + movh m0, [r%4 + r4 * 4] +%else + movh m0, [tab_ChromaCoeff + r4 * 4] +%endif + + punpcklqdq m0, m0 + mova m2, [tab_Tm16] + +%ifidn %3, ps + mova m1, [INTERP_OFFSET_PS] + cmp r5m, byte 0 + je .skip + sub r0, r1 + call chroma_filter_%3_%1x1_internal + add r0, r1 + add r2, r3 + call chroma_filter_%3_%1x1_internal + add r0, r1 + add r2, r3 + call chroma_filter_%3_%1x1_internal + add r0, r1 + add r2, r3 +.skip: +%else + mova m1, [tab_c_32] + pxor m6, m6 + mova m7, [pw_pixel_max] +%endif + + call chroma_filter_%3_%1x1_internal +%rep %2 - 1 + add r0, r1 + add r2, r3 + call chroma_filter_%3_%1x1_internal +%endrep +RET +%endmacro +IPFILTER_CHROMA 6, 8, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 2, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 4, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 6, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 8, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 12, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 4, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 8, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 12, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 24, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 8, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 24, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 32, pp, 5, 6, 8 + +IPFILTER_CHROMA 6, 8, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 2, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 4, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 6, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 8, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 12, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 4, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 8, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 12, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 24, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 8, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 24, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 32, ps, 6, 7, 6 + +IPFILTER_CHROMA 6, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 12, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 12, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 24, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 24, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 48, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 6, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 12, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 12, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 24, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 24, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 48, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 64, ps, 6, 7, 6 + +IPFILTER_CHROMA 48, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 64, 48, pp, 5, 6, 8 +IPFILTER_CHROMA 64, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 64, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 64, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 48, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 64, 48, ps, 6, 7, 6 +IPFILTER_CHROMA 64, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 64, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 64, 16, ps, 6, 7, 6 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_6xN 1 +cglobal interp_4tap_horiz_pp_6x%1, 5,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r4d, %1/2 +.loop: + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movq [r2], xm3 + pextrd [r2 + 8], xm3, 2 + + vbroadcasti128 m3, [r0 + r1] + vbroadcasti128 m4, [r0 + r1 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movq [r2 + r3], xm3 + pextrd [r2 + r3 + 8], xm3, 2 + + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] + dec r4d + jnz .loop + RET +%endmacro +IPFILTER_CHROMA_avx2_6xN 8 +IPFILTER_CHROMA_avx2_6xN 16 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_8x2, 5,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3,q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2], xm3 + + vbroadcasti128 m3, [r0 + r1] + vbroadcasti128 m4, [r0 + r1 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3,q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2 + r3], xm3 + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_8x4, 5,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + +%rep 2 + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3,q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2], xm3 + + vbroadcasti128 m3, [r0 + r1] + vbroadcasti128 m4, [r0 + r1 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3,q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2 + r3], xm3 + + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] +%endrep + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_8xN 1 +cglobal interp_4tap_horiz_pp_8x%1, 5,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r4d, %1/2 +.loop: + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2], xm3 + + vbroadcasti128 m3, [r0 + r1] + vbroadcasti128 m4, [r0 + r1 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2 + r3], xm3 + + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] + dec r4d + jnz .loop + RET +%endmacro +IPFILTER_CHROMA_avx2_8xN 6 +IPFILTER_CHROMA_avx2_8xN 8 +IPFILTER_CHROMA_avx2_8xN 12 +IPFILTER_CHROMA_avx2_8xN 16 +IPFILTER_CHROMA_avx2_8xN 32 +IPFILTER_CHROMA_avx2_8xN 64 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_16xN 1 +%if ARCH_X86_64 +cglobal interp_4tap_horiz_pp_16x%1, 5,6,9 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r4d, %1 +.loop: + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m8, [r0 + 24] + + pshufb m4, m1 + pshufb m8, m1 + + pmaddwd m4, m0 + pmaddwd m8, m0 + phaddd m4, m8 + paddd m4, m2 + psrad m4, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m4, m4 + vpermq m4, m4, q2020 + pshufb xm4, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + vinserti128 m3, m3, xm4, 1 + CLIPW m3, m5, m7 + movu [r2], m3 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +%endif +%endmacro +IPFILTER_CHROMA_avx2_16xN 4 +IPFILTER_CHROMA_avx2_16xN 8 +IPFILTER_CHROMA_avx2_16xN 12 +IPFILTER_CHROMA_avx2_16xN 16 +IPFILTER_CHROMA_avx2_16xN 24 +IPFILTER_CHROMA_avx2_16xN 32 +IPFILTER_CHROMA_avx2_16xN 64 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_32xN 1 +%if ARCH_X86_64 +cglobal interp_4tap_horiz_pp_32x%1, 5,6,9 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r6d, %1 +.loop: +%assign x 0 +%rep 2 + vbroadcasti128 m3, [r0 + x] + vbroadcasti128 m4, [r0 + 8 + x] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + + vbroadcasti128 m4, [r0 + 16 + x] + vbroadcasti128 m8, [r0 + 24 + x] + pshufb m4, m1 + pshufb m8, m1 + + pmaddwd m4, m0 + pmaddwd m8, m0 + phaddd m4, m8 + paddd m4, m2 + psrad m4, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m4, m4 + vpermq m4, m4, q2020 + pshufb xm4, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + vinserti128 m3, m3, xm4, 1 + CLIPW m3, m5, m7 + movu [r2 + x], m3 + %assign x x+32 + %endrep + + add r0, r1 + add r2, r3 + dec r6d + jnz .loop + RET +%endif +%endmacro +IPFILTER_CHROMA_avx2_32xN 8 +IPFILTER_CHROMA_avx2_32xN 16 +IPFILTER_CHROMA_avx2_32xN 24 +IPFILTER_CHROMA_avx2_32xN 32 +IPFILTER_CHROMA_avx2_32xN 48 +IPFILTER_CHROMA_avx2_32xN 64 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_12xN 1 +%if ARCH_X86_64 +cglobal interp_4tap_horiz_pp_12x%1, 5,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r4d, %1 +.loop: + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2], xm3 + + vbroadcasti128 m3, [r0 + 16] + vbroadcasti128 m4, [r0 + 24] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movq [r2 + 16], xm3 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +%endif +%endmacro +IPFILTER_CHROMA_avx2_12xN 16 +IPFILTER_CHROMA_avx2_12xN 32 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_24xN 1 +%if ARCH_X86_64 +cglobal interp_4tap_horiz_pp_24x%1, 5,6,9 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r4d, %1 +.loop: + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m8, [r0 + 24] + pshufb m4, m1 + pshufb m8, m1 + + pmaddwd m4, m0 + pmaddwd m8, m0 + phaddd m4, m8 + paddd m4, m2 + psrad m4, 6 + + packusdw m3, m4 + vpermq m3, m3, q3120 + pshufb m3, m6 + CLIPW m3, m5, m7 + movu [r2], m3 + + vbroadcasti128 m3, [r0 + 32] + vbroadcasti128 m4, [r0 + 40] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 + CLIPW xm3, xm5, xm7 + movu [r2 + 32], xm3 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +%endif +%endmacro +IPFILTER_CHROMA_avx2_24xN 32 +IPFILTER_CHROMA_avx2_24xN 64 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_64xN 1 +%if ARCH_X86_64 +cglobal interp_4tap_horiz_pp_64x%1, 5,6,9 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r6d, %1 +.loop: +%assign x 0 +%rep 4 + vbroadcasti128 m3, [r0 + x] + vbroadcasti128 m4, [r0 + 8 + x] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 + + vbroadcasti128 m4, [r0 + 16 + x] + vbroadcasti128 m8, [r0 + 24 + x] + pshufb m4, m1 + pshufb m8, m1 + + pmaddwd m4, m0 + pmaddwd m8, m0 + phaddd m4, m8 + paddd m4, m2 + psrad m4, 6 + + packusdw m3, m4 + vpermq m3, m3, q3120 + pshufb m3, m6 + CLIPW m3, m5, m7 + movu [r2 + x], m3 + %assign x x+32 + %endrep + + add r0, r1 + add r2, r3 + dec r6d + jnz .loop + RET +%endif +%endmacro +IPFILTER_CHROMA_avx2_64xN 16 +IPFILTER_CHROMA_avx2_64xN 32 +IPFILTER_CHROMA_avx2_64xN 48 +IPFILTER_CHROMA_avx2_64xN 64 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%if ARCH_X86_64 +cglobal interp_4tap_horiz_pp_48x64, 5,6,9 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r4d, 64 +.loop: +%assign x 0 +%rep 3 + vbroadcasti128 m3, [r0 + x] + vbroadcasti128 m4, [r0 + 8 + x] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 + + vbroadcasti128 m4, [r0 + 16 + x] + vbroadcasti128 m8, [r0 + 24 + x] + pshufb m4, m1 + pshufb m8, m1 + + pmaddwd m4, m0 + pmaddwd m8, m0 + phaddd m4, m8 + paddd m4, m2 + psrad m4, 6 + + packusdw m3, m4 + vpermq m3, m3, q3120 + pshufb m3, m6 + CLIPW m3, m5, m7 + movu [r2 + x], m3 +%assign x x+32 +%endrep + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +%endif + +;----------------------------------------------------------------------------------------------------------------------------- +;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- + +%macro IPFILTER_LUMA_PS_4xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_ps_4x%1, 6,8,7 + mov r5d, r5m + mov r4d, r4m + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r6, [tab_LumaCoeff] + lea r4, [r4 * 8] + vbroadcasti128 m0, [r6 + r4 * 2] +%else + lea r4, [r4 * 8] + vbroadcasti128 m0, [tab_LumaCoeff + r4 * 2] +%endif + + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - pw_2000 + + sub r0, 6 + test r5d, r5d + mov r7d, %1 ; loop count variable - height + jz .preloop + lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride + sub r0, r6 ; r0(src) - 3 * srcStride + add r7d, 6 ;7 - 1(since last row not in loop) ; need extra 7 rows, just set a specially flag here, blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop) + +.preloop: + lea r6, [r3 * 3] +.loop: + ; Row 0 + movu xm3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + movu xm4, [r0 + 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + vinserti128 m3, m3, xm4, 1 + movu xm4, [r0 + 4] + movu xm5, [r0 + 6] + vinserti128 m4, m4, xm5, 1 + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] + + ; Row 1 + movu xm4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + movu xm5, [r0 + r1 + 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + vinserti128 m4, m4, xm5, 1 + movu xm5, [r0 + r1 + 4] + movu xm6, [r0 + r1 + 6] + vinserti128 m5, m5, xm6, 1 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + phaddd m3, m4 ; all rows and col completed. + + mova m5, [interp8_hps_shuf] + vpermd m3, m5, m3 + paddd m3, m2 + vextracti128 xm4, m3, 1 + psrad xm3, INTERP_SHIFT_PS + psrad xm4, INTERP_SHIFT_PS + packssdw xm3, xm3 + packssdw xm4, xm4 + + movq [r2], xm3 ;row 0 + movq [r2 + r3], xm4 ;row 1 + lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4) + lea r2, [r2 + r3 * 2] ; first loop dst ->5th row(i.e 4) + + sub r7d, 2 + jg .loop + test r5d, r5d + jz .end + + ; Row 10 + movu xm3, [r0] + movu xm4, [r0 + 2] + vinserti128 m3, m3, xm4, 1 + movu xm4, [r0 + 4] + movu xm5, [r0 + 6] + vinserti128 m4, m4, xm5, 1 + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + + ; Row11 + phaddd m3, m4 ; all rows and col completed. + + mova m5, [interp8_hps_shuf] + vpermd m3, m5, m3 + paddd m3, m2 + vextracti128 xm4, m3, 1 + psrad xm3, INTERP_SHIFT_PS + psrad xm4, INTERP_SHIFT_PS + packssdw xm3, xm3 + packssdw xm4, xm4 + + movq [r2], xm3 ;row 0 +.end: + RET +%endif +%endmacro + + IPFILTER_LUMA_PS_4xN_AVX2 4 + IPFILTER_LUMA_PS_4xN_AVX2 8 + IPFILTER_LUMA_PS_4xN_AVX2 16 + +%macro IPFILTER_LUMA_PS_8xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_ps_8x%1, 4, 6, 8 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + shl r4d, 4 +%ifdef PIC + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4] + vpbroadcastq m1, [r6 + r4 + 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +%endif + mova m3, [interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 6 + test r5d, r5d + mov r4d, %1 + jz .loop0 + lea r6, [r1*3] + sub r0, r6 + add r4d, 7 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m7, m5, m3 + pmaddwd m4, m0 + pmaddwd m7, m1 + paddd m4, m7 + + vbroadcasti128 m6, [r0 + 16] + pshufb m5, m3 + pshufb m6, m3 + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m2 + vextracti128 xm5,m4, 1 + psrad xm4, INTERP_SHIFT_PS + psrad xm5, INTERP_SHIFT_PS + packssdw xm4, xm5 + + movu [r2], xm4 + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + + IPFILTER_LUMA_PS_8xN_AVX2 4 + IPFILTER_LUMA_PS_8xN_AVX2 8 + IPFILTER_LUMA_PS_8xN_AVX2 16 + IPFILTER_LUMA_PS_8xN_AVX2 32 + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_ps_24x32, 4, 6, 8 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + shl r4d, 4 +%ifdef PIC + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4] + vpbroadcastq m1, [r6 + r4 + 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +%endif + mova m3, [interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 6 + test r5d, r5d + mov r4d, 32 + jz .loop0 + lea r6, [r1*3] + sub r0, r6 + add r4d, 7 + +.loop0: +%assign x 0 +%rep 24/8 + vbroadcasti128 m4, [r0 + x] + vbroadcasti128 m5, [r0 + 8 + x] + pshufb m4, m3 + pshufb m7, m5, m3 + pmaddwd m4, m0 + pmaddwd m7, m1 + paddd m4, m7 + + vbroadcasti128 m6, [r0 + 16 + x] + pshufb m5, m3 + pshufb m6, m3 + pmaddwd m5, m0 + pmaddwd m6, m1 + paddd m5, m6 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m2 + vextracti128 xm5,m4, 1 + psrad xm4, INTERP_SHIFT_PS + psrad xm5, INTERP_SHIFT_PS + packssdw xm4, xm5 + + movu [r2 + x], xm4 + %assign x x+16 + %endrep + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%macro IPFILTER_LUMA_PS_32_64_AVX2 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_ps_%1x%2, 4, 6, 8 + + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + shl r4d, 6 +%ifdef PIC + lea r6, [h_tab_LumaCoeffV] + movu m0, [r6 + r4] + movu m1, [r6 + r4 + mmsize] +%else + movu m0, [h_tab_LumaCoeffV + r4] + movu m1, [h_tab_LumaCoeffV + r4 + mmsize] +%endif + mova m3, [interp8_hpp_shuf_new] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 6 + test r5d, r5d + mov r4d, %2 + jz .loop0 + lea r6, [r1*3] + sub r0, r6 + add r4d, 7 + +.loop0: +%assign x 0 +%rep %1/16 + vbroadcasti128 m4, [r0 + x] + vbroadcasti128 m5, [r0 + 4 * SIZEOF_PIXEL + x] + pshufb m4, m3 + pshufb m5, m3 + + pmaddwd m4, m0 + pmaddwd m7, m5, m1 + paddd m4, m7 + vextracti128 xm7, m4, 1 + paddd xm4, xm7 + paddd xm4, xm2 + psrad xm4, INTERP_SHIFT_PS + + vbroadcasti128 m6, [r0 + 16 + x] + pshufb m6, m3 + + pmaddwd m5, m0 + pmaddwd m7, m6, m1 + paddd m5, m7 + vextracti128 xm7, m5, 1 + paddd xm5, xm7 + paddd xm5, xm2 + psrad xm5, INTERP_SHIFT_PS + + packssdw xm4, xm5 + movu [r2 + x], xm4 + + vbroadcasti128 m5, [r0 + 24 + x] + pshufb m5, m3 + + pmaddwd m6, m0 + pmaddwd m7, m5, m1 + paddd m6, m7 + vextracti128 xm7, m6, 1 + paddd xm6, xm7 + paddd xm6, xm2 + psrad xm6, INTERP_SHIFT_PS + + vbroadcasti128 m7, [r0 + 32 + x] + pshufb m7, m3 + + pmaddwd m5, m0 + pmaddwd m7, m1 + paddd m5, m7 + vextracti128 xm7, m5, 1 + paddd xm5, xm7 + paddd xm5, xm2 + psrad xm5, INTERP_SHIFT_PS + + packssdw xm6, xm5 + movu [r2 + 16 + x], xm6 + +%assign x x+32 +%endrep + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + + IPFILTER_LUMA_PS_32_64_AVX2 32, 8 + IPFILTER_LUMA_PS_32_64_AVX2 32, 16 + IPFILTER_LUMA_PS_32_64_AVX2 32, 24 + IPFILTER_LUMA_PS_32_64_AVX2 32, 32 + IPFILTER_LUMA_PS_32_64_AVX2 32, 64 + + IPFILTER_LUMA_PS_32_64_AVX2 64, 16 + IPFILTER_LUMA_PS_32_64_AVX2 64, 32 + IPFILTER_LUMA_PS_32_64_AVX2 64, 48 + IPFILTER_LUMA_PS_32_64_AVX2 64, 64 + + IPFILTER_LUMA_PS_32_64_AVX2 48, 64 + +%macro IPFILTER_LUMA_PS_16xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_ps_16x%1, 4, 6, 8 + + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + shl r4d, 4 +%ifdef PIC + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4] + vpbroadcastq m1, [r6 + r4 + 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +%endif + mova m3, [interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 6 + test r5d, r5d + mov r4d, %1 + jz .loop0 + lea r6, [r1*3] + sub r0, r6 + add r4d, 7 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m7, m5, m3 + pmaddwd m4, m0 + pmaddwd m7, m1 + paddd m4, m7 + + vbroadcasti128 m6, [r0 + 16] + pshufb m5, m3 + pshufb m7, m6, m3 + pmaddwd m5, m0 + pmaddwd m7, m1 + paddd m5, m7 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m2 + vextracti128 xm5, m4, 1 + psrad xm4, INTERP_SHIFT_PS + psrad xm5, INTERP_SHIFT_PS + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m5, [r0 + 24] + pshufb m6, m3 + pshufb m7, m5, m3 + pmaddwd m6, m0 + pmaddwd m7, m1 + paddd m6, m7 + + vbroadcasti128 m7, [r0 + 32] + pshufb m5, m3 + pshufb m7, m3 + pmaddwd m5, m0 + pmaddwd m7, m1 + paddd m5, m7 + + phaddd m6, m5 + vpermq m6, m6, q3120 + paddd m6, m2 + vextracti128 xm5,m6, 1 + psrad xm6, INTERP_SHIFT_PS + psrad xm5, INTERP_SHIFT_PS + packssdw xm6, xm5 + movu [r2 + 16], xm6 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + + IPFILTER_LUMA_PS_16xN_AVX2 4 + IPFILTER_LUMA_PS_16xN_AVX2 8 + IPFILTER_LUMA_PS_16xN_AVX2 12 + IPFILTER_LUMA_PS_16xN_AVX2 16 + IPFILTER_LUMA_PS_16xN_AVX2 32 + IPFILTER_LUMA_PS_16xN_AVX2 64 + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_ps_12x16, 4, 6, 8 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + shl r4d, 4 +%ifdef PIC + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4] + vpbroadcastq m1, [r6 + r4 + 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +%endif + mova m3, [interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 6 + test r5d, r5d + mov r4d, 16 + jz .loop0 + lea r6, [r1*3] + sub r0, r6 + add r4d, 7 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m7, m5, m3 + pmaddwd m4, m0 + pmaddwd m7, m1 + paddd m4, m7 + + vbroadcasti128 m6, [r0 + 16] + pshufb m5, m3 + pshufb m7, m6, m3 + pmaddwd m5, m0 + pmaddwd m7, m1 + paddd m5, m7 + + phaddd m4, m5 + vpermq m4, m4, q3120 + paddd m4, m2 + vextracti128 xm5,m4, 1 + psrad xm4, INTERP_SHIFT_PS + psrad xm5, INTERP_SHIFT_PS + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m5, [r0 + 24] + pshufb m6, m3 + pshufb m5, m3 + pmaddwd m6, m0 + pmaddwd m5, m1 + paddd m6, m5 + + phaddd m6, m6 + vpermq m6, m6, q3120 + paddd xm6, xm2 + psrad xm6, INTERP_SHIFT_PS + packssdw xm6, xm6 + movq [r2 + 16], xm6 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%macro IPFILTER_CHROMA_PS_8xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_8x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + + IPFILTER_CHROMA_PS_8xN_AVX2 4 + IPFILTER_CHROMA_PS_8xN_AVX2 8 + IPFILTER_CHROMA_PS_8xN_AVX2 16 + IPFILTER_CHROMA_PS_8xN_AVX2 32 + IPFILTER_CHROMA_PS_8xN_AVX2 6 + IPFILTER_CHROMA_PS_8xN_AVX2 2 + IPFILTER_CHROMA_PS_8xN_AVX2 12 + IPFILTER_CHROMA_PS_8xN_AVX2 64 + +%macro IPFILTER_CHROMA_PS_16xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_16x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 16], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + +IPFILTER_CHROMA_PS_16xN_AVX2 16 +IPFILTER_CHROMA_PS_16xN_AVX2 8 +IPFILTER_CHROMA_PS_16xN_AVX2 32 +IPFILTER_CHROMA_PS_16xN_AVX2 12 +IPFILTER_CHROMA_PS_16xN_AVX2 4 +IPFILTER_CHROMA_PS_16xN_AVX2 64 +IPFILTER_CHROMA_PS_16xN_AVX2 24 + +%macro IPFILTER_CHROMA_PS_24xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_24x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 16], xm4 + + vbroadcasti128 m4, [r0 + 32] + vbroadcasti128 m5, [r0 + 40] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 32], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + +IPFILTER_CHROMA_PS_24xN_AVX2 32 +IPFILTER_CHROMA_PS_24xN_AVX2 64 + +%macro IPFILTER_CHROMA_PS_12xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_12x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + pshufb m4, m3 + pmaddwd m4, m0 + phaddd m4, m4 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movq [r2 + 16], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + +IPFILTER_CHROMA_PS_12xN_AVX2 16 +IPFILTER_CHROMA_PS_12xN_AVX2 32 + +%macro IPFILTER_CHROMA_PS_32xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_32x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 16], xm4 + + vbroadcasti128 m4, [r0 + 32] + vbroadcasti128 m5, [r0 + 40] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 32], xm4 + + vbroadcasti128 m4, [r0 + 48] + vbroadcasti128 m5, [r0 + 56] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 48], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + +IPFILTER_CHROMA_PS_32xN_AVX2 32 +IPFILTER_CHROMA_PS_32xN_AVX2 16 +IPFILTER_CHROMA_PS_32xN_AVX2 24 +IPFILTER_CHROMA_PS_32xN_AVX2 8 +IPFILTER_CHROMA_PS_32xN_AVX2 64 +IPFILTER_CHROMA_PS_32xN_AVX2 48 + + +%macro IPFILTER_CHROMA_PS_64xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_64x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 16], xm4 + + vbroadcasti128 m4, [r0 + 32] + vbroadcasti128 m5, [r0 + 40] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 32], xm4 + + vbroadcasti128 m4, [r0 + 48] + vbroadcasti128 m5, [r0 + 56] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 48], xm4 + + vbroadcasti128 m4, [r0 + 64] + vbroadcasti128 m5, [r0 + 72] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 64], xm4 + + vbroadcasti128 m4, [r0 + 80] + vbroadcasti128 m5, [r0 + 88] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 80], xm4 + + vbroadcasti128 m4, [r0 + 96] + vbroadcasti128 m5, [r0 + 104] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 96], xm4 + + vbroadcasti128 m4, [r0 + 112] + vbroadcasti128 m5, [r0 + 120] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 112], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + +IPFILTER_CHROMA_PS_64xN_AVX2 64 +IPFILTER_CHROMA_PS_64xN_AVX2 48 +IPFILTER_CHROMA_PS_64xN_AVX2 32 +IPFILTER_CHROMA_PS_64xN_AVX2 16 + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_48x64, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, 64 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 16], xm4 + + vbroadcasti128 m4, [r0 + 32] + vbroadcasti128 m5, [r0 + 40] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 32], xm4 + + vbroadcasti128 m4, [r0 + 48] + vbroadcasti128 m5, [r0 + 56] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 48], xm4 + + vbroadcasti128 m4, [r0 + 64] + vbroadcasti128 m5, [r0 + 72] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 64], xm4 + + vbroadcasti128 m4, [r0 + 80] + vbroadcasti128 m5, [r0 + 88] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 80], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif + +%macro IPFILTER_CHROMA_PS_6xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_6x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movq [r2], xm4 + pextrd [r2 + 8], xm4, 2 + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + + IPFILTER_CHROMA_PS_6xN_AVX2 8 + IPFILTER_CHROMA_PS_6xN_AVX2 16 diff --git a/source/common/x86/ipfilter16.asm b/source/common/x86/ipfilter16.asm index d44cfd9edb..c5b775121e 100644 --- a/source/common/x86/ipfilter16.asm +++ b/source/common/x86/ipfilter16.asm @@ -47,22 +47,10 @@ SECTION_RODATA 32 -tab_c_32: times 8 dd 32 tab_c_524800: times 4 dd 524800 tab_c_n8192: times 8 dw -8192 pd_524800: times 8 dd 524800 -tab_Tm16: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 - -tab_ChromaCoeff: dw 0, 64, 0, 0 - dw -2, 58, 10, -2 - dw -4, 54, 16, -2 - dw -6, 46, 28, -4 - dw -4, 36, 36, -4 - dw -4, 28, 46, -6 - dw -2, 16, 54, -4 - dw -2, 10, 58, -2 - const tab_ChromaCoeffV, times 8 dw 0, 64 times 8 dw 0, 0 @@ -111,11 +99,6 @@ tab_ChromaCoeffVer: times 8 dw 0, 64 times 8 dw -2, 10 times 8 dw 58, -2 -tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0 - dw -1, 4, -10, 58, 17, -5, 1, 0 - dw -1, 4, -11, 40, 40, -11, 4, -1 - dw 0, 1, -5, 17, 58, -10, 4, -1 - ALIGN 32 tab_LumaCoeffV: times 4 dw 0, 0 times 4 dw 0, 64 @@ -157,14 +140,6 @@ tab_LumaCoeffVer: times 8 dw 0, 0 times 8 dw 58, -10 times 8 dw 4, -1 -const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 - -const interp8_hpp_shuf, db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 - db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 - -const interp8_hpp_shuf_new, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 - db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 - SECTION .text cextern pd_8 cextern pd_32 @@ -175,255 +150,6 @@ cextern pd_n131072 cextern pw_2000 cextern idct8_shuf2 -%macro FILTER_LUMA_HOR_4_sse2 1 - movu m4, [r0 + %1] ; m4 = src[0-7] - movu m5, [r0 + %1 + 2] ; m5 = src[1-8] - pmaddwd m4, m0 - pmaddwd m5, m0 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m4, m4, q3120 - pshufd m5, m5, q3120 - punpcklqdq m4, m5 - - movu m5, [r0 + %1 + 4] ; m5 = src[2-9] - movu m3, [r0 + %1 + 6] ; m3 = src[3-10] - pmaddwd m5, m0 - pmaddwd m3, m0 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m5, m5, q3120 - pshufd m3, m3, q3120 - punpcklqdq m5, m3 - - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m4, m4, q3120 - pshufd m5, m5, q3120 - punpcklqdq m4, m5 - paddd m4, m1 -%endmacro - -%macro FILTER_LUMA_HOR_8_sse2 1 - movu m4, [r0 + %1] ; m4 = src[0-7] - movu m5, [r0 + %1 + 2] ; m5 = src[1-8] - pmaddwd m4, m0 - pmaddwd m5, m0 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m4, m4, q3120 - pshufd m5, m5, q3120 - punpcklqdq m4, m5 - - movu m5, [r0 + %1 + 4] ; m5 = src[2-9] - movu m3, [r0 + %1 + 6] ; m3 = src[3-10] - pmaddwd m5, m0 - pmaddwd m3, m0 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m5, m5, q3120 - pshufd m3, m3, q3120 - punpcklqdq m5, m3 - - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m4, m4, q3120 - pshufd m5, m5, q3120 - punpcklqdq m4, m5 - paddd m4, m1 - - movu m5, [r0 + %1 + 8] ; m5 = src[4-11] - movu m6, [r0 + %1 + 10] ; m6 = src[5-12] - pmaddwd m5, m0 - pmaddwd m6, m0 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m2, m6, q2301 - paddd m6, m2 - pshufd m5, m5, q3120 - pshufd m6, m6, q3120 - punpcklqdq m5, m6 - - movu m6, [r0 + %1 + 12] ; m6 = src[6-13] - movu m3, [r0 + %1 + 14] ; m3 = src[7-14] - pmaddwd m6, m0 - pmaddwd m3, m0 - pshufd m2, m6, q2301 - paddd m6, m2 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m6, m6, q3120 - pshufd m3, m3, q3120 - punpcklqdq m6, m3 - - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m2, m6, q2301 - paddd m6, m2 - pshufd m5, m5, q3120 - pshufd m6, m6, q3120 - punpcklqdq m5, m6 - paddd m5, m1 -%endmacro - -;------------------------------------------------------------------------------------------------------------ -; void interp_8tap_horiz_p%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------ -%macro FILTER_HOR_LUMA_sse2 3 -INIT_XMM sse2 -cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 - mov r4d, r4m - sub r0, 6 - shl r4d, 4 - add r1d, r1d - add r3d, r3d - -%ifdef PIC - lea r6, [tab_LumaCoeff] - mova m0, [r6 + r4] -%else - mova m0, [tab_LumaCoeff + r4] -%endif - -%ifidn %3, pp - mova m1, [pd_32] - pxor m7, m7 -%else - mova m1, [INTERP_OFFSET_PS] -%endif - - mov r4d, %2 -%ifidn %3, ps - cmp r5m, byte 0 - je .loopH - lea r6, [r1 + 2 * r1] - sub r0, r6 - add r4d, 7 -%endif - -.loopH: -%assign x 0 -%rep %1/8 - FILTER_LUMA_HOR_8_sse2 x - -%ifidn %3, pp - psrad m4, 6 - psrad m5, 6 - packssdw m4, m5 - CLIPW m4, m7, [pw_pixel_max] -%else - %if BIT_DEPTH == 10 - psrad m4, 2 - psrad m5, 2 - %elif BIT_DEPTH == 12 - psrad m4, 4 - psrad m5, 4 - %endif - packssdw m4, m5 -%endif - - movu [r2 + x], m4 -%assign x x+16 -%endrep - -%rep (%1 % 8)/4 - FILTER_LUMA_HOR_4_sse2 x - -%ifidn %3, pp - psrad m4, 6 - packssdw m4, m4 - CLIPW m4, m7, [pw_pixel_max] -%else - %if BIT_DEPTH == 10 - psrad m4, 2 - %elif BIT_DEPTH == 12 - psrad m4, 4 - %endif - packssdw m4, m4 -%endif - - movh [r2 + x], m4 -%endrep - - add r0, r1 - add r2, r3 - - dec r4d - jnz .loopH - RET - -%endmacro - -;------------------------------------------------------------------------------------------------------------ -; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------ - FILTER_HOR_LUMA_sse2 4, 4, pp - FILTER_HOR_LUMA_sse2 4, 8, pp - FILTER_HOR_LUMA_sse2 4, 16, pp - FILTER_HOR_LUMA_sse2 8, 4, pp - FILTER_HOR_LUMA_sse2 8, 8, pp - FILTER_HOR_LUMA_sse2 8, 16, pp - FILTER_HOR_LUMA_sse2 8, 32, pp - FILTER_HOR_LUMA_sse2 12, 16, pp - FILTER_HOR_LUMA_sse2 16, 4, pp - FILTER_HOR_LUMA_sse2 16, 8, pp - FILTER_HOR_LUMA_sse2 16, 12, pp - FILTER_HOR_LUMA_sse2 16, 16, pp - FILTER_HOR_LUMA_sse2 16, 32, pp - FILTER_HOR_LUMA_sse2 16, 64, pp - FILTER_HOR_LUMA_sse2 24, 32, pp - FILTER_HOR_LUMA_sse2 32, 8, pp - FILTER_HOR_LUMA_sse2 32, 16, pp - FILTER_HOR_LUMA_sse2 32, 24, pp - FILTER_HOR_LUMA_sse2 32, 32, pp - FILTER_HOR_LUMA_sse2 32, 64, pp - FILTER_HOR_LUMA_sse2 48, 64, pp - FILTER_HOR_LUMA_sse2 64, 16, pp - FILTER_HOR_LUMA_sse2 64, 32, pp - FILTER_HOR_LUMA_sse2 64, 48, pp - FILTER_HOR_LUMA_sse2 64, 64, pp - -;--------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;--------------------------------------------------------------------------------------------------------------------------- - FILTER_HOR_LUMA_sse2 4, 4, ps - FILTER_HOR_LUMA_sse2 4, 8, ps - FILTER_HOR_LUMA_sse2 4, 16, ps - FILTER_HOR_LUMA_sse2 8, 4, ps - FILTER_HOR_LUMA_sse2 8, 8, ps - FILTER_HOR_LUMA_sse2 8, 16, ps - FILTER_HOR_LUMA_sse2 8, 32, ps - FILTER_HOR_LUMA_sse2 12, 16, ps - FILTER_HOR_LUMA_sse2 16, 4, ps - FILTER_HOR_LUMA_sse2 16, 8, ps - FILTER_HOR_LUMA_sse2 16, 12, ps - FILTER_HOR_LUMA_sse2 16, 16, ps - FILTER_HOR_LUMA_sse2 16, 32, ps - FILTER_HOR_LUMA_sse2 16, 64, ps - FILTER_HOR_LUMA_sse2 24, 32, ps - FILTER_HOR_LUMA_sse2 32, 8, ps - FILTER_HOR_LUMA_sse2 32, 16, ps - FILTER_HOR_LUMA_sse2 32, 24, ps - FILTER_HOR_LUMA_sse2 32, 32, ps - FILTER_HOR_LUMA_sse2 32, 64, ps - FILTER_HOR_LUMA_sse2 48, 64, ps - FILTER_HOR_LUMA_sse2 64, 16, ps - FILTER_HOR_LUMA_sse2 64, 32, ps - FILTER_HOR_LUMA_sse2 64, 48, ps - FILTER_HOR_LUMA_sse2 64, 64, ps - %macro PROCESS_LUMA_VER_W4_4R_sse2 0 movq m0, [r0] movq m1, [r0 + r1] @@ -556,3518 +282,107 @@ cglobal interp_8tap_vert_%1_%2x%3, 5, 7, 8 %assign x x+8 %endrep - lea r0, [r0 + 4 * r1 - 2 * %2] - lea r2, [r2 + 4 * r3] - - dec r4d - jnz .loopH - - RET -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_sse2 pp, 4, 4 - FILTER_VER_LUMA_sse2 pp, 8, 8 - FILTER_VER_LUMA_sse2 pp, 8, 4 - FILTER_VER_LUMA_sse2 pp, 4, 8 - FILTER_VER_LUMA_sse2 pp, 16, 16 - FILTER_VER_LUMA_sse2 pp, 16, 8 - FILTER_VER_LUMA_sse2 pp, 8, 16 - FILTER_VER_LUMA_sse2 pp, 16, 12 - FILTER_VER_LUMA_sse2 pp, 12, 16 - FILTER_VER_LUMA_sse2 pp, 16, 4 - FILTER_VER_LUMA_sse2 pp, 4, 16 - FILTER_VER_LUMA_sse2 pp, 32, 32 - FILTER_VER_LUMA_sse2 pp, 32, 16 - FILTER_VER_LUMA_sse2 pp, 16, 32 - FILTER_VER_LUMA_sse2 pp, 32, 24 - FILTER_VER_LUMA_sse2 pp, 24, 32 - FILTER_VER_LUMA_sse2 pp, 32, 8 - FILTER_VER_LUMA_sse2 pp, 8, 32 - FILTER_VER_LUMA_sse2 pp, 64, 64 - FILTER_VER_LUMA_sse2 pp, 64, 32 - FILTER_VER_LUMA_sse2 pp, 32, 64 - FILTER_VER_LUMA_sse2 pp, 64, 48 - FILTER_VER_LUMA_sse2 pp, 48, 64 - FILTER_VER_LUMA_sse2 pp, 64, 16 - FILTER_VER_LUMA_sse2 pp, 16, 64 - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_sse2 ps, 4, 4 - FILTER_VER_LUMA_sse2 ps, 8, 8 - FILTER_VER_LUMA_sse2 ps, 8, 4 - FILTER_VER_LUMA_sse2 ps, 4, 8 - FILTER_VER_LUMA_sse2 ps, 16, 16 - FILTER_VER_LUMA_sse2 ps, 16, 8 - FILTER_VER_LUMA_sse2 ps, 8, 16 - FILTER_VER_LUMA_sse2 ps, 16, 12 - FILTER_VER_LUMA_sse2 ps, 12, 16 - FILTER_VER_LUMA_sse2 ps, 16, 4 - FILTER_VER_LUMA_sse2 ps, 4, 16 - FILTER_VER_LUMA_sse2 ps, 32, 32 - FILTER_VER_LUMA_sse2 ps, 32, 16 - FILTER_VER_LUMA_sse2 ps, 16, 32 - FILTER_VER_LUMA_sse2 ps, 32, 24 - FILTER_VER_LUMA_sse2 ps, 24, 32 - FILTER_VER_LUMA_sse2 ps, 32, 8 - FILTER_VER_LUMA_sse2 ps, 8, 32 - FILTER_VER_LUMA_sse2 ps, 64, 64 - FILTER_VER_LUMA_sse2 ps, 64, 32 - FILTER_VER_LUMA_sse2 ps, 32, 64 - FILTER_VER_LUMA_sse2 ps, 64, 48 - FILTER_VER_LUMA_sse2 ps, 48, 64 - FILTER_VER_LUMA_sse2 ps, 64, 16 - FILTER_VER_LUMA_sse2 ps, 16, 64 - -%macro FILTERH_W2_4_sse3 2 - movh m3, [r0 + %1] - movhps m3, [r0 + %1 + 2] - pmaddwd m3, m0 - movh m4, [r0 + r1 + %1] - movhps m4, [r0 + r1 + %1 + 2] - pmaddwd m4, m0 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m3, m3, q3120 - pshufd m4, m4, q3120 - punpcklqdq m3, m4 - paddd m3, m1 - movh m5, [r0 + 2 * r1 + %1] - movhps m5, [r0 + 2 * r1 + %1 + 2] - pmaddwd m5, m0 - movh m4, [r0 + r4 + %1] - movhps m4, [r0 + r4 + %1 + 2] - pmaddwd m4, m0 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m5, m5, q3120 - pshufd m4, m4, q3120 - punpcklqdq m5, m4 - paddd m5, m1 -%ifidn %2, pp - psrad m3, 6 - psrad m5, 6 - packssdw m3, m5 - CLIPW m3, m7, m6 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movd [r2 + %1], m3 - psrldq m3, 4 - movd [r2 + r3 + %1], m3 - psrldq m3, 4 - movd [r2 + r3 * 2 + %1], m3 - psrldq m3, 4 - movd [r2 + r5 + %1], m3 -%endmacro - -%macro FILTERH_W2_3_sse3 1 - movh m3, [r0 + %1] - movhps m3, [r0 + %1 + 2] - pmaddwd m3, m0 - movh m4, [r0 + r1 + %1] - movhps m4, [r0 + r1 + %1 + 2] - pmaddwd m4, m0 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m3, m3, q3120 - pshufd m4, m4, q3120 - punpcklqdq m3, m4 - paddd m3, m1 - - movh m5, [r0 + 2 * r1 + %1] - movhps m5, [r0 + 2 * r1 + %1 + 2] - pmaddwd m5, m0 - - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m5, m5, q3120 - paddd m5, m1 - - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 - - movd [r2 + %1], m3 - psrldq m3, 4 - movd [r2 + r3 + %1], m3 - psrldq m3, 4 - movd [r2 + r3 * 2 + %1], m3 -%endmacro - -%macro FILTERH_W4_2_sse3 2 - movh m3, [r0 + %1] - movhps m3, [r0 + %1 + 2] - pmaddwd m3, m0 - movh m4, [r0 + %1 + 4] - movhps m4, [r0 + %1 + 6] - pmaddwd m4, m0 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m3, m3, q3120 - pshufd m4, m4, q3120 - punpcklqdq m3, m4 - paddd m3, m1 - - movh m5, [r0 + r1 + %1] - movhps m5, [r0 + r1 + %1 + 2] - pmaddwd m5, m0 - movh m4, [r0 + r1 + %1 + 4] - movhps m4, [r0 + r1 + %1 + 6] - pmaddwd m4, m0 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m5, m5, q3120 - pshufd m4, m4, q3120 - punpcklqdq m5, m4 - paddd m5, m1 -%ifidn %2, pp - psrad m3, 6 - psrad m5, 6 - packssdw m3, m5 - CLIPW m3, m7, m6 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + %1], m3 - movhps [r2 + r3 + %1], m3 -%endmacro - -%macro FILTERH_W4_1_sse3 1 - movh m3, [r0 + 2 * r1 + %1] - movhps m3, [r0 + 2 * r1 + %1 + 2] - pmaddwd m3, m0 - movh m4, [r0 + 2 * r1 + %1 + 4] - movhps m4, [r0 + 2 * r1 + %1 + 6] - pmaddwd m4, m0 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m3, m3, q3120 - pshufd m4, m4, q3120 - punpcklqdq m3, m4 - paddd m3, m1 - - psrad m3, INTERP_SHIFT_PS - packssdw m3, m3 - movh [r2 + r3 * 2 + %1], m3 -%endmacro - -%macro FILTERH_W8_1_sse3 2 - movh m3, [r0 + %1] - movhps m3, [r0 + %1 + 2] - pmaddwd m3, m0 - movh m4, [r0 + %1 + 4] - movhps m4, [r0 + %1 + 6] - pmaddwd m4, m0 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m3, m3, q3120 - pshufd m4, m4, q3120 - punpcklqdq m3, m4 - paddd m3, m1 - - movh m5, [r0 + %1 + 8] - movhps m5, [r0 + %1 + 10] - pmaddwd m5, m0 - movh m4, [r0 + %1 + 12] - movhps m4, [r0 + %1 + 14] - pmaddwd m4, m0 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m5, m5, q3120 - pshufd m4, m4, q3120 - punpcklqdq m5, m4 - paddd m5, m1 -%ifidn %2, pp - psrad m3, 6 - psrad m5, 6 - packssdw m3, m5 - CLIPW m3, m7, m6 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movdqu [r2 + %1], m3 -%endmacro - -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_HOR_CHROMA_sse3 3 -INIT_XMM sse3 -cglobal interp_4tap_horiz_%3_%1x%2, 4, 7, 8 - add r3, r3 - add r1, r1 - sub r0, 2 - mov r4d, r4m - add r4d, r4d - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - movddup m0, [r6 + r4 * 4] -%else - movddup m0, [tab_ChromaCoeff + r4 * 4] -%endif - -%ifidn %3, ps - mova m1, [INTERP_OFFSET_PS] - cmp r5m, byte 0 -%if %1 <= 6 - lea r4, [r1 * 3] - lea r5, [r3 * 3] -%endif - je .skip - sub r0, r1 -%if %1 <= 6 -%assign y 1 -%else -%assign y 3 -%endif -%assign z 0 -%rep y -%assign x 0 -%rep %1/8 - FILTERH_W8_1_sse3 x, %3 -%assign x x+16 -%endrep -%if %1 == 4 || (%1 == 6 && z == 0) || (%1 == 12 && z == 0) - FILTERH_W4_2_sse3 x, %3 - FILTERH_W4_1_sse3 x -%assign x x+8 -%endif -%if %1 == 2 || (%1 == 6 && z == 0) - FILTERH_W2_3_sse3 x -%endif -%if %1 <= 6 - lea r0, [r0 + r4] - lea r2, [r2 + r5] -%else - lea r0, [r0 + r1] - lea r2, [r2 + r3] -%endif -%assign z z+1 -%endrep -.skip: -%elifidn %3, pp - pxor m7, m7 - mova m6, [pw_pixel_max] - mova m1, [tab_c_32] -%if %1 == 2 || %1 == 6 - lea r4, [r1 * 3] - lea r5, [r3 * 3] -%endif -%endif - -%if %1 == 2 -%assign y %2/4 -%elif %1 <= 6 -%assign y %2/2 -%else -%assign y %2 -%endif -%assign z 0 -%rep y -%assign x 0 -%rep %1/8 - FILTERH_W8_1_sse3 x, %3 -%assign x x+16 -%endrep -%if %1 == 4 || %1 == 6 || (%1 == 12 && (z % 2) == 0) - FILTERH_W4_2_sse3 x, %3 -%assign x x+8 -%endif -%if %1 == 2 || (%1 == 6 && (z % 2) == 0) - FILTERH_W2_4_sse3 x, %3 -%endif -%assign z z+1 -%if z < y -%if %1 == 2 - lea r0, [r0 + 4 * r1] - lea r2, [r2 + 4 * r3] -%elif %1 <= 6 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] -%else - lea r0, [r0 + r1] - lea r2, [r2 + r3] -%endif -%endif ;z < y -%endrep - - RET -%endmacro - -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- - -FILTER_HOR_CHROMA_sse3 2, 4, pp -FILTER_HOR_CHROMA_sse3 2, 8, pp -FILTER_HOR_CHROMA_sse3 2, 16, pp -FILTER_HOR_CHROMA_sse3 4, 2, pp -FILTER_HOR_CHROMA_sse3 4, 4, pp -FILTER_HOR_CHROMA_sse3 4, 8, pp -FILTER_HOR_CHROMA_sse3 4, 16, pp -FILTER_HOR_CHROMA_sse3 4, 32, pp -FILTER_HOR_CHROMA_sse3 6, 8, pp -FILTER_HOR_CHROMA_sse3 6, 16, pp -FILTER_HOR_CHROMA_sse3 8, 2, pp -FILTER_HOR_CHROMA_sse3 8, 4, pp -FILTER_HOR_CHROMA_sse3 8, 6, pp -FILTER_HOR_CHROMA_sse3 8, 8, pp -FILTER_HOR_CHROMA_sse3 8, 12, pp -FILTER_HOR_CHROMA_sse3 8, 16, pp -FILTER_HOR_CHROMA_sse3 8, 32, pp -FILTER_HOR_CHROMA_sse3 8, 64, pp -FILTER_HOR_CHROMA_sse3 12, 16, pp -FILTER_HOR_CHROMA_sse3 12, 32, pp -FILTER_HOR_CHROMA_sse3 16, 4, pp -FILTER_HOR_CHROMA_sse3 16, 8, pp -FILTER_HOR_CHROMA_sse3 16, 12, pp -FILTER_HOR_CHROMA_sse3 16, 16, pp -FILTER_HOR_CHROMA_sse3 16, 24, pp -FILTER_HOR_CHROMA_sse3 16, 32, pp -FILTER_HOR_CHROMA_sse3 16, 64, pp -FILTER_HOR_CHROMA_sse3 24, 32, pp -FILTER_HOR_CHROMA_sse3 24, 64, pp -FILTER_HOR_CHROMA_sse3 32, 8, pp -FILTER_HOR_CHROMA_sse3 32, 16, pp -FILTER_HOR_CHROMA_sse3 32, 24, pp -FILTER_HOR_CHROMA_sse3 32, 32, pp -FILTER_HOR_CHROMA_sse3 32, 48, pp -FILTER_HOR_CHROMA_sse3 32, 64, pp -FILTER_HOR_CHROMA_sse3 48, 64, pp -FILTER_HOR_CHROMA_sse3 64, 16, pp -FILTER_HOR_CHROMA_sse3 64, 32, pp -FILTER_HOR_CHROMA_sse3 64, 48, pp -FILTER_HOR_CHROMA_sse3 64, 64, pp - -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- - -FILTER_HOR_CHROMA_sse3 2, 4, ps -FILTER_HOR_CHROMA_sse3 2, 8, ps -FILTER_HOR_CHROMA_sse3 2, 16, ps -FILTER_HOR_CHROMA_sse3 4, 2, ps -FILTER_HOR_CHROMA_sse3 4, 4, ps -FILTER_HOR_CHROMA_sse3 4, 8, ps -FILTER_HOR_CHROMA_sse3 4, 16, ps -FILTER_HOR_CHROMA_sse3 4, 32, ps -FILTER_HOR_CHROMA_sse3 6, 8, ps -FILTER_HOR_CHROMA_sse3 6, 16, ps -FILTER_HOR_CHROMA_sse3 8, 2, ps -FILTER_HOR_CHROMA_sse3 8, 4, ps -FILTER_HOR_CHROMA_sse3 8, 6, ps -FILTER_HOR_CHROMA_sse3 8, 8, ps -FILTER_HOR_CHROMA_sse3 8, 12, ps -FILTER_HOR_CHROMA_sse3 8, 16, ps -FILTER_HOR_CHROMA_sse3 8, 32, ps -FILTER_HOR_CHROMA_sse3 8, 64, ps -FILTER_HOR_CHROMA_sse3 12, 16, ps -FILTER_HOR_CHROMA_sse3 12, 32, ps -FILTER_HOR_CHROMA_sse3 16, 4, ps -FILTER_HOR_CHROMA_sse3 16, 8, ps -FILTER_HOR_CHROMA_sse3 16, 12, ps -FILTER_HOR_CHROMA_sse3 16, 16, ps -FILTER_HOR_CHROMA_sse3 16, 24, ps -FILTER_HOR_CHROMA_sse3 16, 32, ps -FILTER_HOR_CHROMA_sse3 16, 64, ps -FILTER_HOR_CHROMA_sse3 24, 32, ps -FILTER_HOR_CHROMA_sse3 24, 64, ps -FILTER_HOR_CHROMA_sse3 32, 8, ps -FILTER_HOR_CHROMA_sse3 32, 16, ps -FILTER_HOR_CHROMA_sse3 32, 24, ps -FILTER_HOR_CHROMA_sse3 32, 32, ps -FILTER_HOR_CHROMA_sse3 32, 48, ps -FILTER_HOR_CHROMA_sse3 32, 64, ps -FILTER_HOR_CHROMA_sse3 48, 64, ps -FILTER_HOR_CHROMA_sse3 64, 16, ps -FILTER_HOR_CHROMA_sse3 64, 32, ps -FILTER_HOR_CHROMA_sse3 64, 48, ps -FILTER_HOR_CHROMA_sse3 64, 64, ps - -%macro FILTER_P2S_2_4_sse2 1 - movd m0, [r0 + %1] - movd m2, [r0 + r1 * 2 + %1] - movhps m0, [r0 + r1 + %1] - movhps m2, [r0 + r4 + %1] - psllw m0, (14 - BIT_DEPTH) - psllw m2, (14 - BIT_DEPTH) - psubw m0, m1 - psubw m2, m1 - - movd [r2 + r3 * 0 + %1], m0 - movd [r2 + r3 * 2 + %1], m2 - movhlps m0, m0 - movhlps m2, m2 - movd [r2 + r3 * 1 + %1], m0 - movd [r2 + r5 + %1], m2 -%endmacro - -%macro FILTER_P2S_4_4_sse2 1 - movh m0, [r0 + %1] - movhps m0, [r0 + r1 + %1] - psllw m0, (14 - BIT_DEPTH) - psubw m0, m1 - movh [r2 + r3 * 0 + %1], m0 - movhps [r2 + r3 * 1 + %1], m0 - - movh m2, [r0 + r1 * 2 + %1] - movhps m2, [r0 + r4 + %1] - psllw m2, (14 - BIT_DEPTH) - psubw m2, m1 - movh [r2 + r3 * 2 + %1], m2 - movhps [r2 + r5 + %1], m2 -%endmacro - -%macro FILTER_P2S_4_2_sse2 0 - movh m0, [r0] - movhps m0, [r0 + r1 * 2] - psllw m0, (14 - BIT_DEPTH) - psubw m0, [pw_2000] - movh [r2 + r3 * 0], m0 - movhps [r2 + r3 * 2], m0 -%endmacro - -%macro FILTER_P2S_8_4_sse2 1 - movu m0, [r0 + %1] - movu m2, [r0 + r1 + %1] - psllw m0, (14 - BIT_DEPTH) - psllw m2, (14 - BIT_DEPTH) - psubw m0, m1 - psubw m2, m1 - movu [r2 + r3 * 0 + %1], m0 - movu [r2 + r3 * 1 + %1], m2 - - movu m3, [r0 + r1 * 2 + %1] - movu m4, [r0 + r4 + %1] - psllw m3, (14 - BIT_DEPTH) - psllw m4, (14 - BIT_DEPTH) - psubw m3, m1 - psubw m4, m1 - movu [r2 + r3 * 2 + %1], m3 - movu [r2 + r5 + %1], m4 -%endmacro - -%macro FILTER_P2S_8_2_sse2 1 - movu m0, [r0 + %1] - movu m2, [r0 + r1 + %1] - psllw m0, (14 - BIT_DEPTH) - psllw m2, (14 - BIT_DEPTH) - psubw m0, m1 - psubw m2, m1 - movu [r2 + r3 * 0 + %1], m0 - movu [r2 + r3 * 1 + %1], m2 -%endmacro - -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride) -;----------------------------------------------------------------------------- -%macro FILTER_PIX_TO_SHORT_sse2 2 -INIT_XMM sse2 -cglobal filterPixelToShort_%1x%2, 4, 6, 3 -%if %2 == 2 -%if %1 == 4 - FILTER_P2S_4_2_sse2 -%elif %1 == 8 - add r1d, r1d - add r3d, r3d - mova m1, [pw_2000] - FILTER_P2S_8_2_sse2 0 -%endif -%else - add r1d, r1d - add r3d, r3d - mova m1, [pw_2000] - lea r4, [r1 * 3] - lea r5, [r3 * 3] -%assign y 1 -%rep %2/4 -%assign x 0 -%rep %1/8 - FILTER_P2S_8_4_sse2 x -%if %2 == 6 - lea r0, [r0 + 4 * r1] - lea r2, [r2 + 4 * r3] - FILTER_P2S_8_2_sse2 x -%endif -%assign x x+16 -%endrep -%rep (%1 % 8)/4 - FILTER_P2S_4_4_sse2 x -%assign x x+8 -%endrep -%rep (%1 % 4)/2 - FILTER_P2S_2_4_sse2 x -%endrep -%if y < %2/4 - lea r0, [r0 + 4 * r1] - lea r2, [r2 + 4 * r3] -%assign y y+1 -%endif -%endrep -%endif -RET -%endmacro - - FILTER_PIX_TO_SHORT_sse2 2, 4 - FILTER_PIX_TO_SHORT_sse2 2, 8 - FILTER_PIX_TO_SHORT_sse2 2, 16 - FILTER_PIX_TO_SHORT_sse2 4, 2 - FILTER_PIX_TO_SHORT_sse2 4, 4 - FILTER_PIX_TO_SHORT_sse2 4, 8 - FILTER_PIX_TO_SHORT_sse2 4, 16 - FILTER_PIX_TO_SHORT_sse2 4, 32 - FILTER_PIX_TO_SHORT_sse2 6, 8 - FILTER_PIX_TO_SHORT_sse2 6, 16 - FILTER_PIX_TO_SHORT_sse2 8, 2 - FILTER_PIX_TO_SHORT_sse2 8, 4 - FILTER_PIX_TO_SHORT_sse2 8, 6 - FILTER_PIX_TO_SHORT_sse2 8, 8 - FILTER_PIX_TO_SHORT_sse2 8, 12 - FILTER_PIX_TO_SHORT_sse2 8, 16 - FILTER_PIX_TO_SHORT_sse2 8, 32 - FILTER_PIX_TO_SHORT_sse2 8, 64 - FILTER_PIX_TO_SHORT_sse2 12, 16 - FILTER_PIX_TO_SHORT_sse2 12, 32 - FILTER_PIX_TO_SHORT_sse2 16, 4 - FILTER_PIX_TO_SHORT_sse2 16, 8 - FILTER_PIX_TO_SHORT_sse2 16, 12 - FILTER_PIX_TO_SHORT_sse2 16, 16 - FILTER_PIX_TO_SHORT_sse2 16, 24 - FILTER_PIX_TO_SHORT_sse2 16, 32 - FILTER_PIX_TO_SHORT_sse2 16, 64 - FILTER_PIX_TO_SHORT_sse2 24, 32 - FILTER_PIX_TO_SHORT_sse2 24, 64 - FILTER_PIX_TO_SHORT_sse2 32, 8 - FILTER_PIX_TO_SHORT_sse2 32, 16 - FILTER_PIX_TO_SHORT_sse2 32, 24 - FILTER_PIX_TO_SHORT_sse2 32, 32 - FILTER_PIX_TO_SHORT_sse2 32, 48 - FILTER_PIX_TO_SHORT_sse2 32, 64 - FILTER_PIX_TO_SHORT_sse2 48, 64 - FILTER_PIX_TO_SHORT_sse2 64, 16 - FILTER_PIX_TO_SHORT_sse2 64, 32 - FILTER_PIX_TO_SHORT_sse2 64, 48 - FILTER_PIX_TO_SHORT_sse2 64, 64 - -;------------------------------------------------------------------------------------------------------------ -; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------ -%macro FILTER_HOR_LUMA_W4 3 -INIT_XMM sse4 -cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 - mov r4d, r4m - sub r0, 6 - shl r4d, 4 - add r1, r1 - add r3, r3 - -%ifdef PIC - lea r6, [tab_LumaCoeff] - mova m0, [r6 + r4] -%else - mova m0, [tab_LumaCoeff + r4] -%endif - -%ifidn %3, pp - mova m1, [pd_32] - pxor m6, m6 - mova m7, [pw_pixel_max] -%else - mova m1, [INTERP_OFFSET_PS] -%endif - - mov r4d, %2 -%ifidn %3, ps - cmp r5m, byte 0 - je .loopH - lea r6, [r1 + 2 * r1] - sub r0, r6 - add r4d, 7 -%endif - -.loopH: - movu m2, [r0] ; m2 = src[0-7] - movu m3, [r0 + 16] ; m3 = src[8-15] - - pmaddwd m4, m2, m0 - palignr m5, m3, m2, 2 ; m5 = src[1-8] - pmaddwd m5, m0 - phaddd m4, m5 - - palignr m5, m3, m2, 4 ; m5 = src[2-9] - pmaddwd m5, m0 - palignr m3, m2, 6 ; m3 = src[3-10] - pmaddwd m3, m0 - phaddd m5, m3 - - phaddd m4, m5 - paddd m4, m1 -%ifidn %3, pp - psrad m4, 6 - packusdw m4, m4 - CLIPW m4, m6, m7 -%else - psrad m4, INTERP_SHIFT_PS - packssdw m4, m4 -%endif - - movh [r2], m4 - - add r0, r1 - add r2, r3 - - dec r4d - jnz .loopH - RET -%endmacro - -;------------------------------------------------------------------------------------------------------------ -; void interp_8tap_horiz_pp_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------ -FILTER_HOR_LUMA_W4 4, 4, pp -FILTER_HOR_LUMA_W4 4, 8, pp -FILTER_HOR_LUMA_W4 4, 16, pp - -;--------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;--------------------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W4 4, 4, ps -FILTER_HOR_LUMA_W4 4, 8, ps -FILTER_HOR_LUMA_W4 4, 16, ps - -;------------------------------------------------------------------------------------------------------------ -; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------ -%macro FILTER_HOR_LUMA_W8 3 -INIT_XMM sse4 -cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 - - add r1, r1 - add r3, r3 - mov r4d, r4m - sub r0, 6 - shl r4d, 4 - -%ifdef PIC - lea r6, [tab_LumaCoeff] - mova m0, [r6 + r4] -%else - mova m0, [tab_LumaCoeff + r4] -%endif - -%ifidn %3, pp - mova m1, [pd_32] - pxor m7, m7 -%else - mova m1, [INTERP_OFFSET_PS] -%endif - - mov r4d, %2 -%ifidn %3, ps - cmp r5m, byte 0 - je .loopH - lea r6, [r1 + 2 * r1] - sub r0, r6 - add r4d, 7 -%endif - -.loopH: - movu m2, [r0] ; m2 = src[0-7] - movu m3, [r0 + 16] ; m3 = src[8-15] - - pmaddwd m4, m2, m0 - palignr m5, m3, m2, 2 ; m5 = src[1-8] - pmaddwd m5, m0 - phaddd m4, m5 - - palignr m5, m3, m2, 4 ; m5 = src[2-9] - pmaddwd m5, m0 - palignr m6, m3, m2, 6 ; m6 = src[3-10] - pmaddwd m6, m0 - phaddd m5, m6 - phaddd m4, m5 - paddd m4, m1 - - palignr m5, m3, m2, 8 ; m5 = src[4-11] - pmaddwd m5, m0 - palignr m6, m3, m2, 10 ; m6 = src[5-12] - pmaddwd m6, m0 - phaddd m5, m6 - - palignr m6, m3, m2, 12 ; m6 = src[6-13] - pmaddwd m6, m0 - palignr m3, m2, 14 ; m3 = src[7-14] - pmaddwd m3, m0 - phaddd m6, m3 - phaddd m5, m6 - paddd m5, m1 -%ifidn %3, pp - psrad m4, 6 - psrad m5, 6 - packusdw m4, m5 - CLIPW m4, m7, [pw_pixel_max] -%else - psrad m4, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m4, m5 -%endif - - movu [r2], m4 - - add r0, r1 - add r2, r3 - - dec r4d - jnz .loopH - RET -%endmacro - -;------------------------------------------------------------------------------------------------------------ -; void interp_8tap_horiz_pp_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------ -FILTER_HOR_LUMA_W8 8, 4, pp -FILTER_HOR_LUMA_W8 8, 8, pp -FILTER_HOR_LUMA_W8 8, 16, pp -FILTER_HOR_LUMA_W8 8, 32, pp - -;--------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;--------------------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W8 8, 4, ps -FILTER_HOR_LUMA_W8 8, 8, ps -FILTER_HOR_LUMA_W8 8, 16, ps -FILTER_HOR_LUMA_W8 8, 32, ps - -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_HOR_LUMA_W12 3 -INIT_XMM sse4 -cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 - - add r1, r1 - add r3, r3 - mov r4d, r4m - sub r0, 6 - shl r4d, 4 - -%ifdef PIC - lea r6, [tab_LumaCoeff] - mova m0, [r6 + r4] -%else - mova m0, [tab_LumaCoeff + r4] -%endif -%ifidn %3, pp - mova m1, [INTERP_OFFSET_PP] -%else - mova m1, [INTERP_OFFSET_PS] -%endif - - mov r4d, %2 -%ifidn %3, ps - cmp r5m, byte 0 - je .loopH - lea r6, [r1 + 2 * r1] - sub r0, r6 - add r4d, 7 -%endif - -.loopH: - movu m2, [r0] ; m2 = src[0-7] - movu m3, [r0 + 16] ; m3 = src[8-15] - - pmaddwd m4, m2, m0 - palignr m5, m3, m2, 2 ; m5 = src[1-8] - pmaddwd m5, m0 - phaddd m4, m5 - - palignr m5, m3, m2, 4 ; m5 = src[2-9] - pmaddwd m5, m0 - palignr m6, m3, m2, 6 ; m6 = src[3-10] - pmaddwd m6, m0 - phaddd m5, m6 - phaddd m4, m5 - paddd m4, m1 - - palignr m5, m3, m2, 8 ; m5 = src[4-11] - pmaddwd m5, m0 - palignr m6, m3, m2, 10 ; m6 = src[5-12] - pmaddwd m6, m0 - phaddd m5, m6 - - palignr m6, m3, m2, 12 ; m6 = src[6-13] - pmaddwd m6, m0 - palignr m7, m3, m2, 14 ; m2 = src[7-14] - pmaddwd m7, m0 - phaddd m6, m7 - phaddd m5, m6 - paddd m5, m1 -%ifidn %3, pp - psrad m4, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m4, m5 - pxor m5, m5 - CLIPW m4, m5, [pw_pixel_max] -%else - psrad m4, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m4, m5 -%endif - - movu [r2], m4 - - movu m2, [r0 + 32] ; m2 = src[16-23] - - pmaddwd m4, m3, m0 ; m3 = src[8-15] - palignr m5, m2, m3, 2 ; m5 = src[9-16] - pmaddwd m5, m0 - phaddd m4, m5 - - palignr m5, m2, m3, 4 ; m5 = src[10-17] - pmaddwd m5, m0 - palignr m2, m3, 6 ; m2 = src[11-18] - pmaddwd m2, m0 - phaddd m5, m2 - phaddd m4, m5 - paddd m4, m1 -%ifidn %3, pp - psrad m4, INTERP_SHIFT_PP - packusdw m4, m4 - pxor m5, m5 - CLIPW m4, m5, [pw_pixel_max] -%else - psrad m4, INTERP_SHIFT_PS - packssdw m4, m4 -%endif - - movh [r2 + 16], m4 - - add r0, r1 - add r2, r3 - - dec r4d - jnz .loopH - RET -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W12 12, 16, pp - -;---------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_ps_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;---------------------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W12 12, 16, ps - -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_HOR_LUMA_W16 3 -INIT_XMM sse4 -cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 - - add r1, r1 - add r3, r3 - mov r4d, r4m - sub r0, 6 - shl r4d, 4 - -%ifdef PIC - lea r6, [tab_LumaCoeff] - mova m0, [r6 + r4] -%else - mova m0, [tab_LumaCoeff + r4] -%endif - -%ifidn %3, pp - mova m1, [pd_32] -%else - mova m1, [INTERP_OFFSET_PS] -%endif - - mov r4d, %2 -%ifidn %3, ps - cmp r5m, byte 0 - je .loopH - lea r6, [r1 + 2 * r1] - sub r0, r6 - add r4d, 7 -%endif - -.loopH: -%assign x 0 -%rep %1 / 16 - movu m2, [r0 + x] ; m2 = src[0-7] - movu m3, [r0 + 16 + x] ; m3 = src[8-15] - - pmaddwd m4, m2, m0 - palignr m5, m3, m2, 2 ; m5 = src[1-8] - pmaddwd m5, m0 - phaddd m4, m5 - - palignr m5, m3, m2, 4 ; m5 = src[2-9] - pmaddwd m5, m0 - palignr m6, m3, m2, 6 ; m6 = src[3-10] - pmaddwd m6, m0 - phaddd m5, m6 - phaddd m4, m5 - paddd m4, m1 - - palignr m5, m3, m2, 8 ; m5 = src[4-11] - pmaddwd m5, m0 - palignr m6, m3, m2, 10 ; m6 = src[5-12] - pmaddwd m6, m0 - phaddd m5, m6 - - palignr m6, m3, m2, 12 ; m6 = src[6-13] - pmaddwd m6, m0 - palignr m7, m3, m2, 14 ; m2 = src[7-14] - pmaddwd m7, m0 - phaddd m6, m7 - phaddd m5, m6 - paddd m5, m1 -%ifidn %3, pp - psrad m4, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m4, m5 - pxor m5, m5 - CLIPW m4, m5, [pw_pixel_max] -%else - psrad m4, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m4, m5 -%endif - movu [r2 + x], m4 - - movu m2, [r0 + 32 + x] ; m2 = src[16-23] - - pmaddwd m4, m3, m0 ; m3 = src[8-15] - palignr m5, m2, m3, 2 ; m5 = src[9-16] - pmaddwd m5, m0 - phaddd m4, m5 - - palignr m5, m2, m3, 4 ; m5 = src[10-17] - pmaddwd m5, m0 - palignr m6, m2, m3, 6 ; m6 = src[11-18] - pmaddwd m6, m0 - phaddd m5, m6 - phaddd m4, m5 - paddd m4, m1 - - palignr m5, m2, m3, 8 ; m5 = src[12-19] - pmaddwd m5, m0 - palignr m6, m2, m3, 10 ; m6 = src[13-20] - pmaddwd m6, m0 - phaddd m5, m6 - - palignr m6, m2, m3, 12 ; m6 = src[14-21] - pmaddwd m6, m0 - palignr m2, m3, 14 ; m3 = src[15-22] - pmaddwd m2, m0 - phaddd m6, m2 - phaddd m5, m6 - paddd m5, m1 -%ifidn %3, pp - psrad m4, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m4, m5 - pxor m5, m5 - CLIPW m4, m5, [pw_pixel_max] -%else - psrad m4, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m4, m5 -%endif - movu [r2 + 16 + x], m4 - -%assign x x+32 -%endrep - - add r0, r1 - add r2, r3 - - dec r4d - jnz .loopH - RET -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W16 16, 4, pp -FILTER_HOR_LUMA_W16 16, 8, pp -FILTER_HOR_LUMA_W16 16, 12, pp -FILTER_HOR_LUMA_W16 16, 16, pp -FILTER_HOR_LUMA_W16 16, 32, pp -FILTER_HOR_LUMA_W16 16, 64, pp - -;---------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_ps_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;---------------------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W16 16, 4, ps -FILTER_HOR_LUMA_W16 16, 8, ps -FILTER_HOR_LUMA_W16 16, 12, ps -FILTER_HOR_LUMA_W16 16, 16, ps -FILTER_HOR_LUMA_W16 16, 32, ps -FILTER_HOR_LUMA_W16 16, 64, ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W16 32, 8, pp -FILTER_HOR_LUMA_W16 32, 16, pp -FILTER_HOR_LUMA_W16 32, 24, pp -FILTER_HOR_LUMA_W16 32, 32, pp -FILTER_HOR_LUMA_W16 32, 64, pp - -;---------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_ps_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;---------------------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W16 32, 8, ps -FILTER_HOR_LUMA_W16 32, 16, ps -FILTER_HOR_LUMA_W16 32, 24, ps -FILTER_HOR_LUMA_W16 32, 32, ps -FILTER_HOR_LUMA_W16 32, 64, ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp_48x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W16 48, 64, pp - -;---------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_ps_48x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;---------------------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W16 48, 64, ps - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp_64x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W16 64, 16, pp -FILTER_HOR_LUMA_W16 64, 32, pp -FILTER_HOR_LUMA_W16 64, 48, pp -FILTER_HOR_LUMA_W16 64, 64, pp - -;---------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_ps_64x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;---------------------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W16 64, 16, ps -FILTER_HOR_LUMA_W16 64, 32, ps -FILTER_HOR_LUMA_W16 64, 48, ps -FILTER_HOR_LUMA_W16 64, 64, ps - -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_HOR_LUMA_W24 3 -INIT_XMM sse4 -cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 - - add r1, r1 - add r3, r3 - mov r4d, r4m - sub r0, 6 - shl r4d, 4 - -%ifdef PIC - lea r6, [tab_LumaCoeff] - mova m0, [r6 + r4] -%else - mova m0, [tab_LumaCoeff + r4] -%endif -%ifidn %3, pp - mova m1, [pd_32] -%else - mova m1, [INTERP_OFFSET_PS] -%endif - - mov r4d, %2 -%ifidn %3, ps - cmp r5m, byte 0 - je .loopH - lea r6, [r1 + 2 * r1] - sub r0, r6 - add r4d, 7 -%endif - -.loopH: - movu m2, [r0] ; m2 = src[0-7] - movu m3, [r0 + 16] ; m3 = src[8-15] - - pmaddwd m4, m2, m0 - palignr m5, m3, m2, 2 ; m5 = src[1-8] - pmaddwd m5, m0 - phaddd m4, m5 - - palignr m5, m3, m2, 4 ; m5 = src[2-9] - pmaddwd m5, m0 - palignr m6, m3, m2, 6 ; m6 = src[3-10] - pmaddwd m6, m0 - phaddd m5, m6 - phaddd m4, m5 - paddd m4, m1 - - palignr m5, m3, m2, 8 ; m5 = src[4-11] - pmaddwd m5, m0 - palignr m6, m3, m2, 10 ; m6 = src[5-12] - pmaddwd m6, m0 - phaddd m5, m6 - - palignr m6, m3, m2, 12 ; m6 = src[6-13] - pmaddwd m6, m0 - palignr m7, m3, m2, 14 ; m7 = src[7-14] - pmaddwd m7, m0 - phaddd m6, m7 - phaddd m5, m6 - paddd m5, m1 -%ifidn %3, pp - psrad m4, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m4, m5 - pxor m5, m5 - CLIPW m4, m5, [pw_pixel_max] -%else - psrad m4, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m4, m5 -%endif - movu [r2], m4 - - movu m2, [r0 + 32] ; m2 = src[16-23] - - pmaddwd m4, m3, m0 ; m3 = src[8-15] - palignr m5, m2, m3, 2 ; m5 = src[1-8] - pmaddwd m5, m0 - phaddd m4, m5 - - palignr m5, m2, m3, 4 ; m5 = src[2-9] - pmaddwd m5, m0 - palignr m6, m2, m3, 6 ; m6 = src[3-10] - pmaddwd m6, m0 - phaddd m5, m6 - phaddd m4, m5 - paddd m4, m1 - - palignr m5, m2, m3, 8 ; m5 = src[4-11] - pmaddwd m5, m0 - palignr m6, m2, m3, 10 ; m6 = src[5-12] - pmaddwd m6, m0 - phaddd m5, m6 - - palignr m6, m2, m3, 12 ; m6 = src[6-13] - pmaddwd m6, m0 - palignr m7, m2, m3, 14 ; m7 = src[7-14] - pmaddwd m7, m0 - phaddd m6, m7 - phaddd m5, m6 - paddd m5, m1 -%ifidn %3, pp - psrad m4, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m4, m5 - pxor m5, m5 - CLIPW m4, m5, [pw_pixel_max] -%else - psrad m4, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m4, m5 -%endif - movu [r2 + 16], m4 - - movu m3, [r0 + 48] ; m3 = src[24-31] - - pmaddwd m4, m2, m0 ; m2 = src[16-23] - palignr m5, m3, m2, 2 ; m5 = src[1-8] - pmaddwd m5, m0 - phaddd m4, m5 - - palignr m5, m3, m2, 4 ; m5 = src[2-9] - pmaddwd m5, m0 - palignr m6, m3, m2, 6 ; m6 = src[3-10] - pmaddwd m6, m0 - phaddd m5, m6 - phaddd m4, m5 - paddd m4, m1 - - palignr m5, m3, m2, 8 ; m5 = src[4-11] - pmaddwd m5, m0 - palignr m6, m3, m2, 10 ; m6 = src[5-12] - pmaddwd m6, m0 - phaddd m5, m6 - - palignr m6, m3, m2, 12 ; m6 = src[6-13] - pmaddwd m6, m0 - palignr m7, m3, m2, 14 ; m7 = src[7-14] - pmaddwd m7, m0 - phaddd m6, m7 - phaddd m5, m6 - paddd m5, m1 -%ifidn %3, pp - psrad m4, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m4, m5 - pxor m5, m5 - CLIPW m4, m5, [pw_pixel_max] -%else - psrad m4, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m4, m5 -%endif - movu [r2 + 32], m4 - - add r0, r1 - add r2, r3 - - dec r4d - jnz .loopH - RET -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W24 24, 32, pp - -;---------------------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_ps_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;---------------------------------------------------------------------------------------------------------------------------- -FILTER_HOR_LUMA_W24 24, 32, ps - -%macro FILTER_W2_2 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + r1] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - packusdw m3, m3 - CLIPW m3, m7, m6 -%else - psrad m3, INTERP_SHIFT_PS - packssdw m3, m3 -%endif - movd [r2], m3 - pextrd [r2 + r3], m3, 1 -%endmacro - -%macro FILTER_W4_2 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + r1] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + r1 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m7, m6 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2], m3 - movhps [r2 + r3], m3 -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_HOR_LUMA_W4_avx2 1 -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_4x%1, 4,7,7 - add r1d, r1d - add r3d, r3d - sub r0, 6 - mov r4d, r4m - shl r4d, 4 -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastq m0, [r5 + r4] - vpbroadcastq m1, [r5 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - lea r6, [pw_pixel_max] - mova m3, [interp8_hpp_shuf] - mova m6, [pd_32] - pxor m2, m2 - - ; register map - ; m0 , m1 interpolate coeff - - mov r4d, %1/2 - -.loop: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - phaddd m4, m4 - vpermq m4, m4, q3120 - paddd m4, m6 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [r6] - movq [r2], xm4 - - vbroadcasti128 m4, [r0 + r1] - vbroadcasti128 m5, [r0 + r1 + 8] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - phaddd m4, m4 - vpermq m4, m4, q3120 - paddd m4, m6 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [r6] - movq [r2 + r3], xm4 - - lea r2, [r2 + 2 * r3] - lea r0, [r0 + 2 * r1] - dec r4d - jnz .loop - RET -%endmacro -FILTER_HOR_LUMA_W4_avx2 4 -FILTER_HOR_LUMA_W4_avx2 8 -FILTER_HOR_LUMA_W4_avx2 16 - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_HOR_LUMA_W8 1 -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_8x%1, 4,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 6 - mov r4d, r4m - shl r4d, 4 -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastq m0, [r5 + r4] - vpbroadcastq m1, [r5 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - mova m7, [pd_32] - pxor m2, m2 - - ; register map - ; m0 , m1 interpolate coeff - - mov r4d, %1/2 - -.loop: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 8] - vbroadcasti128 m6, [r0 + 16] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2], xm4 - - vbroadcasti128 m4, [r0 + r1] - vbroadcasti128 m5, [r0 + r1 + 8] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + r1 + 8] - vbroadcasti128 m6, [r0 + r1 + 16] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2 + r3], xm4 - - lea r2, [r2 + 2 * r3] - lea r0, [r0 + 2 * r1] - dec r4d - jnz .loop - RET -%endmacro -FILTER_HOR_LUMA_W8 4 -FILTER_HOR_LUMA_W8 8 -FILTER_HOR_LUMA_W8 16 -FILTER_HOR_LUMA_W8 32 - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_HOR_LUMA_W16 1 -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_16x%1, 4,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 6 - mov r4d, r4m - shl r4d, 4 -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastq m0, [r5 + r4] - vpbroadcastq m1, [r5 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - mova m7, [pd_32] - pxor m2, m2 - - ; register map - ; m0 , m1 interpolate coeff - - mov r4d, %1 - -.loop: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 8] - vbroadcasti128 m6, [r0 + 16] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2], xm4 - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 24] - vbroadcasti128 m6, [r0 + 32] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2 + 16], xm4 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop - RET -%endmacro -FILTER_HOR_LUMA_W16 4 -FILTER_HOR_LUMA_W16 8 -FILTER_HOR_LUMA_W16 12 -FILTER_HOR_LUMA_W16 16 -FILTER_HOR_LUMA_W16 32 -FILTER_HOR_LUMA_W16 64 - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_HOR_LUMA_W32 2 -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 6 - mov r4d, r4m - shl r4d, 4 -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastq m0, [r5 + r4] - vpbroadcastq m1, [r5 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - mova m7, [pd_32] - pxor m2, m2 - - ; register map - ; m0 , m1 interpolate coeff - - mov r4d, %2 - -.loop: -%assign x 0 -%rep %1/16 - vbroadcasti128 m4, [r0 + x] - vbroadcasti128 m5, [r0 + 8 + x] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 8 + x] - vbroadcasti128 m6, [r0 + 16 + x] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2 + x], xm4 - - vbroadcasti128 m4, [r0 + 16 + x] - vbroadcasti128 m5, [r0 + 24 + x] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 24 + x] - vbroadcasti128 m6, [r0 + 32 + x] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2 + 16 + x], xm4 - -%assign x x+32 -%endrep - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop - RET -%endmacro -FILTER_HOR_LUMA_W32 32, 8 -FILTER_HOR_LUMA_W32 32, 16 -FILTER_HOR_LUMA_W32 32, 24 -FILTER_HOR_LUMA_W32 32, 32 -FILTER_HOR_LUMA_W32 32, 64 -FILTER_HOR_LUMA_W32 64, 16 -FILTER_HOR_LUMA_W32 64, 32 -FILTER_HOR_LUMA_W32 64, 48 -FILTER_HOR_LUMA_W32 64, 64 - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_12x16, 4,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 6 - mov r4d, r4m - shl r4d, 4 -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastq m0, [r5 + r4] - vpbroadcastq m1, [r5 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - mova m7, [pd_32] - pxor m2, m2 - - ; register map - ; m0 , m1 interpolate coeff - - mov r4d, 16 - -.loop: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 8] - vbroadcasti128 m6, [r0 + 16] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2], xm4 - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 24] - vbroadcasti128 m6, [r0 + 32] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movq [r2 + 16], xm4 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop - RET - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_24x32, 4,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 6 - mov r4d, r4m - shl r4d, 4 -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastq m0, [r5 + r4] - vpbroadcastq m1, [r5 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - mova m7, [pd_32] - pxor m2, m2 - - ; register map - ; m0 , m1 interpolate coeff - - mov r4d, 32 - -.loop: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 8] - vbroadcasti128 m6, [r0 + 16] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2], xm4 - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 24] - vbroadcasti128 m6, [r0 + 32] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2 + 16], xm4 - - vbroadcasti128 m4, [r0 + 32] - vbroadcasti128 m5, [r0 + 40] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 40] - vbroadcasti128 m6, [r0 + 48] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2 + 32], xm4 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop - RET - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal interp_8tap_horiz_pp_48x64, 4,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 6 - mov r4d, r4m - shl r4d, 4 -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastq m0, [r5 + r4] - vpbroadcastq m1, [r5 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - mova m7, [pd_32] - pxor m2, m2 - - ; register map - ; m0 , m1 interpolate coeff - - mov r4d, 64 - -.loop: -%assign x 0 -%rep 2 - vbroadcasti128 m4, [r0 + x] - vbroadcasti128 m5, [r0 + 8 + x] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 8 + x] - vbroadcasti128 m6, [r0 + 16 + x] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2 + x], xm4 - - vbroadcasti128 m4, [r0 + 16 + x] - vbroadcasti128 m5, [r0 + 24 + x] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 24 + x] - vbroadcasti128 m6, [r0 + 32 + x] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2 + 16 + x], xm4 - - vbroadcasti128 m4, [r0 + 32 + x] - vbroadcasti128 m5, [r0 + 40 + x] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m5, m1 - paddd m4, m5 - - vbroadcasti128 m5, [r0 + 40 + x] - vbroadcasti128 m6, [r0 + 48 + x] - pshufb m5, m3 - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m7 - psrad m4, INTERP_SHIFT_PP - - packusdw m4, m4 - vpermq m4, m4, q2020 - CLIPW m4, m2, [pw_pixel_max] - movu [r2 + 32 + x], xm4 - -%assign x x+48 -%endrep - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop - RET - -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_CHROMA_H 6 -INIT_XMM sse4 -cglobal interp_4tap_horiz_%3_%1x%2, 4, %4, %5 - - add r3, r3 - add r1, r1 - sub r0, 2 - mov r4d, r4m - add r4d, r4d - -%ifdef PIC - lea r%6, [tab_ChromaCoeff] - movh m0, [r%6 + r4 * 4] -%else - movh m0, [tab_ChromaCoeff + r4 * 4] -%endif - - punpcklqdq m0, m0 - mova m2, [tab_Tm16] - -%ifidn %3, ps - mova m1, [INTERP_OFFSET_PS] - cmp r5m, byte 0 - je .skip - sub r0, r1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - - %if %1 == 4 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - %else - phaddd m3, m3 - %endif - - paddd m3, m1 - psrad m3, INTERP_SHIFT_PS - packssdw m3, m3 - - %if %1 == 2 - movd [r2], m3 - %else - movh [r2], m3 - %endif - - add r0, r1 - add r2, r3 - FILTER_W%1_2 %3 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - -.skip: - -%else ;%ifidn %3, ps - pxor m7, m7 - mova m6, [pw_pixel_max] - mova m1, [tab_c_32] -%endif ;%ifidn %3, ps - - FILTER_W%1_2 %3 - -%rep (%2/2) - 1 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - FILTER_W%1_2 %3 -%endrep - RET -%endmacro - -FILTER_CHROMA_H 2, 4, pp, 6, 8, 5 -FILTER_CHROMA_H 2, 8, pp, 6, 8, 5 -FILTER_CHROMA_H 4, 2, pp, 6, 8, 5 -FILTER_CHROMA_H 4, 4, pp, 6, 8, 5 -FILTER_CHROMA_H 4, 8, pp, 6, 8, 5 -FILTER_CHROMA_H 4, 16, pp, 6, 8, 5 - -FILTER_CHROMA_H 2, 4, ps, 7, 5, 6 -FILTER_CHROMA_H 2, 8, ps, 7, 5, 6 -FILTER_CHROMA_H 4, 2, ps, 7, 6, 6 -FILTER_CHROMA_H 4, 4, ps, 7, 6, 6 -FILTER_CHROMA_H 4, 8, ps, 7, 6, 6 -FILTER_CHROMA_H 4, 16, ps, 7, 6, 6 - -FILTER_CHROMA_H 2, 16, pp, 6, 8, 5 -FILTER_CHROMA_H 4, 32, pp, 6, 8, 5 -FILTER_CHROMA_H 2, 16, ps, 7, 5, 6 -FILTER_CHROMA_H 4, 32, ps, 7, 6, 6 - - -%macro FILTER_W6_1 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m4, [r0 + 8] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m4, m4 - paddd m4, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m4, INTERP_SHIFT_PP - packusdw m3, m4 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m4, INTERP_SHIFT_PS - packssdw m3, m4 -%endif - movh [r2], m3 - pextrd [r2 + 8], m3, 2 -%endmacro - -cglobal chroma_filter_pp_6x1_internal - FILTER_W6_1 pp - ret - -cglobal chroma_filter_ps_6x1_internal - FILTER_W6_1 ps - ret - -%macro FILTER_W8_1 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 8] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 12] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2], m3 - movhps [r2 + 8], m3 -%endmacro - -cglobal chroma_filter_pp_8x1_internal - FILTER_W8_1 pp - ret - -cglobal chroma_filter_ps_8x1_internal - FILTER_W8_1 ps - ret - -%macro FILTER_W12_1 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 8] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 12] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2], m3 - movhps [r2 + 8], m3 - - movu m3, [r0 + 16] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 20] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - packusdw m3, m3 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - packssdw m3, m3 -%endif - movh [r2 + 16], m3 -%endmacro - -cglobal chroma_filter_pp_12x1_internal - FILTER_W12_1 pp - ret - -cglobal chroma_filter_ps_12x1_internal - FILTER_W12_1 ps - ret - -%macro FILTER_W16_1 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 8] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 12] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2], m3 - movhps [r2 + 8], m3 - - movu m3, [r0 + 16] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 20] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 24] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 28] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + 16], m3 - movhps [r2 + 24], m3 -%endmacro - -cglobal chroma_filter_pp_16x1_internal - FILTER_W16_1 pp - ret - -cglobal chroma_filter_ps_16x1_internal - FILTER_W16_1 ps - ret - -%macro FILTER_W24_1 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 8] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 12] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2], m3 - movhps [r2 + 8], m3 - - movu m3, [r0 + 16] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 20] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 24] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 28] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + 16], m3 - movhps [r2 + 24], m3 - - movu m3, [r0 + 32] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 36] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 40] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 44] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + 32], m3 - movhps [r2 + 40], m3 -%endmacro - -cglobal chroma_filter_pp_24x1_internal - FILTER_W24_1 pp - ret - -cglobal chroma_filter_ps_24x1_internal - FILTER_W24_1 ps - ret - -%macro FILTER_W32_1 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 8] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 12] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2], m3 - movhps [r2 + 8], m3 - - movu m3, [r0 + 16] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 20] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 24] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 28] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + 16], m3 - movhps [r2 + 24], m3 - - movu m3, [r0 + 32] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 36] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 40] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 44] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + 32], m3 - movhps [r2 + 40], m3 - - movu m3, [r0 + 48] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 52] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 56] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 60] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + 48], m3 - movhps [r2 + 56], m3 -%endmacro - -cglobal chroma_filter_pp_32x1_internal - FILTER_W32_1 pp - ret - -cglobal chroma_filter_ps_32x1_internal - FILTER_W32_1 ps - ret - -%macro FILTER_W8o_1 2 - movu m3, [r0 + %2] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + %2 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + %2 + 8] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + %2 + 12] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + %2], m3 - movhps [r2 + %2 + 8], m3 -%endmacro - -%macro FILTER_W48_1 1 - FILTER_W8o_1 %1, 0 - FILTER_W8o_1 %1, 16 - FILTER_W8o_1 %1, 32 - FILTER_W8o_1 %1, 48 - FILTER_W8o_1 %1, 64 - FILTER_W8o_1 %1, 80 -%endmacro - -cglobal chroma_filter_pp_48x1_internal - FILTER_W48_1 pp - ret - -cglobal chroma_filter_ps_48x1_internal - FILTER_W48_1 ps - ret - -%macro FILTER_W64_1 1 - FILTER_W8o_1 %1, 0 - FILTER_W8o_1 %1, 16 - FILTER_W8o_1 %1, 32 - FILTER_W8o_1 %1, 48 - FILTER_W8o_1 %1, 64 - FILTER_W8o_1 %1, 80 - FILTER_W8o_1 %1, 96 - FILTER_W8o_1 %1, 112 -%endmacro - -cglobal chroma_filter_pp_64x1_internal - FILTER_W64_1 pp - ret - -cglobal chroma_filter_ps_64x1_internal - FILTER_W64_1 ps - ret - - -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- - -INIT_XMM sse4 -%macro IPFILTER_CHROMA 6 -cglobal interp_4tap_horiz_%3_%1x%2, 4, %5, %6 - - add r3, r3 - add r1, r1 - sub r0, 2 - mov r4d, r4m - add r4d, r4d - -%ifdef PIC - lea r%4, [tab_ChromaCoeff] - movh m0, [r%4 + r4 * 4] -%else - movh m0, [tab_ChromaCoeff + r4 * 4] -%endif - - punpcklqdq m0, m0 - mova m2, [tab_Tm16] - -%ifidn %3, ps - mova m1, [INTERP_OFFSET_PS] - cmp r5m, byte 0 - je .skip - sub r0, r1 - call chroma_filter_%3_%1x1_internal - add r0, r1 - add r2, r3 - call chroma_filter_%3_%1x1_internal - add r0, r1 - add r2, r3 - call chroma_filter_%3_%1x1_internal - add r0, r1 - add r2, r3 -.skip: -%else - mova m1, [tab_c_32] - pxor m6, m6 - mova m7, [pw_pixel_max] -%endif - - call chroma_filter_%3_%1x1_internal -%rep %2 - 1 - add r0, r1 - add r2, r3 - call chroma_filter_%3_%1x1_internal -%endrep -RET -%endmacro -IPFILTER_CHROMA 6, 8, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 2, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 4, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 6, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 8, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 16, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 32, pp, 5, 6, 8 -IPFILTER_CHROMA 12, 16, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 4, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 8, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 12, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 16, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 32, pp, 5, 6, 8 -IPFILTER_CHROMA 24, 32, pp, 5, 6, 8 -IPFILTER_CHROMA 32, 8, pp, 5, 6, 8 -IPFILTER_CHROMA 32, 16, pp, 5, 6, 8 -IPFILTER_CHROMA 32, 24, pp, 5, 6, 8 -IPFILTER_CHROMA 32, 32, pp, 5, 6, 8 - -IPFILTER_CHROMA 6, 8, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 2, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 4, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 6, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 8, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 16, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 32, ps, 6, 7, 6 -IPFILTER_CHROMA 12, 16, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 4, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 8, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 12, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 16, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 32, ps, 6, 7, 6 -IPFILTER_CHROMA 24, 32, ps, 6, 7, 6 -IPFILTER_CHROMA 32, 8, ps, 6, 7, 6 -IPFILTER_CHROMA 32, 16, ps, 6, 7, 6 -IPFILTER_CHROMA 32, 24, ps, 6, 7, 6 -IPFILTER_CHROMA 32, 32, ps, 6, 7, 6 - -IPFILTER_CHROMA 6, 16, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 12, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 64, pp, 5, 6, 8 -IPFILTER_CHROMA 12, 32, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 24, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 64, pp, 5, 6, 8 -IPFILTER_CHROMA 24, 64, pp, 5, 6, 8 -IPFILTER_CHROMA 32, 48, pp, 5, 6, 8 -IPFILTER_CHROMA 32, 64, pp, 5, 6, 8 -IPFILTER_CHROMA 6, 16, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 12, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 64, ps, 6, 7, 6 -IPFILTER_CHROMA 12, 32, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 24, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 64, ps, 6, 7, 6 -IPFILTER_CHROMA 24, 64, ps, 6, 7, 6 -IPFILTER_CHROMA 32, 48, ps, 6, 7, 6 -IPFILTER_CHROMA 32, 64, ps, 6, 7, 6 - -IPFILTER_CHROMA 48, 64, pp, 5, 6, 8 -IPFILTER_CHROMA 64, 48, pp, 5, 6, 8 -IPFILTER_CHROMA 64, 64, pp, 5, 6, 8 -IPFILTER_CHROMA 64, 32, pp, 5, 6, 8 -IPFILTER_CHROMA 64, 16, pp, 5, 6, 8 -IPFILTER_CHROMA 48, 64, ps, 6, 7, 6 -IPFILTER_CHROMA 64, 48, ps, 6, 7, 6 -IPFILTER_CHROMA 64, 64, ps, 6, 7, 6 -IPFILTER_CHROMA 64, 32, ps, 6, 7, 6 -IPFILTER_CHROMA 64, 16, ps, 6, 7, 6 - - -%macro PROCESS_CHROMA_SP_W4_4R 0 - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r6 + 0 *32] ;m0=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m1, m4 ;m1=[1 2] - pmaddwd m1, [r6 + 0 *32] ;m1=[1+2] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[2 3] - pmaddwd m2, m4, [r6 + 0 *32] ;m2=[2+3] Row3 - pmaddwd m4, [r6 + 1 * 32] - paddd m0, m4 ;m0=[0+1+2+3] Row1 done - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[3 4] - pmaddwd m3, m5, [r6 + 0 *32] ;m3=[3+4] Row4 - pmaddwd m5, [r6 + 1 * 32] - paddd m1, m5 ;m1 = [1+2+3+4] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[4 5] - pmaddwd m4, [r6 + 1 * 32] - paddd m2, m4 ;m2=[2+3+4+5] Row3 - - movq m4, [r0 + 2 * r1] - punpcklwd m5, m4 ;m5=[5 6] - pmaddwd m5, [r6 + 1 * 32] - paddd m3, m5 ;m3=[3+4+5+6] Row4 -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_6xN 1 -cglobal interp_4tap_horiz_pp_6x%1, 5,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r4d, %1/2 -.loop: - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movq [r2], xm3 - pextrd [r2 + 8], xm3, 2 - - vbroadcasti128 m3, [r0 + r1] - vbroadcasti128 m4, [r0 + r1 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movq [r2 + r3], xm3 - pextrd [r2 + r3 + 8], xm3, 2 - - lea r0, [r0 + r1 * 2] - lea r2, [r2 + r3 * 2] - dec r4d - jnz .loop - RET -%endmacro -IPFILTER_CHROMA_avx2_6xN 8 -IPFILTER_CHROMA_avx2_6xN 16 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_8x2, 5,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3,q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2], xm3 - - vbroadcasti128 m3, [r0 + r1] - vbroadcasti128 m4, [r0 + r1 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3,q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2 + r3], xm3 - RET - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_8x4, 5,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - -%rep 2 - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3,q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2], xm3 - - vbroadcasti128 m3, [r0 + r1] - vbroadcasti128 m4, [r0 + r1 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3,q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2 + r3], xm3 - - lea r0, [r0 + r1 * 2] - lea r2, [r2 + r3 * 2] -%endrep - RET - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_8xN 1 -cglobal interp_4tap_horiz_pp_8x%1, 5,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r4d, %1/2 -.loop: - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2], xm3 - - vbroadcasti128 m3, [r0 + r1] - vbroadcasti128 m4, [r0 + r1 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2 + r3], xm3 - - lea r0, [r0 + r1 * 2] - lea r2, [r2 + r3 * 2] - dec r4d - jnz .loop - RET -%endmacro -IPFILTER_CHROMA_avx2_8xN 6 -IPFILTER_CHROMA_avx2_8xN 8 -IPFILTER_CHROMA_avx2_8xN 12 -IPFILTER_CHROMA_avx2_8xN 16 -IPFILTER_CHROMA_avx2_8xN 32 -IPFILTER_CHROMA_avx2_8xN 64 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_16xN 1 -%if ARCH_X86_64 -cglobal interp_4tap_horiz_pp_16x%1, 5,6,9 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r4d, %1 -.loop: - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m8, [r0 + 24] - - pshufb m4, m1 - pshufb m8, m1 - - pmaddwd m4, m0 - pmaddwd m8, m0 - phaddd m4, m8 - paddd m4, m2 - psrad m4, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m4, m4 - vpermq m4, m4, q2020 - pshufb xm4, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - vinserti128 m3, m3, xm4, 1 - CLIPW m3, m5, m7 - movu [r2], m3 - - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET -%endif -%endmacro -IPFILTER_CHROMA_avx2_16xN 4 -IPFILTER_CHROMA_avx2_16xN 8 -IPFILTER_CHROMA_avx2_16xN 12 -IPFILTER_CHROMA_avx2_16xN 16 -IPFILTER_CHROMA_avx2_16xN 24 -IPFILTER_CHROMA_avx2_16xN 32 -IPFILTER_CHROMA_avx2_16xN 64 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_32xN 1 -%if ARCH_X86_64 -cglobal interp_4tap_horiz_pp_32x%1, 5,6,9 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r6d, %1 -.loop: -%assign x 0 -%rep 2 - vbroadcasti128 m3, [r0 + x] - vbroadcasti128 m4, [r0 + 8 + x] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - - vbroadcasti128 m4, [r0 + 16 + x] - vbroadcasti128 m8, [r0 + 24 + x] - pshufb m4, m1 - pshufb m8, m1 - - pmaddwd m4, m0 - pmaddwd m8, m0 - phaddd m4, m8 - paddd m4, m2 - psrad m4, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m4, m4 - vpermq m4, m4, q2020 - pshufb xm4, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - vinserti128 m3, m3, xm4, 1 - CLIPW m3, m5, m7 - movu [r2 + x], m3 - %assign x x+32 - %endrep - - add r0, r1 - add r2, r3 - dec r6d - jnz .loop - RET -%endif -%endmacro -IPFILTER_CHROMA_avx2_32xN 8 -IPFILTER_CHROMA_avx2_32xN 16 -IPFILTER_CHROMA_avx2_32xN 24 -IPFILTER_CHROMA_avx2_32xN 32 -IPFILTER_CHROMA_avx2_32xN 48 -IPFILTER_CHROMA_avx2_32xN 64 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_12xN 1 -%if ARCH_X86_64 -cglobal interp_4tap_horiz_pp_12x%1, 5,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r4d, %1 -.loop: - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2], xm3 - - vbroadcasti128 m3, [r0 + 16] - vbroadcasti128 m4, [r0 + 24] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movq [r2 + 16], xm3 - - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET -%endif -%endmacro -IPFILTER_CHROMA_avx2_12xN 16 -IPFILTER_CHROMA_avx2_12xN 32 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_24xN 1 -%if ARCH_X86_64 -cglobal interp_4tap_horiz_pp_24x%1, 5,6,9 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r4d, %1 -.loop: - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m8, [r0 + 24] - pshufb m4, m1 - pshufb m8, m1 - - pmaddwd m4, m0 - pmaddwd m8, m0 - phaddd m4, m8 - paddd m4, m2 - psrad m4, 6 - - packusdw m3, m4 - vpermq m3, m3, q3120 - pshufb m3, m6 - CLIPW m3, m5, m7 - movu [r2], m3 - - vbroadcasti128 m3, [r0 + 32] - vbroadcasti128 m4, [r0 + 40] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 - CLIPW xm3, xm5, xm7 - movu [r2 + 32], xm3 - - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET -%endif -%endmacro -IPFILTER_CHROMA_avx2_24xN 32 -IPFILTER_CHROMA_avx2_24xN 64 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_64xN 1 -%if ARCH_X86_64 -cglobal interp_4tap_horiz_pp_64x%1, 5,6,9 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r6d, %1 -.loop: -%assign x 0 -%rep 4 - vbroadcasti128 m3, [r0 + x] - vbroadcasti128 m4, [r0 + 8 + x] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 - - vbroadcasti128 m4, [r0 + 16 + x] - vbroadcasti128 m8, [r0 + 24 + x] - pshufb m4, m1 - pshufb m8, m1 + lea r0, [r0 + 4 * r1 - 2 * %2] + lea r2, [r2 + 4 * r3] - pmaddwd m4, m0 - pmaddwd m8, m0 - phaddd m4, m8 - paddd m4, m2 - psrad m4, 6 + dec r4d + jnz .loopH - packusdw m3, m4 - vpermq m3, m3, q3120 - pshufb m3, m6 - CLIPW m3, m5, m7 - movu [r2 + x], m3 - %assign x x+32 - %endrep - - add r0, r1 - add r2, r3 - dec r6d - jnz .loop RET -%endif %endmacro -IPFILTER_CHROMA_avx2_64xN 16 -IPFILTER_CHROMA_avx2_64xN 32 -IPFILTER_CHROMA_avx2_64xN 48 -IPFILTER_CHROMA_avx2_64xN 64 ;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +; void interp_8tap_vert_pp_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%if ARCH_X86_64 -cglobal interp_4tap_horiz_pp_48x64, 5,6,9 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] + FILTER_VER_LUMA_sse2 pp, 4, 4 + FILTER_VER_LUMA_sse2 pp, 8, 8 + FILTER_VER_LUMA_sse2 pp, 8, 4 + FILTER_VER_LUMA_sse2 pp, 4, 8 + FILTER_VER_LUMA_sse2 pp, 16, 16 + FILTER_VER_LUMA_sse2 pp, 16, 8 + FILTER_VER_LUMA_sse2 pp, 8, 16 + FILTER_VER_LUMA_sse2 pp, 16, 12 + FILTER_VER_LUMA_sse2 pp, 12, 16 + FILTER_VER_LUMA_sse2 pp, 16, 4 + FILTER_VER_LUMA_sse2 pp, 4, 16 + FILTER_VER_LUMA_sse2 pp, 32, 32 + FILTER_VER_LUMA_sse2 pp, 32, 16 + FILTER_VER_LUMA_sse2 pp, 16, 32 + FILTER_VER_LUMA_sse2 pp, 32, 24 + FILTER_VER_LUMA_sse2 pp, 24, 32 + FILTER_VER_LUMA_sse2 pp, 32, 8 + FILTER_VER_LUMA_sse2 pp, 8, 32 + FILTER_VER_LUMA_sse2 pp, 64, 64 + FILTER_VER_LUMA_sse2 pp, 64, 32 + FILTER_VER_LUMA_sse2 pp, 32, 64 + FILTER_VER_LUMA_sse2 pp, 64, 48 + FILTER_VER_LUMA_sse2 pp, 48, 64 + FILTER_VER_LUMA_sse2 pp, 64, 16 + FILTER_VER_LUMA_sse2 pp, 16, 64 - mov r4d, 64 -.loop: -%assign x 0 -%rep 3 - vbroadcasti128 m3, [r0 + x] - vbroadcasti128 m4, [r0 + 8 + x] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_sse2 ps, 4, 4 + FILTER_VER_LUMA_sse2 ps, 8, 8 + FILTER_VER_LUMA_sse2 ps, 8, 4 + FILTER_VER_LUMA_sse2 ps, 4, 8 + FILTER_VER_LUMA_sse2 ps, 16, 16 + FILTER_VER_LUMA_sse2 ps, 16, 8 + FILTER_VER_LUMA_sse2 ps, 8, 16 + FILTER_VER_LUMA_sse2 ps, 16, 12 + FILTER_VER_LUMA_sse2 ps, 12, 16 + FILTER_VER_LUMA_sse2 ps, 16, 4 + FILTER_VER_LUMA_sse2 ps, 4, 16 + FILTER_VER_LUMA_sse2 ps, 32, 32 + FILTER_VER_LUMA_sse2 ps, 32, 16 + FILTER_VER_LUMA_sse2 ps, 16, 32 + FILTER_VER_LUMA_sse2 ps, 32, 24 + FILTER_VER_LUMA_sse2 ps, 24, 32 + FILTER_VER_LUMA_sse2 ps, 32, 8 + FILTER_VER_LUMA_sse2 ps, 8, 32 + FILTER_VER_LUMA_sse2 ps, 64, 64 + FILTER_VER_LUMA_sse2 ps, 64, 32 + FILTER_VER_LUMA_sse2 ps, 32, 64 + FILTER_VER_LUMA_sse2 ps, 64, 48 + FILTER_VER_LUMA_sse2 ps, 48, 64 + FILTER_VER_LUMA_sse2 ps, 64, 16 + FILTER_VER_LUMA_sse2 ps, 16, 64 - vbroadcasti128 m4, [r0 + 16 + x] - vbroadcasti128 m8, [r0 + 24 + x] - pshufb m4, m1 - pshufb m8, m1 +%macro PROCESS_CHROMA_SP_W4_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *32] ;m0=[0+1] Row1 - pmaddwd m4, m0 - pmaddwd m8, m0 - phaddd m4, m8 - paddd m4, m2 - psrad m4, 6 + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *32] ;m1=[1+2] Row2 - packusdw m3, m4 - vpermq m3, m3, q3120 - pshufb m3, m6 - CLIPW m3, m5, m7 - movu [r2 + x], m3 -%assign x x+32 -%endrep + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *32] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 32] + paddd m0, m4 ;m0=[0+1+2+3] Row1 done - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET -%endif + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *32] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 32] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m4, [r6 + 1 * 32] + paddd m2, m4 ;m2=[2+3+4+5] Row3 + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m5, [r6 + 1 * 32] + paddd m3, m5 ;m3=[3+4+5+6] Row4 +%endmacro ;----------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) @@ -10275,1247 +6590,6 @@ cglobal filterPixelToShort_48x64, 3, 7, 4 RET -;----------------------------------------------------------------------------------------------------------------------------- -;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- - -%macro IPFILTER_LUMA_PS_4xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_4x%1, 6,8,7 - mov r5d, r5m - mov r4d, r4m - add r1d, r1d - add r3d, r3d - -%ifdef PIC - lea r6, [tab_LumaCoeff] - lea r4, [r4 * 8] - vbroadcasti128 m0, [r6 + r4 * 2] -%else - lea r4, [r4 * 8] - vbroadcasti128 m0, [tab_LumaCoeff + r4 * 2] -%endif - - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - pw_2000 - - sub r0, 6 - test r5d, r5d - mov r7d, %1 ; loop count variable - height - jz .preloop - lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride - sub r0, r6 ; r0(src) - 3 * srcStride - add r7d, 6 ;7 - 1(since last row not in loop) ; need extra 7 rows, just set a specially flag here, blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop) - -.preloop: - lea r6, [r3 * 3] -.loop: - ; Row 0 - movu xm3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - movu xm4, [r0 + 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - vinserti128 m3, m3, xm4, 1 - movu xm4, [r0 + 4] - movu xm5, [r0 + 6] - vinserti128 m4, m4, xm5, 1 - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] - - ; Row 1 - movu xm4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - movu xm5, [r0 + r1 + 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - vinserti128 m4, m4, xm5, 1 - movu xm5, [r0 + r1 + 4] - movu xm6, [r0 + r1 + 6] - vinserti128 m5, m5, xm6, 1 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] - phaddd m3, m4 ; all rows and col completed. - - mova m5, [interp8_hps_shuf] - vpermd m3, m5, m3 - paddd m3, m2 - vextracti128 xm4, m3, 1 - psrad xm3, INTERP_SHIFT_PS - psrad xm4, INTERP_SHIFT_PS - packssdw xm3, xm3 - packssdw xm4, xm4 - - movq [r2], xm3 ;row 0 - movq [r2 + r3], xm4 ;row 1 - lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4) - lea r2, [r2 + r3 * 2] ; first loop dst ->5th row(i.e 4) - - sub r7d, 2 - jg .loop - test r5d, r5d - jz .end - - ; Row 10 - movu xm3, [r0] - movu xm4, [r0 + 2] - vinserti128 m3, m3, xm4, 1 - movu xm4, [r0 + 4] - movu xm5, [r0 + 6] - vinserti128 m4, m4, xm5, 1 - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - - ; Row11 - phaddd m3, m4 ; all rows and col completed. - - mova m5, [interp8_hps_shuf] - vpermd m3, m5, m3 - paddd m3, m2 - vextracti128 xm4, m3, 1 - psrad xm3, INTERP_SHIFT_PS - psrad xm4, INTERP_SHIFT_PS - packssdw xm3, xm3 - packssdw xm4, xm4 - - movq [r2], xm3 ;row 0 -.end: - RET -%endif -%endmacro - - IPFILTER_LUMA_PS_4xN_AVX2 4 - IPFILTER_LUMA_PS_4xN_AVX2 8 - IPFILTER_LUMA_PS_4xN_AVX2 16 - -%macro IPFILTER_LUMA_PS_8xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_8x%1, 4, 6, 8 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - shl r4d, 4 -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4] - vpbroadcastq m1, [r6 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 6 - test r5d, r5d - mov r4d, %1 - jz .loop0 - lea r6, [r1*3] - sub r0, r6 - add r4d, 7 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m7, m5, m3 - pmaddwd m4, m0 - pmaddwd m7, m1 - paddd m4, m7 - - vbroadcasti128 m6, [r0 + 16] - pshufb m5, m3 - pshufb m6, m3 - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m2 - vextracti128 xm5,m4, 1 - psrad xm4, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm4, xm5 - - movu [r2], xm4 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - - IPFILTER_LUMA_PS_8xN_AVX2 4 - IPFILTER_LUMA_PS_8xN_AVX2 8 - IPFILTER_LUMA_PS_8xN_AVX2 16 - IPFILTER_LUMA_PS_8xN_AVX2 32 - -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_24x32, 4, 6, 8 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - shl r4d, 4 -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4] - vpbroadcastq m1, [r6 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 6 - test r5d, r5d - mov r4d, 32 - jz .loop0 - lea r6, [r1*3] - sub r0, r6 - add r4d, 7 - -.loop0: -%assign x 0 -%rep 24/8 - vbroadcasti128 m4, [r0 + x] - vbroadcasti128 m5, [r0 + 8 + x] - pshufb m4, m3 - pshufb m7, m5, m3 - pmaddwd m4, m0 - pmaddwd m7, m1 - paddd m4, m7 - - vbroadcasti128 m6, [r0 + 16 + x] - pshufb m5, m3 - pshufb m6, m3 - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m2 - vextracti128 xm5,m4, 1 - psrad xm4, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm4, xm5 - - movu [r2 + x], xm4 - %assign x x+16 - %endrep - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif - - -%macro IPFILTER_LUMA_PS_32_64_AVX2 2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_%1x%2, 4, 6, 8 - - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - shl r4d, 6 -%ifdef PIC - lea r6, [tab_LumaCoeffV] - movu m0, [r6 + r4] - movu m1, [r6 + r4 + mmsize] -%else - movu m0, [tab_LumaCoeffV + r4] - movu m1, [tab_LumaCoeffV + r4 + mmsize] -%endif - mova m3, [interp8_hpp_shuf_new] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 6 - test r5d, r5d - mov r4d, %2 - jz .loop0 - lea r6, [r1*3] - sub r0, r6 - add r4d, 7 - -.loop0: -%assign x 0 -%rep %1/16 - vbroadcasti128 m4, [r0 + x] - vbroadcasti128 m5, [r0 + 4 * SIZEOF_PIXEL + x] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m7, m5, m1 - paddd m4, m7 - vextracti128 xm7, m4, 1 - paddd xm4, xm7 - paddd xm4, xm2 - psrad xm4, INTERP_SHIFT_PS - - vbroadcasti128 m6, [r0 + 16 + x] - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m7, m6, m1 - paddd m5, m7 - vextracti128 xm7, m5, 1 - paddd xm5, xm7 - paddd xm5, xm2 - psrad xm5, INTERP_SHIFT_PS - - packssdw xm4, xm5 - movu [r2 + x], xm4 - - vbroadcasti128 m5, [r0 + 24 + x] - pshufb m5, m3 - - pmaddwd m6, m0 - pmaddwd m7, m5, m1 - paddd m6, m7 - vextracti128 xm7, m6, 1 - paddd xm6, xm7 - paddd xm6, xm2 - psrad xm6, INTERP_SHIFT_PS - - vbroadcasti128 m7, [r0 + 32 + x] - pshufb m7, m3 - - pmaddwd m5, m0 - pmaddwd m7, m1 - paddd m5, m7 - vextracti128 xm7, m5, 1 - paddd xm5, xm7 - paddd xm5, xm2 - psrad xm5, INTERP_SHIFT_PS - - packssdw xm6, xm5 - movu [r2 + 16 + x], xm6 - -%assign x x+32 -%endrep - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - - IPFILTER_LUMA_PS_32_64_AVX2 32, 8 - IPFILTER_LUMA_PS_32_64_AVX2 32, 16 - IPFILTER_LUMA_PS_32_64_AVX2 32, 24 - IPFILTER_LUMA_PS_32_64_AVX2 32, 32 - IPFILTER_LUMA_PS_32_64_AVX2 32, 64 - - IPFILTER_LUMA_PS_32_64_AVX2 64, 16 - IPFILTER_LUMA_PS_32_64_AVX2 64, 32 - IPFILTER_LUMA_PS_32_64_AVX2 64, 48 - IPFILTER_LUMA_PS_32_64_AVX2 64, 64 - - IPFILTER_LUMA_PS_32_64_AVX2 48, 64 - -%macro IPFILTER_LUMA_PS_16xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_16x%1, 4, 6, 8 - - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - shl r4d, 4 -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4] - vpbroadcastq m1, [r6 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 6 - test r5d, r5d - mov r4d, %1 - jz .loop0 - lea r6, [r1*3] - sub r0, r6 - add r4d, 7 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m7, m5, m3 - pmaddwd m4, m0 - pmaddwd m7, m1 - paddd m4, m7 - - vbroadcasti128 m6, [r0 + 16] - pshufb m5, m3 - pshufb m7, m6, m3 - pmaddwd m5, m0 - pmaddwd m7, m1 - paddd m5, m7 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m2 - vextracti128 xm5, m4, 1 - psrad xm4, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m5, [r0 + 24] - pshufb m6, m3 - pshufb m7, m5, m3 - pmaddwd m6, m0 - pmaddwd m7, m1 - paddd m6, m7 - - vbroadcasti128 m7, [r0 + 32] - pshufb m5, m3 - pshufb m7, m3 - pmaddwd m5, m0 - pmaddwd m7, m1 - paddd m5, m7 - - phaddd m6, m5 - vpermq m6, m6, q3120 - paddd m6, m2 - vextracti128 xm5,m6, 1 - psrad xm6, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm6, xm5 - movu [r2 + 16], xm6 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - - IPFILTER_LUMA_PS_16xN_AVX2 4 - IPFILTER_LUMA_PS_16xN_AVX2 8 - IPFILTER_LUMA_PS_16xN_AVX2 12 - IPFILTER_LUMA_PS_16xN_AVX2 16 - IPFILTER_LUMA_PS_16xN_AVX2 32 - IPFILTER_LUMA_PS_16xN_AVX2 64 - -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_12x16, 4, 6, 8 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - shl r4d, 4 -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4] - vpbroadcastq m1, [r6 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 6 - test r5d, r5d - mov r4d, 16 - jz .loop0 - lea r6, [r1*3] - sub r0, r6 - add r4d, 7 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m7, m5, m3 - pmaddwd m4, m0 - pmaddwd m7, m1 - paddd m4, m7 - - vbroadcasti128 m6, [r0 + 16] - pshufb m5, m3 - pshufb m7, m6, m3 - pmaddwd m5, m0 - pmaddwd m7, m1 - paddd m5, m7 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m2 - vextracti128 xm5,m4, 1 - psrad xm4, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m5, [r0 + 24] - pshufb m6, m3 - pshufb m5, m3 - pmaddwd m6, m0 - pmaddwd m5, m1 - paddd m6, m5 - - phaddd m6, m6 - vpermq m6, m6, q3120 - paddd xm6, xm2 - psrad xm6, INTERP_SHIFT_PS - packssdw xm6, xm6 - movq [r2 + 16], xm6 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif - -%macro IPFILTER_CHROMA_PS_8xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_8x%1, 4, 7, 6 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 2 - test r5d, r5d - mov r4d, %1 - jz .loop0 - sub r0, r1 - add r4d, 3 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - - IPFILTER_CHROMA_PS_8xN_AVX2 4 - IPFILTER_CHROMA_PS_8xN_AVX2 8 - IPFILTER_CHROMA_PS_8xN_AVX2 16 - IPFILTER_CHROMA_PS_8xN_AVX2 32 - IPFILTER_CHROMA_PS_8xN_AVX2 6 - IPFILTER_CHROMA_PS_8xN_AVX2 2 - IPFILTER_CHROMA_PS_8xN_AVX2 12 - IPFILTER_CHROMA_PS_8xN_AVX2 64 - -%macro IPFILTER_CHROMA_PS_16xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_16x%1, 4, 7, 6 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 2 - test r5d, r5d - mov r4d, %1 - jz .loop0 - sub r0, r1 - add r4d, 3 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 16], xm4 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - -IPFILTER_CHROMA_PS_16xN_AVX2 16 -IPFILTER_CHROMA_PS_16xN_AVX2 8 -IPFILTER_CHROMA_PS_16xN_AVX2 32 -IPFILTER_CHROMA_PS_16xN_AVX2 12 -IPFILTER_CHROMA_PS_16xN_AVX2 4 -IPFILTER_CHROMA_PS_16xN_AVX2 64 -IPFILTER_CHROMA_PS_16xN_AVX2 24 - -%macro IPFILTER_CHROMA_PS_24xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_24x%1, 4, 7, 6 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 2 - test r5d, r5d - mov r4d, %1 - jz .loop0 - sub r0, r1 - add r4d, 3 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 16], xm4 - - vbroadcasti128 m4, [r0 + 32] - vbroadcasti128 m5, [r0 + 40] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 32], xm4 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - -IPFILTER_CHROMA_PS_24xN_AVX2 32 -IPFILTER_CHROMA_PS_24xN_AVX2 64 - -%macro IPFILTER_CHROMA_PS_12xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_12x%1, 4, 7, 6 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 2 - test r5d, r5d - mov r4d, %1 - jz .loop0 - sub r0, r1 - add r4d, 3 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m4, [r0 + 16] - pshufb m4, m3 - pmaddwd m4, m0 - phaddd m4, m4 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movq [r2 + 16], xm4 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - -IPFILTER_CHROMA_PS_12xN_AVX2 16 -IPFILTER_CHROMA_PS_12xN_AVX2 32 - -%macro IPFILTER_CHROMA_PS_32xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_32x%1, 4, 7, 6 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 2 - test r5d, r5d - mov r4d, %1 - jz .loop0 - sub r0, r1 - add r4d, 3 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 16], xm4 - - vbroadcasti128 m4, [r0 + 32] - vbroadcasti128 m5, [r0 + 40] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 32], xm4 - - vbroadcasti128 m4, [r0 + 48] - vbroadcasti128 m5, [r0 + 56] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 48], xm4 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - -IPFILTER_CHROMA_PS_32xN_AVX2 32 -IPFILTER_CHROMA_PS_32xN_AVX2 16 -IPFILTER_CHROMA_PS_32xN_AVX2 24 -IPFILTER_CHROMA_PS_32xN_AVX2 8 -IPFILTER_CHROMA_PS_32xN_AVX2 64 -IPFILTER_CHROMA_PS_32xN_AVX2 48 - - -%macro IPFILTER_CHROMA_PS_64xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_64x%1, 4, 7, 6 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 2 - test r5d, r5d - mov r4d, %1 - jz .loop0 - sub r0, r1 - add r4d, 3 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 16], xm4 - - vbroadcasti128 m4, [r0 + 32] - vbroadcasti128 m5, [r0 + 40] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 32], xm4 - - vbroadcasti128 m4, [r0 + 48] - vbroadcasti128 m5, [r0 + 56] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 48], xm4 - - vbroadcasti128 m4, [r0 + 64] - vbroadcasti128 m5, [r0 + 72] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 64], xm4 - - vbroadcasti128 m4, [r0 + 80] - vbroadcasti128 m5, [r0 + 88] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 80], xm4 - - vbroadcasti128 m4, [r0 + 96] - vbroadcasti128 m5, [r0 + 104] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 96], xm4 - - vbroadcasti128 m4, [r0 + 112] - vbroadcasti128 m5, [r0 + 120] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 112], xm4 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - -IPFILTER_CHROMA_PS_64xN_AVX2 64 -IPFILTER_CHROMA_PS_64xN_AVX2 48 -IPFILTER_CHROMA_PS_64xN_AVX2 32 -IPFILTER_CHROMA_PS_64xN_AVX2 16 - -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_48x64, 4, 7, 6 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 2 - test r5d, r5d - mov r4d, 64 - jz .loop0 - sub r0, r1 - add r4d, 3 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 16], xm4 - - vbroadcasti128 m4, [r0 + 32] - vbroadcasti128 m5, [r0 + 40] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 32], xm4 - - vbroadcasti128 m4, [r0 + 48] - vbroadcasti128 m5, [r0 + 56] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 48], xm4 - - vbroadcasti128 m4, [r0 + 64] - vbroadcasti128 m5, [r0 + 72] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 64], xm4 - - vbroadcasti128 m4, [r0 + 80] - vbroadcasti128 m5, [r0 + 88] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 80], xm4 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif - -%macro IPFILTER_CHROMA_PS_6xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_6x%1, 4, 7, 6 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 2 - test r5d, r5d - mov r4d, %1 - jz .loop0 - sub r0, r1 - add r4d, 3 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movq [r2], xm4 - pextrd [r2 + 8], xm4, 2 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - - IPFILTER_CHROMA_PS_6xN_AVX2 8 - IPFILTER_CHROMA_PS_6xN_AVX2 16 - %macro FILTER_VER_CHROMA_AVX2_8xN 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 From f0bb1a50a4342bd5164b6a761ae42e9cd8cfa2ca Mon Sep 17 00:00:00 2001 From: Mythreyi P Date: Tue, 13 Feb 2018 00:28:25 -0800 Subject: [PATCH 49/51] x86: Split ipfilter16 kernals part2 Port horizonal 4tap kernals from h-ipfilter16.asm to a new source file, h4-ipfilter16.asm to improve build time. --- source/common/CMakeLists.txt | 2 +- source/common/x86/h-ipfilter16.asm | 3111 +++------------------------ source/common/x86/h4-ipfilter16.asm | 2632 ++++++++++++++++++++++ 3 files changed, 2957 insertions(+), 2788 deletions(-) create mode 100644 source/common/x86/h4-ipfilter16.asm diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 079b64982c..0eb2dfb8f2 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -61,7 +61,7 @@ if(ENABLE_ASSEMBLY AND X86) mc-a2.asm pixel-util8.asm blockcopy8.asm pixeladd8.asm dct8.asm seaintegral.asm) if(HIGH_BIT_DEPTH) - set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm h-ipfilter16.asm ipfilter16.asm loopfilter.asm) + set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm h4-ipfilter16.asm h-ipfilter16.asm ipfilter16.asm loopfilter.asm) else() set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm v4-ipfilter8.asm h-ipfilter8.asm ipfilter8.asm loopfilter.asm) endif() diff --git a/source/common/x86/h-ipfilter16.asm b/source/common/x86/h-ipfilter16.asm index 92c329b036..8f593e1b67 100644 --- a/source/common/x86/h-ipfilter16.asm +++ b/source/common/x86/h-ipfilter16.asm @@ -45,20 +45,7 @@ SECTION_RODATA 32 -tab_c_32: times 8 dd 32 h_pd_524800: times 8 dd 524800 - -tab_Tm16: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 - - -tab_ChromaCoeff: dw 0, 64, 0, 0 - dw -2, 58, 10, -2 - dw -4, 54, 16, -2 - dw -6, 46, 28, -4 - dw -4, 36, 36, -4 - dw -4, 28, 46, -6 - dw -2, 16, 54, -4 - dw -2, 10, 58, -2 tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0 dw -1, 4, -10, 58, 17, -5, 1, 0 @@ -352,198 +339,6 @@ cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8 FILTER_HOR_LUMA_sse2 64, 32, ps FILTER_HOR_LUMA_sse2 64, 48, ps FILTER_HOR_LUMA_sse2 64, 64, ps - -%macro FILTERH_W2_4_sse3 2 - movh m3, [r0 + %1] - movhps m3, [r0 + %1 + 2] - pmaddwd m3, m0 - movh m4, [r0 + r1 + %1] - movhps m4, [r0 + r1 + %1 + 2] - pmaddwd m4, m0 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m3, m3, q3120 - pshufd m4, m4, q3120 - punpcklqdq m3, m4 - paddd m3, m1 - movh m5, [r0 + 2 * r1 + %1] - movhps m5, [r0 + 2 * r1 + %1 + 2] - pmaddwd m5, m0 - movh m4, [r0 + r4 + %1] - movhps m4, [r0 + r4 + %1 + 2] - pmaddwd m4, m0 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m5, m5, q3120 - pshufd m4, m4, q3120 - punpcklqdq m5, m4 - paddd m5, m1 -%ifidn %2, pp - psrad m3, 6 - psrad m5, 6 - packssdw m3, m5 - CLIPW m3, m7, m6 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movd [r2 + %1], m3 - psrldq m3, 4 - movd [r2 + r3 + %1], m3 - psrldq m3, 4 - movd [r2 + r3 * 2 + %1], m3 - psrldq m3, 4 - movd [r2 + r5 + %1], m3 -%endmacro - -%macro FILTERH_W2_3_sse3 1 - movh m3, [r0 + %1] - movhps m3, [r0 + %1 + 2] - pmaddwd m3, m0 - movh m4, [r0 + r1 + %1] - movhps m4, [r0 + r1 + %1 + 2] - pmaddwd m4, m0 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m3, m3, q3120 - pshufd m4, m4, q3120 - punpcklqdq m3, m4 - paddd m3, m1 - - movh m5, [r0 + 2 * r1 + %1] - movhps m5, [r0 + 2 * r1 + %1 + 2] - pmaddwd m5, m0 - - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m5, m5, q3120 - paddd m5, m1 - - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 - - movd [r2 + %1], m3 - psrldq m3, 4 - movd [r2 + r3 + %1], m3 - psrldq m3, 4 - movd [r2 + r3 * 2 + %1], m3 -%endmacro - -%macro FILTERH_W4_2_sse3 2 - movh m3, [r0 + %1] - movhps m3, [r0 + %1 + 2] - pmaddwd m3, m0 - movh m4, [r0 + %1 + 4] - movhps m4, [r0 + %1 + 6] - pmaddwd m4, m0 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m3, m3, q3120 - pshufd m4, m4, q3120 - punpcklqdq m3, m4 - paddd m3, m1 - - movh m5, [r0 + r1 + %1] - movhps m5, [r0 + r1 + %1 + 2] - pmaddwd m5, m0 - movh m4, [r0 + r1 + %1 + 4] - movhps m4, [r0 + r1 + %1 + 6] - pmaddwd m4, m0 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m5, m5, q3120 - pshufd m4, m4, q3120 - punpcklqdq m5, m4 - paddd m5, m1 -%ifidn %2, pp - psrad m3, 6 - psrad m5, 6 - packssdw m3, m5 - CLIPW m3, m7, m6 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + %1], m3 - movhps [r2 + r3 + %1], m3 -%endmacro - -%macro FILTERH_W4_1_sse3 1 - movh m3, [r0 + 2 * r1 + %1] - movhps m3, [r0 + 2 * r1 + %1 + 2] - pmaddwd m3, m0 - movh m4, [r0 + 2 * r1 + %1 + 4] - movhps m4, [r0 + 2 * r1 + %1 + 6] - pmaddwd m4, m0 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m3, m3, q3120 - pshufd m4, m4, q3120 - punpcklqdq m3, m4 - paddd m3, m1 - - psrad m3, INTERP_SHIFT_PS - packssdw m3, m3 - movh [r2 + r3 * 2 + %1], m3 -%endmacro - -%macro FILTERH_W8_1_sse3 2 - movh m3, [r0 + %1] - movhps m3, [r0 + %1 + 2] - pmaddwd m3, m0 - movh m4, [r0 + %1 + 4] - movhps m4, [r0 + %1 + 6] - pmaddwd m4, m0 - pshufd m2, m3, q2301 - paddd m3, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m3, m3, q3120 - pshufd m4, m4, q3120 - punpcklqdq m3, m4 - paddd m3, m1 - - movh m5, [r0 + %1 + 8] - movhps m5, [r0 + %1 + 10] - pmaddwd m5, m0 - movh m4, [r0 + %1 + 12] - movhps m4, [r0 + %1 + 14] - pmaddwd m4, m0 - pshufd m2, m5, q2301 - paddd m5, m2 - pshufd m2, m4, q2301 - paddd m4, m2 - pshufd m5, m5, q3120 - pshufd m4, m4, q3120 - punpcklqdq m5, m4 - paddd m5, m1 -%ifidn %2, pp - psrad m3, 6 - psrad m5, 6 - packssdw m3, m5 - CLIPW m3, m7, m6 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movdqu [r2 + %1], m3 -%endmacro ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) @@ -652,96 +447,6 @@ cglobal interp_4tap_horiz_%3_%1x%2, 4, 7, 8 RET %endmacro -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- - -FILTER_HOR_CHROMA_sse3 2, 4, pp -FILTER_HOR_CHROMA_sse3 2, 8, pp -FILTER_HOR_CHROMA_sse3 2, 16, pp -FILTER_HOR_CHROMA_sse3 4, 2, pp -FILTER_HOR_CHROMA_sse3 4, 4, pp -FILTER_HOR_CHROMA_sse3 4, 8, pp -FILTER_HOR_CHROMA_sse3 4, 16, pp -FILTER_HOR_CHROMA_sse3 4, 32, pp -FILTER_HOR_CHROMA_sse3 6, 8, pp -FILTER_HOR_CHROMA_sse3 6, 16, pp -FILTER_HOR_CHROMA_sse3 8, 2, pp -FILTER_HOR_CHROMA_sse3 8, 4, pp -FILTER_HOR_CHROMA_sse3 8, 6, pp -FILTER_HOR_CHROMA_sse3 8, 8, pp -FILTER_HOR_CHROMA_sse3 8, 12, pp -FILTER_HOR_CHROMA_sse3 8, 16, pp -FILTER_HOR_CHROMA_sse3 8, 32, pp -FILTER_HOR_CHROMA_sse3 8, 64, pp -FILTER_HOR_CHROMA_sse3 12, 16, pp -FILTER_HOR_CHROMA_sse3 12, 32, pp -FILTER_HOR_CHROMA_sse3 16, 4, pp -FILTER_HOR_CHROMA_sse3 16, 8, pp -FILTER_HOR_CHROMA_sse3 16, 12, pp -FILTER_HOR_CHROMA_sse3 16, 16, pp -FILTER_HOR_CHROMA_sse3 16, 24, pp -FILTER_HOR_CHROMA_sse3 16, 32, pp -FILTER_HOR_CHROMA_sse3 16, 64, pp -FILTER_HOR_CHROMA_sse3 24, 32, pp -FILTER_HOR_CHROMA_sse3 24, 64, pp -FILTER_HOR_CHROMA_sse3 32, 8, pp -FILTER_HOR_CHROMA_sse3 32, 16, pp -FILTER_HOR_CHROMA_sse3 32, 24, pp -FILTER_HOR_CHROMA_sse3 32, 32, pp -FILTER_HOR_CHROMA_sse3 32, 48, pp -FILTER_HOR_CHROMA_sse3 32, 64, pp -FILTER_HOR_CHROMA_sse3 48, 64, pp -FILTER_HOR_CHROMA_sse3 64, 16, pp -FILTER_HOR_CHROMA_sse3 64, 32, pp -FILTER_HOR_CHROMA_sse3 64, 48, pp -FILTER_HOR_CHROMA_sse3 64, 64, pp - -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- - -FILTER_HOR_CHROMA_sse3 2, 4, ps -FILTER_HOR_CHROMA_sse3 2, 8, ps -FILTER_HOR_CHROMA_sse3 2, 16, ps -FILTER_HOR_CHROMA_sse3 4, 2, ps -FILTER_HOR_CHROMA_sse3 4, 4, ps -FILTER_HOR_CHROMA_sse3 4, 8, ps -FILTER_HOR_CHROMA_sse3 4, 16, ps -FILTER_HOR_CHROMA_sse3 4, 32, ps -FILTER_HOR_CHROMA_sse3 6, 8, ps -FILTER_HOR_CHROMA_sse3 6, 16, ps -FILTER_HOR_CHROMA_sse3 8, 2, ps -FILTER_HOR_CHROMA_sse3 8, 4, ps -FILTER_HOR_CHROMA_sse3 8, 6, ps -FILTER_HOR_CHROMA_sse3 8, 8, ps -FILTER_HOR_CHROMA_sse3 8, 12, ps -FILTER_HOR_CHROMA_sse3 8, 16, ps -FILTER_HOR_CHROMA_sse3 8, 32, ps -FILTER_HOR_CHROMA_sse3 8, 64, ps -FILTER_HOR_CHROMA_sse3 12, 16, ps -FILTER_HOR_CHROMA_sse3 12, 32, ps -FILTER_HOR_CHROMA_sse3 16, 4, ps -FILTER_HOR_CHROMA_sse3 16, 8, ps -FILTER_HOR_CHROMA_sse3 16, 12, ps -FILTER_HOR_CHROMA_sse3 16, 16, ps -FILTER_HOR_CHROMA_sse3 16, 24, ps -FILTER_HOR_CHROMA_sse3 16, 32, ps -FILTER_HOR_CHROMA_sse3 16, 64, ps -FILTER_HOR_CHROMA_sse3 24, 32, ps -FILTER_HOR_CHROMA_sse3 24, 64, ps -FILTER_HOR_CHROMA_sse3 32, 8, ps -FILTER_HOR_CHROMA_sse3 32, 16, ps -FILTER_HOR_CHROMA_sse3 32, 24, ps -FILTER_HOR_CHROMA_sse3 32, 32, ps -FILTER_HOR_CHROMA_sse3 32, 48, ps -FILTER_HOR_CHROMA_sse3 32, 64, ps -FILTER_HOR_CHROMA_sse3 48, 64, ps -FILTER_HOR_CHROMA_sse3 64, 16, ps -FILTER_HOR_CHROMA_sse3 64, 32, ps -FILTER_HOR_CHROMA_sse3 64, 48, ps -FILTER_HOR_CHROMA_sse3 64, 64, ps - %macro FILTER_P2S_2_4_sse2 1 movd m0, [r0 + %1] movd m2, [r0 + r1 * 2 + %1] @@ -1590,59 +1295,6 @@ FILTER_HOR_LUMA_W24 24, 32, pp ;---------------------------------------------------------------------------------------------------------------------------- FILTER_HOR_LUMA_W24 24, 32, ps -%macro FILTER_W2_2 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + r1] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - packusdw m3, m3 - CLIPW m3, m7, m6 -%else - psrad m3, INTERP_SHIFT_PS - packssdw m3, m3 -%endif - movd [r2], m3 - pextrd [r2 + r3], m3, 1 -%endmacro - -%macro FILTER_W4_2 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + r1] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + r1 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m7, m6 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2], m3 - movhps [r2 + r3], m3 -%endmacro - ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx ;------------------------------------------------------------------------------------------------------------- @@ -2352,2019 +2004,137 @@ cglobal interp_8tap_horiz_pp_48x64, 4,6,8 jnz .loop RET -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_CHROMA_H 6 -INIT_XMM sse4 -cglobal interp_4tap_horiz_%3_%1x%2, 4, %4, %5 +;----------------------------------------------------------------------------------------------------------------------------- +;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- - add r3, r3 - add r1, r1 - sub r0, 2 - mov r4d, r4m - add r4d, r4d +%macro IPFILTER_LUMA_PS_4xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_ps_4x%1, 6,8,7 + mov r5d, r5m + mov r4d, r4m + add r1d, r1d + add r3d, r3d %ifdef PIC - lea r%6, [tab_ChromaCoeff] - movh m0, [r%6 + r4 * 4] + lea r6, [tab_LumaCoeff] + lea r4, [r4 * 8] + vbroadcasti128 m0, [r6 + r4 * 2] %else - movh m0, [tab_ChromaCoeff + r4 * 4] + lea r4, [r4 * 8] + vbroadcasti128 m0, [tab_LumaCoeff + r4 * 2] %endif - punpcklqdq m0, m0 - mova m2, [tab_Tm16] + vbroadcasti128 m2, [INTERP_OFFSET_PS] -%ifidn %3, ps - mova m1, [INTERP_OFFSET_PS] - cmp r5m, byte 0 - je .skip - sub r0, r1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - - %if %1 == 4 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - %else - phaddd m3, m3 - %endif + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - pw_2000 - paddd m3, m1 - psrad m3, INTERP_SHIFT_PS - packssdw m3, m3 + sub r0, 6 + test r5d, r5d + mov r7d, %1 ; loop count variable - height + jz .preloop + lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride + sub r0, r6 ; r0(src) - 3 * srcStride + add r7d, 6 ;7 - 1(since last row not in loop) ; need extra 7 rows, just set a specially flag here, blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop) - %if %1 == 2 - movd [r2], m3 - %else - movh [r2], m3 - %endif +.preloop: + lea r6, [r3 * 3] +.loop: + ; Row 0 + movu xm3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + movu xm4, [r0 + 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + vinserti128 m3, m3, xm4, 1 + movu xm4, [r0 + 4] + movu xm5, [r0 + 6] + vinserti128 m4, m4, xm5, 1 + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] - add r0, r1 - add r2, r3 - FILTER_W%1_2 %3 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] + ; Row 1 + movu xm4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + movu xm5, [r0 + r1 + 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + vinserti128 m4, m4, xm5, 1 + movu xm5, [r0 + r1 + 4] + movu xm6, [r0 + r1 + 6] + vinserti128 m5, m5, xm6, 1 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + phaddd m3, m4 ; all rows and col completed. -.skip: + mova m5, [interp8_hps_shuf] + vpermd m3, m5, m3 + paddd m3, m2 + vextracti128 xm4, m3, 1 + psrad xm3, INTERP_SHIFT_PS + psrad xm4, INTERP_SHIFT_PS + packssdw xm3, xm3 + packssdw xm4, xm4 -%else ;%ifidn %3, ps - pxor m7, m7 - mova m6, [pw_pixel_max] - mova m1, [tab_c_32] -%endif ;%ifidn %3, ps + movq [r2], xm3 ;row 0 + movq [r2 + r3], xm4 ;row 1 + lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4) + lea r2, [r2 + r3 * 2] ; first loop dst ->5th row(i.e 4) - FILTER_W%1_2 %3 + sub r7d, 2 + jg .loop + test r5d, r5d + jz .end -%rep (%2/2) - 1 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - FILTER_W%1_2 %3 -%endrep - RET -%endmacro + ; Row 10 + movu xm3, [r0] + movu xm4, [r0 + 2] + vinserti128 m3, m3, xm4, 1 + movu xm4, [r0 + 4] + movu xm5, [r0 + 6] + vinserti128 m4, m4, xm5, 1 + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 -FILTER_CHROMA_H 2, 4, pp, 6, 8, 5 -FILTER_CHROMA_H 2, 8, pp, 6, 8, 5 -FILTER_CHROMA_H 4, 2, pp, 6, 8, 5 -FILTER_CHROMA_H 4, 4, pp, 6, 8, 5 -FILTER_CHROMA_H 4, 8, pp, 6, 8, 5 -FILTER_CHROMA_H 4, 16, pp, 6, 8, 5 - -FILTER_CHROMA_H 2, 4, ps, 7, 5, 6 -FILTER_CHROMA_H 2, 8, ps, 7, 5, 6 -FILTER_CHROMA_H 4, 2, ps, 7, 6, 6 -FILTER_CHROMA_H 4, 4, ps, 7, 6, 6 -FILTER_CHROMA_H 4, 8, ps, 7, 6, 6 -FILTER_CHROMA_H 4, 16, ps, 7, 6, 6 - -FILTER_CHROMA_H 2, 16, pp, 6, 8, 5 -FILTER_CHROMA_H 4, 32, pp, 6, 8, 5 -FILTER_CHROMA_H 2, 16, ps, 7, 5, 6 -FILTER_CHROMA_H 4, 32, ps, 7, 6, 6 - - -%macro FILTER_W6_1 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m4, [r0 + 8] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m4, m4 - paddd m4, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m4, INTERP_SHIFT_PP - packusdw m3, m4 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m4, INTERP_SHIFT_PS - packssdw m3, m4 -%endif - movh [r2], m3 - pextrd [r2 + 8], m3, 2 -%endmacro + ; Row11 + phaddd m3, m4 ; all rows and col completed. -cglobal chroma_filter_pp_6x1_internal - FILTER_W6_1 pp - ret - -cglobal chroma_filter_ps_6x1_internal - FILTER_W6_1 ps - ret - -%macro FILTER_W8_1 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 8] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 12] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2], m3 - movhps [r2 + 8], m3 -%endmacro + mova m5, [interp8_hps_shuf] + vpermd m3, m5, m3 + paddd m3, m2 + vextracti128 xm4, m3, 1 + psrad xm3, INTERP_SHIFT_PS + psrad xm4, INTERP_SHIFT_PS + packssdw xm3, xm3 + packssdw xm4, xm4 -cglobal chroma_filter_pp_8x1_internal - FILTER_W8_1 pp - ret - -cglobal chroma_filter_ps_8x1_internal - FILTER_W8_1 ps - ret - -%macro FILTER_W12_1 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 8] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 12] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2], m3 - movhps [r2 + 8], m3 - - movu m3, [r0 + 16] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 20] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - packusdw m3, m3 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - packssdw m3, m3 + movq [r2], xm3 ;row 0 +.end: + RET %endif - movh [r2 + 16], m3 %endmacro -cglobal chroma_filter_pp_12x1_internal - FILTER_W12_1 pp - ret - -cglobal chroma_filter_ps_12x1_internal - FILTER_W12_1 ps - ret - -%macro FILTER_W16_1 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 8] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 12] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2], m3 - movhps [r2 + 8], m3 - - movu m3, [r0 + 16] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 20] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 24] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 28] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + 16], m3 - movhps [r2 + 24], m3 -%endmacro + IPFILTER_LUMA_PS_4xN_AVX2 4 + IPFILTER_LUMA_PS_4xN_AVX2 8 + IPFILTER_LUMA_PS_4xN_AVX2 16 -cglobal chroma_filter_pp_16x1_internal - FILTER_W16_1 pp - ret - -cglobal chroma_filter_ps_16x1_internal - FILTER_W16_1 ps - ret - -%macro FILTER_W24_1 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 8] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 12] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2], m3 - movhps [r2 + 8], m3 - - movu m3, [r0 + 16] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 20] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 24] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 28] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + 16], m3 - movhps [r2 + 24], m3 - - movu m3, [r0 + 32] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 36] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 40] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 44] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 +%macro IPFILTER_LUMA_PS_8xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_ps_8x%1, 4, 6, 8 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + shl r4d, 4 +%ifdef PIC + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4] + vpbroadcastq m1, [r6 + r4 + 8] %else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + 32], m3 - movhps [r2 + 40], m3 -%endmacro - -cglobal chroma_filter_pp_24x1_internal - FILTER_W24_1 pp - ret - -cglobal chroma_filter_ps_24x1_internal - FILTER_W24_1 ps - ret - -%macro FILTER_W32_1 1 - movu m3, [r0] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 8] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 12] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2], m3 - movhps [r2 + 8], m3 - - movu m3, [r0 + 16] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 20] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 24] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 28] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + 16], m3 - movhps [r2 + 24], m3 - - movu m3, [r0 + 32] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 36] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 40] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 44] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + 32], m3 - movhps [r2 + 40], m3 - - movu m3, [r0 + 48] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + 52] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + 56] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + 60] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + 48], m3 - movhps [r2 + 56], m3 -%endmacro - -cglobal chroma_filter_pp_32x1_internal - FILTER_W32_1 pp - ret - -cglobal chroma_filter_ps_32x1_internal - FILTER_W32_1 ps - ret - -%macro FILTER_W8o_1 2 - movu m3, [r0 + %2] - pshufb m3, m3, m2 - pmaddwd m3, m0 - movu m4, [r0 + %2 + 4] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m1 - - movu m5, [r0 + %2 + 8] - pshufb m5, m5, m2 - pmaddwd m5, m0 - movu m4, [r0 + %2 + 12] - pshufb m4, m4, m2 - pmaddwd m4, m0 - phaddd m5, m4 - paddd m5, m1 -%ifidn %1, pp - psrad m3, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP - packusdw m3, m5 - CLIPW m3, m6, m7 -%else - psrad m3, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS - packssdw m3, m5 -%endif - movh [r2 + %2], m3 - movhps [r2 + %2 + 8], m3 -%endmacro - -%macro FILTER_W48_1 1 - FILTER_W8o_1 %1, 0 - FILTER_W8o_1 %1, 16 - FILTER_W8o_1 %1, 32 - FILTER_W8o_1 %1, 48 - FILTER_W8o_1 %1, 64 - FILTER_W8o_1 %1, 80 -%endmacro - -cglobal chroma_filter_pp_48x1_internal - FILTER_W48_1 pp - ret - -cglobal chroma_filter_ps_48x1_internal - FILTER_W48_1 ps - ret - -%macro FILTER_W64_1 1 - FILTER_W8o_1 %1, 0 - FILTER_W8o_1 %1, 16 - FILTER_W8o_1 %1, 32 - FILTER_W8o_1 %1, 48 - FILTER_W8o_1 %1, 64 - FILTER_W8o_1 %1, 80 - FILTER_W8o_1 %1, 96 - FILTER_W8o_1 %1, 112 -%endmacro - -cglobal chroma_filter_pp_64x1_internal - FILTER_W64_1 pp - ret - -cglobal chroma_filter_ps_64x1_internal - FILTER_W64_1 ps - ret - -;----------------------------------------------------------------------------- -; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- - -INIT_XMM sse4 -%macro IPFILTER_CHROMA 6 -cglobal interp_4tap_horiz_%3_%1x%2, 4, %5, %6 - - add r3, r3 - add r1, r1 - sub r0, 2 - mov r4d, r4m - add r4d, r4d - -%ifdef PIC - lea r%4, [tab_ChromaCoeff] - movh m0, [r%4 + r4 * 4] -%else - movh m0, [tab_ChromaCoeff + r4 * 4] -%endif - - punpcklqdq m0, m0 - mova m2, [tab_Tm16] - -%ifidn %3, ps - mova m1, [INTERP_OFFSET_PS] - cmp r5m, byte 0 - je .skip - sub r0, r1 - call chroma_filter_%3_%1x1_internal - add r0, r1 - add r2, r3 - call chroma_filter_%3_%1x1_internal - add r0, r1 - add r2, r3 - call chroma_filter_%3_%1x1_internal - add r0, r1 - add r2, r3 -.skip: -%else - mova m1, [tab_c_32] - pxor m6, m6 - mova m7, [pw_pixel_max] -%endif - - call chroma_filter_%3_%1x1_internal -%rep %2 - 1 - add r0, r1 - add r2, r3 - call chroma_filter_%3_%1x1_internal -%endrep -RET -%endmacro -IPFILTER_CHROMA 6, 8, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 2, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 4, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 6, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 8, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 16, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 32, pp, 5, 6, 8 -IPFILTER_CHROMA 12, 16, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 4, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 8, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 12, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 16, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 32, pp, 5, 6, 8 -IPFILTER_CHROMA 24, 32, pp, 5, 6, 8 -IPFILTER_CHROMA 32, 8, pp, 5, 6, 8 -IPFILTER_CHROMA 32, 16, pp, 5, 6, 8 -IPFILTER_CHROMA 32, 24, pp, 5, 6, 8 -IPFILTER_CHROMA 32, 32, pp, 5, 6, 8 - -IPFILTER_CHROMA 6, 8, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 2, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 4, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 6, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 8, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 16, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 32, ps, 6, 7, 6 -IPFILTER_CHROMA 12, 16, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 4, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 8, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 12, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 16, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 32, ps, 6, 7, 6 -IPFILTER_CHROMA 24, 32, ps, 6, 7, 6 -IPFILTER_CHROMA 32, 8, ps, 6, 7, 6 -IPFILTER_CHROMA 32, 16, ps, 6, 7, 6 -IPFILTER_CHROMA 32, 24, ps, 6, 7, 6 -IPFILTER_CHROMA 32, 32, ps, 6, 7, 6 - -IPFILTER_CHROMA 6, 16, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 12, pp, 5, 6, 8 -IPFILTER_CHROMA 8, 64, pp, 5, 6, 8 -IPFILTER_CHROMA 12, 32, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 24, pp, 5, 6, 8 -IPFILTER_CHROMA 16, 64, pp, 5, 6, 8 -IPFILTER_CHROMA 24, 64, pp, 5, 6, 8 -IPFILTER_CHROMA 32, 48, pp, 5, 6, 8 -IPFILTER_CHROMA 32, 64, pp, 5, 6, 8 -IPFILTER_CHROMA 6, 16, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 12, ps, 6, 7, 6 -IPFILTER_CHROMA 8, 64, ps, 6, 7, 6 -IPFILTER_CHROMA 12, 32, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 24, ps, 6, 7, 6 -IPFILTER_CHROMA 16, 64, ps, 6, 7, 6 -IPFILTER_CHROMA 24, 64, ps, 6, 7, 6 -IPFILTER_CHROMA 32, 48, ps, 6, 7, 6 -IPFILTER_CHROMA 32, 64, ps, 6, 7, 6 - -IPFILTER_CHROMA 48, 64, pp, 5, 6, 8 -IPFILTER_CHROMA 64, 48, pp, 5, 6, 8 -IPFILTER_CHROMA 64, 64, pp, 5, 6, 8 -IPFILTER_CHROMA 64, 32, pp, 5, 6, 8 -IPFILTER_CHROMA 64, 16, pp, 5, 6, 8 -IPFILTER_CHROMA 48, 64, ps, 6, 7, 6 -IPFILTER_CHROMA 64, 48, ps, 6, 7, 6 -IPFILTER_CHROMA 64, 64, ps, 6, 7, 6 -IPFILTER_CHROMA 64, 32, ps, 6, 7, 6 -IPFILTER_CHROMA 64, 16, ps, 6, 7, 6 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_6xN 1 -cglobal interp_4tap_horiz_pp_6x%1, 5,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r4d, %1/2 -.loop: - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movq [r2], xm3 - pextrd [r2 + 8], xm3, 2 - - vbroadcasti128 m3, [r0 + r1] - vbroadcasti128 m4, [r0 + r1 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movq [r2 + r3], xm3 - pextrd [r2 + r3 + 8], xm3, 2 - - lea r0, [r0 + r1 * 2] - lea r2, [r2 + r3 * 2] - dec r4d - jnz .loop - RET -%endmacro -IPFILTER_CHROMA_avx2_6xN 8 -IPFILTER_CHROMA_avx2_6xN 16 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_8x2, 5,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3,q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2], xm3 - - vbroadcasti128 m3, [r0 + r1] - vbroadcasti128 m4, [r0 + r1 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3,q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2 + r3], xm3 - RET - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal interp_4tap_horiz_pp_8x4, 5,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - -%rep 2 - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3,q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2], xm3 - - vbroadcasti128 m3, [r0 + r1] - vbroadcasti128 m4, [r0 + r1 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3,q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2 + r3], xm3 - - lea r0, [r0 + r1 * 2] - lea r2, [r2 + r3 * 2] -%endrep - RET - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_8xN 1 -cglobal interp_4tap_horiz_pp_8x%1, 5,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r4d, %1/2 -.loop: - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2], xm3 - - vbroadcasti128 m3, [r0 + r1] - vbroadcasti128 m4, [r0 + r1 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2 + r3], xm3 - - lea r0, [r0 + r1 * 2] - lea r2, [r2 + r3 * 2] - dec r4d - jnz .loop - RET -%endmacro -IPFILTER_CHROMA_avx2_8xN 6 -IPFILTER_CHROMA_avx2_8xN 8 -IPFILTER_CHROMA_avx2_8xN 12 -IPFILTER_CHROMA_avx2_8xN 16 -IPFILTER_CHROMA_avx2_8xN 32 -IPFILTER_CHROMA_avx2_8xN 64 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_16xN 1 -%if ARCH_X86_64 -cglobal interp_4tap_horiz_pp_16x%1, 5,6,9 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r4d, %1 -.loop: - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m8, [r0 + 24] - - pshufb m4, m1 - pshufb m8, m1 - - pmaddwd m4, m0 - pmaddwd m8, m0 - phaddd m4, m8 - paddd m4, m2 - psrad m4, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m4, m4 - vpermq m4, m4, q2020 - pshufb xm4, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - vinserti128 m3, m3, xm4, 1 - CLIPW m3, m5, m7 - movu [r2], m3 - - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET -%endif -%endmacro -IPFILTER_CHROMA_avx2_16xN 4 -IPFILTER_CHROMA_avx2_16xN 8 -IPFILTER_CHROMA_avx2_16xN 12 -IPFILTER_CHROMA_avx2_16xN 16 -IPFILTER_CHROMA_avx2_16xN 24 -IPFILTER_CHROMA_avx2_16xN 32 -IPFILTER_CHROMA_avx2_16xN 64 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_32xN 1 -%if ARCH_X86_64 -cglobal interp_4tap_horiz_pp_32x%1, 5,6,9 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r6d, %1 -.loop: -%assign x 0 -%rep 2 - vbroadcasti128 m3, [r0 + x] - vbroadcasti128 m4, [r0 + 8 + x] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - - vbroadcasti128 m4, [r0 + 16 + x] - vbroadcasti128 m8, [r0 + 24 + x] - pshufb m4, m1 - pshufb m8, m1 - - pmaddwd m4, m0 - pmaddwd m8, m0 - phaddd m4, m8 - paddd m4, m2 - psrad m4, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m4, m4 - vpermq m4, m4, q2020 - pshufb xm4, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - vinserti128 m3, m3, xm4, 1 - CLIPW m3, m5, m7 - movu [r2 + x], m3 - %assign x x+32 - %endrep - - add r0, r1 - add r2, r3 - dec r6d - jnz .loop - RET -%endif -%endmacro -IPFILTER_CHROMA_avx2_32xN 8 -IPFILTER_CHROMA_avx2_32xN 16 -IPFILTER_CHROMA_avx2_32xN 24 -IPFILTER_CHROMA_avx2_32xN 32 -IPFILTER_CHROMA_avx2_32xN 48 -IPFILTER_CHROMA_avx2_32xN 64 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_12xN 1 -%if ARCH_X86_64 -cglobal interp_4tap_horiz_pp_12x%1, 5,6,8 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r4d, %1 -.loop: - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movu [r2], xm3 - - vbroadcasti128 m3, [r0 + 16] - vbroadcasti128 m4, [r0 + 24] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] - CLIPW xm3, xm5, xm7 - movq [r2 + 16], xm3 - - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET -%endif -%endmacro -IPFILTER_CHROMA_avx2_12xN 16 -IPFILTER_CHROMA_avx2_12xN 32 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_24xN 1 -%if ARCH_X86_64 -cglobal interp_4tap_horiz_pp_24x%1, 5,6,9 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r4d, %1 -.loop: - vbroadcasti128 m3, [r0] - vbroadcasti128 m4, [r0 + 8] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m8, [r0 + 24] - pshufb m4, m1 - pshufb m8, m1 - - pmaddwd m4, m0 - pmaddwd m8, m0 - phaddd m4, m8 - paddd m4, m2 - psrad m4, 6 - - packusdw m3, m4 - vpermq m3, m3, q3120 - pshufb m3, m6 - CLIPW m3, m5, m7 - movu [r2], m3 - - vbroadcasti128 m3, [r0 + 32] - vbroadcasti128 m4, [r0 + 40] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 - - packusdw m3, m3 - vpermq m3, m3, q2020 - pshufb xm3, xm6 - CLIPW xm3, xm5, xm7 - movu [r2 + 32], xm3 - - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET -%endif -%endmacro -IPFILTER_CHROMA_avx2_24xN 32 -IPFILTER_CHROMA_avx2_24xN 64 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%macro IPFILTER_CHROMA_avx2_64xN 1 -%if ARCH_X86_64 -cglobal interp_4tap_horiz_pp_64x%1, 5,6,9 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r6d, %1 -.loop: -%assign x 0 -%rep 4 - vbroadcasti128 m3, [r0 + x] - vbroadcasti128 m4, [r0 + 8 + x] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 - - vbroadcasti128 m4, [r0 + 16 + x] - vbroadcasti128 m8, [r0 + 24 + x] - pshufb m4, m1 - pshufb m8, m1 - - pmaddwd m4, m0 - pmaddwd m8, m0 - phaddd m4, m8 - paddd m4, m2 - psrad m4, 6 - - packusdw m3, m4 - vpermq m3, m3, q3120 - pshufb m3, m6 - CLIPW m3, m5, m7 - movu [r2 + x], m3 - %assign x x+32 - %endrep - - add r0, r1 - add r2, r3 - dec r6d - jnz .loop - RET -%endif -%endmacro -IPFILTER_CHROMA_avx2_64xN 16 -IPFILTER_CHROMA_avx2_64xN 32 -IPFILTER_CHROMA_avx2_64xN 48 -IPFILTER_CHROMA_avx2_64xN 64 - -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx -;------------------------------------------------------------------------------------------------------------- -INIT_YMM avx2 -%if ARCH_X86_64 -cglobal interp_4tap_horiz_pp_48x64, 5,6,9 - add r1d, r1d - add r3d, r3d - sub r0, 2 - mov r4d, r4m -%ifdef PIC - lea r5, [tab_ChromaCoeff] - vpbroadcastq m0, [r5 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m1, [interp8_hpp_shuf] - vpbroadcastd m2, [pd_32] - pxor m5, m5 - mova m6, [idct8_shuf2] - mova m7, [pw_pixel_max] - - mov r4d, 64 -.loop: -%assign x 0 -%rep 3 - vbroadcasti128 m3, [r0 + x] - vbroadcasti128 m4, [r0 + 8 + x] - pshufb m3, m1 - pshufb m4, m1 - - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - paddd m3, m2 - psrad m3, 6 - - vbroadcasti128 m4, [r0 + 16 + x] - vbroadcasti128 m8, [r0 + 24 + x] - pshufb m4, m1 - pshufb m8, m1 - - pmaddwd m4, m0 - pmaddwd m8, m0 - phaddd m4, m8 - paddd m4, m2 - psrad m4, 6 - - packusdw m3, m4 - vpermq m3, m3, q3120 - pshufb m3, m6 - CLIPW m3, m5, m7 - movu [r2 + x], m3 -%assign x x+32 -%endrep - - add r0, r1 - add r2, r3 - dec r4d - jnz .loop - RET -%endif - -;----------------------------------------------------------------------------------------------------------------------------- -;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- - -%macro IPFILTER_LUMA_PS_4xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_4x%1, 6,8,7 - mov r5d, r5m - mov r4d, r4m - add r1d, r1d - add r3d, r3d - -%ifdef PIC - lea r6, [tab_LumaCoeff] - lea r4, [r4 * 8] - vbroadcasti128 m0, [r6 + r4 * 2] -%else - lea r4, [r4 * 8] - vbroadcasti128 m0, [tab_LumaCoeff + r4 * 2] -%endif - - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 - interpolate coeff - ; m1 - shuffle order table - ; m2 - pw_2000 - - sub r0, 6 - test r5d, r5d - mov r7d, %1 ; loop count variable - height - jz .preloop - lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride - sub r0, r6 ; r0(src) - 3 * srcStride - add r7d, 6 ;7 - 1(since last row not in loop) ; need extra 7 rows, just set a specially flag here, blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop) - -.preloop: - lea r6, [r3 * 3] -.loop: - ; Row 0 - movu xm3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - movu xm4, [r0 + 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - vinserti128 m3, m3, xm4, 1 - movu xm4, [r0 + 4] - movu xm5, [r0 + 6] - vinserti128 m4, m4, xm5, 1 - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] - - ; Row 1 - movu xm4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - movu xm5, [r0 + r1 + 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] - vinserti128 m4, m4, xm5, 1 - movu xm5, [r0 + r1 + 4] - movu xm6, [r0 + r1 + 6] - vinserti128 m5, m5, xm6, 1 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] - phaddd m3, m4 ; all rows and col completed. - - mova m5, [interp8_hps_shuf] - vpermd m3, m5, m3 - paddd m3, m2 - vextracti128 xm4, m3, 1 - psrad xm3, INTERP_SHIFT_PS - psrad xm4, INTERP_SHIFT_PS - packssdw xm3, xm3 - packssdw xm4, xm4 - - movq [r2], xm3 ;row 0 - movq [r2 + r3], xm4 ;row 1 - lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4) - lea r2, [r2 + r3 * 2] ; first loop dst ->5th row(i.e 4) - - sub r7d, 2 - jg .loop - test r5d, r5d - jz .end - - ; Row 10 - movu xm3, [r0] - movu xm4, [r0 + 2] - vinserti128 m3, m3, xm4, 1 - movu xm4, [r0 + 4] - movu xm5, [r0 + 6] - vinserti128 m4, m4, xm5, 1 - pmaddwd m3, m0 - pmaddwd m4, m0 - phaddd m3, m4 - - ; Row11 - phaddd m3, m4 ; all rows and col completed. - - mova m5, [interp8_hps_shuf] - vpermd m3, m5, m3 - paddd m3, m2 - vextracti128 xm4, m3, 1 - psrad xm3, INTERP_SHIFT_PS - psrad xm4, INTERP_SHIFT_PS - packssdw xm3, xm3 - packssdw xm4, xm4 - - movq [r2], xm3 ;row 0 -.end: - RET -%endif -%endmacro - - IPFILTER_LUMA_PS_4xN_AVX2 4 - IPFILTER_LUMA_PS_4xN_AVX2 8 - IPFILTER_LUMA_PS_4xN_AVX2 16 - -%macro IPFILTER_LUMA_PS_8xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_8x%1, 4, 6, 8 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - shl r4d, 4 -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4] - vpbroadcastq m1, [r6 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 6 - test r5d, r5d - mov r4d, %1 - jz .loop0 - lea r6, [r1*3] - sub r0, r6 - add r4d, 7 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m7, m5, m3 - pmaddwd m4, m0 - pmaddwd m7, m1 - paddd m4, m7 - - vbroadcasti128 m6, [r0 + 16] - pshufb m5, m3 - pshufb m6, m3 - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m2 - vextracti128 xm5,m4, 1 - psrad xm4, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm4, xm5 - - movu [r2], xm4 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - - IPFILTER_LUMA_PS_8xN_AVX2 4 - IPFILTER_LUMA_PS_8xN_AVX2 8 - IPFILTER_LUMA_PS_8xN_AVX2 16 - IPFILTER_LUMA_PS_8xN_AVX2 32 - -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_24x32, 4, 6, 8 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - shl r4d, 4 -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4] - vpbroadcastq m1, [r6 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 6 - test r5d, r5d - mov r4d, 32 - jz .loop0 - lea r6, [r1*3] - sub r0, r6 - add r4d, 7 - -.loop0: -%assign x 0 -%rep 24/8 - vbroadcasti128 m4, [r0 + x] - vbroadcasti128 m5, [r0 + 8 + x] - pshufb m4, m3 - pshufb m7, m5, m3 - pmaddwd m4, m0 - pmaddwd m7, m1 - paddd m4, m7 - - vbroadcasti128 m6, [r0 + 16 + x] - pshufb m5, m3 - pshufb m6, m3 - pmaddwd m5, m0 - pmaddwd m6, m1 - paddd m5, m6 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m2 - vextracti128 xm5,m4, 1 - psrad xm4, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm4, xm5 - - movu [r2 + x], xm4 - %assign x x+16 - %endrep - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%macro IPFILTER_LUMA_PS_32_64_AVX2 2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_%1x%2, 4, 6, 8 - - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - shl r4d, 6 -%ifdef PIC - lea r6, [h_tab_LumaCoeffV] - movu m0, [r6 + r4] - movu m1, [r6 + r4 + mmsize] -%else - movu m0, [h_tab_LumaCoeffV + r4] - movu m1, [h_tab_LumaCoeffV + r4 + mmsize] -%endif - mova m3, [interp8_hpp_shuf_new] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 6 - test r5d, r5d - mov r4d, %2 - jz .loop0 - lea r6, [r1*3] - sub r0, r6 - add r4d, 7 - -.loop0: -%assign x 0 -%rep %1/16 - vbroadcasti128 m4, [r0 + x] - vbroadcasti128 m5, [r0 + 4 * SIZEOF_PIXEL + x] - pshufb m4, m3 - pshufb m5, m3 - - pmaddwd m4, m0 - pmaddwd m7, m5, m1 - paddd m4, m7 - vextracti128 xm7, m4, 1 - paddd xm4, xm7 - paddd xm4, xm2 - psrad xm4, INTERP_SHIFT_PS - - vbroadcasti128 m6, [r0 + 16 + x] - pshufb m6, m3 - - pmaddwd m5, m0 - pmaddwd m7, m6, m1 - paddd m5, m7 - vextracti128 xm7, m5, 1 - paddd xm5, xm7 - paddd xm5, xm2 - psrad xm5, INTERP_SHIFT_PS - - packssdw xm4, xm5 - movu [r2 + x], xm4 - - vbroadcasti128 m5, [r0 + 24 + x] - pshufb m5, m3 - - pmaddwd m6, m0 - pmaddwd m7, m5, m1 - paddd m6, m7 - vextracti128 xm7, m6, 1 - paddd xm6, xm7 - paddd xm6, xm2 - psrad xm6, INTERP_SHIFT_PS - - vbroadcasti128 m7, [r0 + 32 + x] - pshufb m7, m3 - - pmaddwd m5, m0 - pmaddwd m7, m1 - paddd m5, m7 - vextracti128 xm7, m5, 1 - paddd xm5, xm7 - paddd xm5, xm2 - psrad xm5, INTERP_SHIFT_PS - - packssdw xm6, xm5 - movu [r2 + 16 + x], xm6 - -%assign x x+32 -%endrep - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - - IPFILTER_LUMA_PS_32_64_AVX2 32, 8 - IPFILTER_LUMA_PS_32_64_AVX2 32, 16 - IPFILTER_LUMA_PS_32_64_AVX2 32, 24 - IPFILTER_LUMA_PS_32_64_AVX2 32, 32 - IPFILTER_LUMA_PS_32_64_AVX2 32, 64 - - IPFILTER_LUMA_PS_32_64_AVX2 64, 16 - IPFILTER_LUMA_PS_32_64_AVX2 64, 32 - IPFILTER_LUMA_PS_32_64_AVX2 64, 48 - IPFILTER_LUMA_PS_32_64_AVX2 64, 64 - - IPFILTER_LUMA_PS_32_64_AVX2 48, 64 - -%macro IPFILTER_LUMA_PS_16xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_16x%1, 4, 6, 8 - - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - shl r4d, 4 -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4] - vpbroadcastq m1, [r6 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 6 - test r5d, r5d - mov r4d, %1 - jz .loop0 - lea r6, [r1*3] - sub r0, r6 - add r4d, 7 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m7, m5, m3 - pmaddwd m4, m0 - pmaddwd m7, m1 - paddd m4, m7 - - vbroadcasti128 m6, [r0 + 16] - pshufb m5, m3 - pshufb m7, m6, m3 - pmaddwd m5, m0 - pmaddwd m7, m1 - paddd m5, m7 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m2 - vextracti128 xm5, m4, 1 - psrad xm4, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m5, [r0 + 24] - pshufb m6, m3 - pshufb m7, m5, m3 - pmaddwd m6, m0 - pmaddwd m7, m1 - paddd m6, m7 - - vbroadcasti128 m7, [r0 + 32] - pshufb m5, m3 - pshufb m7, m3 - pmaddwd m5, m0 - pmaddwd m7, m1 - paddd m5, m7 - - phaddd m6, m5 - vpermq m6, m6, q3120 - paddd m6, m2 - vextracti128 xm5,m6, 1 - psrad xm6, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm6, xm5 - movu [r2 + 16], xm6 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - - IPFILTER_LUMA_PS_16xN_AVX2 4 - IPFILTER_LUMA_PS_16xN_AVX2 8 - IPFILTER_LUMA_PS_16xN_AVX2 12 - IPFILTER_LUMA_PS_16xN_AVX2 16 - IPFILTER_LUMA_PS_16xN_AVX2 32 - IPFILTER_LUMA_PS_16xN_AVX2 64 - -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_horiz_ps_12x16, 4, 6, 8 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - shl r4d, 4 -%ifdef PIC - lea r6, [tab_LumaCoeff] - vpbroadcastq m0, [r6 + r4] - vpbroadcastq m1, [r6 + r4 + 8] -%else - vpbroadcastq m0, [tab_LumaCoeff + r4] - vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 6 - test r5d, r5d - mov r4d, 16 - jz .loop0 - lea r6, [r1*3] - sub r0, r6 - add r4d, 7 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m7, m5, m3 - pmaddwd m4, m0 - pmaddwd m7, m1 - paddd m4, m7 - - vbroadcasti128 m6, [r0 + 16] - pshufb m5, m3 - pshufb m7, m6, m3 - pmaddwd m5, m0 - pmaddwd m7, m1 - paddd m5, m7 - - phaddd m4, m5 - vpermq m4, m4, q3120 - paddd m4, m2 - vextracti128 xm5,m4, 1 - psrad xm4, INTERP_SHIFT_PS - psrad xm5, INTERP_SHIFT_PS - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m5, [r0 + 24] - pshufb m6, m3 - pshufb m5, m3 - pmaddwd m6, m0 - pmaddwd m5, m1 - paddd m6, m5 - - phaddd m6, m6 - vpermq m6, m6, q3120 - paddd xm6, xm2 - psrad xm6, INTERP_SHIFT_PS - packssdw xm6, xm6 - movq [r2 + 16], xm6 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%macro IPFILTER_CHROMA_PS_8xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_8x%1, 4, 7, 6 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 2 - test r5d, r5d - mov r4d, %1 - jz .loop0 - sub r0, r1 - add r4d, 3 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - - IPFILTER_CHROMA_PS_8xN_AVX2 4 - IPFILTER_CHROMA_PS_8xN_AVX2 8 - IPFILTER_CHROMA_PS_8xN_AVX2 16 - IPFILTER_CHROMA_PS_8xN_AVX2 32 - IPFILTER_CHROMA_PS_8xN_AVX2 6 - IPFILTER_CHROMA_PS_8xN_AVX2 2 - IPFILTER_CHROMA_PS_8xN_AVX2 12 - IPFILTER_CHROMA_PS_8xN_AVX2 64 - -%macro IPFILTER_CHROMA_PS_16xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_16x%1, 4, 7, 6 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] %endif mova m3, [interp8_hpp_shuf] vbroadcasti128 m2, [INTERP_OFFSET_PS] @@ -4372,194 +2142,39 @@ cglobal interp_4tap_horiz_ps_16x%1, 4, 7, 6 ; register map ; m0 , m1 interpolate coeff - sub r0, 2 + sub r0, 6 test r5d, r5d mov r4d, %1 jz .loop0 - sub r0, r1 - add r4d, 3 + lea r6, [r1*3] + sub r0, r6 + add r4d, 7 .loop0: vbroadcasti128 m4, [r0] vbroadcasti128 m5, [r0 + 8] pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 - pshufb m5, m3 + pshufb m7, m5, m3 pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 16], xm4 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - -IPFILTER_CHROMA_PS_16xN_AVX2 16 -IPFILTER_CHROMA_PS_16xN_AVX2 8 -IPFILTER_CHROMA_PS_16xN_AVX2 32 -IPFILTER_CHROMA_PS_16xN_AVX2 12 -IPFILTER_CHROMA_PS_16xN_AVX2 4 -IPFILTER_CHROMA_PS_16xN_AVX2 64 -IPFILTER_CHROMA_PS_16xN_AVX2 24 - -%macro IPFILTER_CHROMA_PS_24xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_24x%1, 4, 7, 6 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 2 - test r5d, r5d - mov r4d, %1 - jz .loop0 - sub r0, r1 - add r4d, 3 + pmaddwd m7, m1 + paddd m4, m7 -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 + vbroadcasti128 m6, [r0 + 16] pshufb m5, m3 - pmaddwd m4, m0 + pshufb m6, m3 pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 + pmaddwd m6, m1 + paddd m5, m6 - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 phaddd m4, m5 - paddd m4, m2 vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 16], xm4 - - vbroadcasti128 m4, [r0 + 32] - vbroadcasti128 m5, [r0 + 40] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 + vextracti128 xm5,m4, 1 + psrad xm4, INTERP_SHIFT_PS + psrad xm5, INTERP_SHIFT_PS packssdw xm4, xm5 - movu [r2 + 32], xm4 - - add r2, r3 - add r0, r1 - dec r4d - jnz .loop0 - RET -%endif -%endmacro - -IPFILTER_CHROMA_PS_24xN_AVX2 32 -IPFILTER_CHROMA_PS_24xN_AVX2 64 - -%macro IPFILTER_CHROMA_PS_12xN_AVX2 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_12x%1, 4, 7, 6 - add r1d, r1d - add r3d, r3d - mov r4d, r4m - mov r5d, r5m - -%ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] -%else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] -%endif - mova m3, [interp8_hpp_shuf] - vbroadcasti128 m2, [INTERP_OFFSET_PS] - - ; register map - ; m0 , m1 interpolate coeff - - sub r0, 2 - test r5d, r5d - mov r4d, %1 - jz .loop0 - sub r0, r1 - add r4d, 3 -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 movu [r2], xm4 - - vbroadcasti128 m4, [r0 + 16] - pshufb m4, m3 - pmaddwd m4, m0 - phaddd m4, m4 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movq [r2 + 16], xm4 - add r2, r3 add r0, r1 dec r4d @@ -4568,23 +2183,26 @@ cglobal interp_4tap_horiz_ps_12x%1, 4, 7, 6 %endif %endmacro -IPFILTER_CHROMA_PS_12xN_AVX2 16 -IPFILTER_CHROMA_PS_12xN_AVX2 32 + IPFILTER_LUMA_PS_8xN_AVX2 4 + IPFILTER_LUMA_PS_8xN_AVX2 8 + IPFILTER_LUMA_PS_8xN_AVX2 16 + IPFILTER_LUMA_PS_8xN_AVX2 32 -%macro IPFILTER_CHROMA_PS_32xN_AVX2 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_32x%1, 4, 7, 6 +cglobal interp_8tap_horiz_ps_24x32, 4, 6, 8 add r1d, r1d add r3d, r3d mov r4d, r4m mov r5d, r5m - + shl r4d, 4 %ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4] + vpbroadcastq m1, [r6 + r4 + 8] %else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] %endif mova m3, [interp8_hpp_shuf] vbroadcasti128 m2, [INTERP_OFFSET_PS] @@ -4592,69 +2210,43 @@ cglobal interp_4tap_horiz_ps_32x%1, 4, 7, 6 ; register map ; m0 , m1 interpolate coeff - sub r0, 2 + sub r0, 6 test r5d, r5d - mov r4d, %1 + mov r4d, 32 jz .loop0 - sub r0, r1 - add r4d, 3 + lea r6, [r1*3] + sub r0, r6 + add r4d, 7 .loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] +%assign x 0 +%rep 24/8 + vbroadcasti128 m4, [r0 + x] + vbroadcasti128 m5, [r0 + 8 + x] pshufb m4, m3 - pshufb m5, m3 + pshufb m7, m5, m3 pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 + pmaddwd m7, m1 + paddd m4, m7 - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 + vbroadcasti128 m6, [r0 + 16 + x] pshufb m5, m3 - pmaddwd m4, m0 + pshufb m6, m3 pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 16], xm4 + pmaddwd m6, m1 + paddd m5, m6 - vbroadcasti128 m4, [r0 + 32] - vbroadcasti128 m5, [r0 + 40] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 phaddd m4, m5 - paddd m4, m2 vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 32], xm4 - - vbroadcasti128 m4, [r0 + 48] - vbroadcasti128 m5, [r0 + 56] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 + vextracti128 xm5,m4, 1 + psrad xm4, INTERP_SHIFT_PS + psrad xm5, INTERP_SHIFT_PS packssdw xm4, xm5 - movu [r2 + 48], xm4 + + movu [r2 + x], xm4 + %assign x x+16 + %endrep add r2, r3 add r0, r1 @@ -4662,183 +2254,134 @@ cglobal interp_4tap_horiz_ps_32x%1, 4, 7, 6 jnz .loop0 RET %endif -%endmacro - -IPFILTER_CHROMA_PS_32xN_AVX2 32 -IPFILTER_CHROMA_PS_32xN_AVX2 16 -IPFILTER_CHROMA_PS_32xN_AVX2 24 -IPFILTER_CHROMA_PS_32xN_AVX2 8 -IPFILTER_CHROMA_PS_32xN_AVX2 64 -IPFILTER_CHROMA_PS_32xN_AVX2 48 - - -%macro IPFILTER_CHROMA_PS_64xN_AVX2 1 +%macro IPFILTER_LUMA_PS_32_64_AVX2 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_64x%1, 4, 7, 6 +cglobal interp_8tap_horiz_ps_%1x%2, 4, 6, 8 + add r1d, r1d add r3d, r3d mov r4d, r4m mov r5d, r5m - + shl r4d, 6 %ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] + lea r6, [h_tab_LumaCoeffV] + movu m0, [r6 + r4] + movu m1, [r6 + r4 + mmsize] %else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] + movu m0, [h_tab_LumaCoeffV + r4] + movu m1, [h_tab_LumaCoeffV + r4 + mmsize] %endif - mova m3, [interp8_hpp_shuf] + mova m3, [interp8_hpp_shuf_new] vbroadcasti128 m2, [INTERP_OFFSET_PS] ; register map ; m0 , m1 interpolate coeff - sub r0, 2 + sub r0, 6 test r5d, r5d - mov r4d, %1 - jz .loop0 - sub r0, r1 - add r4d, 3 - -.loop0: - vbroadcasti128 m4, [r0] - vbroadcasti128 m5, [r0 + 8] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 - - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 16], xm4 + mov r4d, %2 + jz .loop0 + lea r6, [r1*3] + sub r0, r6 + add r4d, 7 - vbroadcasti128 m4, [r0 + 32] - vbroadcasti128 m5, [r0 + 40] +.loop0: +%assign x 0 +%rep %1/16 + vbroadcasti128 m4, [r0 + x] + vbroadcasti128 m5, [r0 + 4 * SIZEOF_PIXEL + x] pshufb m4, m3 pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 32], xm4 - vbroadcasti128 m4, [r0 + 48] - vbroadcasti128 m5, [r0 + 56] - pshufb m4, m3 - pshufb m5, m3 pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 48], xm4 + pmaddwd m7, m5, m1 + paddd m4, m7 + vextracti128 xm7, m4, 1 + paddd xm4, xm7 + paddd xm4, xm2 + psrad xm4, INTERP_SHIFT_PS - vbroadcasti128 m4, [r0 + 64] - vbroadcasti128 m5, [r0 + 72] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 64], xm4 + vbroadcasti128 m6, [r0 + 16 + x] + pshufb m6, m3 - vbroadcasti128 m4, [r0 + 80] - vbroadcasti128 m5, [r0 + 88] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 80], xm4 + pmaddwd m7, m6, m1 + paddd m5, m7 + vextracti128 xm7, m5, 1 + paddd xm5, xm7 + paddd xm5, xm2 + psrad xm5, INTERP_SHIFT_PS - vbroadcasti128 m4, [r0 + 96] - vbroadcasti128 m5, [r0 + 104] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 packssdw xm4, xm5 - movu [r2 + 96], xm4 + movu [r2 + x], xm4 - vbroadcasti128 m4, [r0 + 112] - vbroadcasti128 m5, [r0 + 120] - pshufb m4, m3 + vbroadcasti128 m5, [r0 + 24 + x] pshufb m5, m3 - pmaddwd m4, m0 + + pmaddwd m6, m0 + pmaddwd m7, m5, m1 + paddd m6, m7 + vextracti128 xm7, m6, 1 + paddd xm6, xm7 + paddd xm6, xm2 + psrad xm6, INTERP_SHIFT_PS + + vbroadcasti128 m7, [r0 + 32 + x] + pshufb m7, m3 + pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 112], xm4 + pmaddwd m7, m1 + paddd m5, m7 + vextracti128 xm7, m5, 1 + paddd xm5, xm7 + paddd xm5, xm2 + psrad xm5, INTERP_SHIFT_PS + + packssdw xm6, xm5 + movu [r2 + 16 + x], xm6 + +%assign x x+32 +%endrep add r2, r3 add r0, r1 dec r4d - jnz .loop0 + jnz .loop0 RET %endif %endmacro -IPFILTER_CHROMA_PS_64xN_AVX2 64 -IPFILTER_CHROMA_PS_64xN_AVX2 48 -IPFILTER_CHROMA_PS_64xN_AVX2 32 -IPFILTER_CHROMA_PS_64xN_AVX2 16 + IPFILTER_LUMA_PS_32_64_AVX2 32, 8 + IPFILTER_LUMA_PS_32_64_AVX2 32, 16 + IPFILTER_LUMA_PS_32_64_AVX2 32, 24 + IPFILTER_LUMA_PS_32_64_AVX2 32, 32 + IPFILTER_LUMA_PS_32_64_AVX2 32, 64 + + IPFILTER_LUMA_PS_32_64_AVX2 64, 16 + IPFILTER_LUMA_PS_32_64_AVX2 64, 32 + IPFILTER_LUMA_PS_32_64_AVX2 64, 48 + IPFILTER_LUMA_PS_32_64_AVX2 64, 64 + + IPFILTER_LUMA_PS_32_64_AVX2 48, 64 +%macro IPFILTER_LUMA_PS_16xN_AVX2 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_48x64, 4, 7, 6 +cglobal interp_8tap_horiz_ps_16x%1, 4, 6, 8 + add r1d, r1d add r3d, r3d mov r4d, r4m mov r5d, r5m - + shl r4d, 4 %ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4] + vpbroadcastq m1, [r6 + r4 + 8] %else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] %endif mova m3, [interp8_hpp_shuf] vbroadcasti128 m2, [INTERP_OFFSET_PS] @@ -4846,97 +2389,61 @@ cglobal interp_4tap_horiz_ps_48x64, 4, 7, 6 ; register map ; m0 , m1 interpolate coeff - sub r0, 2 + sub r0, 6 test r5d, r5d - mov r4d, 64 + mov r4d, %1 jz .loop0 - sub r0, r1 - add r4d, 3 + lea r6, [r1*3] + sub r0, r6 + add r4d, 7 .loop0: vbroadcasti128 m4, [r0] vbroadcasti128 m5, [r0 + 8] pshufb m4, m3 - pshufb m5, m3 + pshufb m7, m5, m3 pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2], xm4 + pmaddwd m7, m1 + paddd m4, m7 - vbroadcasti128 m4, [r0 + 16] - vbroadcasti128 m5, [r0 + 24] - pshufb m4, m3 + vbroadcasti128 m6, [r0 + 16] pshufb m5, m3 - pmaddwd m4, m0 + pshufb m7, m6, m3 pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 16], xm4 + pmaddwd m7, m1 + paddd m5, m7 - vbroadcasti128 m4, [r0 + 32] - vbroadcasti128 m5, [r0 + 40] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 phaddd m4, m5 - paddd m4, m2 vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 32], xm4 - - vbroadcasti128 m4, [r0 + 48] - vbroadcasti128 m5, [r0 + 56] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS vextracti128 xm5, m4, 1 + psrad xm4, INTERP_SHIFT_PS + psrad xm5, INTERP_SHIFT_PS packssdw xm4, xm5 - movu [r2 + 48], xm4 + movu [r2], xm4 - vbroadcasti128 m4, [r0 + 64] - vbroadcasti128 m5, [r0 + 72] - pshufb m4, m3 - pshufb m5, m3 - pmaddwd m4, m0 - pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 64], xm4 + vbroadcasti128 m5, [r0 + 24] + pshufb m6, m3 + pshufb m7, m5, m3 + pmaddwd m6, m0 + pmaddwd m7, m1 + paddd m6, m7 - vbroadcasti128 m4, [r0 + 80] - vbroadcasti128 m5, [r0 + 88] - pshufb m4, m3 + vbroadcasti128 m7, [r0 + 32] pshufb m5, m3 - pmaddwd m4, m0 + pshufb m7, m3 pmaddwd m5, m0 - phaddd m4, m5 - paddd m4, m2 - vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 - packssdw xm4, xm5 - movu [r2 + 80], xm4 + pmaddwd m7, m1 + paddd m5, m7 + + phaddd m6, m5 + vpermq m6, m6, q3120 + paddd m6, m2 + vextracti128 xm5,m6, 1 + psrad xm6, INTERP_SHIFT_PS + psrad xm5, INTERP_SHIFT_PS + packssdw xm6, xm5 + movu [r2 + 16], xm6 add r2, r3 add r0, r1 @@ -4944,21 +2451,30 @@ cglobal interp_4tap_horiz_ps_48x64, 4, 7, 6 jnz .loop0 RET %endif +%endmacro + + IPFILTER_LUMA_PS_16xN_AVX2 4 + IPFILTER_LUMA_PS_16xN_AVX2 8 + IPFILTER_LUMA_PS_16xN_AVX2 12 + IPFILTER_LUMA_PS_16xN_AVX2 16 + IPFILTER_LUMA_PS_16xN_AVX2 32 + IPFILTER_LUMA_PS_16xN_AVX2 64 -%macro IPFILTER_CHROMA_PS_6xN_AVX2 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_4tap_horiz_ps_6x%1, 4, 7, 6 +cglobal interp_8tap_horiz_ps_12x16, 4, 6, 8 add r1d, r1d add r3d, r3d mov r4d, r4m mov r5d, r5m - + shl r4d, 4 %ifdef PIC - lea r6, [tab_ChromaCoeff] - vpbroadcastq m0, [r6 + r4 * 8] + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4] + vpbroadcastq m1, [r6 + r4 + 8] %else - vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] + vpbroadcastq m0, [tab_LumaCoeff + r4] + vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] %endif mova m3, [interp8_hpp_shuf] vbroadcasti128 m2, [INTERP_OFFSET_PS] @@ -4966,35 +2482,56 @@ cglobal interp_4tap_horiz_ps_6x%1, 4, 7, 6 ; register map ; m0 , m1 interpolate coeff - sub r0, 2 + sub r0, 6 test r5d, r5d - mov r4d, %1 + mov r4d, 16 jz .loop0 - sub r0, r1 - add r4d, 3 + lea r6, [r1*3] + sub r0, r6 + add r4d, 7 .loop0: vbroadcasti128 m4, [r0] vbroadcasti128 m5, [r0 + 8] pshufb m4, m3 - pshufb m5, m3 + pshufb m7, m5, m3 pmaddwd m4, m0 + pmaddwd m7, m1 + paddd m4, m7 + + vbroadcasti128 m6, [r0 + 16] + pshufb m5, m3 + pshufb m7, m6, m3 pmaddwd m5, m0 + pmaddwd m7, m1 + paddd m5, m7 + phaddd m4, m5 - paddd m4, m2 vpermq m4, m4, q3120 - psrad m4, INTERP_SHIFT_PS - vextracti128 xm5, m4, 1 + paddd m4, m2 + vextracti128 xm5,m4, 1 + psrad xm4, INTERP_SHIFT_PS + psrad xm5, INTERP_SHIFT_PS packssdw xm4, xm5 - movq [r2], xm4 - pextrd [r2 + 8], xm4, 2 + movu [r2], xm4 + + vbroadcasti128 m5, [r0 + 24] + pshufb m6, m3 + pshufb m5, m3 + pmaddwd m6, m0 + pmaddwd m5, m1 + paddd m6, m5 + + phaddd m6, m6 + vpermq m6, m6, q3120 + paddd xm6, xm2 + psrad xm6, INTERP_SHIFT_PS + packssdw xm6, xm6 + movq [r2 + 16], xm6 + add r2, r3 add r0, r1 dec r4d jnz .loop0 RET %endif -%endmacro - - IPFILTER_CHROMA_PS_6xN_AVX2 8 - IPFILTER_CHROMA_PS_6xN_AVX2 16 diff --git a/source/common/x86/h4-ipfilter16.asm b/source/common/x86/h4-ipfilter16.asm new file mode 100644 index 0000000000..727ea95e79 --- /dev/null +++ b/source/common/x86/h4-ipfilter16.asm @@ -0,0 +1,2632 @@ +;***************************************************************************** +;* Copyright (C) 2013-2017 MulticoreWare, Inc +;* +;* Authors: Nabajit Deka +;* Murugan Vairavel +;* Min Chen +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + + +%define INTERP_OFFSET_PP pd_32 +%define INTERP_SHIFT_PP 6 + +%if BIT_DEPTH == 10 + %define INTERP_SHIFT_PS 2 + %define INTERP_OFFSET_PS pd_n32768 + %define INTERP_SHIFT_SP 10 + %define INTERP_OFFSET_SP h4_pd_524800 +%elif BIT_DEPTH == 12 + %define INTERP_SHIFT_PS 4 + %define INTERP_OFFSET_PS pd_n131072 + %define INTERP_SHIFT_SP 8 + %define INTERP_OFFSET_SP pd_524416 +%else + %error Unsupport bit depth! +%endif + + +SECTION_RODATA 32 + +tab_c_32: times 8 dd 32 +h4_pd_524800: times 8 dd 524800 + +tab_Tm16: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + +tab_ChromaCoeff: dw 0, 64, 0, 0 + dw -2, 58, 10, -2 + dw -4, 54, 16, -2 + dw -6, 46, 28, -4 + dw -4, 36, 36, -4 + dw -4, 28, 46, -6 + dw -2, 16, 54, -4 + dw -2, 10, 58, -2 + +const h4_interp8_hpp_shuf, db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 + +SECTION .text +cextern pd_8 +cextern pd_32 +cextern pw_pixel_max +cextern pd_524416 +cextern pd_n32768 +cextern pd_n131072 +cextern pw_2000 +cextern idct8_shuf2 + +%macro FILTERH_W2_4_sse3 2 + movh m3, [r0 + %1] + movhps m3, [r0 + %1 + 2] + pmaddwd m3, m0 + movh m4, [r0 + r1 + %1] + movhps m4, [r0 + r1 + %1 + 2] + pmaddwd m4, m0 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m3, m3, q3120 + pshufd m4, m4, q3120 + punpcklqdq m3, m4 + paddd m3, m1 + movh m5, [r0 + 2 * r1 + %1] + movhps m5, [r0 + 2 * r1 + %1 + 2] + pmaddwd m5, m0 + movh m4, [r0 + r4 + %1] + movhps m4, [r0 + r4 + %1 + 2] + pmaddwd m4, m0 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m5, m5, q3120 + pshufd m4, m4, q3120 + punpcklqdq m5, m4 + paddd m5, m1 +%ifidn %2, pp + psrad m3, 6 + psrad m5, 6 + packssdw m3, m5 + CLIPW m3, m7, m6 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movd [r2 + %1], m3 + psrldq m3, 4 + movd [r2 + r3 + %1], m3 + psrldq m3, 4 + movd [r2 + r3 * 2 + %1], m3 + psrldq m3, 4 + movd [r2 + r5 + %1], m3 +%endmacro + +%macro FILTERH_W2_3_sse3 1 + movh m3, [r0 + %1] + movhps m3, [r0 + %1 + 2] + pmaddwd m3, m0 + movh m4, [r0 + r1 + %1] + movhps m4, [r0 + r1 + %1 + 2] + pmaddwd m4, m0 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m3, m3, q3120 + pshufd m4, m4, q3120 + punpcklqdq m3, m4 + paddd m3, m1 + + movh m5, [r0 + 2 * r1 + %1] + movhps m5, [r0 + 2 * r1 + %1 + 2] + pmaddwd m5, m0 + + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m5, m5, q3120 + paddd m5, m1 + + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 + + movd [r2 + %1], m3 + psrldq m3, 4 + movd [r2 + r3 + %1], m3 + psrldq m3, 4 + movd [r2 + r3 * 2 + %1], m3 +%endmacro + +%macro FILTERH_W4_2_sse3 2 + movh m3, [r0 + %1] + movhps m3, [r0 + %1 + 2] + pmaddwd m3, m0 + movh m4, [r0 + %1 + 4] + movhps m4, [r0 + %1 + 6] + pmaddwd m4, m0 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m3, m3, q3120 + pshufd m4, m4, q3120 + punpcklqdq m3, m4 + paddd m3, m1 + + movh m5, [r0 + r1 + %1] + movhps m5, [r0 + r1 + %1 + 2] + pmaddwd m5, m0 + movh m4, [r0 + r1 + %1 + 4] + movhps m4, [r0 + r1 + %1 + 6] + pmaddwd m4, m0 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m5, m5, q3120 + pshufd m4, m4, q3120 + punpcklqdq m5, m4 + paddd m5, m1 +%ifidn %2, pp + psrad m3, 6 + psrad m5, 6 + packssdw m3, m5 + CLIPW m3, m7, m6 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + %1], m3 + movhps [r2 + r3 + %1], m3 +%endmacro + +%macro FILTERH_W4_1_sse3 1 + movh m3, [r0 + 2 * r1 + %1] + movhps m3, [r0 + 2 * r1 + %1 + 2] + pmaddwd m3, m0 + movh m4, [r0 + 2 * r1 + %1 + 4] + movhps m4, [r0 + 2 * r1 + %1 + 6] + pmaddwd m4, m0 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m3, m3, q3120 + pshufd m4, m4, q3120 + punpcklqdq m3, m4 + paddd m3, m1 + + psrad m3, INTERP_SHIFT_PS + packssdw m3, m3 + movh [r2 + r3 * 2 + %1], m3 +%endmacro + +%macro FILTERH_W8_1_sse3 2 + movh m3, [r0 + %1] + movhps m3, [r0 + %1 + 2] + pmaddwd m3, m0 + movh m4, [r0 + %1 + 4] + movhps m4, [r0 + %1 + 6] + pmaddwd m4, m0 + pshufd m2, m3, q2301 + paddd m3, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m3, m3, q3120 + pshufd m4, m4, q3120 + punpcklqdq m3, m4 + paddd m3, m1 + + movh m5, [r0 + %1 + 8] + movhps m5, [r0 + %1 + 10] + pmaddwd m5, m0 + movh m4, [r0 + %1 + 12] + movhps m4, [r0 + %1 + 14] + pmaddwd m4, m0 + pshufd m2, m5, q2301 + paddd m5, m2 + pshufd m2, m4, q2301 + paddd m4, m2 + pshufd m5, m5, q3120 + pshufd m4, m4, q3120 + punpcklqdq m5, m4 + paddd m5, m1 +%ifidn %2, pp + psrad m3, 6 + psrad m5, 6 + packssdw m3, m5 + CLIPW m3, m7, m6 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movdqu [r2 + %1], m3 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_HOR_CHROMA_sse3 3 +INIT_XMM sse3 +cglobal interp_4tap_horiz_%3_%1x%2, 4, 7, 8 + add r3, r3 + add r1, r1 + sub r0, 2 + mov r4d, r4m + add r4d, r4d + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movddup m0, [r6 + r4 * 4] +%else + movddup m0, [tab_ChromaCoeff + r4 * 4] +%endif + +%ifidn %3, ps + mova m1, [INTERP_OFFSET_PS] + cmp r5m, byte 0 +%if %1 <= 6 + lea r4, [r1 * 3] + lea r5, [r3 * 3] +%endif + je .skip + sub r0, r1 +%if %1 <= 6 +%assign y 1 +%else +%assign y 3 +%endif +%assign z 0 +%rep y +%assign x 0 +%rep %1/8 + FILTERH_W8_1_sse3 x, %3 +%assign x x+16 +%endrep +%if %1 == 4 || (%1 == 6 && z == 0) || (%1 == 12 && z == 0) + FILTERH_W4_2_sse3 x, %3 + FILTERH_W4_1_sse3 x +%assign x x+8 +%endif +%if %1 == 2 || (%1 == 6 && z == 0) + FILTERH_W2_3_sse3 x +%endif +%if %1 <= 6 + lea r0, [r0 + r4] + lea r2, [r2 + r5] +%else + lea r0, [r0 + r1] + lea r2, [r2 + r3] +%endif +%assign z z+1 +%endrep +.skip: +%elifidn %3, pp + pxor m7, m7 + mova m6, [pw_pixel_max] + mova m1, [tab_c_32] +%if %1 == 2 || %1 == 6 + lea r4, [r1 * 3] + lea r5, [r3 * 3] +%endif +%endif + +%if %1 == 2 +%assign y %2/4 +%elif %1 <= 6 +%assign y %2/2 +%else +%assign y %2 +%endif +%assign z 0 +%rep y +%assign x 0 +%rep %1/8 + FILTERH_W8_1_sse3 x, %3 +%assign x x+16 +%endrep +%if %1 == 4 || %1 == 6 || (%1 == 12 && (z % 2) == 0) + FILTERH_W4_2_sse3 x, %3 +%assign x x+8 +%endif +%if %1 == 2 || (%1 == 6 && (z % 2) == 0) + FILTERH_W2_4_sse3 x, %3 +%endif +%assign z z+1 +%if z < y +%if %1 == 2 + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] +%elif %1 <= 6 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] +%else + lea r0, [r0 + r1] + lea r2, [r2 + r3] +%endif +%endif ;z < y +%endrep + + RET +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- + +FILTER_HOR_CHROMA_sse3 2, 4, pp +FILTER_HOR_CHROMA_sse3 2, 8, pp +FILTER_HOR_CHROMA_sse3 2, 16, pp +FILTER_HOR_CHROMA_sse3 4, 2, pp +FILTER_HOR_CHROMA_sse3 4, 4, pp +FILTER_HOR_CHROMA_sse3 4, 8, pp +FILTER_HOR_CHROMA_sse3 4, 16, pp +FILTER_HOR_CHROMA_sse3 4, 32, pp +FILTER_HOR_CHROMA_sse3 6, 8, pp +FILTER_HOR_CHROMA_sse3 6, 16, pp +FILTER_HOR_CHROMA_sse3 8, 2, pp +FILTER_HOR_CHROMA_sse3 8, 4, pp +FILTER_HOR_CHROMA_sse3 8, 6, pp +FILTER_HOR_CHROMA_sse3 8, 8, pp +FILTER_HOR_CHROMA_sse3 8, 12, pp +FILTER_HOR_CHROMA_sse3 8, 16, pp +FILTER_HOR_CHROMA_sse3 8, 32, pp +FILTER_HOR_CHROMA_sse3 8, 64, pp +FILTER_HOR_CHROMA_sse3 12, 16, pp +FILTER_HOR_CHROMA_sse3 12, 32, pp +FILTER_HOR_CHROMA_sse3 16, 4, pp +FILTER_HOR_CHROMA_sse3 16, 8, pp +FILTER_HOR_CHROMA_sse3 16, 12, pp +FILTER_HOR_CHROMA_sse3 16, 16, pp +FILTER_HOR_CHROMA_sse3 16, 24, pp +FILTER_HOR_CHROMA_sse3 16, 32, pp +FILTER_HOR_CHROMA_sse3 16, 64, pp +FILTER_HOR_CHROMA_sse3 24, 32, pp +FILTER_HOR_CHROMA_sse3 24, 64, pp +FILTER_HOR_CHROMA_sse3 32, 8, pp +FILTER_HOR_CHROMA_sse3 32, 16, pp +FILTER_HOR_CHROMA_sse3 32, 24, pp +FILTER_HOR_CHROMA_sse3 32, 32, pp +FILTER_HOR_CHROMA_sse3 32, 48, pp +FILTER_HOR_CHROMA_sse3 32, 64, pp +FILTER_HOR_CHROMA_sse3 48, 64, pp +FILTER_HOR_CHROMA_sse3 64, 16, pp +FILTER_HOR_CHROMA_sse3 64, 32, pp +FILTER_HOR_CHROMA_sse3 64, 48, pp +FILTER_HOR_CHROMA_sse3 64, 64, pp + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- + +FILTER_HOR_CHROMA_sse3 2, 4, ps +FILTER_HOR_CHROMA_sse3 2, 8, ps +FILTER_HOR_CHROMA_sse3 2, 16, ps +FILTER_HOR_CHROMA_sse3 4, 2, ps +FILTER_HOR_CHROMA_sse3 4, 4, ps +FILTER_HOR_CHROMA_sse3 4, 8, ps +FILTER_HOR_CHROMA_sse3 4, 16, ps +FILTER_HOR_CHROMA_sse3 4, 32, ps +FILTER_HOR_CHROMA_sse3 6, 8, ps +FILTER_HOR_CHROMA_sse3 6, 16, ps +FILTER_HOR_CHROMA_sse3 8, 2, ps +FILTER_HOR_CHROMA_sse3 8, 4, ps +FILTER_HOR_CHROMA_sse3 8, 6, ps +FILTER_HOR_CHROMA_sse3 8, 8, ps +FILTER_HOR_CHROMA_sse3 8, 12, ps +FILTER_HOR_CHROMA_sse3 8, 16, ps +FILTER_HOR_CHROMA_sse3 8, 32, ps +FILTER_HOR_CHROMA_sse3 8, 64, ps +FILTER_HOR_CHROMA_sse3 12, 16, ps +FILTER_HOR_CHROMA_sse3 12, 32, ps +FILTER_HOR_CHROMA_sse3 16, 4, ps +FILTER_HOR_CHROMA_sse3 16, 8, ps +FILTER_HOR_CHROMA_sse3 16, 12, ps +FILTER_HOR_CHROMA_sse3 16, 16, ps +FILTER_HOR_CHROMA_sse3 16, 24, ps +FILTER_HOR_CHROMA_sse3 16, 32, ps +FILTER_HOR_CHROMA_sse3 16, 64, ps +FILTER_HOR_CHROMA_sse3 24, 32, ps +FILTER_HOR_CHROMA_sse3 24, 64, ps +FILTER_HOR_CHROMA_sse3 32, 8, ps +FILTER_HOR_CHROMA_sse3 32, 16, ps +FILTER_HOR_CHROMA_sse3 32, 24, ps +FILTER_HOR_CHROMA_sse3 32, 32, ps +FILTER_HOR_CHROMA_sse3 32, 48, ps +FILTER_HOR_CHROMA_sse3 32, 64, ps +FILTER_HOR_CHROMA_sse3 48, 64, ps +FILTER_HOR_CHROMA_sse3 64, 16, ps +FILTER_HOR_CHROMA_sse3 64, 32, ps +FILTER_HOR_CHROMA_sse3 64, 48, ps +FILTER_HOR_CHROMA_sse3 64, 64, ps + +%macro FILTER_W2_2 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + r1] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + packusdw m3, m3 + CLIPW m3, m7, m6 +%else + psrad m3, INTERP_SHIFT_PS + packssdw m3, m3 +%endif + movd [r2], m3 + pextrd [r2 + r3], m3, 1 +%endmacro + +%macro FILTER_W4_2 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + r1] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + r1 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m7, m6 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + r3], m3 +%endmacro + +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_CHROMA_H 6 +INIT_XMM sse4 +cglobal interp_4tap_horiz_%3_%1x%2, 4, %4, %5 + + add r3, r3 + add r1, r1 + sub r0, 2 + mov r4d, r4m + add r4d, r4d + +%ifdef PIC + lea r%6, [tab_ChromaCoeff] + movh m0, [r%6 + r4 * 4] +%else + movh m0, [tab_ChromaCoeff + r4 * 4] +%endif + + punpcklqdq m0, m0 + mova m2, [tab_Tm16] + +%ifidn %3, ps + mova m1, [INTERP_OFFSET_PS] + cmp r5m, byte 0 + je .skip + sub r0, r1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + + %if %1 == 4 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + %else + phaddd m3, m3 + %endif + + paddd m3, m1 + psrad m3, INTERP_SHIFT_PS + packssdw m3, m3 + + %if %1 == 2 + movd [r2], m3 + %else + movh [r2], m3 + %endif + + add r0, r1 + add r2, r3 + FILTER_W%1_2 %3 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + +.skip: + +%else ;%ifidn %3, ps + pxor m7, m7 + mova m6, [pw_pixel_max] + mova m1, [tab_c_32] +%endif ;%ifidn %3, ps + + FILTER_W%1_2 %3 + +%rep (%2/2) - 1 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + FILTER_W%1_2 %3 +%endrep + RET +%endmacro + +FILTER_CHROMA_H 2, 4, pp, 6, 8, 5 +FILTER_CHROMA_H 2, 8, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 2, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 4, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 8, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 16, pp, 6, 8, 5 + +FILTER_CHROMA_H 2, 4, ps, 7, 5, 6 +FILTER_CHROMA_H 2, 8, ps, 7, 5, 6 +FILTER_CHROMA_H 4, 2, ps, 7, 6, 6 +FILTER_CHROMA_H 4, 4, ps, 7, 6, 6 +FILTER_CHROMA_H 4, 8, ps, 7, 6, 6 +FILTER_CHROMA_H 4, 16, ps, 7, 6, 6 + +FILTER_CHROMA_H 2, 16, pp, 6, 8, 5 +FILTER_CHROMA_H 4, 32, pp, 6, 8, 5 +FILTER_CHROMA_H 2, 16, ps, 7, 5, 6 +FILTER_CHROMA_H 4, 32, ps, 7, 6, 6 + + +%macro FILTER_W6_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m4, [r0 + 8] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m4, m4 + paddd m4, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m4, INTERP_SHIFT_PP + packusdw m3, m4 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m4, INTERP_SHIFT_PS + packssdw m3, m4 +%endif + movh [r2], m3 + pextrd [r2 + 8], m3, 2 +%endmacro + +cglobal chroma_filter_pp_6x1_internal + FILTER_W6_1 pp + ret + +cglobal chroma_filter_ps_6x1_internal + FILTER_W6_1 ps + ret + +%macro FILTER_W8_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 +%endmacro + +cglobal chroma_filter_pp_8x1_internal + FILTER_W8_1 pp + ret + +cglobal chroma_filter_ps_8x1_internal + FILTER_W8_1 ps + ret + +%macro FILTER_W12_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 + + movu m3, [r0 + 16] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 20] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + packusdw m3, m3 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + packssdw m3, m3 +%endif + movh [r2 + 16], m3 +%endmacro + +cglobal chroma_filter_pp_12x1_internal + FILTER_W12_1 pp + ret + +cglobal chroma_filter_ps_12x1_internal + FILTER_W12_1 ps + ret + +%macro FILTER_W16_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 + + movu m3, [r0 + 16] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 20] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 24] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 28] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + 16], m3 + movhps [r2 + 24], m3 +%endmacro + +cglobal chroma_filter_pp_16x1_internal + FILTER_W16_1 pp + ret + +cglobal chroma_filter_ps_16x1_internal + FILTER_W16_1 ps + ret + +%macro FILTER_W24_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 + + movu m3, [r0 + 16] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 20] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 24] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 28] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + 16], m3 + movhps [r2 + 24], m3 + + movu m3, [r0 + 32] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 36] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 40] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 44] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + 32], m3 + movhps [r2 + 40], m3 +%endmacro + +cglobal chroma_filter_pp_24x1_internal + FILTER_W24_1 pp + ret + +cglobal chroma_filter_ps_24x1_internal + FILTER_W24_1 ps + ret + +%macro FILTER_W32_1 1 + movu m3, [r0] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2], m3 + movhps [r2 + 8], m3 + + movu m3, [r0 + 16] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 20] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 24] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 28] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + 16], m3 + movhps [r2 + 24], m3 + + movu m3, [r0 + 32] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 36] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 40] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 44] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + 32], m3 + movhps [r2 + 40], m3 + + movu m3, [r0 + 48] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + 52] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + 56] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + 60] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + 48], m3 + movhps [r2 + 56], m3 +%endmacro + +cglobal chroma_filter_pp_32x1_internal + FILTER_W32_1 pp + ret + +cglobal chroma_filter_ps_32x1_internal + FILTER_W32_1 ps + ret + +%macro FILTER_W8o_1 2 + movu m3, [r0 + %2] + pshufb m3, m3, m2 + pmaddwd m3, m0 + movu m4, [r0 + %2 + 4] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m1 + + movu m5, [r0 + %2 + 8] + pshufb m5, m5, m2 + pmaddwd m5, m0 + movu m4, [r0 + %2 + 12] + pshufb m4, m4, m2 + pmaddwd m4, m0 + phaddd m5, m4 + paddd m5, m1 +%ifidn %1, pp + psrad m3, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + packusdw m3, m5 + CLIPW m3, m6, m7 +%else + psrad m3, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + packssdw m3, m5 +%endif + movh [r2 + %2], m3 + movhps [r2 + %2 + 8], m3 +%endmacro + +%macro FILTER_W48_1 1 + FILTER_W8o_1 %1, 0 + FILTER_W8o_1 %1, 16 + FILTER_W8o_1 %1, 32 + FILTER_W8o_1 %1, 48 + FILTER_W8o_1 %1, 64 + FILTER_W8o_1 %1, 80 +%endmacro + +cglobal chroma_filter_pp_48x1_internal + FILTER_W48_1 pp + ret + +cglobal chroma_filter_ps_48x1_internal + FILTER_W48_1 ps + ret + +%macro FILTER_W64_1 1 + FILTER_W8o_1 %1, 0 + FILTER_W8o_1 %1, 16 + FILTER_W8o_1 %1, 32 + FILTER_W8o_1 %1, 48 + FILTER_W8o_1 %1, 64 + FILTER_W8o_1 %1, 80 + FILTER_W8o_1 %1, 96 + FILTER_W8o_1 %1, 112 +%endmacro + +cglobal chroma_filter_pp_64x1_internal + FILTER_W64_1 pp + ret + +cglobal chroma_filter_ps_64x1_internal + FILTER_W64_1 ps + ret +;----------------------------------------------------------------------------- +; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- + +INIT_XMM sse4 +%macro IPFILTER_CHROMA 6 +cglobal interp_4tap_horiz_%3_%1x%2, 4, %5, %6 + + add r3, r3 + add r1, r1 + sub r0, 2 + mov r4d, r4m + add r4d, r4d + +%ifdef PIC + lea r%4, [tab_ChromaCoeff] + movh m0, [r%4 + r4 * 4] +%else + movh m0, [tab_ChromaCoeff + r4 * 4] +%endif + + punpcklqdq m0, m0 + mova m2, [tab_Tm16] + +%ifidn %3, ps + mova m1, [INTERP_OFFSET_PS] + cmp r5m, byte 0 + je .skip + sub r0, r1 + call chroma_filter_%3_%1x1_internal + add r0, r1 + add r2, r3 + call chroma_filter_%3_%1x1_internal + add r0, r1 + add r2, r3 + call chroma_filter_%3_%1x1_internal + add r0, r1 + add r2, r3 +.skip: +%else + mova m1, [tab_c_32] + pxor m6, m6 + mova m7, [pw_pixel_max] +%endif + + call chroma_filter_%3_%1x1_internal +%rep %2 - 1 + add r0, r1 + add r2, r3 + call chroma_filter_%3_%1x1_internal +%endrep +RET +%endmacro +IPFILTER_CHROMA 6, 8, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 2, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 4, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 6, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 8, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 12, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 4, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 8, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 12, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 24, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 8, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 24, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 32, pp, 5, 6, 8 + +IPFILTER_CHROMA 6, 8, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 2, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 4, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 6, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 8, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 12, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 4, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 8, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 12, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 24, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 8, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 24, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 32, ps, 6, 7, 6 + +IPFILTER_CHROMA 6, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 12, pp, 5, 6, 8 +IPFILTER_CHROMA 8, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 12, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 24, pp, 5, 6, 8 +IPFILTER_CHROMA 16, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 24, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 48, pp, 5, 6, 8 +IPFILTER_CHROMA 32, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 6, 16, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 12, ps, 6, 7, 6 +IPFILTER_CHROMA 8, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 12, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 24, ps, 6, 7, 6 +IPFILTER_CHROMA 16, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 24, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 48, ps, 6, 7, 6 +IPFILTER_CHROMA 32, 64, ps, 6, 7, 6 + +IPFILTER_CHROMA 48, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 64, 48, pp, 5, 6, 8 +IPFILTER_CHROMA 64, 64, pp, 5, 6, 8 +IPFILTER_CHROMA 64, 32, pp, 5, 6, 8 +IPFILTER_CHROMA 64, 16, pp, 5, 6, 8 +IPFILTER_CHROMA 48, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 64, 48, ps, 6, 7, 6 +IPFILTER_CHROMA 64, 64, ps, 6, 7, 6 +IPFILTER_CHROMA 64, 32, ps, 6, 7, 6 +IPFILTER_CHROMA 64, 16, ps, 6, 7, 6 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_6xN 1 +cglobal interp_4tap_horiz_pp_6x%1, 5,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [h4_interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r4d, %1/2 +.loop: + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movq [r2], xm3 + pextrd [r2 + 8], xm3, 2 + + vbroadcasti128 m3, [r0 + r1] + vbroadcasti128 m4, [r0 + r1 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movq [r2 + r3], xm3 + pextrd [r2 + r3 + 8], xm3, 2 + + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] + dec r4d + jnz .loop + RET +%endmacro +IPFILTER_CHROMA_avx2_6xN 8 +IPFILTER_CHROMA_avx2_6xN 16 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_8x2, 5,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [h4_interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3,q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2], xm3 + + vbroadcasti128 m3, [r0 + r1] + vbroadcasti128 m4, [r0 + r1 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, INTERP_SHIFT_PP ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3,q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2 + r3], xm3 + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_8x4, 5,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [h4_interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + +%rep 2 + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3,q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2], xm3 + + vbroadcasti128 m3, [r0 + r1] + vbroadcasti128 m4, [r0 + r1 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3,q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2 + r3], xm3 + + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] +%endrep + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_8xN 1 +cglobal interp_4tap_horiz_pp_8x%1, 5,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [h4_interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r4d, %1/2 +.loop: + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2], xm3 + + vbroadcasti128 m3, [r0 + r1] + vbroadcasti128 m4, [r0 + r1 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2 + r3], xm3 + + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] + dec r4d + jnz .loop + RET +%endmacro +IPFILTER_CHROMA_avx2_8xN 6 +IPFILTER_CHROMA_avx2_8xN 8 +IPFILTER_CHROMA_avx2_8xN 12 +IPFILTER_CHROMA_avx2_8xN 16 +IPFILTER_CHROMA_avx2_8xN 32 +IPFILTER_CHROMA_avx2_8xN 64 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_16xN 1 +%if ARCH_X86_64 +cglobal interp_4tap_horiz_pp_16x%1, 5,6,9 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [h4_interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r4d, %1 +.loop: + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m8, [r0 + 24] + + pshufb m4, m1 + pshufb m8, m1 + + pmaddwd m4, m0 + pmaddwd m8, m0 + phaddd m4, m8 + paddd m4, m2 + psrad m4, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m4, m4 + vpermq m4, m4, q2020 + pshufb xm4, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + vinserti128 m3, m3, xm4, 1 + CLIPW m3, m5, m7 + movu [r2], m3 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +%endif +%endmacro +IPFILTER_CHROMA_avx2_16xN 4 +IPFILTER_CHROMA_avx2_16xN 8 +IPFILTER_CHROMA_avx2_16xN 12 +IPFILTER_CHROMA_avx2_16xN 16 +IPFILTER_CHROMA_avx2_16xN 24 +IPFILTER_CHROMA_avx2_16xN 32 +IPFILTER_CHROMA_avx2_16xN 64 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_32xN 1 +%if ARCH_X86_64 +cglobal interp_4tap_horiz_pp_32x%1, 5,6,9 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [h4_interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r6d, %1 +.loop: +%assign x 0 +%rep 2 + vbroadcasti128 m3, [r0 + x] + vbroadcasti128 m4, [r0 + 8 + x] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + + vbroadcasti128 m4, [r0 + 16 + x] + vbroadcasti128 m8, [r0 + 24 + x] + pshufb m4, m1 + pshufb m8, m1 + + pmaddwd m4, m0 + pmaddwd m8, m0 + phaddd m4, m8 + paddd m4, m2 + psrad m4, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m4, m4 + vpermq m4, m4, q2020 + pshufb xm4, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + vinserti128 m3, m3, xm4, 1 + CLIPW m3, m5, m7 + movu [r2 + x], m3 + %assign x x+32 + %endrep + + add r0, r1 + add r2, r3 + dec r6d + jnz .loop + RET +%endif +%endmacro +IPFILTER_CHROMA_avx2_32xN 8 +IPFILTER_CHROMA_avx2_32xN 16 +IPFILTER_CHROMA_avx2_32xN 24 +IPFILTER_CHROMA_avx2_32xN 32 +IPFILTER_CHROMA_avx2_32xN 48 +IPFILTER_CHROMA_avx2_32xN 64 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_12xN 1 +%if ARCH_X86_64 +cglobal interp_4tap_horiz_pp_12x%1, 5,6,8 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [h4_interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r4d, %1 +.loop: + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movu [r2], xm3 + + vbroadcasti128 m3, [r0 + 16] + vbroadcasti128 m4, [r0 + 24] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 ; m3 = DWORD[7 6 3 2 5 4 1 0] + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 ; m3 = WORD[7 6 5 4 3 2 1 0] + CLIPW xm3, xm5, xm7 + movq [r2 + 16], xm3 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +%endif +%endmacro +IPFILTER_CHROMA_avx2_12xN 16 +IPFILTER_CHROMA_avx2_12xN 32 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_24xN 1 +%if ARCH_X86_64 +cglobal interp_4tap_horiz_pp_24x%1, 5,6,9 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [h4_interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r4d, %1 +.loop: + vbroadcasti128 m3, [r0] + vbroadcasti128 m4, [r0 + 8] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m8, [r0 + 24] + pshufb m4, m1 + pshufb m8, m1 + + pmaddwd m4, m0 + pmaddwd m8, m0 + phaddd m4, m8 + paddd m4, m2 + psrad m4, 6 + + packusdw m3, m4 + vpermq m3, m3, q3120 + pshufb m3, m6 + CLIPW m3, m5, m7 + movu [r2], m3 + + vbroadcasti128 m3, [r0 + 32] + vbroadcasti128 m4, [r0 + 40] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 + + packusdw m3, m3 + vpermq m3, m3, q2020 + pshufb xm3, xm6 + CLIPW xm3, xm5, xm7 + movu [r2 + 32], xm3 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +%endif +%endmacro +IPFILTER_CHROMA_avx2_24xN 32 +IPFILTER_CHROMA_avx2_24xN 64 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%macro IPFILTER_CHROMA_avx2_64xN 1 +%if ARCH_X86_64 +cglobal interp_4tap_horiz_pp_64x%1, 5,6,9 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [h4_interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r6d, %1 +.loop: +%assign x 0 +%rep 4 + vbroadcasti128 m3, [r0 + x] + vbroadcasti128 m4, [r0 + 8 + x] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 + + vbroadcasti128 m4, [r0 + 16 + x] + vbroadcasti128 m8, [r0 + 24 + x] + pshufb m4, m1 + pshufb m8, m1 + + pmaddwd m4, m0 + pmaddwd m8, m0 + phaddd m4, m8 + paddd m4, m2 + psrad m4, 6 + + packusdw m3, m4 + vpermq m3, m3, q3120 + pshufb m3, m6 + CLIPW m3, m5, m7 + movu [r2 + x], m3 + %assign x x+32 + %endrep + + add r0, r1 + add r2, r3 + dec r6d + jnz .loop + RET +%endif +%endmacro +IPFILTER_CHROMA_avx2_64xN 16 +IPFILTER_CHROMA_avx2_64xN 32 +IPFILTER_CHROMA_avx2_64xN 48 +IPFILTER_CHROMA_avx2_64xN 64 + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +%if ARCH_X86_64 +cglobal interp_4tap_horiz_pp_48x64, 5,6,9 + add r1d, r1d + add r3d, r3d + sub r0, 2 + mov r4d, r4m +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m1, [h4_interp8_hpp_shuf] + vpbroadcastd m2, [pd_32] + pxor m5, m5 + mova m6, [idct8_shuf2] + mova m7, [pw_pixel_max] + + mov r4d, 64 +.loop: +%assign x 0 +%rep 3 + vbroadcasti128 m3, [r0 + x] + vbroadcasti128 m4, [r0 + 8 + x] + pshufb m3, m1 + pshufb m4, m1 + + pmaddwd m3, m0 + pmaddwd m4, m0 + phaddd m3, m4 + paddd m3, m2 + psrad m3, 6 + + vbroadcasti128 m4, [r0 + 16 + x] + vbroadcasti128 m8, [r0 + 24 + x] + pshufb m4, m1 + pshufb m8, m1 + + pmaddwd m4, m0 + pmaddwd m8, m0 + phaddd m4, m8 + paddd m4, m2 + psrad m4, 6 + + packusdw m3, m4 + vpermq m3, m3, q3120 + pshufb m3, m6 + CLIPW m3, m5, m7 + movu [r2 + x], m3 +%assign x x+32 +%endrep + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +%endif + +%macro IPFILTER_CHROMA_PS_8xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_8x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [h4_interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + + IPFILTER_CHROMA_PS_8xN_AVX2 4 + IPFILTER_CHROMA_PS_8xN_AVX2 8 + IPFILTER_CHROMA_PS_8xN_AVX2 16 + IPFILTER_CHROMA_PS_8xN_AVX2 32 + IPFILTER_CHROMA_PS_8xN_AVX2 6 + IPFILTER_CHROMA_PS_8xN_AVX2 2 + IPFILTER_CHROMA_PS_8xN_AVX2 12 + IPFILTER_CHROMA_PS_8xN_AVX2 64 + +%macro IPFILTER_CHROMA_PS_16xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_16x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [h4_interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 16], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + +IPFILTER_CHROMA_PS_16xN_AVX2 16 +IPFILTER_CHROMA_PS_16xN_AVX2 8 +IPFILTER_CHROMA_PS_16xN_AVX2 32 +IPFILTER_CHROMA_PS_16xN_AVX2 12 +IPFILTER_CHROMA_PS_16xN_AVX2 4 +IPFILTER_CHROMA_PS_16xN_AVX2 64 +IPFILTER_CHROMA_PS_16xN_AVX2 24 + +%macro IPFILTER_CHROMA_PS_24xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_24x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [h4_interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 16], xm4 + + vbroadcasti128 m4, [r0 + 32] + vbroadcasti128 m5, [r0 + 40] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 32], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + +IPFILTER_CHROMA_PS_24xN_AVX2 32 +IPFILTER_CHROMA_PS_24xN_AVX2 64 + +%macro IPFILTER_CHROMA_PS_12xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_12x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [h4_interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + pshufb m4, m3 + pmaddwd m4, m0 + phaddd m4, m4 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movq [r2 + 16], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + +IPFILTER_CHROMA_PS_12xN_AVX2 16 +IPFILTER_CHROMA_PS_12xN_AVX2 32 + +%macro IPFILTER_CHROMA_PS_32xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_32x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [h4_interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 16], xm4 + + vbroadcasti128 m4, [r0 + 32] + vbroadcasti128 m5, [r0 + 40] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 32], xm4 + + vbroadcasti128 m4, [r0 + 48] + vbroadcasti128 m5, [r0 + 56] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 48], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + +IPFILTER_CHROMA_PS_32xN_AVX2 32 +IPFILTER_CHROMA_PS_32xN_AVX2 16 +IPFILTER_CHROMA_PS_32xN_AVX2 24 +IPFILTER_CHROMA_PS_32xN_AVX2 8 +IPFILTER_CHROMA_PS_32xN_AVX2 64 +IPFILTER_CHROMA_PS_32xN_AVX2 48 + + +%macro IPFILTER_CHROMA_PS_64xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_64x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [h4_interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 16], xm4 + + vbroadcasti128 m4, [r0 + 32] + vbroadcasti128 m5, [r0 + 40] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 32], xm4 + + vbroadcasti128 m4, [r0 + 48] + vbroadcasti128 m5, [r0 + 56] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 48], xm4 + + vbroadcasti128 m4, [r0 + 64] + vbroadcasti128 m5, [r0 + 72] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 64], xm4 + + vbroadcasti128 m4, [r0 + 80] + vbroadcasti128 m5, [r0 + 88] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 80], xm4 + + vbroadcasti128 m4, [r0 + 96] + vbroadcasti128 m5, [r0 + 104] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 96], xm4 + + vbroadcasti128 m4, [r0 + 112] + vbroadcasti128 m5, [r0 + 120] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 112], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + +IPFILTER_CHROMA_PS_64xN_AVX2 64 +IPFILTER_CHROMA_PS_64xN_AVX2 48 +IPFILTER_CHROMA_PS_64xN_AVX2 32 +IPFILTER_CHROMA_PS_64xN_AVX2 16 + +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_48x64, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [h4_interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, 64 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2], xm4 + + vbroadcasti128 m4, [r0 + 16] + vbroadcasti128 m5, [r0 + 24] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 16], xm4 + + vbroadcasti128 m4, [r0 + 32] + vbroadcasti128 m5, [r0 + 40] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 32], xm4 + + vbroadcasti128 m4, [r0 + 48] + vbroadcasti128 m5, [r0 + 56] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 48], xm4 + + vbroadcasti128 m4, [r0 + 64] + vbroadcasti128 m5, [r0 + 72] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 64], xm4 + + vbroadcasti128 m4, [r0 + 80] + vbroadcasti128 m5, [r0 + 88] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movu [r2 + 80], xm4 + + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif + +%macro IPFILTER_CHROMA_PS_6xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_horiz_ps_6x%1, 4, 7, 6 + add r1d, r1d + add r3d, r3d + mov r4d, r4m + mov r5d, r5m + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8] +%endif + mova m3, [h4_interp8_hpp_shuf] + vbroadcasti128 m2, [INTERP_OFFSET_PS] + + ; register map + ; m0 , m1 interpolate coeff + + sub r0, 2 + test r5d, r5d + mov r4d, %1 + jz .loop0 + sub r0, r1 + add r4d, 3 + +.loop0: + vbroadcasti128 m4, [r0] + vbroadcasti128 m5, [r0 + 8] + pshufb m4, m3 + pshufb m5, m3 + pmaddwd m4, m0 + pmaddwd m5, m0 + phaddd m4, m5 + paddd m4, m2 + vpermq m4, m4, q3120 + psrad m4, INTERP_SHIFT_PS + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + movq [r2], xm4 + pextrd [r2 + 8], xm4, 2 + add r2, r3 + add r0, r1 + dec r4d + jnz .loop0 + RET +%endif +%endmacro + + IPFILTER_CHROMA_PS_6xN_AVX2 8 + IPFILTER_CHROMA_PS_6xN_AVX2 16 From 36301974e97b0b7a978f2d8e0ff9a2b863f9b623 Mon Sep 17 00:00:00 2001 From: Mythreyi P Date: Thu, 15 Feb 2018 02:21:26 -0800 Subject: [PATCH 50/51] x86: Split ipfilter16 kernals part3 Port vertical 4tap kernals from ipfilter16.asm to new source file, v4-ipfilter16.asm to reduce compile time of x265. --- source/common/CMakeLists.txt | 2 +- source/common/x86/ipfilter16.asm | 7604 ++++++++------------------- source/common/x86/v4-ipfilter16.asm | 3529 +++++++++++++ 3 files changed, 5598 insertions(+), 5537 deletions(-) create mode 100644 source/common/x86/v4-ipfilter16.asm diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 0eb2dfb8f2..c70bb108c9 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -61,7 +61,7 @@ if(ENABLE_ASSEMBLY AND X86) mc-a2.asm pixel-util8.asm blockcopy8.asm pixeladd8.asm dct8.asm seaintegral.asm) if(HIGH_BIT_DEPTH) - set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm h4-ipfilter16.asm h-ipfilter16.asm ipfilter16.asm loopfilter.asm) + set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm v4-ipfilter16.asm h4-ipfilter16.asm h-ipfilter16.asm ipfilter16.asm loopfilter.asm) else() set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm v4-ipfilter8.asm h-ipfilter8.asm ipfilter8.asm loopfilter.asm) endif() diff --git a/source/common/x86/ipfilter16.asm b/source/common/x86/ipfilter16.asm index c5b775121e..9582ccabeb 100644 --- a/source/common/x86/ipfilter16.asm +++ b/source/common/x86/ipfilter16.asm @@ -51,54 +51,6 @@ tab_c_524800: times 4 dd 524800 tab_c_n8192: times 8 dw -8192 pd_524800: times 8 dd 524800 -const tab_ChromaCoeffV, times 8 dw 0, 64 - times 8 dw 0, 0 - - times 8 dw -2, 58 - times 8 dw 10, -2 - - times 8 dw -4, 54 - times 8 dw 16, -2 - - times 8 dw -6, 46 - times 8 dw 28, -4 - - times 8 dw -4, 36 - times 8 dw 36, -4 - - times 8 dw -4, 28 - times 8 dw 46, -6 - - times 8 dw -2, 16 - times 8 dw 54, -4 - - times 8 dw -2, 10 - times 8 dw 58, -2 - -tab_ChromaCoeffVer: times 8 dw 0, 64 - times 8 dw 0, 0 - - times 8 dw -2, 58 - times 8 dw 10, -2 - - times 8 dw -4, 54 - times 8 dw 16, -2 - - times 8 dw -6, 46 - times 8 dw 28, -4 - - times 8 dw -4, 36 - times 8 dw 36, -4 - - times 8 dw -4, 28 - times 8 dw 46, -6 - - times 8 dw -2, 16 - times 8 dw 54, -4 - - times 8 dw -2, 10 - times 8 dw 58, -2 - ALIGN 32 tab_LumaCoeffV: times 4 dw 0, 0 times 4 dw 0, 64 @@ -349,121 +301,115 @@ cglobal interp_8tap_vert_%1_%2x%3, 5, 7, 8 FILTER_VER_LUMA_sse2 ps, 64, 16 FILTER_VER_LUMA_sse2 ps, 16, 64 -%macro PROCESS_CHROMA_SP_W4_4R 0 + +%macro PROCESS_LUMA_VER_W4_4R 0 movq m0, [r0] movq m1, [r0 + r1] punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r6 + 0 *32] ;m0=[0+1] Row1 + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m1, m4 ;m1=[1 2] - pmaddwd m1, [r6 + 0 *32] ;m1=[1+2] Row2 + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[2 3] - pmaddwd m2, m4, [r6 + 0 *32] ;m2=[2+3] Row3 - pmaddwd m4, [r6 + 1 * 32] - paddd m0, m4 ;m0=[0+1+2+3] Row1 done + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m5, m4 ;m5=[3 4] - pmaddwd m3, m5, [r6 + 0 *32] ;m3=[3+4] Row4 - pmaddwd m5, [r6 + 1 * 32] - paddd m1, m5 ;m1 = [1+2+3+4] Row2 + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[4 5] - pmaddwd m4, [r6 + 1 * 32] - paddd m2, m4 ;m2=[2+3+4+5] Row3 + pmaddwd m6, m4, [r6 + 1 * 16] + paddd m2, m6 ;m2=[2+3+4+5] Row3 + pmaddwd m4, [r6 + 2 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 - movq m4, [r0 + 2 * r1] + lea r0, [r0 + 2 * r1] + movq m4, [r0] punpcklwd m5, m4 ;m5=[5 6] - pmaddwd m5, [r6 + 1 * 32] - paddd m3, m5 ;m3=[3+4+5+6] Row4 + pmaddwd m6, m5, [r6 + 1 * 16] + paddd m3, m6 ;m3=[3+4+5+6] Row4 + pmaddwd m5, [r6 + 2 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[6 7] + pmaddwd m6, m4, [r6 + 2 * 16] + paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 + pmaddwd m4, [r6 + 3 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[7 8] + pmaddwd m6, m5, [r6 + 2 * 16] + paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 + pmaddwd m5, [r6 + 3 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[8 9] + pmaddwd m4, [r6 + 3 * 16] + paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[9 10] + pmaddwd m5, [r6 + 3 * 16] + paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end %endmacro -;----------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS 4 -INIT_XMM sse2 -cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-gprsize +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_PP 2 +INIT_XMM sse4 +cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8 ,0-gprsize add r1d, r1d add r3d, r3d - sub r0, r1 + lea r5, [r1 + 2 * r1] + sub r0, r5 shl r4d, 6 %ifdef PIC - lea r5, [tab_ChromaCoeffV] + lea r5, [tab_LumaCoeffV] lea r6, [r5 + r4] %else - lea r6, [tab_ChromaCoeffV + r4] + lea r6, [tab_LumaCoeffV + r4] %endif - mov dword [rsp], %2/4 - -%ifnidn %3, ss - %ifnidn %3, ps - mova m7, [pw_pixel_max] - %ifidn %3, pp - mova m6, [INTERP_OFFSET_PP] - %else - mova m6, [INTERP_OFFSET_SP] - %endif - %else - mova m6, [INTERP_OFFSET_PS] - %endif -%endif + mova m7, [INTERP_OFFSET_PP] + mov dword [rsp], %2/4 .loopH: mov r4d, (%1/4) .loopW: - PROCESS_CHROMA_SP_W4_4R + PROCESS_LUMA_VER_W4_4R -%ifidn %3, ss - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 - packssdw m0, m1 - packssdw m2, m3 -%elifidn %3, ps - paddd m0, m6 - paddd m1, m6 - paddd m2, m6 - paddd m3, m6 - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP packssdw m0, m1 packssdw m2, m3 -%else - paddd m0, m6 - paddd m1, m6 - paddd m2, m6 - paddd m3, m6 - %ifidn %3, pp - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP - %else - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP - %endif - packssdw m0, m1 - packssdw m2, m3 - pxor m5, m5 - CLIPW2 m0, m2, m5, m7 -%endif + + pxor m1, m1 + CLIPW2 m0, m2, m1, [pw_pixel_max] movh [r2], m0 movhps [r2 + r3], m0 @@ -471,7 +417,7 @@ cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-gprsize movh [r5], m2 movhps [r5 + r3], m2 - lea r5, [4 * r1 - 2 * 4] + lea r5, [8 * r1 - 2 * 4] sub r0, r5 add r2, 2 * 4 @@ -483,776 +429,287 @@ cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-gprsize dec dword [rsp] jnz .loopH - RET %endmacro - FILTER_VER_CHROMA_SS 4, 4, ss, 6 - FILTER_VER_CHROMA_SS 4, 8, ss, 6 - FILTER_VER_CHROMA_SS 16, 16, ss, 6 - FILTER_VER_CHROMA_SS 16, 8, ss, 6 - FILTER_VER_CHROMA_SS 16, 12, ss, 6 - FILTER_VER_CHROMA_SS 12, 16, ss, 6 - FILTER_VER_CHROMA_SS 16, 4, ss, 6 - FILTER_VER_CHROMA_SS 4, 16, ss, 6 - FILTER_VER_CHROMA_SS 32, 32, ss, 6 - FILTER_VER_CHROMA_SS 32, 16, ss, 6 - FILTER_VER_CHROMA_SS 16, 32, ss, 6 - FILTER_VER_CHROMA_SS 32, 24, ss, 6 - FILTER_VER_CHROMA_SS 24, 32, ss, 6 - FILTER_VER_CHROMA_SS 32, 8, ss, 6 - - FILTER_VER_CHROMA_SS 4, 4, ps, 7 - FILTER_VER_CHROMA_SS 4, 8, ps, 7 - FILTER_VER_CHROMA_SS 16, 16, ps, 7 - FILTER_VER_CHROMA_SS 16, 8, ps, 7 - FILTER_VER_CHROMA_SS 16, 12, ps, 7 - FILTER_VER_CHROMA_SS 12, 16, ps, 7 - FILTER_VER_CHROMA_SS 16, 4, ps, 7 - FILTER_VER_CHROMA_SS 4, 16, ps, 7 - FILTER_VER_CHROMA_SS 32, 32, ps, 7 - FILTER_VER_CHROMA_SS 32, 16, ps, 7 - FILTER_VER_CHROMA_SS 16, 32, ps, 7 - FILTER_VER_CHROMA_SS 32, 24, ps, 7 - FILTER_VER_CHROMA_SS 24, 32, ps, 7 - FILTER_VER_CHROMA_SS 32, 8, ps, 7 - - FILTER_VER_CHROMA_SS 4, 4, sp, 8 - FILTER_VER_CHROMA_SS 4, 8, sp, 8 - FILTER_VER_CHROMA_SS 16, 16, sp, 8 - FILTER_VER_CHROMA_SS 16, 8, sp, 8 - FILTER_VER_CHROMA_SS 16, 12, sp, 8 - FILTER_VER_CHROMA_SS 12, 16, sp, 8 - FILTER_VER_CHROMA_SS 16, 4, sp, 8 - FILTER_VER_CHROMA_SS 4, 16, sp, 8 - FILTER_VER_CHROMA_SS 32, 32, sp, 8 - FILTER_VER_CHROMA_SS 32, 16, sp, 8 - FILTER_VER_CHROMA_SS 16, 32, sp, 8 - FILTER_VER_CHROMA_SS 32, 24, sp, 8 - FILTER_VER_CHROMA_SS 24, 32, sp, 8 - FILTER_VER_CHROMA_SS 32, 8, sp, 8 - - FILTER_VER_CHROMA_SS 4, 4, pp, 8 - FILTER_VER_CHROMA_SS 4, 8, pp, 8 - FILTER_VER_CHROMA_SS 16, 16, pp, 8 - FILTER_VER_CHROMA_SS 16, 8, pp, 8 - FILTER_VER_CHROMA_SS 16, 12, pp, 8 - FILTER_VER_CHROMA_SS 12, 16, pp, 8 - FILTER_VER_CHROMA_SS 16, 4, pp, 8 - FILTER_VER_CHROMA_SS 4, 16, pp, 8 - FILTER_VER_CHROMA_SS 32, 32, pp, 8 - FILTER_VER_CHROMA_SS 32, 16, pp, 8 - FILTER_VER_CHROMA_SS 16, 32, pp, 8 - FILTER_VER_CHROMA_SS 32, 24, pp, 8 - FILTER_VER_CHROMA_SS 24, 32, pp, 8 - FILTER_VER_CHROMA_SS 32, 8, pp, 8 - - - FILTER_VER_CHROMA_SS 16, 24, ss, 6 - FILTER_VER_CHROMA_SS 12, 32, ss, 6 - FILTER_VER_CHROMA_SS 4, 32, ss, 6 - FILTER_VER_CHROMA_SS 32, 64, ss, 6 - FILTER_VER_CHROMA_SS 16, 64, ss, 6 - FILTER_VER_CHROMA_SS 32, 48, ss, 6 - FILTER_VER_CHROMA_SS 24, 64, ss, 6 - - FILTER_VER_CHROMA_SS 16, 24, ps, 7 - FILTER_VER_CHROMA_SS 12, 32, ps, 7 - FILTER_VER_CHROMA_SS 4, 32, ps, 7 - FILTER_VER_CHROMA_SS 32, 64, ps, 7 - FILTER_VER_CHROMA_SS 16, 64, ps, 7 - FILTER_VER_CHROMA_SS 32, 48, ps, 7 - FILTER_VER_CHROMA_SS 24, 64, ps, 7 - - FILTER_VER_CHROMA_SS 16, 24, sp, 8 - FILTER_VER_CHROMA_SS 12, 32, sp, 8 - FILTER_VER_CHROMA_SS 4, 32, sp, 8 - FILTER_VER_CHROMA_SS 32, 64, sp, 8 - FILTER_VER_CHROMA_SS 16, 64, sp, 8 - FILTER_VER_CHROMA_SS 32, 48, sp, 8 - FILTER_VER_CHROMA_SS 24, 64, sp, 8 - - FILTER_VER_CHROMA_SS 16, 24, pp, 8 - FILTER_VER_CHROMA_SS 12, 32, pp, 8 - FILTER_VER_CHROMA_SS 4, 32, pp, 8 - FILTER_VER_CHROMA_SS 32, 64, pp, 8 - FILTER_VER_CHROMA_SS 16, 64, pp, 8 - FILTER_VER_CHROMA_SS 32, 48, pp, 8 - FILTER_VER_CHROMA_SS 24, 64, pp, 8 - - - FILTER_VER_CHROMA_SS 48, 64, ss, 6 - FILTER_VER_CHROMA_SS 64, 48, ss, 6 - FILTER_VER_CHROMA_SS 64, 64, ss, 6 - FILTER_VER_CHROMA_SS 64, 32, ss, 6 - FILTER_VER_CHROMA_SS 64, 16, ss, 6 - - FILTER_VER_CHROMA_SS 48, 64, ps, 7 - FILTER_VER_CHROMA_SS 64, 48, ps, 7 - FILTER_VER_CHROMA_SS 64, 64, ps, 7 - FILTER_VER_CHROMA_SS 64, 32, ps, 7 - FILTER_VER_CHROMA_SS 64, 16, ps, 7 - - FILTER_VER_CHROMA_SS 48, 64, sp, 8 - FILTER_VER_CHROMA_SS 64, 48, sp, 8 - FILTER_VER_CHROMA_SS 64, 64, sp, 8 - FILTER_VER_CHROMA_SS 64, 32, sp, 8 - FILTER_VER_CHROMA_SS 64, 16, sp, 8 - - FILTER_VER_CHROMA_SS 48, 64, pp, 8 - FILTER_VER_CHROMA_SS 64, 48, pp, 8 - FILTER_VER_CHROMA_SS 64, 64, pp, 8 - FILTER_VER_CHROMA_SS 64, 32, pp, 8 - FILTER_VER_CHROMA_SS 64, 16, pp, 8 - - -%macro PROCESS_CHROMA_SP_W2_4R 1 - movd m0, [r0] - movd m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - - lea r0, [r0 + 2 * r1] - movd m2, [r0] - punpcklwd m1, m2 ;m1=[1 2] - punpcklqdq m0, m1 ;m0=[0 1 1 2] - pmaddwd m0, [%1 + 0 *32] ;m0=[0+1 1+2] Row 1-2 - - movd m1, [r0 + r1] - punpcklwd m2, m1 ;m2=[2 3] - - lea r0, [r0 + 2 * r1] - movd m3, [r0] - punpcklwd m1, m3 ;m2=[3 4] - punpcklqdq m2, m1 ;m2=[2 3 3 4] - - pmaddwd m4, m2, [%1 + 1 * 32] ;m4=[2+3 3+4] Row 1-2 - pmaddwd m2, [%1 + 0 * 32] ;m2=[2+3 3+4] Row 3-4 - paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 - - movd m1, [r0 + r1] - punpcklwd m3, m1 ;m3=[4 5] - - movd m4, [r0 + 2 * r1] - punpcklwd m1, m4 ;m1=[5 6] - punpcklqdq m3, m1 ;m2=[4 5 5 6] - pmaddwd m3, [%1 + 1 * 32] ;m3=[4+5 5+6] Row 3-4 - paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 -%endmacro - -;--------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_%2_2x%1(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_W2 3 -INIT_XMM sse4 -cglobal interp_4tap_vert_%2_2x%1, 5, 6, %3 +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_PP 4, 4 + FILTER_VER_LUMA_PP 8, 8 + FILTER_VER_LUMA_PP 8, 4 + FILTER_VER_LUMA_PP 4, 8 + FILTER_VER_LUMA_PP 16, 16 + FILTER_VER_LUMA_PP 16, 8 + FILTER_VER_LUMA_PP 8, 16 + FILTER_VER_LUMA_PP 16, 12 + FILTER_VER_LUMA_PP 12, 16 + FILTER_VER_LUMA_PP 16, 4 + FILTER_VER_LUMA_PP 4, 16 + FILTER_VER_LUMA_PP 32, 32 + FILTER_VER_LUMA_PP 32, 16 + FILTER_VER_LUMA_PP 16, 32 + FILTER_VER_LUMA_PP 32, 24 + FILTER_VER_LUMA_PP 24, 32 + FILTER_VER_LUMA_PP 32, 8 + FILTER_VER_LUMA_PP 8, 32 + FILTER_VER_LUMA_PP 64, 64 + FILTER_VER_LUMA_PP 64, 32 + FILTER_VER_LUMA_PP 32, 64 + FILTER_VER_LUMA_PP 64, 48 + FILTER_VER_LUMA_PP 48, 64 + FILTER_VER_LUMA_PP 64, 16 + FILTER_VER_LUMA_PP 16, 64 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 6 +%macro FILTER_VER_LUMA_AVX2_4x4 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 + mov r4d, r4m + add r1d, r1d + add r3d, r3d + shl r4d, 7 %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] + lea r5, [tab_LumaCoeffVer] + add r5, r4 %else - lea r5, [tab_ChromaCoeffV + r4] + lea r5, [tab_LumaCoeffVer + r4] %endif - mov r4d, (%1/4) -%ifnidn %2, ss - %ifnidn %2, ps - pxor m7, m7 - mova m6, [pw_pixel_max] - %ifidn %2, pp - mova m5, [INTERP_OFFSET_PP] - %else - mova m5, [INTERP_OFFSET_SP] - %endif - %else - mova m5, [INTERP_OFFSET_PS] - %endif -%endif + lea r4, [r1 * 3] + sub r0, r4 -.loopH: - PROCESS_CHROMA_SP_W2_4R r5 -%ifidn %2, ss - psrad m0, 6 - psrad m2, 6 - packssdw m0, m2 -%elifidn %2, ps - paddd m0, m5 - paddd m2, m5 - psrad m0, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - packssdw m0, m2 +%ifidn %1,pp + vbroadcasti128 m6, [pd_32] +%elifidn %1, sp + vbroadcasti128 m6, [INTERP_OFFSET_SP] %else - paddd m0, m5 - paddd m2, m5 - %ifidn %2, pp - psrad m0, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - %else - psrad m0, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - %endif - packusdw m0, m2 - CLIPW m0, m7, m6 + vbroadcasti128 m6, [INTERP_OFFSET_PS] %endif - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m0, 2 - pextrd [r2 + r3], m0, 3 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH - RET -%endmacro - -FILTER_VER_CHROMA_W2 4, ss, 5 -FILTER_VER_CHROMA_W2 8, ss, 5 - -FILTER_VER_CHROMA_W2 4, pp, 8 -FILTER_VER_CHROMA_W2 8, pp, 8 - -FILTER_VER_CHROMA_W2 4, ps, 6 -FILTER_VER_CHROMA_W2 8, ps, 6 - -FILTER_VER_CHROMA_W2 4, sp, 8 -FILTER_VER_CHROMA_W2 8, sp, 8 - -FILTER_VER_CHROMA_W2 16, ss, 5 -FILTER_VER_CHROMA_W2 16, pp, 8 -FILTER_VER_CHROMA_W2 16, ps, 6 -FILTER_VER_CHROMA_W2 16, sp, 8 - - -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_%1_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_W4 3 -INIT_XMM sse4 -cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 6 + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] + pmaddwd m4, [r5 + 1 * mmsize] + paddd m0, m5 + paddd m2, m4 + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + pmaddwd m1, [r5 + 2 * mmsize] + paddd m0, m5 + paddd m2, m1 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + 2 * r1] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [A 9 9 8] + pmaddwd m4, [r5 + 3 * mmsize] + paddd m2, m4 -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] +%ifidn %1,ss + psrad m0, 6 + psrad m2, 6 %else - lea r5, [tab_ChromaCoeffV + r4] -%endif - -%ifnidn %2, 2 - mov r4d, %1/2 + paddd m0, m6 + paddd m2, m6 +%ifidn %1,pp + psrad m0, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP +%elifidn %1, sp + psrad m0, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP +%else + psrad m0, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS %endif - -%ifnidn %2, ss - %ifnidn %2, ps - pxor m6, m6 - mova m5, [pw_pixel_max] - %ifidn %2, pp - mova m4, [INTERP_OFFSET_PP] - %else - mova m4, [INTERP_OFFSET_SP] - %endif - %else - mova m4, [INTERP_OFFSET_PS] - %endif %endif -%ifnidn %2, 2 -.loop: + packssdw m0, m2 + pxor m1, m1 +%ifidn %1,pp + CLIPW m0, m1, [pw_pixel_max] +%elifidn %1, sp + CLIPW m0, m1, [pw_pixel_max] %endif - movh m0, [r0] - movh m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r5 + 0 *32] ;m0=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movh m2, [r0] - punpcklwd m1, m2 ;m1=[1 2] - pmaddwd m1, [r5 + 0 *32] ;m1=[1+2] Row2 - - movh m3, [r0 + r1] - punpcklwd m2, m3 ;m4=[2 3] - pmaddwd m2, [r5 + 1 * 32] - paddd m0, m2 ;m0=[0+1+2+3] Row1 done - - movh m2, [r0 + 2 * r1] - punpcklwd m3, m2 ;m5=[3 4] - pmaddwd m3, [r5 + 1 * 32] - paddd m1, m3 ;m1=[1+2+3+4] Row2 done - -%ifidn %2, ss - psrad m0, 6 - psrad m1, 6 - packssdw m0, m1 -%elifidn %2, ps - paddd m0, m4 - paddd m1, m4 - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS - packssdw m0, m1 -%else - paddd m0, m4 - paddd m1, m4 - %ifidn %2, pp - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP - %else - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP - %endif - packusdw m0, m1 - CLIPW m0, m6, m5 -%endif - - movh [r2], m0 - movhps [r2 + r3], m0 - -%ifnidn %2, 2 - lea r2, [r2 + r3 * 2] - dec r4d - jnz .loop -%endif - RET -%endmacro - -FILTER_VER_CHROMA_W4 2, ss, 4 -FILTER_VER_CHROMA_W4 2, pp, 7 -FILTER_VER_CHROMA_W4 2, ps, 5 -FILTER_VER_CHROMA_W4 2, sp, 7 - -FILTER_VER_CHROMA_W4 4, ss, 4 -FILTER_VER_CHROMA_W4 4, pp, 7 -FILTER_VER_CHROMA_W4 4, ps, 5 -FILTER_VER_CHROMA_W4 4, sp, 7 - -;------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_%1_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_W6 3 -INIT_XMM sse4 -cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_ChromaCoeffV + r4] -%endif - - mov r4d, %1/4 - -%ifnidn %2, ss - %ifnidn %2, ps - mova m7, [pw_pixel_max] - %ifidn %2, pp - mova m6, [INTERP_OFFSET_PP] - %else - mova m6, [INTERP_OFFSET_SP] - %endif - %else - mova m6, [INTERP_OFFSET_PS] - %endif -%endif - -.loopH: - PROCESS_CHROMA_SP_W4_4R - -%ifidn %2, ss - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - packssdw m0, m1 - packssdw m2, m3 -%elifidn %2, ps - paddd m0, m6 - paddd m1, m6 - paddd m2, m6 - paddd m3, m6 - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS - - packssdw m0, m1 - packssdw m2, m3 -%else - paddd m0, m6 - paddd m1, m6 - paddd m2, m6 - paddd m3, m6 - %ifidn %2, pp - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP - %else - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP - %endif - packssdw m0, m1 - packssdw m2, m3 - pxor m5, m5 - CLIPW2 m0, m2, m5, m7 -%endif - - movh [r2], m0 - movhps [r2 + r3], m0 - lea r5, [r2 + 2 * r3] - movh [r5], m2 - movhps [r5 + r3], m2 - - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 2 * 4 - - PROCESS_CHROMA_SP_W2_4R r6 - -%ifidn %2, ss - psrad m0, 6 - psrad m2, 6 - packssdw m0, m2 -%elifidn %2, ps - paddd m0, m6 - paddd m2, m6 - psrad m0, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - packssdw m0, m2 -%else - paddd m0, m6 - paddd m2, m6 - %ifidn %2, pp - psrad m0, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - %else - psrad m0, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - %endif - packusdw m0, m2 - CLIPW m0, m5, m7 -%endif - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m0, 2 - pextrd [r2 + r3], m0, 3 - - sub r0, 2 * 4 - lea r2, [r2 + 2 * r3 - 2 * 4] - - dec r4d - jnz .loopH - RET -%endmacro - -FILTER_VER_CHROMA_W6 8, ss, 6 -FILTER_VER_CHROMA_W6 8, ps, 7 -FILTER_VER_CHROMA_W6 8, sp, 8 -FILTER_VER_CHROMA_W6 8, pp, 8 - -FILTER_VER_CHROMA_W6 16, ss, 6 -FILTER_VER_CHROMA_W6 16, ps, 7 -FILTER_VER_CHROMA_W6 16, sp, 8 -FILTER_VER_CHROMA_W6 16, pp, 8 - -%macro PROCESS_CHROMA_SP_W8_2R 0 - movu m1, [r0] - movu m3, [r0 + r1] - punpcklwd m0, m1, m3 - pmaddwd m0, [r5 + 0 * 32] ;m0 = [0l+1l] Row1l - punpckhwd m1, m3 - pmaddwd m1, [r5 + 0 * 32] ;m1 = [0h+1h] Row1h - - movu m4, [r0 + 2 * r1] - punpcklwd m2, m3, m4 - pmaddwd m2, [r5 + 0 * 32] ;m2 = [1l+2l] Row2l - punpckhwd m3, m4 - pmaddwd m3, [r5 + 0 * 32] ;m3 = [1h+2h] Row2h - - lea r0, [r0 + 2 * r1] - movu m5, [r0 + r1] - punpcklwd m6, m4, m5 - pmaddwd m6, [r5 + 1 * 32] ;m6 = [2l+3l] Row1l - paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum - punpckhwd m4, m5 - pmaddwd m4, [r5 + 1 * 32] ;m6 = [2h+3h] Row1h - paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum - - movu m4, [r0 + 2 * r1] - punpcklwd m6, m5, m4 - pmaddwd m6, [r5 + 1 * 32] ;m6 = [3l+4l] Row2l - paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum - punpckhwd m5, m4 - pmaddwd m5, [r5 + 1 * 32] ;m1 = [3h+4h] Row2h - paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum -%endmacro - -;---------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;---------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_W8 4 -INIT_XMM sse2 -cglobal interp_4tap_vert_%3_%1x%2, 5, 6, %4 - - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] -%else - lea r5, [tab_ChromaCoeffV + r4] -%endif - - mov r4d, %2/2 - -%ifidn %3, pp - mova m7, [INTERP_OFFSET_PP] -%elifidn %3, sp - mova m7, [INTERP_OFFSET_SP] -%elifidn %3, ps - mova m7, [INTERP_OFFSET_PS] -%endif - -.loopH: - PROCESS_CHROMA_SP_W8_2R - -%ifidn %3, ss - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - packssdw m0, m1 - packssdw m2, m3 -%elifidn %3, ps - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS - - packssdw m0, m1 - packssdw m2, m3 -%else - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - %ifidn %3, pp - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP - %else - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP - %endif - packssdw m0, m1 - packssdw m2, m3 - pxor m5, m5 - mova m6, [pw_pixel_max] - CLIPW2 m0, m2, m5, m6 -%endif - - movu [r2], m0 - movu [r2 + r3], m2 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH + vextracti128 xm2, m0, 1 + lea r4, [r3 * 3] + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 RET %endmacro -FILTER_VER_CHROMA_W8 8, 2, ss, 7 -FILTER_VER_CHROMA_W8 8, 4, ss, 7 -FILTER_VER_CHROMA_W8 8, 6, ss, 7 -FILTER_VER_CHROMA_W8 8, 8, ss, 7 -FILTER_VER_CHROMA_W8 8, 16, ss, 7 -FILTER_VER_CHROMA_W8 8, 32, ss, 7 - -FILTER_VER_CHROMA_W8 8, 2, sp, 8 -FILTER_VER_CHROMA_W8 8, 4, sp, 8 -FILTER_VER_CHROMA_W8 8, 6, sp, 8 -FILTER_VER_CHROMA_W8 8, 8, sp, 8 -FILTER_VER_CHROMA_W8 8, 16, sp, 8 -FILTER_VER_CHROMA_W8 8, 32, sp, 8 - -FILTER_VER_CHROMA_W8 8, 2, ps, 8 -FILTER_VER_CHROMA_W8 8, 4, ps, 8 -FILTER_VER_CHROMA_W8 8, 6, ps, 8 -FILTER_VER_CHROMA_W8 8, 8, ps, 8 -FILTER_VER_CHROMA_W8 8, 16, ps, 8 -FILTER_VER_CHROMA_W8 8, 32, ps, 8 - -FILTER_VER_CHROMA_W8 8, 2, pp, 8 -FILTER_VER_CHROMA_W8 8, 4, pp, 8 -FILTER_VER_CHROMA_W8 8, 6, pp, 8 -FILTER_VER_CHROMA_W8 8, 8, pp, 8 -FILTER_VER_CHROMA_W8 8, 16, pp, 8 -FILTER_VER_CHROMA_W8 8, 32, pp, 8 - -FILTER_VER_CHROMA_W8 8, 12, ss, 7 -FILTER_VER_CHROMA_W8 8, 64, ss, 7 -FILTER_VER_CHROMA_W8 8, 12, sp, 8 -FILTER_VER_CHROMA_W8 8, 64, sp, 8 -FILTER_VER_CHROMA_W8 8, 12, ps, 8 -FILTER_VER_CHROMA_W8 8, 64, ps, 8 -FILTER_VER_CHROMA_W8 8, 12, pp, 8 -FILTER_VER_CHROMA_W8 8, 64, pp, 8 - -%macro PROCESS_CHROMA_VERT_W16_2R 0 - movu m1, [r0] - movu m3, [r0 + r1] - punpcklwd m0, m1, m3 - pmaddwd m0, [r5 + 0 * 32] - punpckhwd m1, m3 - pmaddwd m1, [r5 + 0 * 32] - - movu m4, [r0 + 2 * r1] - punpcklwd m2, m3, m4 - pmaddwd m2, [r5 + 0 * 32] - punpckhwd m3, m4 - pmaddwd m3, [r5 + 0 * 32] - - lea r0, [r0 + 2 * r1] - movu m5, [r0 + r1] - punpcklwd m6, m4, m5 - pmaddwd m6, [r5 + 1 * 32] - paddd m0, m6 - punpckhwd m4, m5 - pmaddwd m4, [r5 + 1 * 32] - paddd m1, m4 - - movu m4, [r0 + 2 * r1] - punpcklwd m6, m5, m4 - pmaddwd m6, [r5 + 1 * 32] - paddd m2, m6 - punpckhwd m5, m4 - pmaddwd m5, [r5 + 1 * 32] - paddd m3, m5 -%endmacro +FILTER_VER_LUMA_AVX2_4x4 pp +FILTER_VER_LUMA_AVX2_4x4 ps +FILTER_VER_LUMA_AVX2_4x4 sp +FILTER_VER_LUMA_AVX2_4x4 ss -;----------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_AVX2_6xN 2 +%macro FILTER_VER_LUMA_AVX2_8x8 1 INIT_YMM avx2 -%if ARCH_X86_64 -cglobal interp_4tap_vert_%2_6x%1, 4, 7, 10 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 mov r4d, r4m add r1d, r1d add r3d, r3d - shl r4d, 6 + shl r4d, 7 %ifdef PIC - lea r5, [tab_ChromaCoeffV] + lea r5, [tab_LumaCoeffVer] add r5, r4 %else - lea r5, [tab_ChromaCoeffV + r4] + lea r5, [tab_LumaCoeffVer + r4] %endif - sub r0, r1 - mov r6d, %1/4 + lea r4, [r1 * 3] + sub r0, r4 -%ifidn %2,pp - vbroadcasti128 m8, [INTERP_OFFSET_PP] -%elifidn %2, sp - vbroadcasti128 m8, [INTERP_OFFSET_SP] +%ifidn %1,pp + vbroadcasti128 m11, [pd_32] +%elifidn %1, sp + vbroadcasti128 m11, [INTERP_OFFSET_SP] %else - vbroadcasti128 m8, [INTERP_OFFSET_PS] + vbroadcasti128 m11, [INTERP_OFFSET_PS] %endif -.loopH: - movu xm0, [r0] - movu xm1, [r0 + r1] + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] - - movu xm2, [r0 + r1 * 2] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] - - lea r4, [r1 * 3] - movu xm3, [r0 + r4] + movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] pmaddwd m2, [r5] paddd m0, m4 - lea r0, [r0 + r1 * 4] - movu xm4, [r0] + movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] pmaddwd m3, [r5] paddd m1, m5 - - movu xm5, [r0 + r1] + movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 pmaddwd m6, m4, [r5 + 1 * mmsize] - pmaddwd m4, [r5] paddd m2, m6 - - movu xm6, [r0 + r1 * 2] + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhwd xm7, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 pmaddwd m7, m5, [r5 + 1 * mmsize] pmaddwd m5, [r5] paddd m3, m7 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m8 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + pmaddwd m7, [r5] + paddd m5, m9 + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + pmaddwd m8, [r5 + 1 * mmsize] + paddd m4, m10 + paddd m6, m8 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhwd xm8, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm8, 1 + pmaddwd m8, m9, [r5 + 3 * mmsize] + paddd m3, m8 + pmaddwd m8, m9, [r5 + 2 * mmsize] + pmaddwd m9, [r5 + 1 * mmsize] + paddd m5, m8 + paddd m7, m9 + movu xm8, [r0 + r4] ; m8 = row 11 + punpckhwd xm9, xm10, xm8 + punpcklwd xm10, xm8 + vinserti128 m10, m10, xm9, 1 + pmaddwd m9, m10, [r5 + 3 * mmsize] + pmaddwd m10, [r5 + 2 * mmsize] + paddd m4, m9 + paddd m6, m10 + lea r4, [r3 * 3] -%ifidn %2,ss +%ifidn %1,ss psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 %else - paddd m0, m8 - paddd m1, m8 - paddd m2, m8 - paddd m3, m8 -%ifidn %2,pp + paddd m0, m11 + paddd m1, m11 + paddd m2, m11 + paddd m3, m11 +%ifidn %1,pp psrad m0, INTERP_SHIFT_PP psrad m1, INTERP_SHIFT_PP psrad m2, INTERP_SHIFT_PP psrad m3, INTERP_SHIFT_PP -%elifidn %2, sp +%elifidn %1, sp psrad m0, INTERP_SHIFT_SP psrad m1, INTERP_SHIFT_SP psrad m2, INTERP_SHIFT_SP @@ -1267,1108 +724,633 @@ cglobal interp_4tap_vert_%2_6x%1, 4, 7, 10 packssdw m0, m1 packssdw m2, m3 - vpermq m0, m0, q3120 - vpermq m2, m2, q3120 - pxor m5, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + pxor m10, m10 mova m9, [pw_pixel_max] -%ifidn %2,pp - CLIPW m0, m5, m9 - CLIPW m2, m5, m9 -%elifidn %2, sp - CLIPW m0, m5, m9 - CLIPW m2, m5, m9 +%ifidn %1,pp + CLIPW m0, m10, m9 + CLIPW m2, m10, m9 +%elifidn %1, sp + CLIPW m0, m10, m9 + CLIPW m2, m10, m9 %endif vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 - movq [r2], xm0 - pextrd [r2 + 8], xm0, 2 - movq [r2 + r3], xm1 - pextrd [r2 + r3 + 8], xm1, 2 - movq [r2 + r3 * 2], xm2 - pextrd [r2 + r3 * 2 + 8], xm2, 2 - movq [r2 + r4], xm3 - pextrd [r2 + r4 + 8], xm3, 2 - - lea r2, [r2 + r3 * 4] - dec r6d - jnz .loopH - RET -%endif -%endmacro -FILTER_VER_CHROMA_AVX2_6xN 8, pp -FILTER_VER_CHROMA_AVX2_6xN 8, ps -FILTER_VER_CHROMA_AVX2_6xN 8, ss -FILTER_VER_CHROMA_AVX2_6xN 8, sp -FILTER_VER_CHROMA_AVX2_6xN 16, pp -FILTER_VER_CHROMA_AVX2_6xN 16, ps -FILTER_VER_CHROMA_AVX2_6xN 16, ss -FILTER_VER_CHROMA_AVX2_6xN 16, sp + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 -;----------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_W16_16xN_avx2 3 -INIT_YMM avx2 -cglobal interp_4tap_vert_%2_16x%1, 5, 6, %3 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 6 + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 12 + punpckhwd xm3, xm8, xm2 + punpcklwd xm8, xm2 + vinserti128 m8, m8, xm3, 1 + pmaddwd m3, m8, [r5 + 3 * mmsize] + pmaddwd m8, [r5 + 2 * mmsize] + paddd m5, m3 + paddd m7, m8 + movu xm3, [r0 + r1] ; m3 = row 13 + punpckhwd xm0, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 3 * mmsize] + paddd m6, m2 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm3, xm0 + punpcklwd xm3, xm0 + vinserti128 m3, m3, xm1, 1 + pmaddwd m3, [r5 + 3 * mmsize] + paddd m7, m3 -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] +%ifidn %1,ss + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 +%else + paddd m4, m11 + paddd m5, m11 + paddd m6, m11 + paddd m7, m11 +%ifidn %1,pp + psrad m4, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP + psrad m6, INTERP_SHIFT_PP + psrad m7, INTERP_SHIFT_PP +%elifidn %1, sp + psrad m4, INTERP_SHIFT_SP + psrad m5, INTERP_SHIFT_SP + psrad m6, INTERP_SHIFT_SP + psrad m7, INTERP_SHIFT_SP %else - lea r5, [tab_ChromaCoeffV + r4] + psrad m4, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS + psrad m6, INTERP_SHIFT_PS + psrad m7, INTERP_SHIFT_PS +%endif %endif - mov r4d, %1/2 - -%ifidn %2, pp - vbroadcasti128 m7, [INTERP_OFFSET_PP] -%elifidn %2, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] -%elifidn %2, ps - vbroadcasti128 m7, [INTERP_OFFSET_PS] + packssdw m4, m5 + packssdw m6, m7 + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b +%ifidn %1,pp + CLIPW m4, m10, m9 + CLIPW m6, m10, m9 +%elifidn %1, sp + CLIPW m4, m10, m9 + CLIPW m6, m10, m9 +%endif + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r4], xm7 + RET %endif +%endmacro -.loopH: - PROCESS_CHROMA_VERT_W16_2R -%ifidn %2, ss - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 +FILTER_VER_LUMA_AVX2_8x8 pp +FILTER_VER_LUMA_AVX2_8x8 ps +FILTER_VER_LUMA_AVX2_8x8 sp +FILTER_VER_LUMA_AVX2_8x8 ss - packssdw m0, m1 - packssdw m2, m3 -%elifidn %2, ps - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS +%macro PROCESS_LUMA_AVX2_W8_16R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhwd xm11, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 3 * mmsize] + paddd m4, m12 + pmaddwd m12, m10, [r5 + 2 * mmsize] + paddd m6, m12 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 3 * mmsize] + paddd m5, m13 + pmaddwd m13, m11, [r5 + 2 * mmsize] + paddd m7, m13 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] - packssdw m0, m1 - packssdw m2, m3 +%ifidn %1,ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 %else - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - %ifidn %2, pp - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 +%ifidn %1,pp + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP + psrad m4, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP +%elifidn %1, sp + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP + psrad m4, INTERP_SHIFT_SP + psrad m5, INTERP_SHIFT_SP %else - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS + psrad m4, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS %endif - packssdw m0, m1 - packssdw m2, m3 - pxor m5, m5 - CLIPW2 m0, m2, m5, [pw_pixel_max] %endif - movu [r2], m0 - movu [r2 + r3], m2 - lea r2, [r2 + 2 * r3] - dec r4d - jnz .loopH - RET -%endmacro - FILTER_VER_CHROMA_W16_16xN_avx2 4, pp, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 8, pp, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 12, pp, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 24, pp, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 16, pp, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 32, pp, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 64, pp, 8 - - FILTER_VER_CHROMA_W16_16xN_avx2 4, ps, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 8, ps, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 12, ps, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 24, ps, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 16, ps, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 32, ps, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 64, ps, 8 - - FILTER_VER_CHROMA_W16_16xN_avx2 4, ss, 7 - FILTER_VER_CHROMA_W16_16xN_avx2 8, ss, 7 - FILTER_VER_CHROMA_W16_16xN_avx2 12, ss, 7 - FILTER_VER_CHROMA_W16_16xN_avx2 24, ss, 7 - FILTER_VER_CHROMA_W16_16xN_avx2 16, ss, 7 - FILTER_VER_CHROMA_W16_16xN_avx2 32, ss, 7 - FILTER_VER_CHROMA_W16_16xN_avx2 64, ss, 7 - - FILTER_VER_CHROMA_W16_16xN_avx2 4, sp, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 8, sp, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 12, sp, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 24, sp, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 16, sp, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 32, sp, 8 - FILTER_VER_CHROMA_W16_16xN_avx2 64, sp, 8 - -%macro PROCESS_CHROMA_VERT_W32_2R 0 - movu m1, [r0] - movu m3, [r0 + r1] - punpcklwd m0, m1, m3 - pmaddwd m0, [r5 + 0 * mmsize] - punpckhwd m1, m3 - pmaddwd m1, [r5 + 0 * mmsize] - - movu m9, [r0 + mmsize] - movu m11, [r0 + r1 + mmsize] - punpcklwd m8, m9, m11 - pmaddwd m8, [r5 + 0 * mmsize] - punpckhwd m9, m11 - pmaddwd m9, [r5 + 0 * mmsize] - - movu m4, [r0 + 2 * r1] - punpcklwd m2, m3, m4 - pmaddwd m2, [r5 + 0 * mmsize] - punpckhwd m3, m4 - pmaddwd m3, [r5 + 0 * mmsize] - - movu m12, [r0 + 2 * r1 + mmsize] - punpcklwd m10, m11, m12 - pmaddwd m10, [r5 + 0 * mmsize] - punpckhwd m11, m12 - pmaddwd m11, [r5 + 0 * mmsize] - - lea r6, [r0 + 2 * r1] - movu m5, [r6 + r1] - punpcklwd m6, m4, m5 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m0, m6 - punpckhwd m4, m5 - pmaddwd m4, [r5 + 1 * mmsize] - paddd m1, m4 - - movu m13, [r6 + r1 + mmsize] - punpcklwd m14, m12, m13 - pmaddwd m14, [r5 + 1 * mmsize] - paddd m8, m14 - punpckhwd m12, m13 - pmaddwd m12, [r5 + 1 * mmsize] - paddd m9, m12 - - movu m4, [r6 + 2 * r1] - punpcklwd m6, m5, m4 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m2, m6 - punpckhwd m5, m4 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m3, m5 - - movu m12, [r6 + 2 * r1 + mmsize] - punpcklwd m14, m13, m12 - pmaddwd m14, [r5 + 1 * mmsize] - paddd m10, m14 - punpckhwd m13, m12 - pmaddwd m13, [r5 + 1 * mmsize] - paddd m11, m13 -%endmacro - -;----------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_W16_32xN_avx2 3 -INIT_YMM avx2 -%if ARCH_X86_64 -cglobal interp_4tap_vert_%2_32x%1, 5, 7, %3 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] -%else - lea r5, [tab_ChromaCoeffV + r4] -%endif - mov r4d, %1/2 - -%ifidn %2, pp - vbroadcasti128 m7, [INTERP_OFFSET_PP] -%elifidn %2, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] -%elifidn %2, ps - vbroadcasti128 m7, [INTERP_OFFSET_PS] + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + pxor m5, m5 + mova m3, [pw_pixel_max] +%ifidn %1,pp + CLIPW m0, m5, m3 + CLIPW m2, m5, m3 + CLIPW m4, m5, m3 +%elifidn %1, sp + CLIPW m0, m5, m3 + CLIPW m2, m5, m3 + CLIPW m4, m5, m3 %endif -.loopH: - PROCESS_CHROMA_VERT_W32_2R -%ifidn %2, ss - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - psrad m8, 6 - psrad m9, 6 - psrad m10, 6 - psrad m11, 6 + vextracti128 xm1, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + vextracti128 xm1, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm1 + lea r8, [r2 + r3 * 4] + vextracti128 xm1, m4, 1 + movu [r8], xm4 + movu [r8 + r3], xm1 - packssdw m0, m1 - packssdw m2, m3 - packssdw m8, m9 - packssdw m10, m11 -%elifidn %2, ps - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS - paddd m8, m7 - paddd m9, m7 - paddd m10, m7 - paddd m11, m7 - psrad m8, INTERP_SHIFT_PS - psrad m9, INTERP_SHIFT_PS - psrad m10, INTERP_SHIFT_PS - psrad m11, INTERP_SHIFT_PS + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m0, m12, [r5 + 3 * mmsize] + paddd m6, m0 + pmaddwd m0, m12, [r5 + 2 * mmsize] + paddd m8, m0 + pmaddwd m0, m12, [r5 + 1 * mmsize] + paddd m10, m0 + pmaddwd m12, [r5] + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddwd m1, m13, [r5 + 3 * mmsize] + paddd m7, m1 + pmaddwd m1, m13, [r5 + 2 * mmsize] + paddd m9, m1 + pmaddwd m1, m13, [r5 + 1 * mmsize] + paddd m11, m1 + pmaddwd m13, [r5] - packssdw m0, m1 - packssdw m2, m3 - packssdw m8, m9 - packssdw m10, m11 +%ifidn %1,ss + psrad m6, 6 + psrad m7, 6 %else - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - paddd m8, m7 - paddd m9, m7 - paddd m10, m7 - paddd m11, m7 - %ifidn %2, pp - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP - psrad m8, INTERP_SHIFT_PP - psrad m9, INTERP_SHIFT_PP - psrad m10, INTERP_SHIFT_PP - psrad m11, INTERP_SHIFT_PP + paddd m6, m14 + paddd m7, m14 +%ifidn %1,pp + psrad m6, INTERP_SHIFT_PP + psrad m7, INTERP_SHIFT_PP +%elifidn %1, sp + psrad m6, INTERP_SHIFT_SP + psrad m7, INTERP_SHIFT_SP %else - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP - psrad m8, INTERP_SHIFT_SP - psrad m9, INTERP_SHIFT_SP - psrad m10, INTERP_SHIFT_SP - psrad m11, INTERP_SHIFT_SP + psrad m6, INTERP_SHIFT_PS + psrad m7, INTERP_SHIFT_PS %endif - packssdw m0, m1 - packssdw m2, m3 - packssdw m8, m9 - packssdw m10, m11 - pxor m5, m5 - CLIPW2 m0, m2, m5, [pw_pixel_max] - CLIPW2 m8, m10, m5, [pw_pixel_max] %endif - movu [r2], m0 - movu [r2 + r3], m2 - movu [r2 + mmsize], m8 - movu [r2 + r3 + mmsize], m10 - lea r2, [r2 + 2 * r3] - lea r0, [r0 + 2 * r1] - dec r4d - jnz .loopH - RET + packssdw m6, m7 + vpermq m6, m6, 11011000b +%ifidn %1,pp + CLIPW m6, m5, m3 +%elifidn %1, sp + CLIPW m6, m5, m3 %endif -%endmacro - FILTER_VER_CHROMA_W16_32xN_avx2 8, pp, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 16, pp, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 24, pp, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 32, pp, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 48, pp, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 64, pp, 15 - - FILTER_VER_CHROMA_W16_32xN_avx2 8, ps, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 16, ps, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 24, ps, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 32, ps, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 48, ps, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 64, ps, 15 - - FILTER_VER_CHROMA_W16_32xN_avx2 8, ss, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 16, ss, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 24, ss, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 32, ss, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 48, ss, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 64, ss, 15 - - FILTER_VER_CHROMA_W16_32xN_avx2 8, sp, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 16, sp, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 24, sp, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 32, sp, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 48, sp, 15 - FILTER_VER_CHROMA_W16_32xN_avx2 64, sp, 15 + vextracti128 xm7, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 -;----------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_W16_64xN_avx2 3 -INIT_YMM avx2 -cglobal interp_4tap_vert_%2_64x%1, 5, 7, %3 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 6 + movu xm1, [r7 + r4] ; m1 = row 15 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m8, m2 + pmaddwd m2, m0, [r5 + 2 * mmsize] + paddd m10, m2 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m12, m2 + pmaddwd m0, [r5] + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhwd xm6, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm6, 1 + pmaddwd m6, m1, [r5 + 3 * mmsize] + paddd m9, m6 + pmaddwd m6, m1, [r5 + 2 * mmsize] + paddd m11, m6 + pmaddwd m6, m1, [r5 + 1 * mmsize] + paddd m13, m6 + pmaddwd m1, [r5] + movu xm6, [r7 + r1] ; m6 = row 17 + punpckhwd xm4, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m10, m4 + pmaddwd m4, m2, [r5 + 2 * mmsize] + paddd m12, m4 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm2, 1 + pmaddwd m2, m6, [r5 + 3 * mmsize] + paddd m11, m2 + pmaddwd m2, m6, [r5 + 2 * mmsize] + paddd m13, m2 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 + movu xm2, [r7 + r4] ; m2 = row 19 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 3 * mmsize] + paddd m12, m6 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m0, m4 + lea r7, [r7 + r1 * 4] + movu xm6, [r7] ; m6 = row 20 + punpckhwd xm7, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 3 * mmsize] + paddd m13, m7 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m1, m2 + movu xm7, [r7 + r1] ; m7 = row 21 + punpckhwd xm2, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddwd m6, [r5 + 3 * mmsize] + paddd m0, m6 + movu xm2, [r7 + r1 * 2] ; m2 = row 22 + punpckhwd xm6, xm7, xm2 + punpcklwd xm7, xm2 + vinserti128 m7, m7, xm6, 1 + pmaddwd m7, [r5 + 3 * mmsize] + paddd m1, m7 -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] +%ifidn %1,ss + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m1, 6 +%else + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 + paddd m12, m14 + paddd m13, m14 + paddd m0, m14 + paddd m1, m14 +%ifidn %1,pp + psrad m8, INTERP_SHIFT_PP + psrad m9, INTERP_SHIFT_PP + psrad m10, INTERP_SHIFT_PP + psrad m11, INTERP_SHIFT_PP + psrad m12, INTERP_SHIFT_PP + psrad m13, INTERP_SHIFT_PP + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP +%elifidn %1, sp + psrad m8, INTERP_SHIFT_SP + psrad m9, INTERP_SHIFT_SP + psrad m10, INTERP_SHIFT_SP + psrad m11, INTERP_SHIFT_SP + psrad m12, INTERP_SHIFT_SP + psrad m13, INTERP_SHIFT_SP + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP %else - lea r5, [tab_ChromaCoeffV + r4] + psrad m8, INTERP_SHIFT_PS + psrad m9, INTERP_SHIFT_PS + psrad m10, INTERP_SHIFT_PS + psrad m11, INTERP_SHIFT_PS + psrad m12, INTERP_SHIFT_PS + psrad m13, INTERP_SHIFT_PS + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS %endif - mov r4d, %1/2 - -%ifidn %2, pp - vbroadcasti128 m7, [INTERP_OFFSET_PP] -%elifidn %2, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] -%elifidn %2, ps - vbroadcasti128 m7, [INTERP_OFFSET_PS] %endif -.loopH: -%assign x 0 -%rep 4 - movu m1, [r0 + x] - movu m3, [r0 + r1 + x] - movu m5, [r5 + 0 * mmsize] - punpcklwd m0, m1, m3 - pmaddwd m0, m5 - punpckhwd m1, m3 - pmaddwd m1, m5 - - movu m4, [r0 + 2 * r1 + x] - punpcklwd m2, m3, m4 - pmaddwd m2, m5 - punpckhwd m3, m4 - pmaddwd m3, m5 - - lea r6, [r0 + 2 * r1] - movu m5, [r6 + r1 + x] - punpcklwd m6, m4, m5 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m0, m6 - punpckhwd m4, m5 - pmaddwd m4, [r5 + 1 * mmsize] - paddd m1, m4 - - movu m4, [r6 + 2 * r1 + x] - punpcklwd m6, m5, m4 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m2, m6 - punpckhwd m5, m4 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m3, m5 - -%ifidn %2, ss - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - packssdw m0, m1 - packssdw m2, m3 -%elifidn %2, ps - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS - - packssdw m0, m1 - packssdw m2, m3 -%else - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 -%ifidn %2, pp - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP -%else - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP -%endif - packssdw m0, m1 - packssdw m2, m3 - pxor m5, m5 - CLIPW2 m0, m2, m5, [pw_pixel_max] + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b +%ifidn %1,pp + CLIPW m8, m5, m3 + CLIPW m10, m5, m3 + CLIPW m12, m5, m3 + CLIPW m0, m5, m3 +%elifidn %1, sp + CLIPW m8, m5, m3 + CLIPW m10, m5, m3 + CLIPW m12, m5, m3 + CLIPW m0, m5, m3 %endif - - movu [r2 + x], m0 - movu [r2 + r3 + x], m2 -%assign x x+mmsize -%endrep - - lea r2, [r2 + 2 * r3] - lea r0, [r0 + 2 * r1] - dec r4d - jnz .loopH - RET + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 + lea r8, [r8 + r3 * 4] + movu [r8], xm12 + movu [r8 + r3], xm13 + movu [r8 + r3 * 2], xm0 + movu [r8 + r6], xm1 %endmacro - FILTER_VER_CHROMA_W16_64xN_avx2 16, ss, 7 - FILTER_VER_CHROMA_W16_64xN_avx2 32, ss, 7 - FILTER_VER_CHROMA_W16_64xN_avx2 48, ss, 7 - FILTER_VER_CHROMA_W16_64xN_avx2 64, ss, 7 - FILTER_VER_CHROMA_W16_64xN_avx2 16, sp, 8 - FILTER_VER_CHROMA_W16_64xN_avx2 32, sp, 8 - FILTER_VER_CHROMA_W16_64xN_avx2 48, sp, 8 - FILTER_VER_CHROMA_W16_64xN_avx2 64, sp, 8 - FILTER_VER_CHROMA_W16_64xN_avx2 16, ps, 8 - FILTER_VER_CHROMA_W16_64xN_avx2 32, ps, 8 - FILTER_VER_CHROMA_W16_64xN_avx2 48, ps, 8 - FILTER_VER_CHROMA_W16_64xN_avx2 64, ps, 8 - FILTER_VER_CHROMA_W16_64xN_avx2 16, pp, 8 - FILTER_VER_CHROMA_W16_64xN_avx2 32, pp, 8 - FILTER_VER_CHROMA_W16_64xN_avx2 48, pp, 8 - FILTER_VER_CHROMA_W16_64xN_avx2 64, pp, 8 -;----------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_W16_12xN_avx2 3 +%macro FILTER_VER_LUMA_AVX2_Nx16 2 INIT_YMM avx2 -cglobal interp_4tap_vert_%2_12x%1, 5, 8, %3 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 6 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_%2x16, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] + lea r5, [tab_LumaCoeffVer] + add r5, r4 %else - lea r5, [tab_ChromaCoeffV + r4] -%endif - mov r4d, %1/2 - -%ifidn %2, pp - vbroadcasti128 m7, [INTERP_OFFSET_PP] -%elifidn %2, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] -%elifidn %2, ps - vbroadcasti128 m7, [INTERP_OFFSET_PS] + lea r5, [tab_LumaCoeffVer + r4] %endif -.loopH: - PROCESS_CHROMA_VERT_W16_2R -%ifidn %2, ss - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - packssdw m0, m1 - packssdw m2, m3 -%elifidn %2, ps - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS - - packssdw m0, m1 - packssdw m2, m3 -%else - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - %ifidn %2, pp - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + vbroadcasti128 m14, [pd_32] +%elifidn %1, sp + vbroadcasti128 m14, [INTERP_OFFSET_SP] %else - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP -%endif - packssdw m0, m1 - packssdw m2, m3 - pxor m5, m5 - CLIPW2 m0, m2, m5, [pw_pixel_max] + vbroadcasti128 m14, [INTERP_OFFSET_PS] %endif - - movu [r2], xm0 - movu [r2 + r3], xm2 - vextracti128 xm0, m0, 1 - vextracti128 xm2, m2, 1 - movq [r2 + 16], xm0 - movq [r2 + r3 + 16], xm2 - lea r2, [r2 + 2 * r3] - dec r4d - jnz .loopH + lea r6, [r3 * 3] + mov r9d, %2 / 8 +.loopW: + PROCESS_LUMA_AVX2_W8_16R %1 + add r2, 16 + add r0, 16 + dec r9d + jnz .loopW RET -%endmacro - FILTER_VER_CHROMA_W16_12xN_avx2 16, ss, 7 - FILTER_VER_CHROMA_W16_12xN_avx2 16, sp, 8 - FILTER_VER_CHROMA_W16_12xN_avx2 16, ps, 8 - FILTER_VER_CHROMA_W16_12xN_avx2 16, pp, 8 - FILTER_VER_CHROMA_W16_12xN_avx2 32, ss, 7 - FILTER_VER_CHROMA_W16_12xN_avx2 32, sp, 8 - FILTER_VER_CHROMA_W16_12xN_avx2 32, ps, 8 - FILTER_VER_CHROMA_W16_12xN_avx2 32, pp, 8 - -%macro PROCESS_CHROMA_VERT_W24_2R 0 - movu m1, [r0] - movu m3, [r0 + r1] - punpcklwd m0, m1, m3 - pmaddwd m0, [r5 + 0 * mmsize] - punpckhwd m1, m3 - pmaddwd m1, [r5 + 0 * mmsize] - - movu xm9, [r0 + mmsize] - movu xm11, [r0 + r1 + mmsize] - punpcklwd xm8, xm9, xm11 - pmaddwd xm8, [r5 + 0 * mmsize] - punpckhwd xm9, xm11 - pmaddwd xm9, [r5 + 0 * mmsize] - - movu m4, [r0 + 2 * r1] - punpcklwd m2, m3, m4 - pmaddwd m2, [r5 + 0 * mmsize] - punpckhwd m3, m4 - pmaddwd m3, [r5 + 0 * mmsize] - - movu xm12, [r0 + 2 * r1 + mmsize] - punpcklwd xm10, xm11, xm12 - pmaddwd xm10, [r5 + 0 * mmsize] - punpckhwd xm11, xm12 - pmaddwd xm11, [r5 + 0 * mmsize] - - lea r6, [r0 + 2 * r1] - movu m5, [r6 + r1] - punpcklwd m6, m4, m5 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m0, m6 - punpckhwd m4, m5 - pmaddwd m4, [r5 + 1 * mmsize] - paddd m1, m4 - - movu xm13, [r6 + r1 + mmsize] - punpcklwd xm14, xm12, xm13 - pmaddwd xm14, [r5 + 1 * mmsize] - paddd xm8, xm14 - punpckhwd xm12, xm13 - pmaddwd xm12, [r5 + 1 * mmsize] - paddd xm9, xm12 - - movu m4, [r6 + 2 * r1] - punpcklwd m6, m5, m4 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m2, m6 - punpckhwd m5, m4 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m3, m5 - - movu xm12, [r6 + 2 * r1 + mmsize] - punpcklwd xm14, xm13, xm12 - pmaddwd xm14, [r5 + 1 * mmsize] - paddd xm10, xm14 - punpckhwd xm13, xm12 - pmaddwd xm13, [r5 + 1 * mmsize] - paddd xm11, xm13 +%endif %endmacro -;----------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_W16_24xN_avx2 3 +FILTER_VER_LUMA_AVX2_Nx16 pp, 16 +FILTER_VER_LUMA_AVX2_Nx16 pp, 32 +FILTER_VER_LUMA_AVX2_Nx16 pp, 64 +FILTER_VER_LUMA_AVX2_Nx16 ps, 16 +FILTER_VER_LUMA_AVX2_Nx16 ps, 32 +FILTER_VER_LUMA_AVX2_Nx16 ps, 64 +FILTER_VER_LUMA_AVX2_Nx16 sp, 16 +FILTER_VER_LUMA_AVX2_Nx16 sp, 32 +FILTER_VER_LUMA_AVX2_Nx16 sp, 64 +FILTER_VER_LUMA_AVX2_Nx16 ss, 16 +FILTER_VER_LUMA_AVX2_Nx16 ss, 32 +FILTER_VER_LUMA_AVX2_Nx16 ss, 64 + +%macro FILTER_VER_LUMA_AVX2_NxN 3 INIT_YMM avx2 -%if ARCH_X86_64 -cglobal interp_4tap_vert_%2_24x%1, 5, 7, %3 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 6 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] + lea r5, [tab_LumaCoeffVer] + add r5, r4 %else - lea r5, [tab_ChromaCoeffV + r4] -%endif - mov r4d, %1/2 - -%ifidn %2, pp - vbroadcasti128 m7, [INTERP_OFFSET_PP] -%elifidn %2, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] -%elifidn %2, ps - vbroadcasti128 m7, [INTERP_OFFSET_PS] + lea r5, [tab_LumaCoeffVer + r4] %endif -.loopH: - PROCESS_CHROMA_VERT_W24_2R -%ifidn %2, ss - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - psrad m8, 6 - psrad m9, 6 - psrad m10, 6 - psrad m11, 6 - - packssdw m0, m1 - packssdw m2, m3 - packssdw m8, m9 - packssdw m10, m11 -%elifidn %2, ps - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS - paddd m8, m7 - paddd m9, m7 - paddd m10, m7 - paddd m11, m7 - psrad m8, INTERP_SHIFT_PS - psrad m9, INTERP_SHIFT_PS - psrad m10, INTERP_SHIFT_PS - psrad m11, INTERP_SHIFT_PS + lea r4, [r1 * 3] + sub r0, r4 - packssdw m0, m1 - packssdw m2, m3 - packssdw m8, m9 - packssdw m10, m11 -%else - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - paddd m8, m7 - paddd m9, m7 - paddd m10, m7 - paddd m11, m7 - %ifidn %2, pp - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP - psrad m8, INTERP_SHIFT_PP - psrad m9, INTERP_SHIFT_PP - psrad m10, INTERP_SHIFT_PP - psrad m11, INTERP_SHIFT_PP +%ifidn %3,pp + vbroadcasti128 m14, [pd_32] +%elifidn %3, sp + vbroadcasti128 m14, [INTERP_OFFSET_SP] %else - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP - psrad m8, INTERP_SHIFT_SP - psrad m9, INTERP_SHIFT_SP - psrad m10, INTERP_SHIFT_SP - psrad m11, INTERP_SHIFT_SP -%endif - packssdw m0, m1 - packssdw m2, m3 - packssdw m8, m9 - packssdw m10, m11 - pxor m5, m5 - CLIPW2 m0, m2, m5, [pw_pixel_max] - CLIPW2 m8, m10, m5, [pw_pixel_max] + vbroadcasti128 m14, [INTERP_OFFSET_PS] %endif - movu [r2], m0 - movu [r2 + r3], m2 - movu [r2 + mmsize], xm8 - movu [r2 + r3 + mmsize], xm10 - lea r2, [r2 + 2 * r3] - lea r0, [r0 + 2 * r1] - dec r4d - jnz .loopH + lea r6, [r3 * 3] + lea r11, [r1 * 4] + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 8 +.loopW: + PROCESS_LUMA_AVX2_W8_16R %3 + add r2, 16 + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 2 * %1 + 16] + lea r2, [r8 + r3 * 4 - 2 * %1 + 16] + dec r9d + jnz .loopH RET %endif %endmacro - FILTER_VER_CHROMA_W16_24xN_avx2 32, ss, 15 - FILTER_VER_CHROMA_W16_24xN_avx2 32, sp, 15 - FILTER_VER_CHROMA_W16_24xN_avx2 32, ps, 15 - FILTER_VER_CHROMA_W16_24xN_avx2 32, pp, 15 - FILTER_VER_CHROMA_W16_24xN_avx2 64, ss, 15 - FILTER_VER_CHROMA_W16_24xN_avx2 64, sp, 15 - FILTER_VER_CHROMA_W16_24xN_avx2 64, ps, 15 - FILTER_VER_CHROMA_W16_24xN_avx2 64, pp, 15 - -;----------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_W16_48x64_avx2 2 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_48x64, 5, 7, %2 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 6 -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] -%else - lea r5, [tab_ChromaCoeffV + r4] -%endif - mov r4d, 32 +FILTER_VER_LUMA_AVX2_NxN 16, 32, pp +FILTER_VER_LUMA_AVX2_NxN 16, 64, pp +FILTER_VER_LUMA_AVX2_NxN 24, 32, pp +FILTER_VER_LUMA_AVX2_NxN 32, 32, pp +FILTER_VER_LUMA_AVX2_NxN 32, 64, pp +FILTER_VER_LUMA_AVX2_NxN 48, 64, pp +FILTER_VER_LUMA_AVX2_NxN 64, 32, pp +FILTER_VER_LUMA_AVX2_NxN 64, 48, pp +FILTER_VER_LUMA_AVX2_NxN 64, 64, pp +FILTER_VER_LUMA_AVX2_NxN 16, 32, ps +FILTER_VER_LUMA_AVX2_NxN 16, 64, ps +FILTER_VER_LUMA_AVX2_NxN 24, 32, ps +FILTER_VER_LUMA_AVX2_NxN 32, 32, ps +FILTER_VER_LUMA_AVX2_NxN 32, 64, ps +FILTER_VER_LUMA_AVX2_NxN 48, 64, ps +FILTER_VER_LUMA_AVX2_NxN 64, 32, ps +FILTER_VER_LUMA_AVX2_NxN 64, 48, ps +FILTER_VER_LUMA_AVX2_NxN 64, 64, ps +FILTER_VER_LUMA_AVX2_NxN 16, 32, sp +FILTER_VER_LUMA_AVX2_NxN 16, 64, sp +FILTER_VER_LUMA_AVX2_NxN 24, 32, sp +FILTER_VER_LUMA_AVX2_NxN 32, 32, sp +FILTER_VER_LUMA_AVX2_NxN 32, 64, sp +FILTER_VER_LUMA_AVX2_NxN 48, 64, sp +FILTER_VER_LUMA_AVX2_NxN 64, 32, sp +FILTER_VER_LUMA_AVX2_NxN 64, 48, sp +FILTER_VER_LUMA_AVX2_NxN 64, 64, sp +FILTER_VER_LUMA_AVX2_NxN 16, 32, ss +FILTER_VER_LUMA_AVX2_NxN 16, 64, ss +FILTER_VER_LUMA_AVX2_NxN 24, 32, ss +FILTER_VER_LUMA_AVX2_NxN 32, 32, ss +FILTER_VER_LUMA_AVX2_NxN 32, 64, ss +FILTER_VER_LUMA_AVX2_NxN 48, 64, ss +FILTER_VER_LUMA_AVX2_NxN 64, 32, ss +FILTER_VER_LUMA_AVX2_NxN 64, 48, ss +FILTER_VER_LUMA_AVX2_NxN 64, 64, ss -%ifidn %1, pp - vbroadcasti128 m7, [INTERP_OFFSET_PP] -%elifidn %1, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] -%elifidn %1, ps - vbroadcasti128 m7, [INTERP_OFFSET_PS] -%endif - -.loopH: -%assign x 0 -%rep 3 - movu m1, [r0 + x] - movu m3, [r0 + r1 + x] - movu m5, [r5 + 0 * mmsize] - punpcklwd m0, m1, m3 - pmaddwd m0, m5 - punpckhwd m1, m3 - pmaddwd m1, m5 - - movu m4, [r0 + 2 * r1 + x] - punpcklwd m2, m3, m4 - pmaddwd m2, m5 - punpckhwd m3, m4 - pmaddwd m3, m5 - - lea r6, [r0 + 2 * r1] - movu m5, [r6 + r1 + x] - punpcklwd m6, m4, m5 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m0, m6 - punpckhwd m4, m5 - pmaddwd m4, [r5 + 1 * mmsize] - paddd m1, m4 - - movu m4, [r6 + 2 * r1 + x] - punpcklwd m6, m5, m4 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m2, m6 - punpckhwd m5, m4 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m3, m5 - -%ifidn %1, ss - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - packssdw m0, m1 - packssdw m2, m3 -%elifidn %1, ps - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS - - packssdw m0, m1 - packssdw m2, m3 -%else - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 -%ifidn %1, pp - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP -%else - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP -%endif - packssdw m0, m1 - packssdw m2, m3 - pxor m5, m5 - CLIPW2 m0, m2, m5, [pw_pixel_max] -%endif - - movu [r2 + x], m0 - movu [r2 + r3 + x], m2 -%assign x x+mmsize -%endrep - - lea r2, [r2 + 2 * r3] - lea r0, [r0 + 2 * r1] - dec r4d - jnz .loopH - RET -%endmacro - - FILTER_VER_CHROMA_W16_48x64_avx2 pp, 8 - FILTER_VER_CHROMA_W16_48x64_avx2 ps, 8 - FILTER_VER_CHROMA_W16_48x64_avx2 ss, 7 - FILTER_VER_CHROMA_W16_48x64_avx2 sp, 8 - -INIT_XMM sse2 -cglobal chroma_p2s, 3, 7, 3 - ; load width and height - mov r3d, r3m - mov r4d, r4m - add r1, r1 - - ; load constant - mova m2, [tab_c_n8192] - -.loopH: - - xor r5d, r5d -.loopW: - lea r6, [r0 + r5 * 2] - - movu m0, [r6] - psllw m0, (14 - BIT_DEPTH) - paddw m0, m2 - - movu m1, [r6 + r1] - psllw m1, (14 - BIT_DEPTH) - paddw m1, m2 - - add r5d, 8 - cmp r5d, r3d - lea r6, [r2 + r5 * 2] - jg .width4 - movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0 - movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1 - je .nextH - jmp .loopW - -.width4: - test r3d, 4 - jz .width2 - test r3d, 2 - movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0 - movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1 - lea r6, [r6 + 8] - pshufd m0, m0, 2 - pshufd m1, m1, 2 - jz .nextH - -.width2: - movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0 - movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1 - -.nextH: - lea r0, [r0 + r1 * 2] - add r2, FENC_STRIDE / 2 * 4 - - sub r4d, 2 - jnz .loopH - RET - -%macro PROCESS_LUMA_VER_W4_4R 0 - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m1, m4 ;m1=[1 2] - pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[2 3] - pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 - pmaddwd m4, [r6 + 1 * 16] - paddd m0, m4 ;m0=[0+1+2+3] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[3 4] - pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 - pmaddwd m5, [r6 + 1 * 16] - paddd m1, m5 ;m1 = [1+2+3+4] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[4 5] - pmaddwd m6, m4, [r6 + 1 * 16] - paddd m2, m6 ;m2=[2+3+4+5] Row3 - pmaddwd m4, [r6 + 2 * 16] - paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[5 6] - pmaddwd m6, m5, [r6 + 1 * 16] - paddd m3, m6 ;m3=[3+4+5+6] Row4 - pmaddwd m5, [r6 + 2 * 16] - paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[6 7] - pmaddwd m6, m4, [r6 + 2 * 16] - paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 - pmaddwd m4, [r6 + 3 * 16] - paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[7 8] - pmaddwd m6, m5, [r6 + 2 * 16] - paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 - pmaddwd m5, [r6 + 3 * 16] - paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[8 9] - pmaddwd m4, [r6 + 3 * 16] - paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end - - movq m4, [r0 + 2 * r1] - punpcklwd m5, m4 ;m5=[9 10] - pmaddwd m5, [r6 + 3 * 16] - paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end -%endmacro - -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_PP 2 -INIT_XMM sse4 -cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8 ,0-gprsize - - add r1d, r1d - add r3d, r3d - lea r5, [r1 + 2 * r1] - sub r0, r5 - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_LumaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffV + r4] -%endif - - mova m7, [INTERP_OFFSET_PP] - - mov dword [rsp], %2/4 -.loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_LUMA_VER_W4_4R - - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP - - packssdw m0, m1 - packssdw m2, m3 - - pxor m1, m1 - CLIPW2 m0, m2, m1, [pw_pixel_max] - - movh [r2], m0 - movhps [r2 + r3], m0 - lea r5, [r2 + 2 * r3] - movh [r5], m2 - movhps [r5 + r3], m2 - - lea r5, [8 * r1 - 2 * 4] - sub r0, r5 - add r2, 2 * 4 - - dec r4d - jnz .loopW - - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - 2 * %1] - - dec dword [rsp] - jnz .loopH - RET -%endmacro - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_PP 4, 4 - FILTER_VER_LUMA_PP 8, 8 - FILTER_VER_LUMA_PP 8, 4 - FILTER_VER_LUMA_PP 4, 8 - FILTER_VER_LUMA_PP 16, 16 - FILTER_VER_LUMA_PP 16, 8 - FILTER_VER_LUMA_PP 8, 16 - FILTER_VER_LUMA_PP 16, 12 - FILTER_VER_LUMA_PP 12, 16 - FILTER_VER_LUMA_PP 16, 4 - FILTER_VER_LUMA_PP 4, 16 - FILTER_VER_LUMA_PP 32, 32 - FILTER_VER_LUMA_PP 32, 16 - FILTER_VER_LUMA_PP 16, 32 - FILTER_VER_LUMA_PP 32, 24 - FILTER_VER_LUMA_PP 24, 32 - FILTER_VER_LUMA_PP 32, 8 - FILTER_VER_LUMA_PP 8, 32 - FILTER_VER_LUMA_PP 64, 64 - FILTER_VER_LUMA_PP 64, 32 - FILTER_VER_LUMA_PP 32, 64 - FILTER_VER_LUMA_PP 64, 48 - FILTER_VER_LUMA_PP 48, 64 - FILTER_VER_LUMA_PP 64, 16 - FILTER_VER_LUMA_PP 16, 64 - -%macro FILTER_VER_LUMA_AVX2_4x4 1 +%macro FILTER_VER_LUMA_AVX2_8xN 2 INIT_YMM avx2 -cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_8x%2, 4, 9, 15 mov r4d, r4m + shl r4d, 7 add r1d, r1d add r3d, r3d - shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer] @@ -2379,151 +1361,43 @@ cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 lea r4, [r1 * 3] sub r0, r4 - %ifidn %1,pp - vbroadcasti128 m6, [pd_32] + vbroadcasti128 m14, [pd_32] %elifidn %1, sp - vbroadcasti128 m6, [INTERP_OFFSET_SP] + vbroadcasti128 m14, [INTERP_OFFSET_SP] %else - vbroadcasti128 m6, [INTERP_OFFSET_PS] + vbroadcasti128 m14, [INTERP_OFFSET_PS] %endif - - movq xm0, [r0] - movq xm1, [r0 + r1] + lea r6, [r3 * 3] + lea r7, [r1 * 4] + mov r8d, %2 / 16 +.loopH: + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] - movq xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m5, m4, [r5 + 2 * mmsize] - pmaddwd m4, [r5 + 1 * mmsize] - paddd m0, m5 - paddd m2, m4 - movq xm3, [r0 + r4] - punpcklwd xm1, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] - punpcklwd xm3, xm4 - vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] - pmaddwd m5, m1, [r5 + 3 * mmsize] - pmaddwd m1, [r5 + 2 * mmsize] - paddd m0, m5 - paddd m2, m1 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + 2 * r1] - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [A 9 9 8] - pmaddwd m4, [r5 + 3 * mmsize] - paddd m2, m4 - -%ifidn %1,ss - psrad m0, 6 - psrad m2, 6 -%else - paddd m0, m6 - paddd m2, m6 -%ifidn %1,pp - psrad m0, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP -%elifidn %1, sp - psrad m0, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP -%else - psrad m0, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS -%endif -%endif - - packssdw m0, m2 - pxor m1, m1 -%ifidn %1,pp - CLIPW m0, m1, [pw_pixel_max] -%elifidn %1, sp - CLIPW m0, m1, [pw_pixel_max] -%endif - - vextracti128 xm2, m0, 1 - lea r4, [r3 * 3] - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r4], xm2 - RET -%endmacro - -FILTER_VER_LUMA_AVX2_4x4 pp -FILTER_VER_LUMA_AVX2_4x4 ps -FILTER_VER_LUMA_AVX2_4x4 sp -FILTER_VER_LUMA_AVX2_4x4 ss - -%macro FILTER_VER_LUMA_AVX2_8x8 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 - mov r4d, r4m - add r1d, r1d - add r3d, r3d - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 - -%ifidn %1,pp - vbroadcasti128 m11, [pd_32] -%elifidn %1, sp - vbroadcasti128 m11, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m11, [INTERP_OFFSET_PS] -%endif - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] paddd m0, m4 + pmaddwd m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] - pmaddwd m3, [r5] paddd m1, m5 + pmaddwd m3, [r5] movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 @@ -2540,8 +1414,8 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 pmaddwd m7, m5, [r5 + 2 * mmsize] paddd m1, m7 pmaddwd m7, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] paddd m3, m7 + pmaddwd m5, [r5] movu xm7, [r0 + r4] ; m7 = row 7 punpckhwd xm8, xm6, xm7 punpcklwd xm6, xm7 @@ -2551,8 +1425,8 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 pmaddwd m8, m6, [r5 + 2 * mmsize] paddd m2, m8 pmaddwd m8, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] paddd m4, m8 + pmaddwd m6, [r5] lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhwd xm9, xm7, xm8 @@ -2563,8 +1437,8 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 pmaddwd m9, m7, [r5 + 2 * mmsize] paddd m3, m9 pmaddwd m9, m7, [r5 + 1 * mmsize] - pmaddwd m7, [r5] paddd m5, m9 + pmaddwd m7, [r5] movu xm9, [r0 + r1] ; m9 = row 9 punpckhwd xm10, xm8, xm9 punpcklwd xm8, xm9 @@ -2572,156 +1446,336 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 pmaddwd m10, m8, [r5 + 3 * mmsize] paddd m2, m10 pmaddwd m10, m8, [r5 + 2 * mmsize] - pmaddwd m8, [r5 + 1 * mmsize] paddd m4, m10 - paddd m6, m8 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhwd xm8, xm9, xm10 + punpckhwd xm11, xm9, xm10 punpcklwd xm9, xm10 - vinserti128 m9, m9, xm8, 1 - pmaddwd m8, m9, [r5 + 3 * mmsize] - paddd m3, m8 - pmaddwd m8, m9, [r5 + 2 * mmsize] - pmaddwd m9, [r5 + 1 * mmsize] - paddd m5, m8 - paddd m7, m9 - movu xm8, [r0 + r4] ; m8 = row 11 - punpckhwd xm9, xm10, xm8 - punpcklwd xm10, xm8 - vinserti128 m10, m10, xm9, 1 - pmaddwd m9, m10, [r5 + 3 * mmsize] - pmaddwd m10, [r5 + 2 * mmsize] - paddd m4, m9 - paddd m6, m10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 3 * mmsize] + paddd m4, m12 + pmaddwd m12, m10, [r5 + 2 * mmsize] + paddd m6, m12 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 3 * mmsize] + paddd m5, m13 + pmaddwd m13, m11, [r5 + 2 * mmsize] + paddd m7, m13 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] - lea r4, [r3 * 3] %ifidn %1,ss psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 + psrad m4, 6 + psrad m5, 6 %else - paddd m0, m11 - paddd m1, m11 - paddd m2, m11 - paddd m3, m11 + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 %ifidn %1,pp psrad m0, INTERP_SHIFT_PP psrad m1, INTERP_SHIFT_PP psrad m2, INTERP_SHIFT_PP psrad m3, INTERP_SHIFT_PP + psrad m4, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP %elifidn %1, sp psrad m0, INTERP_SHIFT_SP psrad m1, INTERP_SHIFT_SP psrad m2, INTERP_SHIFT_SP psrad m3, INTERP_SHIFT_SP + psrad m4, INTERP_SHIFT_SP + psrad m5, INTERP_SHIFT_SP %else psrad m0, INTERP_SHIFT_PS psrad m1, INTERP_SHIFT_PS psrad m2, INTERP_SHIFT_PS psrad m3, INTERP_SHIFT_PS + psrad m4, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS %endif %endif packssdw m0, m1 packssdw m2, m3 + packssdw m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b - pxor m10, m10 - mova m9, [pw_pixel_max] + vpermq m4, m4, 11011000b + pxor m5, m5 + mova m3, [pw_pixel_max] %ifidn %1,pp - CLIPW m0, m10, m9 - CLIPW m2, m10, m9 + CLIPW m0, m5, m3 + CLIPW m2, m5, m3 + CLIPW m4, m5, m3 %elifidn %1, sp - CLIPW m0, m10, m9 - CLIPW m2, m10, m9 + CLIPW m0, m5, m3 + CLIPW m2, m5, m3 + CLIPW m4, m5, m3 %endif vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 movu [r2], xm0 movu [r2 + r3], xm1 + vextracti128 xm1, m2, 1 movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 + movu [r2 + r6], xm1 + lea r2, [r2 + r3 * 4] + vextracti128 xm1, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm1 - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 12 - punpckhwd xm3, xm8, xm2 - punpcklwd xm8, xm2 - vinserti128 m8, m8, xm3, 1 - pmaddwd m3, m8, [r5 + 3 * mmsize] - pmaddwd m8, [r5 + 2 * mmsize] - paddd m5, m3 - paddd m7, m8 - movu xm3, [r0 + r1] ; m3 = row 13 - punpckhwd xm0, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm0, 1 - pmaddwd m2, [r5 + 3 * mmsize] - paddd m6, m2 + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m0, m12, [r5 + 3 * mmsize] + paddd m6, m0 + pmaddwd m0, m12, [r5 + 2 * mmsize] + paddd m8, m0 + pmaddwd m0, m12, [r5 + 1 * mmsize] + paddd m10, m0 + pmaddwd m12, [r5] movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhwd xm1, xm3, xm0 - punpcklwd xm3, xm0 - vinserti128 m3, m3, xm1, 1 - pmaddwd m3, [r5 + 3 * mmsize] - paddd m7, m3 - -%ifidn %1,ss - psrad m4, 6 - psrad m5, 6 - psrad m6, 6 - psrad m7, 6 -%else - paddd m4, m11 - paddd m5, m11 - paddd m6, m11 - paddd m7, m11 + punpckhwd xm1, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddwd m1, m13, [r5 + 3 * mmsize] + paddd m7, m1 + pmaddwd m1, m13, [r5 + 2 * mmsize] + paddd m9, m1 + pmaddwd m1, m13, [r5 + 1 * mmsize] + paddd m11, m1 + pmaddwd m13, [r5] + +%ifidn %1,ss + psrad m6, 6 + psrad m7, 6 +%else + paddd m6, m14 + paddd m7, m14 %ifidn %1,pp - psrad m4, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP psrad m6, INTERP_SHIFT_PP psrad m7, INTERP_SHIFT_PP %elifidn %1, sp - psrad m4, INTERP_SHIFT_SP - psrad m5, INTERP_SHIFT_SP psrad m6, INTERP_SHIFT_SP psrad m7, INTERP_SHIFT_SP %else - psrad m4, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS psrad m6, INTERP_SHIFT_PS psrad m7, INTERP_SHIFT_PS %endif %endif - packssdw m4, m5 packssdw m6, m7 - vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b %ifidn %1,pp - CLIPW m4, m10, m9 - CLIPW m6, m10, m9 + CLIPW m6, m5, m3 %elifidn %1, sp - CLIPW m4, m10, m9 - CLIPW m6, m10, m9 + CLIPW m6, m5, m3 %endif - vextracti128 xm5, m4, 1 vextracti128 xm7, m6, 1 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 movu [r2 + r3 * 2], xm6 - movu [r2 + r4], xm7 + movu [r2 + r6], xm7 + + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m8, m2 + pmaddwd m2, m0, [r5 + 2 * mmsize] + paddd m10, m2 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m12, m2 + pmaddwd m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhwd xm6, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm6, 1 + pmaddwd m6, m1, [r5 + 3 * mmsize] + paddd m9, m6 + pmaddwd m6, m1, [r5 + 2 * mmsize] + paddd m11, m6 + pmaddwd m6, m1, [r5 + 1 * mmsize] + paddd m13, m6 + pmaddwd m1, [r5] + movu xm6, [r0 + r1] ; m6 = row 17 + punpckhwd xm4, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m10, m4 + pmaddwd m4, m2, [r5 + 2 * mmsize] + paddd m12, m4 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm2, 1 + pmaddwd m2, m6, [r5 + 3 * mmsize] + paddd m11, m2 + pmaddwd m2, m6, [r5 + 2 * mmsize] + paddd m13, m2 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 + movu xm2, [r0 + r4] ; m2 = row 19 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 3 * mmsize] + paddd m12, m6 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhwd xm7, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 3 * mmsize] + paddd m13, m7 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m1, m2 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhwd xm2, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddwd m6, [r5 + 3 * mmsize] + paddd m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhwd xm6, xm7, xm2 + punpcklwd xm7, xm2 + vinserti128 m7, m7, xm6, 1 + pmaddwd m7, [r5 + 3 * mmsize] + paddd m1, m7 + +%ifidn %1,ss + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m1, 6 +%else + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 + paddd m12, m14 + paddd m13, m14 + paddd m0, m14 + paddd m1, m14 +%ifidn %1,pp + psrad m8, INTERP_SHIFT_PP + psrad m9, INTERP_SHIFT_PP + psrad m10, INTERP_SHIFT_PP + psrad m11, INTERP_SHIFT_PP + psrad m12, INTERP_SHIFT_PP + psrad m13, INTERP_SHIFT_PP + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP +%elifidn %1, sp + psrad m8, INTERP_SHIFT_SP + psrad m9, INTERP_SHIFT_SP + psrad m10, INTERP_SHIFT_SP + psrad m11, INTERP_SHIFT_SP + psrad m12, INTERP_SHIFT_SP + psrad m13, INTERP_SHIFT_SP + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP +%else + psrad m8, INTERP_SHIFT_PS + psrad m9, INTERP_SHIFT_PS + psrad m10, INTERP_SHIFT_PS + psrad m11, INTERP_SHIFT_PS + psrad m12, INTERP_SHIFT_PS + psrad m13, INTERP_SHIFT_PS + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS +%endif +%endif + + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b +%ifidn %1,pp + CLIPW m8, m5, m3 + CLIPW m10, m5, m3 + CLIPW m12, m5, m3 + CLIPW m0, m5, m3 +%elifidn %1, sp + CLIPW m8, m5, m3 + CLIPW m10, m5, m3 + CLIPW m12, m5, m3 + CLIPW m0, m5, m3 +%endif + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + lea r2, [r2 + r3 * 4] + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm1 + lea r2, [r2 + r3 * 4] + sub r0, r7 + dec r8d + jnz .loopH RET %endif %endmacro -FILTER_VER_LUMA_AVX2_8x8 pp -FILTER_VER_LUMA_AVX2_8x8 ps -FILTER_VER_LUMA_AVX2_8x8 sp -FILTER_VER_LUMA_AVX2_8x8 ss +FILTER_VER_LUMA_AVX2_8xN pp, 16 +FILTER_VER_LUMA_AVX2_8xN pp, 32 +FILTER_VER_LUMA_AVX2_8xN ps, 16 +FILTER_VER_LUMA_AVX2_8xN ps, 32 +FILTER_VER_LUMA_AVX2_8xN sp, 16 +FILTER_VER_LUMA_AVX2_8xN sp, 32 +FILTER_VER_LUMA_AVX2_8xN ss, 16 +FILTER_VER_LUMA_AVX2_8xN ss, 32 -%macro PROCESS_LUMA_AVX2_W8_16R 1 +%macro PROCESS_LUMA_AVX2_W8_8R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 @@ -2797,43 +1851,35 @@ FILTER_VER_LUMA_AVX2_8x8 ss paddd m2, m10 pmaddwd m10, m8, [r5 + 2 * mmsize] paddd m4, m10 - pmaddwd m10, m8, [r5 + 1 * mmsize] - paddd m6, m10 - pmaddwd m8, [r5] + pmaddwd m8, [r5 + 1 * mmsize] + paddd m6, m8 movu xm10, [r7 + r1 * 2] ; m10 = row 10 - punpckhwd xm11, xm9, xm10 + punpckhwd xm8, xm9, xm10 punpcklwd xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddwd m11, m9, [r5 + 3 * mmsize] - paddd m3, m11 - pmaddwd m11, m9, [r5 + 2 * mmsize] - paddd m5, m11 - pmaddwd m11, m9, [r5 + 1 * mmsize] - paddd m7, m11 - pmaddwd m9, [r5] - movu xm11, [r7 + r4] ; m11 = row 11 - punpckhwd xm12, xm10, xm11 - punpcklwd xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddwd m12, m10, [r5 + 3 * mmsize] - paddd m4, m12 - pmaddwd m12, m10, [r5 + 2 * mmsize] - paddd m6, m12 - pmaddwd m12, m10, [r5 + 1 * mmsize] - paddd m8, m12 - pmaddwd m10, [r5] + vinserti128 m9, m9, xm8, 1 + pmaddwd m8, m9, [r5 + 3 * mmsize] + paddd m3, m8 + pmaddwd m8, m9, [r5 + 2 * mmsize] + paddd m5, m8 + pmaddwd m9, [r5 + 1 * mmsize] + paddd m7, m9 + movu xm8, [r7 + r4] ; m8 = row 11 + punpckhwd xm9, xm10, xm8 + punpcklwd xm10, xm8 + vinserti128 m10, m10, xm9, 1 + pmaddwd m9, m10, [r5 + 3 * mmsize] + paddd m4, m9 + pmaddwd m10, [r5 + 2 * mmsize] + paddd m6, m10 lea r7, [r7 + r1 * 4] - movu xm12, [r7] ; m12 = row 12 - punpckhwd xm13, xm11, xm12 - punpcklwd xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddwd m13, m11, [r5 + 3 * mmsize] - paddd m5, m13 - pmaddwd m13, m11, [r5 + 2 * mmsize] - paddd m7, m13 - pmaddwd m13, m11, [r5 + 1 * mmsize] - paddd m9, m13 - pmaddwd m11, [r5] + movu xm9, [r7] ; m9 = row 12 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m5, m10 + pmaddwd m8, [r5 + 2 * mmsize] + paddd m7, m8 %ifidn %1,ss psrad m0, 6 @@ -2843,12 +1889,12 @@ FILTER_VER_LUMA_AVX2_8x8 ss psrad m4, 6 psrad m5, 6 %else - paddd m0, m14 - paddd m1, m14 - paddd m2, m14 - paddd m3, m14 - paddd m4, m14 - paddd m5, m14 + paddd m0, m11 + paddd m1, m11 + paddd m2, m11 + paddd m3, m11 + paddd m4, m11 + paddd m5, m11 %ifidn %1,pp psrad m0, INTERP_SHIFT_PP psrad m1, INTERP_SHIFT_PP @@ -2879,58 +1925,47 @@ FILTER_VER_LUMA_AVX2_8x8 ss vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b - pxor m5, m5 - mova m3, [pw_pixel_max] + pxor m8, m8 %ifidn %1,pp - CLIPW m0, m5, m3 - CLIPW m2, m5, m3 - CLIPW m4, m5, m3 + CLIPW m0, m8, m12 + CLIPW m2, m8, m12 + CLIPW m4, m8, m12 %elifidn %1, sp - CLIPW m0, m5, m3 - CLIPW m2, m5, m3 - CLIPW m4, m5, m3 + CLIPW m0, m8, m12 + CLIPW m2, m8, m12 + CLIPW m4, m8, m12 %endif vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 movu [r2], xm0 movu [r2 + r3], xm1 - vextracti128 xm1, m2, 1 movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm1 + movu [r2 + r6], xm3 lea r8, [r2 + r3 * 4] - vextracti128 xm1, m4, 1 movu [r8], xm4 - movu [r8 + r3], xm1 + movu [r8 + r3], xm5 - movu xm13, [r7 + r1] ; m13 = row 13 - punpckhwd xm0, xm12, xm13 - punpcklwd xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddwd m0, m12, [r5 + 3 * mmsize] - paddd m6, m0 - pmaddwd m0, m12, [r5 + 2 * mmsize] - paddd m8, m0 - pmaddwd m0, m12, [r5 + 1 * mmsize] - paddd m10, m0 - pmaddwd m12, [r5] + movu xm10, [r7 + r1] ; m10 = row 13 + punpckhwd xm0, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm0, 1 + pmaddwd m9, [r5 + 3 * mmsize] + paddd m6, m9 movu xm0, [r7 + r1 * 2] ; m0 = row 14 - punpckhwd xm1, xm13, xm0 - punpcklwd xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddwd m1, m13, [r5 + 3 * mmsize] - paddd m7, m1 - pmaddwd m1, m13, [r5 + 2 * mmsize] - paddd m9, m1 - pmaddwd m1, m13, [r5 + 1 * mmsize] - paddd m11, m1 - pmaddwd m13, [r5] + punpckhwd xm1, xm10, xm0 + punpcklwd xm10, xm0 + vinserti128 m10, m10, xm1, 1 + pmaddwd m10, [r5 + 3 * mmsize] + paddd m7, m10 %ifidn %1,ss psrad m6, 6 psrad m7, 6 %else - paddd m6, m14 - paddd m7, m14 + paddd m6, m11 + paddd m7, m11 %ifidn %1,pp psrad m6, INTERP_SHIFT_PP psrad m7, INTERP_SHIFT_PP @@ -2946,174 +1981,19 @@ FILTER_VER_LUMA_AVX2_8x8 ss packssdw m6, m7 vpermq m6, m6, 11011000b %ifidn %1,pp - CLIPW m6, m5, m3 + CLIPW m6, m8, m12 %elifidn %1, sp - CLIPW m6, m5, m3 + CLIPW m6, m8, m12 %endif vextracti128 xm7, m6, 1 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm7 - - movu xm1, [r7 + r4] ; m1 = row 15 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m2, m0, [r5 + 3 * mmsize] - paddd m8, m2 - pmaddwd m2, m0, [r5 + 2 * mmsize] - paddd m10, m2 - pmaddwd m2, m0, [r5 + 1 * mmsize] - paddd m12, m2 - pmaddwd m0, [r5] - lea r7, [r7 + r1 * 4] - movu xm2, [r7] ; m2 = row 16 - punpckhwd xm6, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm6, 1 - pmaddwd m6, m1, [r5 + 3 * mmsize] - paddd m9, m6 - pmaddwd m6, m1, [r5 + 2 * mmsize] - paddd m11, m6 - pmaddwd m6, m1, [r5 + 1 * mmsize] - paddd m13, m6 - pmaddwd m1, [r5] - movu xm6, [r7 + r1] ; m6 = row 17 - punpckhwd xm4, xm2, xm6 - punpcklwd xm2, xm6 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 3 * mmsize] - paddd m10, m4 - pmaddwd m4, m2, [r5 + 2 * mmsize] - paddd m12, m4 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 - movu xm4, [r7 + r1 * 2] ; m4 = row 18 - punpckhwd xm2, xm6, xm4 - punpcklwd xm6, xm4 - vinserti128 m6, m6, xm2, 1 - pmaddwd m2, m6, [r5 + 3 * mmsize] - paddd m11, m2 - pmaddwd m2, m6, [r5 + 2 * mmsize] - paddd m13, m2 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m1, m6 - movu xm2, [r7 + r4] ; m2 = row 19 - punpckhwd xm6, xm4, xm2 - punpcklwd xm4, xm2 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 3 * mmsize] - paddd m12, m6 - pmaddwd m4, [r5 + 2 * mmsize] - paddd m0, m4 - lea r7, [r7 + r1 * 4] - movu xm6, [r7] ; m6 = row 20 - punpckhwd xm7, xm2, xm6 - punpcklwd xm2, xm6 - vinserti128 m2, m2, xm7, 1 - pmaddwd m7, m2, [r5 + 3 * mmsize] - paddd m13, m7 - pmaddwd m2, [r5 + 2 * mmsize] - paddd m1, m2 - movu xm7, [r7 + r1] ; m7 = row 21 - punpckhwd xm2, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddwd m6, [r5 + 3 * mmsize] - paddd m0, m6 - movu xm2, [r7 + r1 * 2] ; m2 = row 22 - punpckhwd xm6, xm7, xm2 - punpcklwd xm7, xm2 - vinserti128 m7, m7, xm6, 1 - pmaddwd m7, [r5 + 3 * mmsize] - paddd m1, m7 - -%ifidn %1,ss - psrad m8, 6 - psrad m9, 6 - psrad m10, 6 - psrad m11, 6 - psrad m12, 6 - psrad m13, 6 - psrad m0, 6 - psrad m1, 6 -%else - paddd m8, m14 - paddd m9, m14 - paddd m10, m14 - paddd m11, m14 - paddd m12, m14 - paddd m13, m14 - paddd m0, m14 - paddd m1, m14 -%ifidn %1,pp - psrad m8, INTERP_SHIFT_PP - psrad m9, INTERP_SHIFT_PP - psrad m10, INTERP_SHIFT_PP - psrad m11, INTERP_SHIFT_PP - psrad m12, INTERP_SHIFT_PP - psrad m13, INTERP_SHIFT_PP - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP -%elifidn %1, sp - psrad m8, INTERP_SHIFT_SP - psrad m9, INTERP_SHIFT_SP - psrad m10, INTERP_SHIFT_SP - psrad m11, INTERP_SHIFT_SP - psrad m12, INTERP_SHIFT_SP - psrad m13, INTERP_SHIFT_SP - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP -%else - psrad m8, INTERP_SHIFT_PS - psrad m9, INTERP_SHIFT_PS - psrad m10, INTERP_SHIFT_PS - psrad m11, INTERP_SHIFT_PS - psrad m12, INTERP_SHIFT_PS - psrad m13, INTERP_SHIFT_PS - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS -%endif -%endif - - packssdw m8, m9 - packssdw m10, m11 - packssdw m12, m13 - packssdw m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b -%ifidn %1,pp - CLIPW m8, m5, m3 - CLIPW m10, m5, m3 - CLIPW m12, m5, m3 - CLIPW m0, m5, m3 -%elifidn %1, sp - CLIPW m8, m5, m3 - CLIPW m10, m5, m3 - CLIPW m12, m5, m3 - CLIPW m0, m5, m3 -%endif - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - lea r8, [r8 + r3 * 4] - movu [r8], xm8 - movu [r8 + r3], xm9 - movu [r8 + r3 * 2], xm10 - movu [r8 + r6], xm11 - lea r8, [r8 + r3 * 4] - movu [r8], xm12 - movu [r8 + r3], xm13 - movu [r8 + r3 * 2], xm0 - movu [r8 + r6], xm1 %endmacro -%macro FILTER_VER_LUMA_AVX2_Nx16 2 +%macro FILTER_VER_LUMA_AVX2_Nx8 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_%2x16, 4, 10, 15 +cglobal interp_8tap_vert_%1_%2x8, 4, 10, 13 mov r4d, r4m shl r4d, 7 add r1d, r1d @@ -3129,16 +2009,17 @@ cglobal interp_8tap_vert_%1_%2x16, 4, 10, 15 lea r4, [r1 * 3] sub r0, r4 %ifidn %1,pp - vbroadcasti128 m14, [pd_32] + vbroadcasti128 m11, [pd_32] %elifidn %1, sp - vbroadcasti128 m14, [INTERP_OFFSET_SP] + vbroadcasti128 m11, [INTERP_OFFSET_SP] %else - vbroadcasti128 m14, [INTERP_OFFSET_PS] + vbroadcasti128 m11, [INTERP_OFFSET_PS] %endif + mova m12, [pw_pixel_max] lea r6, [r3 * 3] mov r9d, %2 / 8 .loopW: - PROCESS_LUMA_AVX2_W8_16R %1 + PROCESS_LUMA_AVX2_W8_8R %1 add r2, 16 add r0, 16 dec r9d @@ -3147,23 +2028,19 @@ cglobal interp_8tap_vert_%1_%2x16, 4, 10, 15 %endif %endmacro -FILTER_VER_LUMA_AVX2_Nx16 pp, 16 -FILTER_VER_LUMA_AVX2_Nx16 pp, 32 -FILTER_VER_LUMA_AVX2_Nx16 pp, 64 -FILTER_VER_LUMA_AVX2_Nx16 ps, 16 -FILTER_VER_LUMA_AVX2_Nx16 ps, 32 -FILTER_VER_LUMA_AVX2_Nx16 ps, 64 -FILTER_VER_LUMA_AVX2_Nx16 sp, 16 -FILTER_VER_LUMA_AVX2_Nx16 sp, 32 -FILTER_VER_LUMA_AVX2_Nx16 sp, 64 -FILTER_VER_LUMA_AVX2_Nx16 ss, 16 -FILTER_VER_LUMA_AVX2_Nx16 ss, 32 -FILTER_VER_LUMA_AVX2_Nx16 ss, 64 +FILTER_VER_LUMA_AVX2_Nx8 pp, 32 +FILTER_VER_LUMA_AVX2_Nx8 pp, 16 +FILTER_VER_LUMA_AVX2_Nx8 ps, 32 +FILTER_VER_LUMA_AVX2_Nx8 ps, 16 +FILTER_VER_LUMA_AVX2_Nx8 sp, 32 +FILTER_VER_LUMA_AVX2_Nx8 sp, 16 +FILTER_VER_LUMA_AVX2_Nx8 ss, 32 +FILTER_VER_LUMA_AVX2_Nx8 ss, 16 -%macro FILTER_VER_LUMA_AVX2_NxN 3 +%macro FILTER_VER_LUMA_AVX2_32x24 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 +cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 mov r4d, r4m shl r4d, 7 add r1d, r1d @@ -3178,101 +2055,44 @@ cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 lea r4, [r1 * 3] sub r0, r4 - -%ifidn %3,pp +%ifidn %1,pp vbroadcasti128 m14, [pd_32] -%elifidn %3, sp +%elifidn %1, sp vbroadcasti128 m14, [INTERP_OFFSET_SP] %else vbroadcasti128 m14, [INTERP_OFFSET_PS] %endif - lea r6, [r3 * 3] - lea r11, [r1 * 4] - mov r9d, %2 / 16 -.loopH: - mov r10d, %1 / 8 + mov r9d, 4 .loopW: - PROCESS_LUMA_AVX2_W8_16R %3 + PROCESS_LUMA_AVX2_W8_16R %1 add r2, 16 add r0, 16 - dec r10d + dec r9d jnz .loopW - sub r7, r11 - lea r0, [r7 - 2 * %1 + 16] - lea r2, [r8 + r3 * 4 - 2 * %1 + 16] + lea r9, [r1 * 4] + sub r7, r9 + lea r0, [r7 - 48] + lea r2, [r8 + r3 * 4 - 48] + mova m11, m14 + mova m12, m3 + mov r9d, 4 +.loop: + PROCESS_LUMA_AVX2_W8_8R %1 + add r2, 16 + add r0, 16 dec r9d - jnz .loopH + jnz .loop RET %endif %endmacro -FILTER_VER_LUMA_AVX2_NxN 16, 32, pp -FILTER_VER_LUMA_AVX2_NxN 16, 64, pp -FILTER_VER_LUMA_AVX2_NxN 24, 32, pp -FILTER_VER_LUMA_AVX2_NxN 32, 32, pp -FILTER_VER_LUMA_AVX2_NxN 32, 64, pp -FILTER_VER_LUMA_AVX2_NxN 48, 64, pp -FILTER_VER_LUMA_AVX2_NxN 64, 32, pp -FILTER_VER_LUMA_AVX2_NxN 64, 48, pp -FILTER_VER_LUMA_AVX2_NxN 64, 64, pp -FILTER_VER_LUMA_AVX2_NxN 16, 32, ps -FILTER_VER_LUMA_AVX2_NxN 16, 64, ps -FILTER_VER_LUMA_AVX2_NxN 24, 32, ps -FILTER_VER_LUMA_AVX2_NxN 32, 32, ps -FILTER_VER_LUMA_AVX2_NxN 32, 64, ps -FILTER_VER_LUMA_AVX2_NxN 48, 64, ps -FILTER_VER_LUMA_AVX2_NxN 64, 32, ps -FILTER_VER_LUMA_AVX2_NxN 64, 48, ps -FILTER_VER_LUMA_AVX2_NxN 64, 64, ps -FILTER_VER_LUMA_AVX2_NxN 16, 32, sp -FILTER_VER_LUMA_AVX2_NxN 16, 64, sp -FILTER_VER_LUMA_AVX2_NxN 24, 32, sp -FILTER_VER_LUMA_AVX2_NxN 32, 32, sp -FILTER_VER_LUMA_AVX2_NxN 32, 64, sp -FILTER_VER_LUMA_AVX2_NxN 48, 64, sp -FILTER_VER_LUMA_AVX2_NxN 64, 32, sp -FILTER_VER_LUMA_AVX2_NxN 64, 48, sp -FILTER_VER_LUMA_AVX2_NxN 64, 64, sp -FILTER_VER_LUMA_AVX2_NxN 16, 32, ss -FILTER_VER_LUMA_AVX2_NxN 16, 64, ss -FILTER_VER_LUMA_AVX2_NxN 24, 32, ss -FILTER_VER_LUMA_AVX2_NxN 32, 32, ss -FILTER_VER_LUMA_AVX2_NxN 32, 64, ss -FILTER_VER_LUMA_AVX2_NxN 48, 64, ss -FILTER_VER_LUMA_AVX2_NxN 64, 32, ss -FILTER_VER_LUMA_AVX2_NxN 64, 48, ss -FILTER_VER_LUMA_AVX2_NxN 64, 64, ss - -%macro FILTER_VER_LUMA_AVX2_8xN 2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_8x%2, 4, 9, 15 - mov r4d, r4m - shl r4d, 7 - add r1d, r1d - add r3d, r3d - -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer + r4] -%endif +FILTER_VER_LUMA_AVX2_32x24 pp +FILTER_VER_LUMA_AVX2_32x24 ps +FILTER_VER_LUMA_AVX2_32x24 sp +FILTER_VER_LUMA_AVX2_32x24 ss - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,pp - vbroadcasti128 m14, [pd_32] -%elifidn %1, sp - vbroadcasti128 m14, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m14, [INTERP_OFFSET_PS] -%endif - lea r6, [r3 * 3] - lea r7, [r1 * 4] - mov r8d, %2 / 16 -.loopH: +%macro PROCESS_LUMA_AVX2_W8_4R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 @@ -3305,409 +2125,236 @@ cglobal interp_8tap_vert_%1_8x%2, 4, 9, 15 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 2 * mmsize] paddd m0, m6 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] + pmaddwd m4, [r5 + 1 * mmsize] + paddd m2, m4 movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm7, xm5, xm6 + punpckhwd xm4, xm5, xm6 punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 2 * mmsize] - paddd m1, m7 - pmaddwd m7, m5, [r5 + 1 * mmsize] - paddd m3, m7 - pmaddwd m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhwd xm8, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddwd m8, m6, [r5 + 3 * mmsize] - paddd m0, m8 - pmaddwd m8, m6, [r5 + 2 * mmsize] - paddd m2, m8 - pmaddwd m8, m6, [r5 + 1 * mmsize] - paddd m4, m8 - pmaddwd m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhwd xm9, xm7, xm8 - punpcklwd xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddwd m9, m7, [r5 + 3 * mmsize] - paddd m1, m9 - pmaddwd m9, m7, [r5 + 2 * mmsize] - paddd m3, m9 - pmaddwd m9, m7, [r5 + 1 * mmsize] - paddd m5, m9 - pmaddwd m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhwd xm10, xm8, xm9 - punpcklwd xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddwd m10, m8, [r5 + 3 * mmsize] - paddd m2, m10 - pmaddwd m10, m8, [r5 + 2 * mmsize] - paddd m4, m10 - pmaddwd m10, m8, [r5 + 1 * mmsize] - paddd m6, m10 - pmaddwd m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhwd xm11, xm9, xm10 - punpcklwd xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddwd m11, m9, [r5 + 3 * mmsize] - paddd m3, m11 - pmaddwd m11, m9, [r5 + 2 * mmsize] - paddd m5, m11 - pmaddwd m11, m9, [r5 + 1 * mmsize] - paddd m7, m11 - pmaddwd m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhwd xm12, xm10, xm11 - punpcklwd xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddwd m12, m10, [r5 + 3 * mmsize] - paddd m4, m12 - pmaddwd m12, m10, [r5 + 2 * mmsize] - paddd m6, m12 - pmaddwd m12, m10, [r5 + 1 * mmsize] - paddd m8, m12 - pmaddwd m10, [r5] + vinserti128 m5, m5, xm4, 1 + pmaddwd m4, m5, [r5 + 2 * mmsize] + paddd m1, m4 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m3, m5 + movu xm4, [r0 + r4] ; m4 = row 7 + punpckhwd xm5, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm5, 1 + pmaddwd m5, m6, [r5 + 3 * mmsize] + paddd m0, m5 + pmaddwd m6, [r5 + 2 * mmsize] + paddd m2, m6 lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhwd xm13, xm11, xm12 - punpcklwd xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddwd m13, m11, [r5 + 3 * mmsize] - paddd m5, m13 - pmaddwd m13, m11, [r5 + 2 * mmsize] - paddd m7, m13 - pmaddwd m13, m11, [r5 + 1 * mmsize] - paddd m9, m13 - pmaddwd m11, [r5] + movu xm5, [r0] ; m5 = row 8 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 3 * mmsize] + paddd m1, m6 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m3, m4 + movu xm6, [r0 + r1] ; m6 = row 9 + punpckhwd xm4, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm4, 1 + pmaddwd m5, [r5 + 3 * mmsize] + paddd m2, m5 + movu xm4, [r0 + r1 * 2] ; m4 = row 10 + punpckhwd xm5, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm5, 1 + pmaddwd m6, [r5 + 3 * mmsize] + paddd m3, m6 %ifidn %1,ss psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 - psrad m4, 6 - psrad m5, 6 %else - paddd m0, m14 - paddd m1, m14 - paddd m2, m14 - paddd m3, m14 - paddd m4, m14 - paddd m5, m14 + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 %ifidn %1,pp psrad m0, INTERP_SHIFT_PP psrad m1, INTERP_SHIFT_PP psrad m2, INTERP_SHIFT_PP psrad m3, INTERP_SHIFT_PP - psrad m4, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP %elifidn %1, sp psrad m0, INTERP_SHIFT_SP psrad m1, INTERP_SHIFT_SP psrad m2, INTERP_SHIFT_SP psrad m3, INTERP_SHIFT_SP - psrad m4, INTERP_SHIFT_SP - psrad m5, INTERP_SHIFT_SP %else psrad m0, INTERP_SHIFT_PS psrad m1, INTERP_SHIFT_PS psrad m2, INTERP_SHIFT_PS psrad m3, INTERP_SHIFT_PS - psrad m4, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS %endif %endif packssdw m0, m1 packssdw m2, m3 - packssdw m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - pxor m5, m5 - mova m3, [pw_pixel_max] + pxor m4, m4 %ifidn %1,pp - CLIPW m0, m5, m3 - CLIPW m2, m5, m3 - CLIPW m4, m5, m3 + CLIPW m0, m4, [pw_pixel_max] + CLIPW m2, m4, [pw_pixel_max] %elifidn %1, sp - CLIPW m0, m5, m3 - CLIPW m2, m5, m3 - CLIPW m4, m5, m3 + CLIPW m0, m4, [pw_pixel_max] + CLIPW m2, m4, [pw_pixel_max] %endif vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 +%endmacro + +%macro FILTER_VER_LUMA_AVX2_16x4 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0-gprsize + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + vbroadcasti128 m7, [pd_32] +%elifidn %1, sp + vbroadcasti128 m7, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m7, [INTERP_OFFSET_PS] +%endif + mov dword [rsp], 2 +.loopW: + PROCESS_LUMA_AVX2_W8_4R %1 movu [r2], xm0 movu [r2 + r3], xm1 - vextracti128 xm1, m2, 1 movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm1 - lea r2, [r2 + r3 * 4] - vextracti128 xm1, m4, 1 - movu [r2], xm4 - movu [r2 + r3], xm1 + lea r6, [r3 * 3] + movu [r2 + r6], xm3 + add r2, 16 + lea r6, [8 * r1 - 16] + sub r0, r6 + dec dword [rsp] + jnz .loopW + RET +%endmacro - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhwd xm0, xm12, xm13 - punpcklwd xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddwd m0, m12, [r5 + 3 * mmsize] - paddd m6, m0 - pmaddwd m0, m12, [r5 + 2 * mmsize] - paddd m8, m0 - pmaddwd m0, m12, [r5 + 1 * mmsize] - paddd m10, m0 - pmaddwd m12, [r5] - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhwd xm1, xm13, xm0 - punpcklwd xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddwd m1, m13, [r5 + 3 * mmsize] - paddd m7, m1 - pmaddwd m1, m13, [r5 + 2 * mmsize] - paddd m9, m1 - pmaddwd m1, m13, [r5 + 1 * mmsize] - paddd m11, m1 - pmaddwd m13, [r5] +FILTER_VER_LUMA_AVX2_16x4 pp +FILTER_VER_LUMA_AVX2_16x4 ps +FILTER_VER_LUMA_AVX2_16x4 sp +FILTER_VER_LUMA_AVX2_16x4 ss -%ifidn %1,ss - psrad m6, 6 - psrad m7, 6 +%macro FILTER_VER_LUMA_AVX2_8x4 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_8x4, 4, 6, 8 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 %else - paddd m6, m14 - paddd m7, m14 + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 %ifidn %1,pp - psrad m6, INTERP_SHIFT_PP - psrad m7, INTERP_SHIFT_PP + vbroadcasti128 m7, [pd_32] %elifidn %1, sp - psrad m6, INTERP_SHIFT_SP - psrad m7, INTERP_SHIFT_SP + vbroadcasti128 m7, [INTERP_OFFSET_SP] %else - psrad m6, INTERP_SHIFT_PS - psrad m7, INTERP_SHIFT_PS + vbroadcasti128 m7, [INTERP_OFFSET_PS] %endif + + PROCESS_LUMA_AVX2_W8_4R %1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + lea r4, [r3 * 3] + movu [r2 + r4], xm3 + RET +%endmacro + +FILTER_VER_LUMA_AVX2_8x4 pp +FILTER_VER_LUMA_AVX2_8x4 ps +FILTER_VER_LUMA_AVX2_8x4 sp +FILTER_VER_LUMA_AVX2_8x4 ss + +%macro FILTER_VER_LUMA_AVX2_16x12 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] %endif - packssdw m6, m7 - vpermq m6, m6, 11011000b + lea r4, [r1 * 3] + sub r0, r4 %ifidn %1,pp - CLIPW m6, m5, m3 + vbroadcasti128 m14, [pd_32] %elifidn %1, sp - CLIPW m6, m5, m3 + vbroadcasti128 m14, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m14, [INTERP_OFFSET_PS] %endif - vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 - - movu xm1, [r0 + r4] ; m1 = row 15 + mova m13, [pw_pixel_max] + pxor m12, m12 + lea r6, [r3 * 3] + mov r9d, 2 +.loopW: + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 - pmaddwd m2, m0, [r5 + 3 * mmsize] - paddd m8, m2 - pmaddwd m2, m0, [r5 + 2 * mmsize] - paddd m10, m2 - pmaddwd m2, m0, [r5 + 1 * mmsize] - paddd m12, m2 pmaddwd m0, [r5] - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhwd xm6, xm1, xm2 + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 - vinserti128 m1, m1, xm6, 1 - pmaddwd m6, m1, [r5 + 3 * mmsize] - paddd m9, m6 - pmaddwd m6, m1, [r5 + 2 * mmsize] - paddd m11, m6 - pmaddwd m6, m1, [r5 + 1 * mmsize] - paddd m13, m6 + vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] - movu xm6, [r0 + r1] ; m6 = row 17 - punpckhwd xm4, xm2, xm6 - punpcklwd xm2, xm6 + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 3 * mmsize] - paddd m10, m4 - pmaddwd m4, m2, [r5 + 2 * mmsize] - paddd m12, m4 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhwd xm2, xm6, xm4 - punpcklwd xm6, xm4 - vinserti128 m6, m6, xm2, 1 - pmaddwd m2, m6, [r5 + 3 * mmsize] - paddd m11, m2 - pmaddwd m2, m6, [r5 + 2 * mmsize] - paddd m13, m2 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m1, m6 - movu xm2, [r0 + r4] ; m2 = row 19 - punpckhwd xm6, xm4, xm2 - punpcklwd xm4, xm2 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 3 * mmsize] - paddd m12, m6 - pmaddwd m4, [r5 + 2 * mmsize] + pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 20 - punpckhwd xm7, xm2, xm6 - punpcklwd xm2, xm6 - vinserti128 m2, m2, xm7, 1 - pmaddwd m7, m2, [r5 + 3 * mmsize] - paddd m13, m7 - pmaddwd m2, [r5 + 2 * mmsize] - paddd m1, m2 - movu xm7, [r0 + r1] ; m7 = row 21 - punpckhwd xm2, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddwd m6, [r5 + 3 * mmsize] - paddd m0, m6 - movu xm2, [r0 + r1 * 2] ; m2 = row 22 - punpckhwd xm6, xm7, xm2 - punpcklwd xm7, xm2 - vinserti128 m7, m7, xm6, 1 - pmaddwd m7, [r5 + 3 * mmsize] - paddd m1, m7 - -%ifidn %1,ss - psrad m8, 6 - psrad m9, 6 - psrad m10, 6 - psrad m11, 6 - psrad m12, 6 - psrad m13, 6 - psrad m0, 6 - psrad m1, 6 -%else - paddd m8, m14 - paddd m9, m14 - paddd m10, m14 - paddd m11, m14 - paddd m12, m14 - paddd m13, m14 - paddd m0, m14 - paddd m1, m14 -%ifidn %1,pp - psrad m8, INTERP_SHIFT_PP - psrad m9, INTERP_SHIFT_PP - psrad m10, INTERP_SHIFT_PP - psrad m11, INTERP_SHIFT_PP - psrad m12, INTERP_SHIFT_PP - psrad m13, INTERP_SHIFT_PP - psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP -%elifidn %1, sp - psrad m8, INTERP_SHIFT_SP - psrad m9, INTERP_SHIFT_SP - psrad m10, INTERP_SHIFT_SP - psrad m11, INTERP_SHIFT_SP - psrad m12, INTERP_SHIFT_SP - psrad m13, INTERP_SHIFT_SP - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP -%else - psrad m8, INTERP_SHIFT_PS - psrad m9, INTERP_SHIFT_PS - psrad m10, INTERP_SHIFT_PS - psrad m11, INTERP_SHIFT_PS - psrad m12, INTERP_SHIFT_PS - psrad m13, INTERP_SHIFT_PS - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS -%endif -%endif - - packssdw m8, m9 - packssdw m10, m11 - packssdw m12, m13 - packssdw m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b -%ifidn %1,pp - CLIPW m8, m5, m3 - CLIPW m10, m5, m3 - CLIPW m12, m5, m3 - CLIPW m0, m5, m3 -%elifidn %1, sp - CLIPW m8, m5, m3 - CLIPW m10, m5, m3 - CLIPW m12, m5, m3 - CLIPW m0, m5, m3 -%endif - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - lea r2, [r2 + r3 * 4] - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r6], xm11 - lea r2, [r2 + r3 * 4] - movu [r2], xm12 - movu [r2 + r3], xm13 - movu [r2 + r3 * 2], xm0 - movu [r2 + r6], xm1 - lea r2, [r2 + r3 * 4] - sub r0, r7 - dec r8d - jnz .loopH - RET -%endif -%endmacro - -FILTER_VER_LUMA_AVX2_8xN pp, 16 -FILTER_VER_LUMA_AVX2_8xN pp, 32 -FILTER_VER_LUMA_AVX2_8xN ps, 16 -FILTER_VER_LUMA_AVX2_8xN ps, 32 -FILTER_VER_LUMA_AVX2_8xN sp, 16 -FILTER_VER_LUMA_AVX2_8xN sp, 32 -FILTER_VER_LUMA_AVX2_8xN ss, 16 -FILTER_VER_LUMA_AVX2_8xN ss, 32 - -%macro PROCESS_LUMA_AVX2_W8_8R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 2 * mmsize] + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] paddd m0, m6 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 @@ -3752,175 +2399,235 @@ FILTER_VER_LUMA_AVX2_8xN ss, 32 paddd m2, m10 pmaddwd m10, m8, [r5 + 2 * mmsize] paddd m4, m10 - pmaddwd m8, [r5 + 1 * mmsize] - paddd m6, m8 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] movu xm10, [r7 + r1 * 2] ; m10 = row 10 - punpckhwd xm8, xm9, xm10 + punpckhwd xm11, xm9, xm10 punpcklwd xm9, xm10 - vinserti128 m9, m9, xm8, 1 - pmaddwd m8, m9, [r5 + 3 * mmsize] - paddd m3, m8 - pmaddwd m8, m9, [r5 + 2 * mmsize] - paddd m5, m8 - pmaddwd m9, [r5 + 1 * mmsize] - paddd m7, m9 - movu xm8, [r7 + r4] ; m8 = row 11 - punpckhwd xm9, xm10, xm8 - punpcklwd xm10, xm8 - vinserti128 m10, m10, xm9, 1 - pmaddwd m9, m10, [r5 + 3 * mmsize] - paddd m4, m9 - pmaddwd m10, [r5 + 2 * mmsize] - paddd m6, m10 - lea r7, [r7 + r1 * 4] - movu xm9, [r7] ; m9 = row 12 - punpckhwd xm10, xm8, xm9 - punpcklwd xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddwd m10, m8, [r5 + 3 * mmsize] - paddd m5, m10 - pmaddwd m8, [r5 + 2 * mmsize] - paddd m7, m8 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] %ifidn %1,ss psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 - psrad m4, 6 - psrad m5, 6 %else - paddd m0, m11 - paddd m1, m11 - paddd m2, m11 - paddd m3, m11 - paddd m4, m11 - paddd m5, m11 + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 %ifidn %1,pp psrad m0, INTERP_SHIFT_PP psrad m1, INTERP_SHIFT_PP psrad m2, INTERP_SHIFT_PP psrad m3, INTERP_SHIFT_PP - psrad m4, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP %elifidn %1, sp psrad m0, INTERP_SHIFT_SP psrad m1, INTERP_SHIFT_SP psrad m2, INTERP_SHIFT_SP psrad m3, INTERP_SHIFT_SP - psrad m4, INTERP_SHIFT_SP - psrad m5, INTERP_SHIFT_SP %else psrad m0, INTERP_SHIFT_PS psrad m1, INTERP_SHIFT_PS psrad m2, INTERP_SHIFT_PS psrad m3, INTERP_SHIFT_PS - psrad m4, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS %endif %endif packssdw m0, m1 packssdw m2, m3 - packssdw m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - pxor m8, m8 %ifidn %1,pp - CLIPW m0, m8, m12 - CLIPW m2, m8, m12 - CLIPW m4, m8, m12 + CLIPW m0, m12, m13 + CLIPW m2, m12, m13 %elifidn %1, sp - CLIPW m0, m8, m12 - CLIPW m2, m8, m12 - CLIPW m4, m8, m12 + CLIPW m0, m12, m13 + CLIPW m2, m12, m13 %endif vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 - lea r8, [r2 + r3 * 4] - movu [r8], xm4 - movu [r8 + r3], xm5 - movu xm10, [r7 + r1] ; m10 = row 13 - punpckhwd xm0, xm9, xm10 - punpcklwd xm9, xm10 - vinserti128 m9, m9, xm0, 1 - pmaddwd m9, [r5 + 3 * mmsize] - paddd m6, m9 + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhwd xm0, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm0, 1 + pmaddwd m0, m10, [r5 + 3 * mmsize] + paddd m4, m0 + pmaddwd m0, m10, [r5 + 2 * mmsize] + paddd m6, m0 + pmaddwd m0, m10, [r5 + 1 * mmsize] + paddd m8, m0 + pmaddwd m10, [r5] + lea r7, [r7 + r1 * 4] + movu xm0, [r7] ; m0 = row 12 + punpckhwd xm1, xm11, xm0 + punpcklwd xm11, xm0 + vinserti128 m11, m11, xm1, 1 + pmaddwd m1, m11, [r5 + 3 * mmsize] + paddd m5, m1 + pmaddwd m1, m11, [r5 + 2 * mmsize] + paddd m7, m1 + pmaddwd m1, m11, [r5 + 1 * mmsize] + paddd m9, m1 + pmaddwd m11, [r5] + movu xm2, [r7 + r1] ; m2 = row 13 + punpckhwd xm1, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm1, 1 + pmaddwd m1, m0, [r5 + 3 * mmsize] + paddd m6, m1 + pmaddwd m1, m0, [r5 + 2 * mmsize] + paddd m8, m1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m10, m0 movu xm0, [r7 + r1 * 2] ; m0 = row 14 - punpckhwd xm1, xm10, xm0 - punpcklwd xm10, xm0 - vinserti128 m10, m10, xm1, 1 - pmaddwd m10, [r5 + 3 * mmsize] - paddd m7, m10 + punpckhwd xm1, xm2, xm0 + punpcklwd xm2, xm0 + vinserti128 m2, m2, xm1, 1 + pmaddwd m1, m2, [r5 + 3 * mmsize] + paddd m7, m1 + pmaddwd m1, m2, [r5 + 2 * mmsize] + paddd m9, m1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m11, m2 %ifidn %1,ss + psrad m4, 6 + psrad m5, 6 psrad m6, 6 psrad m7, 6 %else - paddd m6, m11 - paddd m7, m11 + paddd m4, m14 + paddd m5, m14 + paddd m6, m14 + paddd m7, m14 %ifidn %1,pp + psrad m4, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP psrad m6, INTERP_SHIFT_PP psrad m7, INTERP_SHIFT_PP %elifidn %1, sp + psrad m4, INTERP_SHIFT_SP + psrad m5, INTERP_SHIFT_SP psrad m6, INTERP_SHIFT_SP psrad m7, INTERP_SHIFT_SP %else + psrad m4, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS psrad m6, INTERP_SHIFT_PS psrad m7, INTERP_SHIFT_PS %endif %endif + packssdw m4, m5 packssdw m6, m7 + vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b %ifidn %1,pp - CLIPW m6, m8, m12 + CLIPW m4, m12, m13 + CLIPW m6, m12, m13 %elifidn %1, sp - CLIPW m6, m8, m12 + CLIPW m4, m12, m13 + CLIPW m6, m12, m13 %endif + lea r8, [r2 + r3 * 4] + vextracti128 xm1, m4, 1 vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm1 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm7 -%endmacro -%macro FILTER_VER_LUMA_AVX2_Nx8 2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_%2x8, 4, 10, 13 - mov r4d, r4m - shl r4d, 7 - add r1d, r1d - add r3d, r3d + movu xm1, [r7 + r4] ; m1 = row 15 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m8, m2 + pmaddwd m0, [r5 + 2 * mmsize] + paddd m10, m0 + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m3, m1, [r5 + 3 * mmsize] + paddd m9, m3 + pmaddwd m1, [r5 + 2 * mmsize] + paddd m11, m1 + movu xm3, [r7 + r1] ; m3 = row 17 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m2, [r5 + 3 * mmsize] + paddd m10, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddwd m3, [r5 + 3 * mmsize] + paddd m11, m3 -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - add r5, r4 +%ifidn %1,ss + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 %else - lea r5, [tab_LumaCoeffVer + r4] + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 +%ifidn %1,pp + psrad m8, INTERP_SHIFT_PP + psrad m9, INTERP_SHIFT_PP + psrad m10, INTERP_SHIFT_PP + psrad m11, INTERP_SHIFT_PP +%elifidn %1, sp + psrad m8, INTERP_SHIFT_SP + psrad m9, INTERP_SHIFT_SP + psrad m10, INTERP_SHIFT_SP + psrad m11, INTERP_SHIFT_SP +%else + psrad m8, INTERP_SHIFT_PS + psrad m9, INTERP_SHIFT_PS + psrad m10, INTERP_SHIFT_PS + psrad m11, INTERP_SHIFT_PS +%endif %endif - lea r4, [r1 * 3] - sub r0, r4 + packssdw m8, m9 + packssdw m10, m11 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b %ifidn %1,pp - vbroadcasti128 m11, [pd_32] + CLIPW m8, m12, m13 + CLIPW m10, m12, m13 %elifidn %1, sp - vbroadcasti128 m11, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m11, [INTERP_OFFSET_PS] + CLIPW m8, m12, m13 + CLIPW m10, m12, m13 %endif - mova m12, [pw_pixel_max] - lea r6, [r3 * 3] - mov r9d, %2 / 8 -.loopW: - PROCESS_LUMA_AVX2_W8_8R %1 + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 add r2, 16 add r0, 16 dec r9d @@ -3929,19 +2636,14 @@ cglobal interp_8tap_vert_%1_%2x8, 4, 10, 13 %endif %endmacro -FILTER_VER_LUMA_AVX2_Nx8 pp, 32 -FILTER_VER_LUMA_AVX2_Nx8 pp, 16 -FILTER_VER_LUMA_AVX2_Nx8 ps, 32 -FILTER_VER_LUMA_AVX2_Nx8 ps, 16 -FILTER_VER_LUMA_AVX2_Nx8 sp, 32 -FILTER_VER_LUMA_AVX2_Nx8 sp, 16 -FILTER_VER_LUMA_AVX2_Nx8 ss, 32 -FILTER_VER_LUMA_AVX2_Nx8 ss, 16 +FILTER_VER_LUMA_AVX2_16x12 pp +FILTER_VER_LUMA_AVX2_16x12 ps +FILTER_VER_LUMA_AVX2_16x12 sp +FILTER_VER_LUMA_AVX2_16x12 ss -%macro FILTER_VER_LUMA_AVX2_32x24 1 +%macro FILTER_VER_LUMA_AVX2_4x8 1 INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 +cglobal interp_8tap_vert_%1_4x8, 4, 7, 8 mov r4d, r4m shl r4d, 7 add r1d, r1d @@ -3956,595 +2658,413 @@ cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 lea r4, [r1 * 3] sub r0, r4 + %ifidn %1,pp - vbroadcasti128 m14, [pd_32] + vbroadcasti128 m7, [pd_32] %elifidn %1, sp - vbroadcasti128 m14, [INTERP_OFFSET_SP] + vbroadcasti128 m7, [INTERP_OFFSET_SP] %else - vbroadcasti128 m14, [INTERP_OFFSET_PS] + vbroadcasti128 m7, [INTERP_OFFSET_PS] %endif lea r6, [r3 * 3] - mov r9d, 4 -.loopW: - PROCESS_LUMA_AVX2_W8_16R %1 - add r2, 16 - add r0, 16 - dec r9d - jnz .loopW - lea r9, [r1 * 4] - sub r7, r9 - lea r0, [r7 - 48] - lea r2, [r8 + r3 * 4 - 48] - mova m11, m14 - mova m12, m3 - mov r9d, 4 -.loop: - PROCESS_LUMA_AVX2_W8_8R %1 - add r2, 16 - add r0, 16 - dec r9d - jnz .loop - RET -%endif -%endmacro - -FILTER_VER_LUMA_AVX2_32x24 pp -FILTER_VER_LUMA_AVX2_32x24 ps -FILTER_VER_LUMA_AVX2_32x24 sp -FILTER_VER_LUMA_AVX2_32x24 ss -%macro PROCESS_LUMA_AVX2_W8_4R 1 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 + movq xm0, [r0] + movq xm1, [r0 + r1] punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 + movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 2 * mmsize] - paddd m0, m6 - pmaddwd m4, [r5 + 1 * mmsize] - paddd m2, m4 - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm4, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm4, 1 - pmaddwd m4, m5, [r5 + 2 * mmsize] - paddd m1, m4 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m3, m5 - movu xm4, [r0 + r4] ; m4 = row 7 - punpckhwd xm5, xm6, xm4 - punpcklwd xm6, xm4 - vinserti128 m6, m6, xm5, 1 - pmaddwd m5, m6, [r5 + 3 * mmsize] + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] paddd m0, m5 - pmaddwd m6, [r5 + 2 * mmsize] - paddd m2, m6 - lea r0, [r0 + r1 * 4] - movu xm5, [r0] ; m5 = row 8 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 3 * mmsize] - paddd m1, m6 - pmaddwd m4, [r5 + 2 * mmsize] - paddd m3, m4 - movu xm6, [r0 + r1] ; m6 = row 9 - punpckhwd xm4, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm4, 1 - pmaddwd m5, [r5 + 3 * mmsize] + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] + paddd m0, m5 + pmaddwd m5, m4, [r5 + 1 * mmsize] paddd m2, m5 - movu xm4, [r0 + r1 * 2] ; m4 = row 10 - punpckhwd xm5, xm6, xm4 - punpcklwd xm6, xm4 - vinserti128 m6, m6, xm5, 1 - pmaddwd m6, [r5 + 3 * mmsize] - paddd m3, m6 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + paddd m0, m5 + pmaddwd m5, m1, [r5 + 2 * mmsize] + paddd m2, m5 + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] + pmaddwd m3, m6, [r5 + 3 * mmsize] + paddd m2, m3 + pmaddwd m3, m6, [r5 + 2 * mmsize] + paddd m4, m3 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 %ifidn %1,ss psrad m0, 6 - psrad m1, 6 psrad m2, 6 - psrad m3, 6 %else paddd m0, m7 - paddd m1, m7 paddd m2, m7 - paddd m3, m7 %ifidn %1,pp psrad m0, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP psrad m2, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP %elifidn %1, sp psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP %else psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS %endif %endif - packssdw m0, m1 - packssdw m2, m3 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - pxor m4, m4 + packssdw m0, m2 + pxor m6, m6 + mova m3, [pw_pixel_max] %ifidn %1,pp - CLIPW m0, m4, [pw_pixel_max] - CLIPW m2, m4, [pw_pixel_max] + CLIPW m0, m6, m3 %elifidn %1, sp - CLIPW m0, m4, [pw_pixel_max] - CLIPW m2, m4, [pw_pixel_max] + CLIPW m0, m6, m3 %endif - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 -%endmacro + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm2 -%macro FILTER_VER_LUMA_AVX2_16x4 1 -INIT_YMM avx2 -cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0-gprsize - mov r4d, r4m - shl r4d, 7 - add r1d, r1d - add r3d, r3d + movq xm2, [r0 + r4] + punpcklwd xm5, xm2 + lea r0, [r0 + 4 * r1] + movq xm0, [r0] + punpcklwd xm2, xm0 + vinserti128 m5, m5, xm2, 1 ; m5 = [C B B A] + pmaddwd m2, m5, [r5 + 3 * mmsize] + paddd m4, m2 + pmaddwd m5, [r5 + 2 * mmsize] + paddd m1, m5 + movq xm2, [r0 + r1] + punpcklwd xm0, xm2 + movq xm5, [r0 + 2 * r1] + punpcklwd xm2, xm5 + vinserti128 m0, m0, xm2, 1 ; m0 = [E D D C] + pmaddwd m0, [r5 + 3 * mmsize] + paddd m1, m0 -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - add r5, r4 +%ifidn %1,ss + psrad m4, 6 + psrad m1, 6 %else - lea r5, [tab_LumaCoeffVer + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 + paddd m4, m7 + paddd m1, m7 %ifidn %1,pp - vbroadcasti128 m7, [pd_32] + psrad m4, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP %elifidn %1, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] + psrad m4, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP %else - vbroadcasti128 m7, [INTERP_OFFSET_PS] + psrad m4, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS %endif - mov dword [rsp], 2 -.loopW: - PROCESS_LUMA_AVX2_W8_4R %1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - lea r6, [r3 * 3] - movu [r2 + r6], xm3 - add r2, 16 - lea r6, [8 * r1 - 16] - sub r0, r6 - dec dword [rsp] - jnz .loopW - RET -%endmacro - -FILTER_VER_LUMA_AVX2_16x4 pp -FILTER_VER_LUMA_AVX2_16x4 ps -FILTER_VER_LUMA_AVX2_16x4 sp -FILTER_VER_LUMA_AVX2_16x4 ss - -%macro FILTER_VER_LUMA_AVX2_8x4 1 -INIT_YMM avx2 -cglobal interp_8tap_vert_%1_8x4, 4, 6, 8 - mov r4d, r4m - shl r4d, 7 - add r1d, r1d - add r3d, r3d - -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer + r4] %endif - lea r4, [r1 * 3] - sub r0, r4 + packssdw m4, m1 %ifidn %1,pp - vbroadcasti128 m7, [pd_32] + CLIPW m4, m6, m3 %elifidn %1, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m7, [INTERP_OFFSET_PS] + CLIPW m4, m6, m3 %endif - PROCESS_LUMA_AVX2_W8_4R %1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - lea r4, [r3 * 3] - movu [r2 + r4], xm3 + vextracti128 xm1, m4, 1 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 RET %endmacro -FILTER_VER_LUMA_AVX2_8x4 pp -FILTER_VER_LUMA_AVX2_8x4 ps -FILTER_VER_LUMA_AVX2_8x4 sp -FILTER_VER_LUMA_AVX2_8x4 ss +FILTER_VER_LUMA_AVX2_4x8 pp +FILTER_VER_LUMA_AVX2_4x8 ps +FILTER_VER_LUMA_AVX2_4x8 sp +FILTER_VER_LUMA_AVX2_4x8 ss -%macro FILTER_VER_LUMA_AVX2_16x12 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 - mov r4d, r4m - shl r4d, 7 - add r1d, r1d - add r3d, r3d +%macro PROCESS_LUMA_AVX2_W4_16R 1 + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] + paddd m0, m5 + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + paddd m0, m5 + pmaddwd m5, m1, [r5 + 2 * mmsize] + paddd m2, m5 + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] + pmaddwd m3, m6, [r5 + 3 * mmsize] + paddd m2, m3 + pmaddwd m3, m6, [r5 + 2 * mmsize] + paddd m4, m3 + pmaddwd m3, m6, [r5 + 1 * mmsize] + paddd m1, m3 + pmaddwd m6, [r5] -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - add r5, r4 +%ifidn %1,ss + psrad m0, 6 + psrad m2, 6 %else - lea r5, [tab_LumaCoeffVer + r4] + paddd m0, m7 + paddd m2, m7 +%ifidn %1,pp + psrad m0, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP +%elifidn %1, sp + psrad m0, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP +%else + psrad m0, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS +%endif %endif - lea r4, [r1 * 3] - sub r0, r4 + packssdw m0, m2 + pxor m3, m3 %ifidn %1,pp - vbroadcasti128 m14, [pd_32] + CLIPW m0, m3, [pw_pixel_max] %elifidn %1, sp - vbroadcasti128 m14, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m14, [INTERP_OFFSET_PS] + CLIPW m0, m3, [pw_pixel_max] %endif - mova m13, [pw_pixel_max] - pxor m12, m12 - lea r6, [r3 * 3] - mov r9d, 2 -.loopW: - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 2 * mmsize] - paddd m0, m6 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhwd xm7, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 2 * mmsize] - paddd m1, m7 - pmaddwd m7, m5, [r5 + 1 * mmsize] - paddd m3, m7 + + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm2 + + movq xm2, [r0 + r4] + punpcklwd xm5, xm2 + lea r0, [r0 + 4 * r1] + movq xm0, [r0] + punpcklwd xm2, xm0 + vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] + pmaddwd m2, m5, [r5 + 3 * mmsize] + paddd m4, m2 + pmaddwd m2, m5, [r5 + 2 * mmsize] + paddd m1, m2 + pmaddwd m2, m5, [r5 + 1 * mmsize] + paddd m6, m2 pmaddwd m5, [r5] - movu xm7, [r7 + r4] ; m7 = row 7 - punpckhwd xm8, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddwd m8, m6, [r5 + 3 * mmsize] - paddd m0, m8 - pmaddwd m8, m6, [r5 + 2 * mmsize] - paddd m2, m8 - pmaddwd m8, m6, [r5 + 1 * mmsize] - paddd m4, m8 - pmaddwd m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm8, [r7] ; m8 = row 8 - punpckhwd xm9, xm7, xm8 - punpcklwd xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddwd m9, m7, [r5 + 3 * mmsize] - paddd m1, m9 - pmaddwd m9, m7, [r5 + 2 * mmsize] - paddd m3, m9 - pmaddwd m9, m7, [r5 + 1 * mmsize] - paddd m5, m9 - pmaddwd m7, [r5] - movu xm9, [r7 + r1] ; m9 = row 9 - punpckhwd xm10, xm8, xm9 - punpcklwd xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddwd m10, m8, [r5 + 3 * mmsize] - paddd m2, m10 - pmaddwd m10, m8, [r5 + 2 * mmsize] - paddd m4, m10 - pmaddwd m10, m8, [r5 + 1 * mmsize] - paddd m6, m10 - pmaddwd m8, [r5] - movu xm10, [r7 + r1 * 2] ; m10 = row 10 - punpckhwd xm11, xm9, xm10 - punpcklwd xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddwd m11, m9, [r5 + 3 * mmsize] - paddd m3, m11 - pmaddwd m11, m9, [r5 + 2 * mmsize] - paddd m5, m11 - pmaddwd m11, m9, [r5 + 1 * mmsize] - paddd m7, m11 - pmaddwd m9, [r5] + movq xm2, [r0 + r1] + punpcklwd xm0, xm2 + movq xm3, [r0 + 2 * r1] + punpcklwd xm2, xm3 + vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m1, m2 + pmaddwd m2, m0, [r5 + 2 * mmsize] + paddd m6, m2 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m5, m2 + pmaddwd m0, [r5] %ifidn %1,ss - psrad m0, 6 + psrad m4, 6 psrad m1, 6 - psrad m2, 6 - psrad m3, 6 %else - paddd m0, m14 - paddd m1, m14 - paddd m2, m14 - paddd m3, m14 + paddd m4, m7 + paddd m1, m7 %ifidn %1,pp - psrad m0, INTERP_SHIFT_PP + psrad m4, INTERP_SHIFT_PP psrad m1, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP %elifidn %1, sp - psrad m0, INTERP_SHIFT_SP + psrad m4, INTERP_SHIFT_SP psrad m1, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP %else - psrad m0, INTERP_SHIFT_PS + psrad m4, INTERP_SHIFT_PS psrad m1, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS %endif %endif - packssdw m0, m1 - packssdw m2, m3 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b + packssdw m4, m1 + pxor m2, m2 %ifidn %1,pp - CLIPW m0, m12, m13 - CLIPW m2, m12, m13 + CLIPW m4, m2, [pw_pixel_max] %elifidn %1, sp - CLIPW m0, m12, m13 - CLIPW m2, m12, m13 + CLIPW m4, m2, [pw_pixel_max] %endif - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 + vextracti128 xm1, m4, 1 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 - movu xm11, [r7 + r4] ; m11 = row 11 - punpckhwd xm0, xm10, xm11 - punpcklwd xm10, xm11 - vinserti128 m10, m10, xm0, 1 - pmaddwd m0, m10, [r5 + 3 * mmsize] - paddd m4, m0 - pmaddwd m0, m10, [r5 + 2 * mmsize] - paddd m6, m0 - pmaddwd m0, m10, [r5 + 1 * mmsize] - paddd m8, m0 - pmaddwd m10, [r5] - lea r7, [r7 + r1 * 4] - movu xm0, [r7] ; m0 = row 12 - punpckhwd xm1, xm11, xm0 - punpcklwd xm11, xm0 - vinserti128 m11, m11, xm1, 1 - pmaddwd m1, m11, [r5 + 3 * mmsize] - paddd m5, m1 - pmaddwd m1, m11, [r5 + 2 * mmsize] - paddd m7, m1 - pmaddwd m1, m11, [r5 + 1 * mmsize] - paddd m9, m1 - pmaddwd m11, [r5] - movu xm2, [r7 + r1] ; m2 = row 13 - punpckhwd xm1, xm0, xm2 - punpcklwd xm0, xm2 - vinserti128 m0, m0, xm1, 1 - pmaddwd m1, m0, [r5 + 3 * mmsize] - paddd m6, m1 - pmaddwd m1, m0, [r5 + 2 * mmsize] - paddd m8, m1 - pmaddwd m0, [r5 + 1 * mmsize] - paddd m10, m0 - movu xm0, [r7 + r1 * 2] ; m0 = row 14 - punpckhwd xm1, xm2, xm0 - punpcklwd xm2, xm0 - vinserti128 m2, m2, xm1, 1 - pmaddwd m1, m2, [r5 + 3 * mmsize] - paddd m7, m1 - pmaddwd m1, m2, [r5 + 2 * mmsize] - paddd m9, m1 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m11, m2 + movq xm4, [r0 + r4] + punpcklwd xm3, xm4 + lea r0, [r0 + 4 * r1] + movq xm1, [r0] + punpcklwd xm4, xm1 + vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] + pmaddwd m4, m3, [r5 + 3 * mmsize] + paddd m6, m4 + pmaddwd m4, m3, [r5 + 2 * mmsize] + paddd m5, m4 + pmaddwd m4, m3, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m3, [r5] + movq xm4, [r0 + r1] + punpcklwd xm1, xm4 + movq xm2, [r0 + 2 * r1] + punpcklwd xm4, xm2 + vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] + pmaddwd m4, m1, [r5 + 3 * mmsize] + paddd m5, m4 + pmaddwd m4, m1, [r5 + 2 * mmsize] + paddd m0, m4 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m3, m1 %ifidn %1,ss - psrad m4, 6 - psrad m5, 6 psrad m6, 6 - psrad m7, 6 + psrad m5, 6 %else - paddd m4, m14 - paddd m5, m14 - paddd m6, m14 - paddd m7, m14 + paddd m6, m7 + paddd m5, m7 %ifidn %1,pp - psrad m4, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP psrad m6, INTERP_SHIFT_PP - psrad m7, INTERP_SHIFT_PP + psrad m5, INTERP_SHIFT_PP %elifidn %1, sp - psrad m4, INTERP_SHIFT_SP - psrad m5, INTERP_SHIFT_SP psrad m6, INTERP_SHIFT_SP - psrad m7, INTERP_SHIFT_SP + psrad m5, INTERP_SHIFT_SP %else - psrad m4, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS psrad m6, INTERP_SHIFT_PS - psrad m7, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS %endif %endif - packssdw m4, m5 - packssdw m6, m7 - vpermq m4, m4, 11011000b - vpermq m6, m6, 11011000b + packssdw m6, m5 + pxor m1, m1 %ifidn %1,pp - CLIPW m4, m12, m13 - CLIPW m6, m12, m13 + CLIPW m6, m1, [pw_pixel_max] %elifidn %1, sp - CLIPW m4, m12, m13 - CLIPW m6, m12, m13 + CLIPW m6, m1, [pw_pixel_max] %endif - lea r8, [r2 + r3 * 4] - vextracti128 xm1, m4, 1 - vextracti128 xm7, m6, 1 - movu [r8], xm4 - movu [r8 + r3], xm1 - movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 - movu xm1, [r7 + r4] ; m1 = row 15 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m2, m0, [r5 + 3 * mmsize] - paddd m8, m2 - pmaddwd m0, [r5 + 2 * mmsize] - paddd m10, m0 - lea r7, [r7 + r1 * 4] - movu xm2, [r7] ; m2 = row 16 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m3, m1, [r5 + 3 * mmsize] - paddd m9, m3 - pmaddwd m1, [r5 + 2 * mmsize] - paddd m11, m1 - movu xm3, [r7 + r1] ; m3 = row 17 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m2, [r5 + 3 * mmsize] - paddd m10, m2 - movu xm4, [r7 + r1 * 2] ; m4 = row 18 - punpckhwd xm2, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm2, 1 - pmaddwd m3, [r5 + 3 * mmsize] - paddd m11, m3 + vextracti128 xm5, m6, 1 + lea r2, [r2 + r3 * 4] + movq [r2], xm6 + movq [r2 + r3], xm5 + movhps [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm5 + + movq xm4, [r0 + r4] + punpcklwd xm2, xm4 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm4, xm6 + vinserti128 m2, m2, xm4, 1 ; m2 = [20 19 19 18] + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m3, m2 + movq xm4, [r0 + r1] + punpcklwd xm6, xm4 + movq xm2, [r0 + 2 * r1] + punpcklwd xm4, xm2 + vinserti128 m6, m6, xm4, 1 ; m6 = [22 21 21 20] + pmaddwd m6, [r5 + 3 * mmsize] + paddd m3, m6 %ifidn %1,ss - psrad m8, 6 - psrad m9, 6 - psrad m10, 6 - psrad m11, 6 + psrad m0, 6 + psrad m3, 6 %else - paddd m8, m14 - paddd m9, m14 - paddd m10, m14 - paddd m11, m14 + paddd m0, m7 + paddd m3, m7 %ifidn %1,pp - psrad m8, INTERP_SHIFT_PP - psrad m9, INTERP_SHIFT_PP - psrad m10, INTERP_SHIFT_PP - psrad m11, INTERP_SHIFT_PP + psrad m0, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP %elifidn %1, sp - psrad m8, INTERP_SHIFT_SP - psrad m9, INTERP_SHIFT_SP - psrad m10, INTERP_SHIFT_SP - psrad m11, INTERP_SHIFT_SP + psrad m0, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP %else - psrad m8, INTERP_SHIFT_PS - psrad m9, INTERP_SHIFT_PS - psrad m10, INTERP_SHIFT_PS - psrad m11, INTERP_SHIFT_PS + psrad m0, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS %endif %endif - packssdw m8, m9 - packssdw m10, m11 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b + packssdw m0, m3 %ifidn %1,pp - CLIPW m8, m12, m13 - CLIPW m10, m12, m13 + CLIPW m0, m1, [pw_pixel_max] %elifidn %1, sp - CLIPW m8, m12, m13 - CLIPW m10, m12, m13 -%endif - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - lea r8, [r8 + r3 * 4] - movu [r8], xm8 - movu [r8 + r3], xm9 - movu [r8 + r3 * 2], xm10 - movu [r8 + r6], xm11 - add r2, 16 - add r0, 16 - dec r9d - jnz .loopW - RET + CLIPW m0, m1, [pw_pixel_max] %endif -%endmacro -FILTER_VER_LUMA_AVX2_16x12 pp -FILTER_VER_LUMA_AVX2_16x12 ps -FILTER_VER_LUMA_AVX2_16x12 sp -FILTER_VER_LUMA_AVX2_16x12 ss + vextracti128 xm3, m0, 1 + lea r2, [r2 + r3 * 4] + movq [r2], xm0 + movq [r2 + r3], xm3 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm3 +%endmacro -%macro FILTER_VER_LUMA_AVX2_4x8 1 +%macro FILTER_VER_LUMA_AVX2_4x16 1 INIT_YMM avx2 -cglobal interp_8tap_vert_%1_4x8, 4, 7, 8 +cglobal interp_8tap_vert_%1_4x16, 4, 7, 8 mov r4d, r4m shl r4d, 7 add r1d, r1d @@ -4559,7 +3079,6 @@ cglobal interp_8tap_vert_%1_4x8, 4, 7, 8 lea r4, [r1 * 3] sub r0, r4 - %ifidn %1,pp vbroadcasti128 m7, [pd_32] %elifidn %1, sp @@ -4568,698 +3087,278 @@ cglobal interp_8tap_vert_%1_4x8, 4, 7, 8 vbroadcasti128 m7, [INTERP_OFFSET_PS] %endif lea r6, [r3 * 3] + PROCESS_LUMA_AVX2_W4_16R %1 + RET +%endmacro - movq xm0, [r0] - movq xm1, [r0 + r1] - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m5, m4, [r5 + 2 * mmsize] - paddd m0, m5 - pmaddwd m5, m4, [r5 + 1 * mmsize] - paddd m2, m5 - pmaddwd m4, [r5] - movq xm3, [r0 + r4] - punpcklwd xm1, xm3 - lea r0, [r0 + 4 * r1] - movq xm6, [r0] - punpcklwd xm3, xm6 - vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] - pmaddwd m5, m1, [r5 + 3 * mmsize] - paddd m0, m5 - pmaddwd m5, m1, [r5 + 2 * mmsize] - paddd m2, m5 - pmaddwd m5, m1, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m1, [r5] - movq xm3, [r0 + r1] - punpcklwd xm6, xm3 - movq xm5, [r0 + 2 * r1] - punpcklwd xm3, xm5 - vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] - pmaddwd m3, m6, [r5 + 3 * mmsize] - paddd m2, m3 - pmaddwd m3, m6, [r5 + 2 * mmsize] - paddd m4, m3 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m1, m6 +FILTER_VER_LUMA_AVX2_4x16 pp +FILTER_VER_LUMA_AVX2_4x16 ps +FILTER_VER_LUMA_AVX2_4x16 sp +FILTER_VER_LUMA_AVX2_4x16 ss -%ifidn %1,ss - psrad m0, 6 - psrad m2, 6 +%macro FILTER_VER_LUMA_AVX2_12x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_12x16, 4, 9, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 %else - paddd m0, m7 - paddd m2, m7 + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 %ifidn %1,pp - psrad m0, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP + vbroadcasti128 m14, [pd_32] %elifidn %1, sp - psrad m0, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP + vbroadcasti128 m14, [INTERP_OFFSET_SP] %else - psrad m0, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS + vbroadcasti128 m14, [INTERP_OFFSET_PS] %endif + lea r6, [r3 * 3] + PROCESS_LUMA_AVX2_W8_16R %1 + add r2, 16 + add r0, 16 + mova m7, m14 + PROCESS_LUMA_AVX2_W4_16R %1 + RET %endif +%endmacro - packssdw m0, m2 - pxor m6, m6 - mova m3, [pw_pixel_max] -%ifidn %1,pp - CLIPW m0, m6, m3 -%elifidn %1, sp - CLIPW m0, m6, m3 -%endif +FILTER_VER_LUMA_AVX2_12x16 pp +FILTER_VER_LUMA_AVX2_12x16 ps +FILTER_VER_LUMA_AVX2_12x16 sp +FILTER_VER_LUMA_AVX2_12x16 ss - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm2 +;--------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_PS 2 +INIT_XMM sse4 +cglobal interp_8tap_vert_ps_%1x%2, 5, 7, 8 ,0-gprsize - movq xm2, [r0 + r4] - punpcklwd xm5, xm2 - lea r0, [r0 + 4 * r1] - movq xm0, [r0] - punpcklwd xm2, xm0 - vinserti128 m5, m5, xm2, 1 ; m5 = [C B B A] - pmaddwd m2, m5, [r5 + 3 * mmsize] - paddd m4, m2 - pmaddwd m5, [r5 + 2 * mmsize] - paddd m1, m5 - movq xm2, [r0 + r1] - punpcklwd xm0, xm2 - movq xm5, [r0 + 2 * r1] - punpcklwd xm2, xm5 - vinserti128 m0, m0, xm2, 1 ; m0 = [E D D C] - pmaddwd m0, [r5 + 3 * mmsize] - paddd m1, m0 + add r1d, r1d + add r3d, r3d + lea r5, [r1 + 2 * r1] + sub r0, r5 + shl r4d, 6 -%ifidn %1,ss - psrad m4, 6 - psrad m1, 6 -%else - paddd m4, m7 - paddd m1, m7 -%ifidn %1,pp - psrad m4, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP -%elifidn %1, sp - psrad m4, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP +%ifdef PIC + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] %else - psrad m4, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS -%endif + lea r6, [tab_LumaCoeffV + r4] %endif - packssdw m4, m1 -%ifidn %1,pp - CLIPW m4, m6, m3 -%elifidn %1, sp - CLIPW m4, m6, m3 -%endif + mova m7, [INTERP_OFFSET_PS] - vextracti128 xm1, m4, 1 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movq [r2 + r3], xm1 - movhps [r2 + r3 * 2], xm4 - movhps [r2 + r6], xm1 + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_LUMA_VER_W4_4R + + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS + + packssdw m0, m1 + packssdw m2, m3 + + movh [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movh [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH RET %endmacro -FILTER_VER_LUMA_AVX2_4x8 pp -FILTER_VER_LUMA_AVX2_4x8 ps -FILTER_VER_LUMA_AVX2_4x8 sp -FILTER_VER_LUMA_AVX2_4x8 ss +;--------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_PS 4, 4 + FILTER_VER_LUMA_PS 8, 8 + FILTER_VER_LUMA_PS 8, 4 + FILTER_VER_LUMA_PS 4, 8 + FILTER_VER_LUMA_PS 16, 16 + FILTER_VER_LUMA_PS 16, 8 + FILTER_VER_LUMA_PS 8, 16 + FILTER_VER_LUMA_PS 16, 12 + FILTER_VER_LUMA_PS 12, 16 + FILTER_VER_LUMA_PS 16, 4 + FILTER_VER_LUMA_PS 4, 16 + FILTER_VER_LUMA_PS 32, 32 + FILTER_VER_LUMA_PS 32, 16 + FILTER_VER_LUMA_PS 16, 32 + FILTER_VER_LUMA_PS 32, 24 + FILTER_VER_LUMA_PS 24, 32 + FILTER_VER_LUMA_PS 32, 8 + FILTER_VER_LUMA_PS 8, 32 + FILTER_VER_LUMA_PS 64, 64 + FILTER_VER_LUMA_PS 64, 32 + FILTER_VER_LUMA_PS 32, 64 + FILTER_VER_LUMA_PS 64, 48 + FILTER_VER_LUMA_PS 48, 64 + FILTER_VER_LUMA_PS 64, 16 + FILTER_VER_LUMA_PS 16, 64 -%macro PROCESS_LUMA_AVX2_W4_16R 1 - movq xm0, [r0] - movq xm1, [r0 + r1] - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m5, m4, [r5 + 2 * mmsize] - paddd m0, m5 - pmaddwd m5, m4, [r5 + 1 * mmsize] - paddd m2, m5 - pmaddwd m4, [r5] - movq xm3, [r0 + r4] - punpcklwd xm1, xm3 - lea r0, [r0 + 4 * r1] - movq xm6, [r0] - punpcklwd xm3, xm6 - vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] - pmaddwd m5, m1, [r5 + 3 * mmsize] - paddd m0, m5 - pmaddwd m5, m1, [r5 + 2 * mmsize] - paddd m2, m5 - pmaddwd m5, m1, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m1, [r5] - movq xm3, [r0 + r1] - punpcklwd xm6, xm3 - movq xm5, [r0 + 2 * r1] - punpcklwd xm3, xm5 - vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] - pmaddwd m3, m6, [r5 + 3 * mmsize] - paddd m2, m3 - pmaddwd m3, m6, [r5 + 2 * mmsize] - paddd m4, m3 - pmaddwd m3, m6, [r5 + 1 * mmsize] - paddd m1, m3 - pmaddwd m6, [r5] +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_SP 2 +INIT_XMM sse4 +cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize -%ifidn %1,ss - psrad m0, 6 - psrad m2, 6 -%else - paddd m0, m7 - paddd m2, m7 -%ifidn %1,pp - psrad m0, INTERP_SHIFT_PP - psrad m2, INTERP_SHIFT_PP -%elifidn %1, sp - psrad m0, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP + add r1d, r1d + add r3d, r3d + lea r5, [r1 + 2 * r1] + sub r0, r5 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] %else - psrad m0, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS -%endif + lea r6, [tab_LumaCoeffV + r4] %endif - packssdw m0, m2 - pxor m3, m3 -%ifidn %1,pp - CLIPW m0, m3, [pw_pixel_max] -%elifidn %1, sp - CLIPW m0, m3, [pw_pixel_max] -%endif + mova m7, [INTERP_OFFSET_SP] - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm2 + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_LUMA_VER_W4_4R - movq xm2, [r0 + r4] - punpcklwd xm5, xm2 - lea r0, [r0 + 4 * r1] - movq xm0, [r0] - punpcklwd xm2, xm0 - vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] - pmaddwd m2, m5, [r5 + 3 * mmsize] - paddd m4, m2 - pmaddwd m2, m5, [r5 + 2 * mmsize] - paddd m1, m2 - pmaddwd m2, m5, [r5 + 1 * mmsize] - paddd m6, m2 - pmaddwd m5, [r5] - movq xm2, [r0 + r1] - punpcklwd xm0, xm2 - movq xm3, [r0 + 2 * r1] - punpcklwd xm2, xm3 - vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] - pmaddwd m2, m0, [r5 + 3 * mmsize] - paddd m1, m2 - pmaddwd m2, m0, [r5 + 2 * mmsize] - paddd m6, m2 - pmaddwd m2, m0, [r5 + 1 * mmsize] - paddd m5, m2 - pmaddwd m0, [r5] - -%ifidn %1,ss - psrad m4, 6 - psrad m1, 6 -%else - paddd m4, m7 - paddd m1, m7 -%ifidn %1,pp - psrad m4, INTERP_SHIFT_PP - psrad m1, INTERP_SHIFT_PP -%elifidn %1, sp - psrad m4, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP -%else - psrad m4, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS -%endif -%endif + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 - packssdw m4, m1 - pxor m2, m2 -%ifidn %1,pp - CLIPW m4, m2, [pw_pixel_max] -%elifidn %1, sp - CLIPW m4, m2, [pw_pixel_max] -%endif + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP - vextracti128 xm1, m4, 1 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movq [r2 + r3], xm1 - movhps [r2 + r3 * 2], xm4 - movhps [r2 + r6], xm1 + packssdw m0, m1 + packssdw m2, m3 - movq xm4, [r0 + r4] - punpcklwd xm3, xm4 - lea r0, [r0 + 4 * r1] - movq xm1, [r0] - punpcklwd xm4, xm1 - vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] - pmaddwd m4, m3, [r5 + 3 * mmsize] - paddd m6, m4 - pmaddwd m4, m3, [r5 + 2 * mmsize] - paddd m5, m4 - pmaddwd m4, m3, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m3, [r5] - movq xm4, [r0 + r1] - punpcklwd xm1, xm4 - movq xm2, [r0 + 2 * r1] - punpcklwd xm4, xm2 - vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] - pmaddwd m4, m1, [r5 + 3 * mmsize] - paddd m5, m4 - pmaddwd m4, m1, [r5 + 2 * mmsize] - paddd m0, m4 - pmaddwd m1, [r5 + 1 * mmsize] - paddd m3, m1 + pxor m1, m1 + CLIPW2 m0, m2, m1, [pw_pixel_max] -%ifidn %1,ss - psrad m6, 6 - psrad m5, 6 -%else - paddd m6, m7 - paddd m5, m7 -%ifidn %1,pp - psrad m6, INTERP_SHIFT_PP - psrad m5, INTERP_SHIFT_PP -%elifidn %1, sp - psrad m6, INTERP_SHIFT_SP - psrad m5, INTERP_SHIFT_SP -%else - psrad m6, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS -%endif -%endif + movh [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movh [r5], m2 + movhps [r5 + r3], m2 - packssdw m6, m5 - pxor m1, m1 -%ifidn %1,pp - CLIPW m6, m1, [pw_pixel_max] -%elifidn %1, sp - CLIPW m6, m1, [pw_pixel_max] -%endif + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 - vextracti128 xm5, m6, 1 - lea r2, [r2 + r3 * 4] - movq [r2], xm6 - movq [r2 + r3], xm5 - movhps [r2 + r3 * 2], xm6 - movhps [r2 + r6], xm5 + dec r4d + jnz .loopW - movq xm4, [r0 + r4] - punpcklwd xm2, xm4 - lea r0, [r0 + 4 * r1] - movq xm6, [r0] - punpcklwd xm4, xm6 - vinserti128 m2, m2, xm4, 1 ; m2 = [20 19 19 18] - pmaddwd m4, m2, [r5 + 3 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5 + 2 * mmsize] - paddd m3, m2 - movq xm4, [r0 + r1] - punpcklwd xm6, xm4 - movq xm2, [r0 + 2 * r1] - punpcklwd xm4, xm2 - vinserti128 m6, m6, xm4, 1 ; m6 = [22 21 21 20] - pmaddwd m6, [r5 + 3 * mmsize] - paddd m3, m6 + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] -%ifidn %1,ss - psrad m0, 6 - psrad m3, 6 -%else - paddd m0, m7 - paddd m3, m7 -%ifidn %1,pp - psrad m0, INTERP_SHIFT_PP - psrad m3, INTERP_SHIFT_PP -%elifidn %1, sp - psrad m0, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP -%else - psrad m0, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS -%endif -%endif + dec dword [rsp] + jnz .loopH + RET +%endmacro - packssdw m0, m3 -%ifidn %1,pp - CLIPW m0, m1, [pw_pixel_max] -%elifidn %1, sp - CLIPW m0, m1, [pw_pixel_max] -%endif +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_SP 4, 4 + FILTER_VER_LUMA_SP 8, 8 + FILTER_VER_LUMA_SP 8, 4 + FILTER_VER_LUMA_SP 4, 8 + FILTER_VER_LUMA_SP 16, 16 + FILTER_VER_LUMA_SP 16, 8 + FILTER_VER_LUMA_SP 8, 16 + FILTER_VER_LUMA_SP 16, 12 + FILTER_VER_LUMA_SP 12, 16 + FILTER_VER_LUMA_SP 16, 4 + FILTER_VER_LUMA_SP 4, 16 + FILTER_VER_LUMA_SP 32, 32 + FILTER_VER_LUMA_SP 32, 16 + FILTER_VER_LUMA_SP 16, 32 + FILTER_VER_LUMA_SP 32, 24 + FILTER_VER_LUMA_SP 24, 32 + FILTER_VER_LUMA_SP 32, 8 + FILTER_VER_LUMA_SP 8, 32 + FILTER_VER_LUMA_SP 64, 64 + FILTER_VER_LUMA_SP 64, 32 + FILTER_VER_LUMA_SP 32, 64 + FILTER_VER_LUMA_SP 64, 48 + FILTER_VER_LUMA_SP 48, 64 + FILTER_VER_LUMA_SP 64, 16 + FILTER_VER_LUMA_SP 16, 64 - vextracti128 xm3, m0, 1 - lea r2, [r2 + r3 * 4] - movq [r2], xm0 - movq [r2 + r3], xm3 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm3 -%endmacro +;----------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_SS 2 +INIT_XMM sse2 +cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize -%macro FILTER_VER_LUMA_AVX2_4x16 1 -INIT_YMM avx2 -cglobal interp_8tap_vert_%1_4x16, 4, 7, 8 - mov r4d, r4m - shl r4d, 7 - add r1d, r1d - add r3d, r3d + add r1d, r1d + add r3d, r3d + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 %ifdef PIC - lea r5, [tab_LumaCoeffVer] - add r5, r4 + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] %else - lea r5, [tab_LumaCoeffVer + r4] + lea r6, [tab_LumaCoeffV + r4] %endif - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,pp - vbroadcasti128 m7, [pd_32] -%elifidn %1, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m7, [INTERP_OFFSET_PS] -%endif - lea r6, [r3 * 3] - PROCESS_LUMA_AVX2_W4_16R %1 - RET -%endmacro + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_LUMA_VER_W4_4R -FILTER_VER_LUMA_AVX2_4x16 pp -FILTER_VER_LUMA_AVX2_4x16 ps -FILTER_VER_LUMA_AVX2_4x16 sp -FILTER_VER_LUMA_AVX2_4x16 ss + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + movlps [r2], m0 + movhps [r2 + r3], m0 -%macro FILTER_VER_LUMA_AVX2_12x16 1 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_%1_12x16, 4, 9, 15 - mov r4d, r4m - shl r4d, 7 - add r1d, r1d - add r3d, r3d - -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r4 -%ifidn %1,pp - vbroadcasti128 m14, [pd_32] -%elifidn %1, sp - vbroadcasti128 m14, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m14, [INTERP_OFFSET_PS] -%endif - lea r6, [r3 * 3] - PROCESS_LUMA_AVX2_W8_16R %1 - add r2, 16 - add r0, 16 - mova m7, m14 - PROCESS_LUMA_AVX2_W4_16R %1 - RET -%endif -%endmacro - -FILTER_VER_LUMA_AVX2_12x16 pp -FILTER_VER_LUMA_AVX2_12x16 ps -FILTER_VER_LUMA_AVX2_12x16 sp -FILTER_VER_LUMA_AVX2_12x16 ss - -;--------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_PS 2 -INIT_XMM sse4 -cglobal interp_8tap_vert_ps_%1x%2, 5, 7, 8 ,0-gprsize - - add r1d, r1d - add r3d, r3d - lea r5, [r1 + 2 * r1] - sub r0, r5 - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_LumaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffV + r4] -%endif - - mova m7, [INTERP_OFFSET_PS] - - mov dword [rsp], %2/4 -.loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_LUMA_VER_W4_4R - - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS - - packssdw m0, m1 - packssdw m2, m3 - - movh [r2], m0 - movhps [r2 + r3], m0 - lea r5, [r2 + 2 * r3] - movh [r5], m2 - movhps [r5 + r3], m2 - - lea r5, [8 * r1 - 2 * 4] - sub r0, r5 - add r2, 2 * 4 - - dec r4d - jnz .loopW - - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - 2 * %1] - - dec dword [rsp] - jnz .loopH - RET -%endmacro - -;--------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_PS 4, 4 - FILTER_VER_LUMA_PS 8, 8 - FILTER_VER_LUMA_PS 8, 4 - FILTER_VER_LUMA_PS 4, 8 - FILTER_VER_LUMA_PS 16, 16 - FILTER_VER_LUMA_PS 16, 8 - FILTER_VER_LUMA_PS 8, 16 - FILTER_VER_LUMA_PS 16, 12 - FILTER_VER_LUMA_PS 12, 16 - FILTER_VER_LUMA_PS 16, 4 - FILTER_VER_LUMA_PS 4, 16 - FILTER_VER_LUMA_PS 32, 32 - FILTER_VER_LUMA_PS 32, 16 - FILTER_VER_LUMA_PS 16, 32 - FILTER_VER_LUMA_PS 32, 24 - FILTER_VER_LUMA_PS 24, 32 - FILTER_VER_LUMA_PS 32, 8 - FILTER_VER_LUMA_PS 8, 32 - FILTER_VER_LUMA_PS 64, 64 - FILTER_VER_LUMA_PS 64, 32 - FILTER_VER_LUMA_PS 32, 64 - FILTER_VER_LUMA_PS 64, 48 - FILTER_VER_LUMA_PS 48, 64 - FILTER_VER_LUMA_PS 64, 16 - FILTER_VER_LUMA_PS 16, 64 - -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_SP 2 -INIT_XMM sse4 -cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize - - add r1d, r1d - add r3d, r3d - lea r5, [r1 + 2 * r1] - sub r0, r5 - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_LumaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffV + r4] -%endif - - mova m7, [INTERP_OFFSET_SP] - - mov dword [rsp], %2/4 -.loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_LUMA_VER_W4_4R - - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP - - packssdw m0, m1 - packssdw m2, m3 - - pxor m1, m1 - CLIPW2 m0, m2, m1, [pw_pixel_max] - - movh [r2], m0 - movhps [r2 + r3], m0 - lea r5, [r2 + 2 * r3] - movh [r5], m2 - movhps [r5 + r3], m2 - - lea r5, [8 * r1 - 2 * 4] - sub r0, r5 - add r2, 2 * 4 - - dec r4d - jnz .loopW - - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - 2 * %1] - - dec dword [rsp] - jnz .loopH - RET -%endmacro - -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_SP 4, 4 - FILTER_VER_LUMA_SP 8, 8 - FILTER_VER_LUMA_SP 8, 4 - FILTER_VER_LUMA_SP 4, 8 - FILTER_VER_LUMA_SP 16, 16 - FILTER_VER_LUMA_SP 16, 8 - FILTER_VER_LUMA_SP 8, 16 - FILTER_VER_LUMA_SP 16, 12 - FILTER_VER_LUMA_SP 12, 16 - FILTER_VER_LUMA_SP 16, 4 - FILTER_VER_LUMA_SP 4, 16 - FILTER_VER_LUMA_SP 32, 32 - FILTER_VER_LUMA_SP 32, 16 - FILTER_VER_LUMA_SP 16, 32 - FILTER_VER_LUMA_SP 32, 24 - FILTER_VER_LUMA_SP 24, 32 - FILTER_VER_LUMA_SP 32, 8 - FILTER_VER_LUMA_SP 8, 32 - FILTER_VER_LUMA_SP 64, 64 - FILTER_VER_LUMA_SP 64, 32 - FILTER_VER_LUMA_SP 32, 64 - FILTER_VER_LUMA_SP 64, 48 - FILTER_VER_LUMA_SP 48, 64 - FILTER_VER_LUMA_SP 64, 16 - FILTER_VER_LUMA_SP 16, 64 - -;----------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_SS 2 -INIT_XMM sse2 -cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize - - add r1d, r1d - add r3d, r3d - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_LumaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffV + r4] -%endif - - mov dword [rsp], %2/4 -.loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_LUMA_VER_W4_4R - - psrad m0, 6 - psrad m1, 6 - packssdw m0, m1 - movlps [r2], m0 - movhps [r2 + r3], m0 - - psrad m2, 6 - psrad m3, 6 - packssdw m2, m3 - movlps [r2 + 2 * r3], m2 - lea r5, [3 * r3] - movhps [r2 + r5], m2 + psrad m2, 6 + psrad m3, 6 + packssdw m2, m3 + movlps [r2 + 2 * r3], m2 + lea r5, [3 * r3] + movhps [r2 + r5], m2 lea r5, [8 * r1 - 2 * 4] sub r0, r5 @@ -6512,1570 +4611,3 @@ cglobal filterPixelToShort_48x64, 3, 7, 5 jnz .loop RET -;----------------------------------------------------------------------------- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride) -;----------------------------------------------------------------------------- -INIT_YMM avx2 -cglobal filterPixelToShort_48x64, 3, 7, 4 - add r1d, r1d - mov r3d, r3m - add r3d, r3d - lea r4, [r3 * 3] - lea r5, [r1 * 3] - - ; load height - mov r6d, 16 - - ; load constant - mova m3, [pw_2000] - -.loop: - movu m0, [r0] - movu m1, [r0 + 32] - movu m2, [r0 + 64] - psllw m0, (14 - BIT_DEPTH) - psllw m1, (14 - BIT_DEPTH) - psllw m2, (14 - BIT_DEPTH) - psubw m0, m3 - psubw m1, m3 - psubw m2, m3 - movu [r2 + r3 * 0], m0 - movu [r2 + r3 * 0 + 32], m1 - movu [r2 + r3 * 0 + 64], m2 - - movu m0, [r0 + r1] - movu m1, [r0 + r1 + 32] - movu m2, [r0 + r1 + 64] - psllw m0, (14 - BIT_DEPTH) - psllw m1, (14 - BIT_DEPTH) - psllw m2, (14 - BIT_DEPTH) - psubw m0, m3 - psubw m1, m3 - psubw m2, m3 - movu [r2 + r3 * 1], m0 - movu [r2 + r3 * 1 + 32], m1 - movu [r2 + r3 * 1 + 64], m2 - - movu m0, [r0 + r1 * 2] - movu m1, [r0 + r1 * 2 + 32] - movu m2, [r0 + r1 * 2 + 64] - psllw m0, (14 - BIT_DEPTH) - psllw m1, (14 - BIT_DEPTH) - psllw m2, (14 - BIT_DEPTH) - psubw m0, m3 - psubw m1, m3 - psubw m2, m3 - movu [r2 + r3 * 2], m0 - movu [r2 + r3 * 2 + 32], m1 - movu [r2 + r3 * 2 + 64], m2 - - movu m0, [r0 + r5] - movu m1, [r0 + r5 + 32] - movu m2, [r0 + r5 + 64] - psllw m0, (14 - BIT_DEPTH) - psllw m1, (14 - BIT_DEPTH) - psllw m2, (14 - BIT_DEPTH) - psubw m0, m3 - psubw m1, m3 - psubw m2, m3 - movu [r2 + r4], m0 - movu [r2 + r4 + 32], m1 - movu [r2 + r4 + 64], m2 - - lea r0, [r0 + r1 * 4] - lea r2, [r2 + r3 * 4] - - dec r6d - jnz .loop - RET - - -%macro FILTER_VER_CHROMA_AVX2_8xN 2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_8x%2, 4, 9, 15 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - vbroadcasti128 m14, [pd_32] -%elifidn %1, sp - vbroadcasti128 m14, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m14, [INTERP_OFFSET_PS] -%endif - lea r6, [r3 * 3] - lea r7, [r1 * 4] - mov r8d, %2 / 16 -.loopH: - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm7, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 1 * mmsize] - paddd m3, m7 - pmaddwd m5, [r5] - - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhwd xm8, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddwd m8, m6, [r5 + 1 * mmsize] - paddd m4, m8 - pmaddwd m6, [r5] - - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhwd xm9, xm7, xm8 - punpcklwd xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddwd m9, m7, [r5 + 1 * mmsize] - paddd m5, m9 - pmaddwd m7, [r5] - - - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhwd xm10, xm8, xm9 - punpcklwd xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddwd m10, m8, [r5 + 1 * mmsize] - paddd m6, m10 - pmaddwd m8, [r5] - - - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhwd xm11, xm9, xm10 - punpcklwd xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddwd m11, m9, [r5 + 1 * mmsize] - paddd m7, m11 - pmaddwd m9, [r5] - - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhwd xm12, xm10, xm11 - punpcklwd xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddwd m12, m10, [r5 + 1 * mmsize] - paddd m8, m12 - pmaddwd m10, [r5] - - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhwd xm13, xm11, xm12 - punpcklwd xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddwd m13, m11, [r5 + 1 * mmsize] - paddd m9, m13 - pmaddwd m11, [r5] - -%ifidn %1,ss - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - psrad m4, 6 - psrad m5, 6 -%else - paddd m0, m14 - paddd m1, m14 - paddd m2, m14 - paddd m3, m14 - paddd m4, m14 - paddd m5, m14 -%ifidn %1,pp - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - psrad m4, 6 - psrad m5, 6 -%elifidn %1, sp - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP - psrad m2, INTERP_SHIFT_SP - psrad m3, INTERP_SHIFT_SP - psrad m4, INTERP_SHIFT_SP - psrad m5, INTERP_SHIFT_SP -%else - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS - psrad m2, INTERP_SHIFT_PS - psrad m3, INTERP_SHIFT_PS - psrad m4, INTERP_SHIFT_PS - psrad m5, INTERP_SHIFT_PS -%endif -%endif - - packssdw m0, m1 - packssdw m2, m3 - packssdw m4, m5 - vpermq m0, m0, q3120 - vpermq m2, m2, q3120 - vpermq m4, m4, q3120 - pxor m5, m5 - mova m3, [pw_pixel_max] -%ifidn %1,pp - CLIPW m0, m5, m3 - CLIPW m2, m5, m3 - CLIPW m4, m5, m3 -%elifidn %1, sp - CLIPW m0, m5, m3 - CLIPW m2, m5, m3 - CLIPW m4, m5, m3 -%endif - - vextracti128 xm1, m0, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - vextracti128 xm1, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm1 - lea r2, [r2 + r3 * 4] - vextracti128 xm1, m4, 1 - movu [r2], xm4 - movu [r2 + r3], xm1 - - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhwd xm0, xm12, xm13 - punpcklwd xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddwd m0, m12, [r5 + 1 * mmsize] - paddd m10, m0 - pmaddwd m12, [r5] - - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhwd xm1, xm13, xm0 - punpcklwd xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddwd m1, m13, [r5 + 1 * mmsize] - paddd m11, m1 - pmaddwd m13, [r5] - -%ifidn %1,ss - psrad m6, 6 - psrad m7, 6 -%else - paddd m6, m14 - paddd m7, m14 -%ifidn %1,pp - psrad m6, 6 - psrad m7, 6 -%elifidn %1, sp - psrad m6, INTERP_SHIFT_SP - psrad m7, INTERP_SHIFT_SP -%else - psrad m6, INTERP_SHIFT_PS - psrad m7, INTERP_SHIFT_PS -%endif -%endif - - packssdw m6, m7 - vpermq m6, m6, q3120 -%ifidn %1,pp - CLIPW m6, m5, m3 -%elifidn %1, sp - CLIPW m6, m5, m3 -%endif - vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 - - movu xm1, [r0 + r4] ; m1 = row 15 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m2, m0, [r5 + 1 * mmsize] - paddd m12, m2 - pmaddwd m0, [r5] - - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhwd xm6, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm6, 1 - pmaddwd m6, m1, [r5 + 1 * mmsize] - paddd m13, m6 - pmaddwd m1, [r5] - - movu xm6, [r0 + r1] ; m6 = row 17 - punpckhwd xm4, xm2, xm6 - punpcklwd xm2, xm6 - vinserti128 m2, m2, xm4, 1 - pmaddwd m2, [r5 + 1 * mmsize] - paddd m0, m2 - - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhwd xm2, xm6, xm4 - punpcklwd xm6, xm4 - vinserti128 m6, m6, xm2, 1 - pmaddwd m6, [r5 + 1 * mmsize] - paddd m1, m6 - -%ifidn %1,ss - psrad m8, 6 - psrad m9, 6 - psrad m10, 6 - psrad m11, 6 - psrad m12, 6 - psrad m13, 6 - psrad m0, 6 - psrad m1, 6 -%else - paddd m8, m14 - paddd m9, m14 - paddd m10, m14 - paddd m11, m14 - paddd m12, m14 - paddd m13, m14 - paddd m0, m14 - paddd m1, m14 -%ifidn %1,pp - psrad m8, 6 - psrad m9, 6 - psrad m10, 6 - psrad m11, 6 - psrad m12, 6 - psrad m13, 6 - psrad m0, 6 - psrad m1, 6 -%elifidn %1, sp - psrad m8, INTERP_SHIFT_SP - psrad m9, INTERP_SHIFT_SP - psrad m10, INTERP_SHIFT_SP - psrad m11, INTERP_SHIFT_SP - psrad m12, INTERP_SHIFT_SP - psrad m13, INTERP_SHIFT_SP - psrad m0, INTERP_SHIFT_SP - psrad m1, INTERP_SHIFT_SP -%else - psrad m8, INTERP_SHIFT_PS - psrad m9, INTERP_SHIFT_PS - psrad m10, INTERP_SHIFT_PS - psrad m11, INTERP_SHIFT_PS - psrad m12, INTERP_SHIFT_PS - psrad m13, INTERP_SHIFT_PS - psrad m0, INTERP_SHIFT_PS - psrad m1, INTERP_SHIFT_PS -%endif -%endif - - packssdw m8, m9 - packssdw m10, m11 - packssdw m12, m13 - packssdw m0, m1 - vpermq m8, m8, q3120 - vpermq m10, m10, q3120 - vpermq m12, m12, q3120 - vpermq m0, m0, q3120 -%ifidn %1,pp - CLIPW m8, m5, m3 - CLIPW m10, m5, m3 - CLIPW m12, m5, m3 - CLIPW m0, m5, m3 -%elifidn %1, sp - CLIPW m8, m5, m3 - CLIPW m10, m5, m3 - CLIPW m12, m5, m3 - CLIPW m0, m5, m3 -%endif - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - lea r2, [r2 + r3 * 4] - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r6], xm11 - lea r2, [r2 + r3 * 4] - movu [r2], xm12 - movu [r2 + r3], xm13 - movu [r2 + r3 * 2], xm0 - movu [r2 + r6], xm1 - lea r2, [r2 + r3 * 4] - dec r8d - jnz .loopH - RET -%endif -%endmacro - -FILTER_VER_CHROMA_AVX2_8xN pp, 16 -FILTER_VER_CHROMA_AVX2_8xN ps, 16 -FILTER_VER_CHROMA_AVX2_8xN ss, 16 -FILTER_VER_CHROMA_AVX2_8xN sp, 16 -FILTER_VER_CHROMA_AVX2_8xN pp, 32 -FILTER_VER_CHROMA_AVX2_8xN ps, 32 -FILTER_VER_CHROMA_AVX2_8xN sp, 32 -FILTER_VER_CHROMA_AVX2_8xN ss, 32 -FILTER_VER_CHROMA_AVX2_8xN pp, 64 -FILTER_VER_CHROMA_AVX2_8xN ps, 64 -FILTER_VER_CHROMA_AVX2_8xN sp, 64 -FILTER_VER_CHROMA_AVX2_8xN ss, 64 - -%macro PROCESS_CHROMA_AVX2_8x2 3 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m2, m2, [r5 + 1 * mmsize] - paddd m0, m2 - - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m3, m3, [r5 + 1 * mmsize] - paddd m1, m3 - -%ifnidn %1,ss - paddd m0, m7 - paddd m1, m7 -%endif - psrad m0, %3 - psrad m1, %3 - - packssdw m0, m1 - vpermq m0, m0, q3120 - pxor m4, m4 - -%if %2 - CLIPW m0, m4, [pw_pixel_max] -%endif - vextracti128 xm1, m0, 1 -%endmacro - - -%macro FILTER_VER_CHROMA_AVX2_8x2 3 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x2, 4, 6, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - vbroadcasti128 m7, [pd_32] -%elifidn %1, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m7, [INTERP_OFFSET_PS] -%endif - - PROCESS_CHROMA_AVX2_8x2 %1, %2, %3 - movu [r2], xm0 - movu [r2 + r3], xm1 - RET -%endmacro - -FILTER_VER_CHROMA_AVX2_8x2 pp, 1, 6 -FILTER_VER_CHROMA_AVX2_8x2 ps, 0, INTERP_SHIFT_PS -FILTER_VER_CHROMA_AVX2_8x2 sp, 1, INTERP_SHIFT_SP -FILTER_VER_CHROMA_AVX2_8x2 ss, 0, 6 - -%macro FILTER_VER_CHROMA_AVX2_4x2 3 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x2, 4, 6, 7 - mov r4d, r4m - add r1d, r1d - add r3d, r3d - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 - -%ifidn %1,pp - vbroadcasti128 m6, [pd_32] -%elifidn %1, sp - vbroadcasti128 m6, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m6, [INTERP_OFFSET_PS] -%endif - - movq xm0, [r0] ; row 0 - movq xm1, [r0 + r1] ; row 1 - punpcklwd xm0, xm1 - - movq xm2, [r0 + r1 * 2] ; row 2 - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - - movq xm3, [r0 + r4] ; row 3 - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] ; row 4 - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - paddd m0, m5 - -%ifnidn %1, ss - paddd m0, m6 -%endif - psrad m0, %3 - packssdw m0, m0 - pxor m1, m1 - -%if %2 - CLIPW m0, m1, [pw_pixel_max] -%endif - - vextracti128 xm2, m0, 1 - lea r4, [r3 * 3] - movq [r2], xm0 - movq [r2 + r3], xm2 - RET -%endmacro - -FILTER_VER_CHROMA_AVX2_4x2 pp, 1, 6 -FILTER_VER_CHROMA_AVX2_4x2 ps, 0, INTERP_SHIFT_PS -FILTER_VER_CHROMA_AVX2_4x2 sp, 1, INTERP_SHIFT_SP -FILTER_VER_CHROMA_AVX2_4x2 ss, 0, 6 - -%macro FILTER_VER_CHROMA_AVX2_4x4 3 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x4, 4, 6, 7 - mov r4d, r4m - add r1d, r1d - add r3d, r3d - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 - -%ifidn %1,pp - vbroadcasti128 m6, [pd_32] -%elifidn %1, sp - vbroadcasti128 m6, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m6, [INTERP_OFFSET_PS] -%endif - movq xm0, [r0] ; row 0 - movq xm1, [r0 + r1] ; row 1 - punpcklwd xm0, xm1 - - movq xm2, [r0 + r1 * 2] ; row 2 - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - - movq xm3, [r0 + r4] ; row 3 - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] ; row 4 - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - - movq xm3, [r0 + r1] ; row 5 - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] ; row 6 - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m4, [r5 + 1 * mmsize] - paddd m2, m4 - -%ifnidn %1,ss - paddd m0, m6 - paddd m2, m6 -%endif - psrad m0, %3 - psrad m2, %3 - - packssdw m0, m2 - pxor m1, m1 -%if %2 - CLIPW m0, m1, [pw_pixel_max] -%endif - - vextracti128 xm2, m0, 1 - lea r4, [r3 * 3] - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r4], xm2 - RET -%endmacro - -FILTER_VER_CHROMA_AVX2_4x4 pp, 1, 6 -FILTER_VER_CHROMA_AVX2_4x4 ps, 0, INTERP_SHIFT_PS -FILTER_VER_CHROMA_AVX2_4x4 sp, 1, INTERP_SHIFT_SP -FILTER_VER_CHROMA_AVX2_4x4 ss, 0, 6 - - -%macro FILTER_VER_CHROMA_AVX2_4x8 3 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x8, 4, 7, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 - -%ifidn %1,pp - vbroadcasti128 m7, [pd_32] -%elifidn %1, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m7, [INTERP_OFFSET_PS] -%endif - lea r6, [r3 * 3] - - movq xm0, [r0] ; row 0 - movq xm1, [r0 + r1] ; row 1 - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] ; row 2 - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - - movq xm3, [r0 + r4] ; row 3 - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] ; row 4 - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - - movq xm3, [r0 + r1] ; row 5 - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] ; row 6 - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m5, m4, [r5 + 1 * mmsize] - paddd m2, m5 - pmaddwd m4, [r5] - - movq xm3, [r0 + r4] ; row 7 - punpcklwd xm1, xm3 - lea r0, [r0 + 4 * r1] - movq xm6, [r0] ; row 8 - punpcklwd xm3, xm6 - vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] - pmaddwd m5, m1, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m1, [r5] - - movq xm3, [r0 + r1] ; row 9 - punpcklwd xm6, xm3 - movq xm5, [r0 + 2 * r1] ; row 10 - punpcklwd xm3, xm5 - vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] - pmaddwd m6, [r5 + 1 * mmsize] - paddd m1, m6 -%ifnidn %1,ss - paddd m0, m7 - paddd m2, m7 -%endif - psrad m0, %3 - psrad m2, %3 - packssdw m0, m2 - pxor m6, m6 - mova m3, [pw_pixel_max] -%if %2 - CLIPW m0, m6, m3 -%endif - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm2 -%ifnidn %1,ss - paddd m4, m7 - paddd m1, m7 -%endif - psrad m4, %3 - psrad m1, %3 - packssdw m4, m1 -%if %2 - CLIPW m4, m6, m3 -%endif - vextracti128 xm1, m4, 1 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movq [r2 + r3], xm1 - movhps [r2 + r3 * 2], xm4 - movhps [r2 + r6], xm1 - RET -%endmacro - -FILTER_VER_CHROMA_AVX2_4x8 pp, 1, 6 -FILTER_VER_CHROMA_AVX2_4x8 ps, 0, INTERP_SHIFT_PS -FILTER_VER_CHROMA_AVX2_4x8 sp, 1, INTERP_SHIFT_SP -FILTER_VER_CHROMA_AVX2_4x8 ss, 0 , 6 - -%macro PROCESS_LUMA_AVX2_W4_16R_4TAP 3 - movq xm0, [r0] ; row 0 - movq xm1, [r0 + r1] ; row 1 - punpcklwd xm0, xm1 - movq xm2, [r0 + r1 * 2] ; row 2 - punpcklwd xm1, xm2 - vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] - pmaddwd m0, [r5] - movq xm3, [r0 + r4] ; row 3 - punpcklwd xm2, xm3 - lea r0, [r0 + 4 * r1] - movq xm4, [r0] ; row 4 - punpcklwd xm3, xm4 - vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] - pmaddwd m5, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m5 - movq xm3, [r0 + r1] ; row 5 - punpcklwd xm4, xm3 - movq xm1, [r0 + r1 * 2] ; row 6 - punpcklwd xm3, xm1 - vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] - pmaddwd m5, m4, [r5 + 1 * mmsize] - paddd m2, m5 - pmaddwd m4, [r5] - movq xm3, [r0 + r4] ; row 7 - punpcklwd xm1, xm3 - lea r0, [r0 + 4 * r1] - movq xm6, [r0] ; row 8 - punpcklwd xm3, xm6 - vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] - pmaddwd m5, m1, [r5 + 1 * mmsize] - paddd m4, m5 - pmaddwd m1, [r5] - movq xm3, [r0 + r1] ; row 9 - punpcklwd xm6, xm3 - movq xm5, [r0 + 2 * r1] ; row 10 - punpcklwd xm3, xm5 - vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] - pmaddwd m3, m6, [r5 + 1 * mmsize] - paddd m1, m3 - pmaddwd m6, [r5] -%ifnidn %1,ss - paddd m0, m7 - paddd m2, m7 -%endif - psrad m0, %3 - psrad m2, %3 - packssdw m0, m2 - pxor m3, m3 -%if %2 - CLIPW m0, m3, [pw_pixel_max] -%endif - vextracti128 xm2, m0, 1 - movq [r2], xm0 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm2 - movq xm2, [r0 + r4] ;row 11 - punpcklwd xm5, xm2 - lea r0, [r0 + 4 * r1] - movq xm0, [r0] ; row 12 - punpcklwd xm2, xm0 - vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] - pmaddwd m2, m5, [r5 + 1 * mmsize] - paddd m6, m2 - pmaddwd m5, [r5] - movq xm2, [r0 + r1] ; row 13 - punpcklwd xm0, xm2 - movq xm3, [r0 + 2 * r1] ; row 14 - punpcklwd xm2, xm3 - vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] - pmaddwd m2, m0, [r5 + 1 * mmsize] - paddd m5, m2 - pmaddwd m0, [r5] -%ifnidn %1,ss - paddd m4, m7 - paddd m1, m7 -%endif - psrad m4, %3 - psrad m1, %3 - packssdw m4, m1 - pxor m2, m2 -%if %2 - CLIPW m4, m2, [pw_pixel_max] -%endif - - vextracti128 xm1, m4, 1 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - movq [r2 + r3], xm1 - movhps [r2 + r3 * 2], xm4 - movhps [r2 + r6], xm1 - movq xm4, [r0 + r4] ; row 15 - punpcklwd xm3, xm4 - lea r0, [r0 + 4 * r1] - movq xm1, [r0] ; row 16 - punpcklwd xm4, xm1 - vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] - pmaddwd m4, m3, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m3, [r5] - movq xm4, [r0 + r1] ; row 17 - punpcklwd xm1, xm4 - movq xm2, [r0 + 2 * r1] ; row 18 - punpcklwd xm4, xm2 - vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] - pmaddwd m1, [r5 + 1 * mmsize] - paddd m3, m1 - -%ifnidn %1,ss - paddd m6, m7 - paddd m5, m7 -%endif - psrad m6, %3 - psrad m5, %3 - packssdw m6, m5 - pxor m1, m1 -%if %2 - CLIPW m6, m1, [pw_pixel_max] -%endif - vextracti128 xm5, m6, 1 - lea r2, [r2 + r3 * 4] - movq [r2], xm6 - movq [r2 + r3], xm5 - movhps [r2 + r3 * 2], xm6 - movhps [r2 + r6], xm5 -%ifnidn %1,ss - paddd m0, m7 - paddd m3, m7 -%endif - psrad m0, %3 - psrad m3, %3 - packssdw m0, m3 -%if %2 - CLIPW m0, m1, [pw_pixel_max] -%endif - vextracti128 xm3, m0, 1 - lea r2, [r2 + r3 * 4] - movq [r2], xm0 - movq [r2 + r3], xm3 - movhps [r2 + r3 * 2], xm0 - movhps [r2 + r6], xm3 -%endmacro - - -%macro FILTER_VER_CHROMA_AVX2_4xN 4 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_4x%2, 4, 8, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 - mov r7d, %2 / 16 -%ifidn %1,pp - vbroadcasti128 m7, [pd_32] -%elifidn %1, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m7, [INTERP_OFFSET_PS] -%endif - lea r6, [r3 * 3] -.loopH: - PROCESS_LUMA_AVX2_W4_16R_4TAP %1, %3, %4 - lea r2, [r2 + r3 * 4] - dec r7d - jnz .loopH - RET -%endmacro - -FILTER_VER_CHROMA_AVX2_4xN pp, 16, 1, 6 -FILTER_VER_CHROMA_AVX2_4xN ps, 16, 0, INTERP_SHIFT_PS -FILTER_VER_CHROMA_AVX2_4xN sp, 16, 1, INTERP_SHIFT_SP -FILTER_VER_CHROMA_AVX2_4xN ss, 16, 0, 6 -FILTER_VER_CHROMA_AVX2_4xN pp, 32, 1, 6 -FILTER_VER_CHROMA_AVX2_4xN ps, 32, 0, INTERP_SHIFT_PS -FILTER_VER_CHROMA_AVX2_4xN sp, 32, 1, INTERP_SHIFT_SP -FILTER_VER_CHROMA_AVX2_4xN ss, 32, 0, 6 - -%macro FILTER_VER_CHROMA_AVX2_8x8 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_8x8, 4, 6, 12 - mov r4d, r4m - add r1d, r1d - add r3d, r3d - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 - -%ifidn %1,pp - vbroadcasti128 m11, [pd_32] -%elifidn %1, sp - vbroadcasti128 m11, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m11, [INTERP_OFFSET_PS] -%endif - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m4 ; res row0 done(0,1,2,3) - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - pmaddwd m3, [r5] - paddd m1, m5 ;res row1 done(1, 2, 3, 4) - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - pmaddwd m4, [r5] - paddd m2, m6 ;res row2 done(2,3,4,5) - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm7, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m7 ;res row3 done(3,4,5,6) - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhwd xm8, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddwd m8, m6, [r5 + 1 * mmsize] - pmaddwd m6, [r5] - paddd m4, m8 ;res row4 done(4,5,6,7) - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhwd xm9, xm7, xm8 - punpcklwd xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddwd m9, m7, [r5 + 1 * mmsize] - pmaddwd m7, [r5] - paddd m5, m9 ;res row5 done(5,6,7,8) - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhwd xm10, xm8, xm9 - punpcklwd xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddwd m8, [r5 + 1 * mmsize] - paddd m6, m8 ;res row6 done(6,7,8,9) - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhwd xm8, xm9, xm10 - punpcklwd xm9, xm10 - vinserti128 m9, m9, xm8, 1 - pmaddwd m9, [r5 + 1 * mmsize] - paddd m7, m9 ;res row7 done 7,8,9,10 - lea r4, [r3 * 3] -%ifnidn %1,ss - paddd m0, m11 - paddd m1, m11 - paddd m2, m11 - paddd m3, m11 -%endif - psrad m0, %3 - psrad m1, %3 - psrad m2, %3 - psrad m3, %3 - packssdw m0, m1 - packssdw m2, m3 - vpermq m0, m0, q3120 - vpermq m2, m2, q3120 - pxor m1, m1 - mova m3, [pw_pixel_max] -%if %2 - CLIPW m0, m1, m3 - CLIPW m2, m1, m3 -%endif - vextracti128 xm9, m0, 1 - vextracti128 xm8, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm8 -%ifnidn %1,ss - paddd m4, m11 - paddd m5, m11 - paddd m6, m11 - paddd m7, m11 -%endif - psrad m4, %3 - psrad m5, %3 - psrad m6, %3 - psrad m7, %3 - packssdw m4, m5 - packssdw m6, m7 - vpermq m4, m4, q3120 - vpermq m6, m6, q3120 -%if %2 - CLIPW m4, m1, m3 - CLIPW m6, m1, m3 -%endif - vextracti128 xm5, m4, 1 - vextracti128 xm7, m6, 1 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 - movu [r2 + r3 * 2], xm6 - movu [r2 + r4], xm7 - RET -%endif -%endmacro - -FILTER_VER_CHROMA_AVX2_8x8 pp, 1, 6 -FILTER_VER_CHROMA_AVX2_8x8 ps, 0, INTERP_SHIFT_PS -FILTER_VER_CHROMA_AVX2_8x8 sp, 1, INTERP_SHIFT_SP -FILTER_VER_CHROMA_AVX2_8x8 ss, 0, 6 - -%macro FILTER_VER_CHROMA_AVX2_8x6 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_8x6, 4, 6, 12 - mov r4d, r4m - add r1d, r1d - add r3d, r3d - shl r4d, 6 - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 - -%ifidn %1,pp - vbroadcasti128 m11, [pd_32] -%elifidn %1, sp - vbroadcasti128 m11, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m11, [INTERP_OFFSET_PS] -%endif - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - pmaddwd m2, [r5] - paddd m0, m4 ; r0 done(0,1,2,3) - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - pmaddwd m3, [r5] - paddd m1, m5 ;r1 done(1, 2, 3, 4) - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - pmaddwd m4, [r5] - paddd m2, m6 ;r2 done(2,3,4,5) - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm7, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 1 * mmsize] - pmaddwd m5, [r5] - paddd m3, m7 ;r3 done(3,4,5,6) - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhwd xm8, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddwd m8, m6, [r5 + 1 * mmsize] - paddd m4, m8 ;r4 done(4,5,6,7) - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhwd xm9, xm7, xm8 - punpcklwd xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddwd m7, m7, [r5 + 1 * mmsize] - paddd m5, m7 ;r5 done(5,6,7,8) - lea r4, [r3 * 3] -%ifnidn %1,ss - paddd m0, m11 - paddd m1, m11 - paddd m2, m11 - paddd m3, m11 -%endif - psrad m0, %3 - psrad m1, %3 - psrad m2, %3 - psrad m3, %3 - packssdw m0, m1 - packssdw m2, m3 - vpermq m0, m0, q3120 - vpermq m2, m2, q3120 - pxor m10, m10 - mova m9, [pw_pixel_max] -%if %2 - CLIPW m0, m10, m9 - CLIPW m2, m10, m9 -%endif - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r4], xm3 -%ifnidn %1,ss - paddd m4, m11 - paddd m5, m11 -%endif - psrad m4, %3 - psrad m5, %3 - packssdw m4, m5 - vpermq m4, m4, 11011000b -%if %2 - CLIPW m4, m10, m9 -%endif - vextracti128 xm5, m4, 1 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 - RET -%endif -%endmacro - -FILTER_VER_CHROMA_AVX2_8x6 pp, 1, 6 -FILTER_VER_CHROMA_AVX2_8x6 ps, 0, INTERP_SHIFT_PS -FILTER_VER_CHROMA_AVX2_8x6 sp, 1, INTERP_SHIFT_SP -FILTER_VER_CHROMA_AVX2_8x6 ss, 0, 6 - -%macro PROCESS_CHROMA_AVX2 3 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m4, [r5 + 1 * mmsize] - paddd m2, m4 - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm4, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm4, 1 - pmaddwd m5, [r5 + 1 * mmsize] - paddd m3, m5 -%ifnidn %1,ss - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 -%endif - psrad m0, %3 - psrad m1, %3 - psrad m2, %3 - psrad m3, %3 - packssdw m0, m1 - packssdw m2, m3 - vpermq m0, m0, q3120 - vpermq m2, m2, q3120 - pxor m4, m4 -%if %2 - CLIPW m0, m4, [pw_pixel_max] - CLIPW m2, m4, [pw_pixel_max] -%endif - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 -%endmacro - - -%macro FILTER_VER_CHROMA_AVX2_8x4 3 -INIT_YMM avx2 -cglobal interp_4tap_vert_%1_8x4, 4, 6, 8 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - add r3d, r3d -%ifdef PIC - lea r5, [tab_ChromaCoeffVer] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer + r4] -%endif - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - vbroadcasti128 m7, [pd_32] -%elifidn %1, sp - vbroadcasti128 m7, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m7, [INTERP_OFFSET_PS] -%endif - PROCESS_CHROMA_AVX2 %1, %2, %3 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - lea r4, [r3 * 3] - movu [r2 + r4], xm3 - RET -%endmacro - -FILTER_VER_CHROMA_AVX2_8x4 pp, 1, 6 -FILTER_VER_CHROMA_AVX2_8x4 ps, 0, INTERP_SHIFT_PS -FILTER_VER_CHROMA_AVX2_8x4 sp, 1, INTERP_SHIFT_SP -FILTER_VER_CHROMA_AVX2_8x4 ss, 0, 6 - -%macro FILTER_VER_CHROMA_AVX2_8x12 3 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_%1_8x12, 4, 7, 15 - mov r4d, r4m - shl r4d, 6 - add r1d, r1d - add r3d, r3d - -%ifdef PIC - lea r5, [tab_ChromaCoeffVer] - add r5, r4 -%else - lea r5, [tab_ChromaCoeffVer + r4] -%endif - - lea r4, [r1 * 3] - sub r0, r1 -%ifidn %1,pp - vbroadcasti128 m14, [pd_32] -%elifidn %1, sp - vbroadcasti128 m14, [INTERP_OFFSET_SP] -%else - vbroadcasti128 m14, [INTERP_OFFSET_PS] -%endif - lea r6, [r3 * 3] - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhwd xm2, xm0, xm1 - punpcklwd xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddwd m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhwd xm3, xm1, xm2 - punpcklwd xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddwd m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhwd xm4, xm2, xm3 - punpcklwd xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddwd m4, m2, [r5 + 1 * mmsize] - paddd m0, m4 - pmaddwd m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhwd xm5, xm3, xm4 - punpcklwd xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddwd m5, m3, [r5 + 1 * mmsize] - paddd m1, m5 - pmaddwd m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhwd xm6, xm4, xm5 - punpcklwd xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddwd m6, m4, [r5 + 1 * mmsize] - paddd m2, m6 - pmaddwd m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhwd xm7, xm5, xm6 - punpcklwd xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddwd m7, m5, [r5 + 1 * mmsize] - paddd m3, m7 - pmaddwd m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhwd xm8, xm6, xm7 - punpcklwd xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddwd m8, m6, [r5 + 1 * mmsize] - paddd m4, m8 - pmaddwd m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhwd xm9, xm7, xm8 - punpcklwd xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddwd m9, m7, [r5 + 1 * mmsize] - paddd m5, m9 - pmaddwd m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhwd xm10, xm8, xm9 - punpcklwd xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddwd m10, m8, [r5 + 1 * mmsize] - paddd m6, m10 - pmaddwd m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhwd xm11, xm9, xm10 - punpcklwd xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddwd m11, m9, [r5 + 1 * mmsize] - paddd m7, m11 - pmaddwd m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhwd xm12, xm10, xm11 - punpcklwd xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddwd m12, m10, [r5 + 1 * mmsize] - paddd m8, m12 - pmaddwd m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhwd xm13, xm11, xm12 - punpcklwd xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddwd m13, m11, [r5 + 1 * mmsize] - paddd m9, m13 - pmaddwd m11, [r5] -%ifnidn %1,ss - paddd m0, m14 - paddd m1, m14 - paddd m2, m14 - paddd m3, m14 - paddd m4, m14 - paddd m5, m14 -%endif - psrad m0, %3 - psrad m1, %3 - psrad m2, %3 - psrad m3, %3 - psrad m4, %3 - psrad m5, %3 - packssdw m0, m1 - packssdw m2, m3 - packssdw m4, m5 - vpermq m0, m0, q3120 - vpermq m2, m2, q3120 - vpermq m4, m4, q3120 - pxor m5, m5 - mova m3, [pw_pixel_max] -%if %2 - CLIPW m0, m5, m3 - CLIPW m2, m5, m3 - CLIPW m4, m5, m3 -%endif - vextracti128 xm1, m0, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - vextracti128 xm1, m2, 1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm1 - lea r2, [r2 + r3 * 4] - vextracti128 xm1, m4, 1 - movu [r2], xm4 - movu [r2 + r3], xm1 - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhwd xm0, xm12, xm13 - punpcklwd xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddwd m12, m12, [r5 + 1 * mmsize] - paddd m10, m12 - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhwd xm1, xm13, xm0 - punpcklwd xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddwd m13, m13, [r5 + 1 * mmsize] - paddd m11, m13 -%ifnidn %1,ss - paddd m6, m14 - paddd m7, m14 - paddd m8, m14 - paddd m9, m14 - paddd m10, m14 - paddd m11, m14 -%endif - psrad m6, %3 - psrad m7, %3 - psrad m8, %3 - psrad m9, %3 - psrad m10, %3 - psrad m11, %3 - packssdw m6, m7 - packssdw m8, m9 - packssdw m10, m11 - vpermq m6, m6, q3120 - vpermq m8, m8, q3120 - vpermq m10, m10, q3120 -%if %2 - CLIPW m6, m5, m3 - CLIPW m8, m5, m3 - CLIPW m10, m5, m3 -%endif - vextracti128 xm7, m6, 1 - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 - lea r2, [r2 + r3 * 4] - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r6], xm11 - RET -%endif -%endmacro - -FILTER_VER_CHROMA_AVX2_8x12 pp, 1, 6 -FILTER_VER_CHROMA_AVX2_8x12 ps, 0, INTERP_SHIFT_PS -FILTER_VER_CHROMA_AVX2_8x12 sp, 1, INTERP_SHIFT_SP -FILTER_VER_CHROMA_AVX2_8x12 ss, 0, 6 diff --git a/source/common/x86/v4-ipfilter16.asm b/source/common/x86/v4-ipfilter16.asm new file mode 100644 index 0000000000..8c51872bc3 --- /dev/null +++ b/source/common/x86/v4-ipfilter16.asm @@ -0,0 +1,3529 @@ +;***************************************************************************** +;* Copyright (C) 2013-2017 MulticoreWare, Inc +;* +;* Authors: Nabajit Deka +;* Murugan Vairavel +;* Min Chen +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + + +%define INTERP_OFFSET_PP pd_32 +%define INTERP_SHIFT_PP 6 + +%if BIT_DEPTH == 10 + %define INTERP_SHIFT_PS 2 + %define INTERP_OFFSET_PS pd_n32768 + %define INTERP_SHIFT_SP 10 + %define INTERP_OFFSET_SP v4_pd_524800 +%elif BIT_DEPTH == 12 + %define INTERP_SHIFT_PS 4 + %define INTERP_OFFSET_PS pd_n131072 + %define INTERP_SHIFT_SP 8 + %define INTERP_OFFSET_SP pd_524416 +%else + %error Unsupport bit depth! +%endif + + +SECTION_RODATA 32 + +v4_pd_524800: times 8 dd 524800 +tab_c_n8192: times 8 dw -8192 + +const tab_ChromaCoeffV, times 8 dw 0, 64 + times 8 dw 0, 0 + + times 8 dw -2, 58 + times 8 dw 10, -2 + + times 8 dw -4, 54 + times 8 dw 16, -2 + + times 8 dw -6, 46 + times 8 dw 28, -4 + + times 8 dw -4, 36 + times 8 dw 36, -4 + + times 8 dw -4, 28 + times 8 dw 46, -6 + + times 8 dw -2, 16 + times 8 dw 54, -4 + + times 8 dw -2, 10 + times 8 dw 58, -2 + +tab_ChromaCoeffVer: times 8 dw 0, 64 + times 8 dw 0, 0 + + times 8 dw -2, 58 + times 8 dw 10, -2 + + times 8 dw -4, 54 + times 8 dw 16, -2 + + times 8 dw -6, 46 + times 8 dw 28, -4 + + times 8 dw -4, 36 + times 8 dw 36, -4 + + times 8 dw -4, 28 + times 8 dw 46, -6 + + times 8 dw -2, 16 + times 8 dw 54, -4 + + times 8 dw -2, 10 + times 8 dw 58, -2 + +SECTION .text +cextern pd_8 +cextern pd_32 +cextern pw_pixel_max +cextern pd_524416 +cextern pd_n32768 +cextern pd_n131072 +cextern pw_2000 +cextern idct8_shuf2 + +%macro PROCESS_CHROMA_SP_W4_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *32] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *32] ;m1=[1+2] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *32] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 32] + paddd m0, m4 ;m0=[0+1+2+3] Row1 done + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *32] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 32] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m4, [r6 + 1 * 32] + paddd m2, m4 ;m2=[2+3+4+5] Row3 + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m5, [r6 + 1 * 32] + paddd m3, m5 ;m3=[3+4+5+6] Row4 +%endmacro + +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS 4 +INIT_XMM sse2 +cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-gprsize + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mov dword [rsp], %2/4 + +%ifnidn %3, ss + %ifnidn %3, ps + mova m7, [pw_pixel_max] + %ifidn %3, pp + mova m6, [INTERP_OFFSET_PP] + %else + mova m6, [INTERP_OFFSET_SP] + %endif + %else + mova m6, [INTERP_OFFSET_PS] + %endif +%endif + +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_CHROMA_SP_W4_4R + +%ifidn %3, ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 +%elifidn %3, ps + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS + + packssdw m0, m1 + packssdw m2, m3 +%else + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + %ifidn %3, pp + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP + %else + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP + %endif + packssdw m0, m1 + packssdw m2, m3 + pxor m5, m5 + CLIPW2 m0, m2, m5, m7 +%endif + + movh [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movh [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SS 4, 4, ss, 6 + FILTER_VER_CHROMA_SS 4, 8, ss, 6 + FILTER_VER_CHROMA_SS 16, 16, ss, 6 + FILTER_VER_CHROMA_SS 16, 8, ss, 6 + FILTER_VER_CHROMA_SS 16, 12, ss, 6 + FILTER_VER_CHROMA_SS 12, 16, ss, 6 + FILTER_VER_CHROMA_SS 16, 4, ss, 6 + FILTER_VER_CHROMA_SS 4, 16, ss, 6 + FILTER_VER_CHROMA_SS 32, 32, ss, 6 + FILTER_VER_CHROMA_SS 32, 16, ss, 6 + FILTER_VER_CHROMA_SS 16, 32, ss, 6 + FILTER_VER_CHROMA_SS 32, 24, ss, 6 + FILTER_VER_CHROMA_SS 24, 32, ss, 6 + FILTER_VER_CHROMA_SS 32, 8, ss, 6 + + FILTER_VER_CHROMA_SS 4, 4, ps, 7 + FILTER_VER_CHROMA_SS 4, 8, ps, 7 + FILTER_VER_CHROMA_SS 16, 16, ps, 7 + FILTER_VER_CHROMA_SS 16, 8, ps, 7 + FILTER_VER_CHROMA_SS 16, 12, ps, 7 + FILTER_VER_CHROMA_SS 12, 16, ps, 7 + FILTER_VER_CHROMA_SS 16, 4, ps, 7 + FILTER_VER_CHROMA_SS 4, 16, ps, 7 + FILTER_VER_CHROMA_SS 32, 32, ps, 7 + FILTER_VER_CHROMA_SS 32, 16, ps, 7 + FILTER_VER_CHROMA_SS 16, 32, ps, 7 + FILTER_VER_CHROMA_SS 32, 24, ps, 7 + FILTER_VER_CHROMA_SS 24, 32, ps, 7 + FILTER_VER_CHROMA_SS 32, 8, ps, 7 + + FILTER_VER_CHROMA_SS 4, 4, sp, 8 + FILTER_VER_CHROMA_SS 4, 8, sp, 8 + FILTER_VER_CHROMA_SS 16, 16, sp, 8 + FILTER_VER_CHROMA_SS 16, 8, sp, 8 + FILTER_VER_CHROMA_SS 16, 12, sp, 8 + FILTER_VER_CHROMA_SS 12, 16, sp, 8 + FILTER_VER_CHROMA_SS 16, 4, sp, 8 + FILTER_VER_CHROMA_SS 4, 16, sp, 8 + FILTER_VER_CHROMA_SS 32, 32, sp, 8 + FILTER_VER_CHROMA_SS 32, 16, sp, 8 + FILTER_VER_CHROMA_SS 16, 32, sp, 8 + FILTER_VER_CHROMA_SS 32, 24, sp, 8 + FILTER_VER_CHROMA_SS 24, 32, sp, 8 + FILTER_VER_CHROMA_SS 32, 8, sp, 8 + + FILTER_VER_CHROMA_SS 4, 4, pp, 8 + FILTER_VER_CHROMA_SS 4, 8, pp, 8 + FILTER_VER_CHROMA_SS 16, 16, pp, 8 + FILTER_VER_CHROMA_SS 16, 8, pp, 8 + FILTER_VER_CHROMA_SS 16, 12, pp, 8 + FILTER_VER_CHROMA_SS 12, 16, pp, 8 + FILTER_VER_CHROMA_SS 16, 4, pp, 8 + FILTER_VER_CHROMA_SS 4, 16, pp, 8 + FILTER_VER_CHROMA_SS 32, 32, pp, 8 + FILTER_VER_CHROMA_SS 32, 16, pp, 8 + FILTER_VER_CHROMA_SS 16, 32, pp, 8 + FILTER_VER_CHROMA_SS 32, 24, pp, 8 + FILTER_VER_CHROMA_SS 24, 32, pp, 8 + FILTER_VER_CHROMA_SS 32, 8, pp, 8 + + + FILTER_VER_CHROMA_SS 16, 24, ss, 6 + FILTER_VER_CHROMA_SS 12, 32, ss, 6 + FILTER_VER_CHROMA_SS 4, 32, ss, 6 + FILTER_VER_CHROMA_SS 32, 64, ss, 6 + FILTER_VER_CHROMA_SS 16, 64, ss, 6 + FILTER_VER_CHROMA_SS 32, 48, ss, 6 + FILTER_VER_CHROMA_SS 24, 64, ss, 6 + + FILTER_VER_CHROMA_SS 16, 24, ps, 7 + FILTER_VER_CHROMA_SS 12, 32, ps, 7 + FILTER_VER_CHROMA_SS 4, 32, ps, 7 + FILTER_VER_CHROMA_SS 32, 64, ps, 7 + FILTER_VER_CHROMA_SS 16, 64, ps, 7 + FILTER_VER_CHROMA_SS 32, 48, ps, 7 + FILTER_VER_CHROMA_SS 24, 64, ps, 7 + + FILTER_VER_CHROMA_SS 16, 24, sp, 8 + FILTER_VER_CHROMA_SS 12, 32, sp, 8 + FILTER_VER_CHROMA_SS 4, 32, sp, 8 + FILTER_VER_CHROMA_SS 32, 64, sp, 8 + FILTER_VER_CHROMA_SS 16, 64, sp, 8 + FILTER_VER_CHROMA_SS 32, 48, sp, 8 + FILTER_VER_CHROMA_SS 24, 64, sp, 8 + + FILTER_VER_CHROMA_SS 16, 24, pp, 8 + FILTER_VER_CHROMA_SS 12, 32, pp, 8 + FILTER_VER_CHROMA_SS 4, 32, pp, 8 + FILTER_VER_CHROMA_SS 32, 64, pp, 8 + FILTER_VER_CHROMA_SS 16, 64, pp, 8 + FILTER_VER_CHROMA_SS 32, 48, pp, 8 + FILTER_VER_CHROMA_SS 24, 64, pp, 8 + + + FILTER_VER_CHROMA_SS 48, 64, ss, 6 + FILTER_VER_CHROMA_SS 64, 48, ss, 6 + FILTER_VER_CHROMA_SS 64, 64, ss, 6 + FILTER_VER_CHROMA_SS 64, 32, ss, 6 + FILTER_VER_CHROMA_SS 64, 16, ss, 6 + + FILTER_VER_CHROMA_SS 48, 64, ps, 7 + FILTER_VER_CHROMA_SS 64, 48, ps, 7 + FILTER_VER_CHROMA_SS 64, 64, ps, 7 + FILTER_VER_CHROMA_SS 64, 32, ps, 7 + FILTER_VER_CHROMA_SS 64, 16, ps, 7 + + FILTER_VER_CHROMA_SS 48, 64, sp, 8 + FILTER_VER_CHROMA_SS 64, 48, sp, 8 + FILTER_VER_CHROMA_SS 64, 64, sp, 8 + FILTER_VER_CHROMA_SS 64, 32, sp, 8 + FILTER_VER_CHROMA_SS 64, 16, sp, 8 + + FILTER_VER_CHROMA_SS 48, 64, pp, 8 + FILTER_VER_CHROMA_SS 64, 48, pp, 8 + FILTER_VER_CHROMA_SS 64, 64, pp, 8 + FILTER_VER_CHROMA_SS 64, 32, pp, 8 + FILTER_VER_CHROMA_SS 64, 16, pp, 8 + + +%macro PROCESS_CHROMA_SP_W2_4R 1 + movd m0, [r0] + movd m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + + lea r0, [r0 + 2 * r1] + movd m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + punpcklqdq m0, m1 ;m0=[0 1 1 2] + pmaddwd m0, [%1 + 0 *32] ;m0=[0+1 1+2] Row 1-2 + + movd m1, [r0 + r1] + punpcklwd m2, m1 ;m2=[2 3] + + lea r0, [r0 + 2 * r1] + movd m3, [r0] + punpcklwd m1, m3 ;m2=[3 4] + punpcklqdq m2, m1 ;m2=[2 3 3 4] + + pmaddwd m4, m2, [%1 + 1 * 32] ;m4=[2+3 3+4] Row 1-2 + pmaddwd m2, [%1 + 0 * 32] ;m2=[2+3 3+4] Row 3-4 + paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 + + movd m1, [r0 + r1] + punpcklwd m3, m1 ;m3=[4 5] + + movd m4, [r0 + 2 * r1] + punpcklwd m1, m4 ;m1=[5 6] + punpcklqdq m3, m1 ;m2=[4 5 5 6] + pmaddwd m3, [%1 + 1 * 32] ;m3=[4+5 5+6] Row 3-4 + paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 +%endmacro +;--------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_%2_2x%1(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W2 3 +INIT_XMM sse4 +cglobal interp_4tap_vert_%2_2x%1, 5, 6, %3 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, (%1/4) +%ifnidn %2, ss + %ifnidn %2, ps + pxor m7, m7 + mova m6, [pw_pixel_max] + %ifidn %2, pp + mova m5, [INTERP_OFFSET_PP] + %else + mova m5, [INTERP_OFFSET_SP] + %endif + %else + mova m5, [INTERP_OFFSET_PS] + %endif +%endif + +.loopH: + PROCESS_CHROMA_SP_W2_4R r5 +%ifidn %2, ss + psrad m0, 6 + psrad m2, 6 + packssdw m0, m2 +%elifidn %2, ps + paddd m0, m5 + paddd m2, m5 + psrad m0, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + packssdw m0, m2 +%else + paddd m0, m5 + paddd m2, m5 + %ifidn %2, pp + psrad m0, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + %else + psrad m0, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + %endif + packusdw m0, m2 + CLIPW m0, m7, m6 +%endif + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m0, 2 + pextrd [r2 + r3], m0, 3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + RET +%endmacro + +FILTER_VER_CHROMA_W2 4, ss, 5 +FILTER_VER_CHROMA_W2 8, ss, 5 + +FILTER_VER_CHROMA_W2 4, pp, 8 +FILTER_VER_CHROMA_W2 8, pp, 8 + +FILTER_VER_CHROMA_W2 4, ps, 6 +FILTER_VER_CHROMA_W2 8, ps, 6 + +FILTER_VER_CHROMA_W2 4, sp, 8 +FILTER_VER_CHROMA_W2 8, sp, 8 + +FILTER_VER_CHROMA_W2 16, ss, 5 +FILTER_VER_CHROMA_W2 16, pp, 8 +FILTER_VER_CHROMA_W2 16, ps, 6 +FILTER_VER_CHROMA_W2 16, sp, 8 + + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_%1_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W4 3 +INIT_XMM sse4 +cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3 + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + +%ifnidn %2, 2 + mov r4d, %1/2 +%endif + +%ifnidn %2, ss + %ifnidn %2, ps + pxor m6, m6 + mova m5, [pw_pixel_max] + %ifidn %2, pp + mova m4, [INTERP_OFFSET_PP] + %else + mova m4, [INTERP_OFFSET_SP] + %endif + %else + mova m4, [INTERP_OFFSET_PS] + %endif +%endif + +%ifnidn %2, 2 +.loop: +%endif + + movh m0, [r0] + movh m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r5 + 0 *32] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movh m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + pmaddwd m1, [r5 + 0 *32] ;m1=[1+2] Row2 + + movh m3, [r0 + r1] + punpcklwd m2, m3 ;m4=[2 3] + pmaddwd m2, [r5 + 1 * 32] + paddd m0, m2 ;m0=[0+1+2+3] Row1 done + + movh m2, [r0 + 2 * r1] + punpcklwd m3, m2 ;m5=[3 4] + pmaddwd m3, [r5 + 1 * 32] + paddd m1, m3 ;m1=[1+2+3+4] Row2 done + +%ifidn %2, ss + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 +%elifidn %2, ps + paddd m0, m4 + paddd m1, m4 + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + packssdw m0, m1 +%else + paddd m0, m4 + paddd m1, m4 + %ifidn %2, pp + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + %else + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + %endif + packusdw m0, m1 + CLIPW m0, m6, m5 +%endif + + movh [r2], m0 + movhps [r2 + r3], m0 + +%ifnidn %2, 2 + lea r2, [r2 + r3 * 2] + dec r4d + jnz .loop +%endif + RET +%endmacro + +FILTER_VER_CHROMA_W4 2, ss, 4 +FILTER_VER_CHROMA_W4 2, pp, 7 +FILTER_VER_CHROMA_W4 2, ps, 5 +FILTER_VER_CHROMA_W4 2, sp, 7 + +FILTER_VER_CHROMA_W4 4, ss, 4 +FILTER_VER_CHROMA_W4 4, pp, 7 +FILTER_VER_CHROMA_W4 4, ps, 5 +FILTER_VER_CHROMA_W4 4, sp, 7 + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_%1_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W6 3 +INIT_XMM sse4 +cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3 + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, %1/4 + +%ifnidn %2, ss + %ifnidn %2, ps + mova m7, [pw_pixel_max] + %ifidn %2, pp + mova m6, [INTERP_OFFSET_PP] + %else + mova m6, [INTERP_OFFSET_SP] + %endif + %else + mova m6, [INTERP_OFFSET_PS] + %endif +%endif + +.loopH: + PROCESS_CHROMA_SP_W4_4R + +%ifidn %2, ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 +%elifidn %2, ps + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS + + packssdw m0, m1 + packssdw m2, m3 +%else + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + %ifidn %2, pp + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP + %else + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP + %endif + packssdw m0, m1 + packssdw m2, m3 + pxor m5, m5 + CLIPW2 m0, m2, m5, m7 +%endif + + movh [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movh [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + PROCESS_CHROMA_SP_W2_4R r6 + +%ifidn %2, ss + psrad m0, 6 + psrad m2, 6 + packssdw m0, m2 +%elifidn %2, ps + paddd m0, m6 + paddd m2, m6 + psrad m0, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + packssdw m0, m2 +%else + paddd m0, m6 + paddd m2, m6 + %ifidn %2, pp + psrad m0, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + %else + psrad m0, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + %endif + packusdw m0, m2 + CLIPW m0, m5, m7 +%endif + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m0, 2 + pextrd [r2 + r3], m0, 3 + + sub r0, 2 * 4 + lea r2, [r2 + 2 * r3 - 2 * 4] + + dec r4d + jnz .loopH + RET +%endmacro + +FILTER_VER_CHROMA_W6 8, ss, 6 +FILTER_VER_CHROMA_W6 8, ps, 7 +FILTER_VER_CHROMA_W6 8, sp, 8 +FILTER_VER_CHROMA_W6 8, pp, 8 + +FILTER_VER_CHROMA_W6 16, ss, 6 +FILTER_VER_CHROMA_W6 16, ps, 7 +FILTER_VER_CHROMA_W6 16, sp, 8 +FILTER_VER_CHROMA_W6 16, pp, 8 + +%macro PROCESS_CHROMA_SP_W8_2R 0 + movu m1, [r0] + movu m3, [r0 + r1] + punpcklwd m0, m1, m3 + pmaddwd m0, [r5 + 0 * 32] ;m0 = [0l+1l] Row1l + punpckhwd m1, m3 + pmaddwd m1, [r5 + 0 * 32] ;m1 = [0h+1h] Row1h + + movu m4, [r0 + 2 * r1] + punpcklwd m2, m3, m4 + pmaddwd m2, [r5 + 0 * 32] ;m2 = [1l+2l] Row2l + punpckhwd m3, m4 + pmaddwd m3, [r5 + 0 * 32] ;m3 = [1h+2h] Row2h + + lea r0, [r0 + 2 * r1] + movu m5, [r0 + r1] + punpcklwd m6, m4, m5 + pmaddwd m6, [r5 + 1 * 32] ;m6 = [2l+3l] Row1l + paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum + punpckhwd m4, m5 + pmaddwd m4, [r5 + 1 * 32] ;m6 = [2h+3h] Row1h + paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum + + movu m4, [r0 + 2 * r1] + punpcklwd m6, m5, m4 + pmaddwd m6, [r5 + 1 * 32] ;m6 = [3l+4l] Row2l + paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum + punpckhwd m5, m4 + pmaddwd m5, [r5 + 1 * 32] ;m1 = [3h+4h] Row2h + paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum +%endmacro + +;---------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;---------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W8 4 +INIT_XMM sse2 +cglobal interp_4tap_vert_%3_%1x%2, 5, 6, %4 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, %2/2 + +%ifidn %3, pp + mova m7, [INTERP_OFFSET_PP] +%elifidn %3, sp + mova m7, [INTERP_OFFSET_SP] +%elifidn %3, ps + mova m7, [INTERP_OFFSET_PS] +%endif + +.loopH: + PROCESS_CHROMA_SP_W8_2R + +%ifidn %3, ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 +%elifidn %3, ps + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS + + packssdw m0, m1 + packssdw m2, m3 +%else + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + %ifidn %3, pp + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP + %else + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP + %endif + packssdw m0, m1 + packssdw m2, m3 + pxor m5, m5 + mova m6, [pw_pixel_max] + CLIPW2 m0, m2, m5, m6 +%endif + + movu [r2], m0 + movu [r2 + r3], m2 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + RET +%endmacro + +FILTER_VER_CHROMA_W8 8, 2, ss, 7 +FILTER_VER_CHROMA_W8 8, 4, ss, 7 +FILTER_VER_CHROMA_W8 8, 6, ss, 7 +FILTER_VER_CHROMA_W8 8, 8, ss, 7 +FILTER_VER_CHROMA_W8 8, 16, ss, 7 +FILTER_VER_CHROMA_W8 8, 32, ss, 7 + +FILTER_VER_CHROMA_W8 8, 2, sp, 8 +FILTER_VER_CHROMA_W8 8, 4, sp, 8 +FILTER_VER_CHROMA_W8 8, 6, sp, 8 +FILTER_VER_CHROMA_W8 8, 8, sp, 8 +FILTER_VER_CHROMA_W8 8, 16, sp, 8 +FILTER_VER_CHROMA_W8 8, 32, sp, 8 + +FILTER_VER_CHROMA_W8 8, 2, ps, 8 +FILTER_VER_CHROMA_W8 8, 4, ps, 8 +FILTER_VER_CHROMA_W8 8, 6, ps, 8 +FILTER_VER_CHROMA_W8 8, 8, ps, 8 +FILTER_VER_CHROMA_W8 8, 16, ps, 8 +FILTER_VER_CHROMA_W8 8, 32, ps, 8 + +FILTER_VER_CHROMA_W8 8, 2, pp, 8 +FILTER_VER_CHROMA_W8 8, 4, pp, 8 +FILTER_VER_CHROMA_W8 8, 6, pp, 8 +FILTER_VER_CHROMA_W8 8, 8, pp, 8 +FILTER_VER_CHROMA_W8 8, 16, pp, 8 +FILTER_VER_CHROMA_W8 8, 32, pp, 8 + +FILTER_VER_CHROMA_W8 8, 12, ss, 7 +FILTER_VER_CHROMA_W8 8, 64, ss, 7 +FILTER_VER_CHROMA_W8 8, 12, sp, 8 +FILTER_VER_CHROMA_W8 8, 64, sp, 8 +FILTER_VER_CHROMA_W8 8, 12, ps, 8 +FILTER_VER_CHROMA_W8 8, 64, ps, 8 +FILTER_VER_CHROMA_W8 8, 12, pp, 8 +FILTER_VER_CHROMA_W8 8, 64, pp, 8 + +%macro PROCESS_CHROMA_VERT_W16_2R 0 + movu m1, [r0] + movu m3, [r0 + r1] + punpcklwd m0, m1, m3 + pmaddwd m0, [r5 + 0 * 32] + punpckhwd m1, m3 + pmaddwd m1, [r5 + 0 * 32] + + movu m4, [r0 + 2 * r1] + punpcklwd m2, m3, m4 + pmaddwd m2, [r5 + 0 * 32] + punpckhwd m3, m4 + pmaddwd m3, [r5 + 0 * 32] + + lea r0, [r0 + 2 * r1] + movu m5, [r0 + r1] + punpcklwd m6, m4, m5 + pmaddwd m6, [r5 + 1 * 32] + paddd m0, m6 + punpckhwd m4, m5 + pmaddwd m4, [r5 + 1 * 32] + paddd m1, m4 + + movu m4, [r0 + 2 * r1] + punpcklwd m6, m5, m4 + pmaddwd m6, [r5 + 1 * 32] + paddd m2, m6 + punpckhwd m5, m4 + pmaddwd m5, [r5 + 1 * 32] + paddd m3, m5 +%endmacro + +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_AVX2_6xN 2 +INIT_YMM avx2 +%if ARCH_X86_64 +cglobal interp_4tap_vert_%2_6x%1, 4, 7, 10 + mov r4d, r4m + add r1d, r1d + add r3d, r3d + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + sub r0, r1 + mov r6d, %1/4 + +%ifidn %2,pp + vbroadcasti128 m8, [INTERP_OFFSET_PP] +%elifidn %2, sp + vbroadcasti128 m8, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m8, [INTERP_OFFSET_PS] +%endif + +.loopH: + movu xm0, [r0] + movu xm1, [r0 + r1] + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + + movu xm2, [r0 + r1 * 2] + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + + lea r4, [r1 * 3] + movu xm3, [r0 + r4] + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + + lea r0, [r0 + r1 * 4] + movu xm4, [r0] + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 + + movu xm5, [r0 + r1] + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + pmaddwd m4, [r5] + paddd m2, m6 + + movu xm6, [r0 + r1 * 2] + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m7 + lea r4, [r3 * 3] +%ifidn %2,ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%else + paddd m0, m8 + paddd m1, m8 + paddd m2, m8 + paddd m3, m8 +%ifidn %2,pp + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP +%elifidn %2, sp + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP +%else + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS +%endif +%endif + + packssdw m0, m1 + packssdw m2, m3 + vpermq m0, m0, q3120 + vpermq m2, m2, q3120 + pxor m5, m5 + mova m9, [pw_pixel_max] +%ifidn %2,pp + CLIPW m0, m5, m9 + CLIPW m2, m5, m9 +%elifidn %2, sp + CLIPW m0, m5, m9 + CLIPW m2, m5, m9 +%endif + + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movq [r2], xm0 + pextrd [r2 + 8], xm0, 2 + movq [r2 + r3], xm1 + pextrd [r2 + r3 + 8], xm1, 2 + movq [r2 + r3 * 2], xm2 + pextrd [r2 + r3 * 2 + 8], xm2, 2 + movq [r2 + r4], xm3 + pextrd [r2 + r4 + 8], xm3, 2 + + lea r2, [r2 + r3 * 4] + dec r6d + jnz .loopH + RET +%endif +%endmacro +FILTER_VER_CHROMA_AVX2_6xN 8, pp +FILTER_VER_CHROMA_AVX2_6xN 8, ps +FILTER_VER_CHROMA_AVX2_6xN 8, ss +FILTER_VER_CHROMA_AVX2_6xN 8, sp +FILTER_VER_CHROMA_AVX2_6xN 16, pp +FILTER_VER_CHROMA_AVX2_6xN 16, ps +FILTER_VER_CHROMA_AVX2_6xN 16, ss +FILTER_VER_CHROMA_AVX2_6xN 16, sp + +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W16_16xN_avx2 3 +INIT_YMM avx2 +cglobal interp_4tap_vert_%2_16x%1, 5, 6, %3 + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mov r4d, %1/2 + +%ifidn %2, pp + vbroadcasti128 m7, [INTERP_OFFSET_PP] +%elifidn %2, sp + vbroadcasti128 m7, [INTERP_OFFSET_SP] +%elifidn %2, ps + vbroadcasti128 m7, [INTERP_OFFSET_PS] +%endif + +.loopH: + PROCESS_CHROMA_VERT_W16_2R +%ifidn %2, ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 +%elifidn %2, ps + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS + + packssdw m0, m1 + packssdw m2, m3 +%else + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + %ifidn %2, pp + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP +%else + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP +%endif + packssdw m0, m1 + packssdw m2, m3 + pxor m5, m5 + CLIPW2 m0, m2, m5, [pw_pixel_max] +%endif + + movu [r2], m0 + movu [r2 + r3], m2 + lea r2, [r2 + 2 * r3] + dec r4d + jnz .loopH + RET +%endmacro + FILTER_VER_CHROMA_W16_16xN_avx2 4, pp, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 8, pp, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 12, pp, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 24, pp, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 16, pp, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 32, pp, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 64, pp, 8 + + FILTER_VER_CHROMA_W16_16xN_avx2 4, ps, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 8, ps, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 12, ps, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 24, ps, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 16, ps, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 32, ps, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 64, ps, 8 + + FILTER_VER_CHROMA_W16_16xN_avx2 4, ss, 7 + FILTER_VER_CHROMA_W16_16xN_avx2 8, ss, 7 + FILTER_VER_CHROMA_W16_16xN_avx2 12, ss, 7 + FILTER_VER_CHROMA_W16_16xN_avx2 24, ss, 7 + FILTER_VER_CHROMA_W16_16xN_avx2 16, ss, 7 + FILTER_VER_CHROMA_W16_16xN_avx2 32, ss, 7 + FILTER_VER_CHROMA_W16_16xN_avx2 64, ss, 7 + + FILTER_VER_CHROMA_W16_16xN_avx2 4, sp, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 8, sp, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 12, sp, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 24, sp, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 16, sp, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 32, sp, 8 + FILTER_VER_CHROMA_W16_16xN_avx2 64, sp, 8 + +%macro PROCESS_CHROMA_VERT_W32_2R 0 + movu m1, [r0] + movu m3, [r0 + r1] + punpcklwd m0, m1, m3 + pmaddwd m0, [r5 + 0 * mmsize] + punpckhwd m1, m3 + pmaddwd m1, [r5 + 0 * mmsize] + + movu m9, [r0 + mmsize] + movu m11, [r0 + r1 + mmsize] + punpcklwd m8, m9, m11 + pmaddwd m8, [r5 + 0 * mmsize] + punpckhwd m9, m11 + pmaddwd m9, [r5 + 0 * mmsize] + + movu m4, [r0 + 2 * r1] + punpcklwd m2, m3, m4 + pmaddwd m2, [r5 + 0 * mmsize] + punpckhwd m3, m4 + pmaddwd m3, [r5 + 0 * mmsize] + + movu m12, [r0 + 2 * r1 + mmsize] + punpcklwd m10, m11, m12 + pmaddwd m10, [r5 + 0 * mmsize] + punpckhwd m11, m12 + pmaddwd m11, [r5 + 0 * mmsize] + + lea r6, [r0 + 2 * r1] + movu m5, [r6 + r1] + punpcklwd m6, m4, m5 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m0, m6 + punpckhwd m4, m5 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m1, m4 + + movu m13, [r6 + r1 + mmsize] + punpcklwd m14, m12, m13 + pmaddwd m14, [r5 + 1 * mmsize] + paddd m8, m14 + punpckhwd m12, m13 + pmaddwd m12, [r5 + 1 * mmsize] + paddd m9, m12 + + movu m4, [r6 + 2 * r1] + punpcklwd m6, m5, m4 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m2, m6 + punpckhwd m5, m4 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m3, m5 + + movu m12, [r6 + 2 * r1 + mmsize] + punpcklwd m14, m13, m12 + pmaddwd m14, [r5 + 1 * mmsize] + paddd m10, m14 + punpckhwd m13, m12 + pmaddwd m13, [r5 + 1 * mmsize] + paddd m11, m13 +%endmacro + +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W16_32xN_avx2 3 +INIT_YMM avx2 +%if ARCH_X86_64 +cglobal interp_4tap_vert_%2_32x%1, 5, 7, %3 + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + mov r4d, %1/2 + +%ifidn %2, pp + vbroadcasti128 m7, [INTERP_OFFSET_PP] +%elifidn %2, sp + vbroadcasti128 m7, [INTERP_OFFSET_SP] +%elifidn %2, ps + vbroadcasti128 m7, [INTERP_OFFSET_PS] +%endif + +.loopH: + PROCESS_CHROMA_VERT_W32_2R +%ifidn %2, ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + + packssdw m0, m1 + packssdw m2, m3 + packssdw m8, m9 + packssdw m10, m11 +%elifidn %2, ps + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS + paddd m8, m7 + paddd m9, m7 + paddd m10, m7 + paddd m11, m7 + psrad m8, INTERP_SHIFT_PS + psrad m9, INTERP_SHIFT_PS + psrad m10, INTERP_SHIFT_PS + psrad m11, INTERP_SHIFT_PS + + packssdw m0, m1 + packssdw m2, m3 + packssdw m8, m9 + packssdw m10, m11 +%else + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + paddd m8, m7 + paddd m9, m7 + paddd m10, m7 + paddd m11, m7 + %ifidn %2, pp + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP + psrad m8, INTERP_SHIFT_PP + psrad m9, INTERP_SHIFT_PP + psrad m10, INTERP_SHIFT_PP + psrad m11, INTERP_SHIFT_PP +%else + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP + psrad m8, INTERP_SHIFT_SP + psrad m9, INTERP_SHIFT_SP + psrad m10, INTERP_SHIFT_SP + psrad m11, INTERP_SHIFT_SP +%endif + packssdw m0, m1 + packssdw m2, m3 + packssdw m8, m9 + packssdw m10, m11 + pxor m5, m5 + CLIPW2 m0, m2, m5, [pw_pixel_max] + CLIPW2 m8, m10, m5, [pw_pixel_max] +%endif + + movu [r2], m0 + movu [r2 + r3], m2 + movu [r2 + mmsize], m8 + movu [r2 + r3 + mmsize], m10 + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + dec r4d + jnz .loopH + RET +%endif +%endmacro + FILTER_VER_CHROMA_W16_32xN_avx2 8, pp, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 16, pp, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 24, pp, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 32, pp, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 48, pp, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 64, pp, 15 + + FILTER_VER_CHROMA_W16_32xN_avx2 8, ps, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 16, ps, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 24, ps, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 32, ps, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 48, ps, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 64, ps, 15 + + FILTER_VER_CHROMA_W16_32xN_avx2 8, ss, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 16, ss, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 24, ss, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 32, ss, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 48, ss, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 64, ss, 15 + + FILTER_VER_CHROMA_W16_32xN_avx2 8, sp, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 16, sp, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 24, sp, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 32, sp, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 48, sp, 15 + FILTER_VER_CHROMA_W16_32xN_avx2 64, sp, 15 + +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W16_64xN_avx2 3 +INIT_YMM avx2 +cglobal interp_4tap_vert_%2_64x%1, 5, 7, %3 + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + mov r4d, %1/2 + +%ifidn %2, pp + vbroadcasti128 m7, [INTERP_OFFSET_PP] +%elifidn %2, sp + vbroadcasti128 m7, [INTERP_OFFSET_SP] +%elifidn %2, ps + vbroadcasti128 m7, [INTERP_OFFSET_PS] +%endif + +.loopH: +%assign x 0 +%rep 4 + movu m1, [r0 + x] + movu m3, [r0 + r1 + x] + movu m5, [r5 + 0 * mmsize] + punpcklwd m0, m1, m3 + pmaddwd m0, m5 + punpckhwd m1, m3 + pmaddwd m1, m5 + + movu m4, [r0 + 2 * r1 + x] + punpcklwd m2, m3, m4 + pmaddwd m2, m5 + punpckhwd m3, m4 + pmaddwd m3, m5 + + lea r6, [r0 + 2 * r1] + movu m5, [r6 + r1 + x] + punpcklwd m6, m4, m5 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m0, m6 + punpckhwd m4, m5 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m1, m4 + + movu m4, [r6 + 2 * r1 + x] + punpcklwd m6, m5, m4 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m2, m6 + punpckhwd m5, m4 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m3, m5 + +%ifidn %2, ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 +%elifidn %2, ps + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS + + packssdw m0, m1 + packssdw m2, m3 +%else + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 +%ifidn %2, pp + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP +%else + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP +%endif + packssdw m0, m1 + packssdw m2, m3 + pxor m5, m5 + CLIPW2 m0, m2, m5, [pw_pixel_max] +%endif + + movu [r2 + x], m0 + movu [r2 + r3 + x], m2 +%assign x x+mmsize +%endrep + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + dec r4d + jnz .loopH + RET +%endmacro + FILTER_VER_CHROMA_W16_64xN_avx2 16, ss, 7 + FILTER_VER_CHROMA_W16_64xN_avx2 32, ss, 7 + FILTER_VER_CHROMA_W16_64xN_avx2 48, ss, 7 + FILTER_VER_CHROMA_W16_64xN_avx2 64, ss, 7 + FILTER_VER_CHROMA_W16_64xN_avx2 16, sp, 8 + FILTER_VER_CHROMA_W16_64xN_avx2 32, sp, 8 + FILTER_VER_CHROMA_W16_64xN_avx2 48, sp, 8 + FILTER_VER_CHROMA_W16_64xN_avx2 64, sp, 8 + FILTER_VER_CHROMA_W16_64xN_avx2 16, ps, 8 + FILTER_VER_CHROMA_W16_64xN_avx2 32, ps, 8 + FILTER_VER_CHROMA_W16_64xN_avx2 48, ps, 8 + FILTER_VER_CHROMA_W16_64xN_avx2 64, ps, 8 + FILTER_VER_CHROMA_W16_64xN_avx2 16, pp, 8 + FILTER_VER_CHROMA_W16_64xN_avx2 32, pp, 8 + FILTER_VER_CHROMA_W16_64xN_avx2 48, pp, 8 + FILTER_VER_CHROMA_W16_64xN_avx2 64, pp, 8 + +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W16_12xN_avx2 3 +INIT_YMM avx2 +cglobal interp_4tap_vert_%2_12x%1, 5, 8, %3 + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + mov r4d, %1/2 + +%ifidn %2, pp + vbroadcasti128 m7, [INTERP_OFFSET_PP] +%elifidn %2, sp + vbroadcasti128 m7, [INTERP_OFFSET_SP] +%elifidn %2, ps + vbroadcasti128 m7, [INTERP_OFFSET_PS] +%endif + +.loopH: + PROCESS_CHROMA_VERT_W16_2R +%ifidn %2, ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 +%elifidn %2, ps + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS + + packssdw m0, m1 + packssdw m2, m3 +%else + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + %ifidn %2, pp + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP +%else + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP +%endif + packssdw m0, m1 + packssdw m2, m3 + pxor m5, m5 + CLIPW2 m0, m2, m5, [pw_pixel_max] +%endif + + movu [r2], xm0 + movu [r2 + r3], xm2 + vextracti128 xm0, m0, 1 + vextracti128 xm2, m2, 1 + movq [r2 + 16], xm0 + movq [r2 + r3 + 16], xm2 + lea r2, [r2 + 2 * r3] + dec r4d + jnz .loopH + RET +%endmacro + FILTER_VER_CHROMA_W16_12xN_avx2 16, ss, 7 + FILTER_VER_CHROMA_W16_12xN_avx2 16, sp, 8 + FILTER_VER_CHROMA_W16_12xN_avx2 16, ps, 8 + FILTER_VER_CHROMA_W16_12xN_avx2 16, pp, 8 + FILTER_VER_CHROMA_W16_12xN_avx2 32, ss, 7 + FILTER_VER_CHROMA_W16_12xN_avx2 32, sp, 8 + FILTER_VER_CHROMA_W16_12xN_avx2 32, ps, 8 + FILTER_VER_CHROMA_W16_12xN_avx2 32, pp, 8 + +%macro PROCESS_CHROMA_VERT_W24_2R 0 + movu m1, [r0] + movu m3, [r0 + r1] + punpcklwd m0, m1, m3 + pmaddwd m0, [r5 + 0 * mmsize] + punpckhwd m1, m3 + pmaddwd m1, [r5 + 0 * mmsize] + + movu xm9, [r0 + mmsize] + movu xm11, [r0 + r1 + mmsize] + punpcklwd xm8, xm9, xm11 + pmaddwd xm8, [r5 + 0 * mmsize] + punpckhwd xm9, xm11 + pmaddwd xm9, [r5 + 0 * mmsize] + + movu m4, [r0 + 2 * r1] + punpcklwd m2, m3, m4 + pmaddwd m2, [r5 + 0 * mmsize] + punpckhwd m3, m4 + pmaddwd m3, [r5 + 0 * mmsize] + + movu xm12, [r0 + 2 * r1 + mmsize] + punpcklwd xm10, xm11, xm12 + pmaddwd xm10, [r5 + 0 * mmsize] + punpckhwd xm11, xm12 + pmaddwd xm11, [r5 + 0 * mmsize] + + lea r6, [r0 + 2 * r1] + movu m5, [r6 + r1] + punpcklwd m6, m4, m5 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m0, m6 + punpckhwd m4, m5 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m1, m4 + + movu xm13, [r6 + r1 + mmsize] + punpcklwd xm14, xm12, xm13 + pmaddwd xm14, [r5 + 1 * mmsize] + paddd xm8, xm14 + punpckhwd xm12, xm13 + pmaddwd xm12, [r5 + 1 * mmsize] + paddd xm9, xm12 + + movu m4, [r6 + 2 * r1] + punpcklwd m6, m5, m4 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m2, m6 + punpckhwd m5, m4 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m3, m5 + + movu xm12, [r6 + 2 * r1 + mmsize] + punpcklwd xm14, xm13, xm12 + pmaddwd xm14, [r5 + 1 * mmsize] + paddd xm10, xm14 + punpckhwd xm13, xm12 + pmaddwd xm13, [r5 + 1 * mmsize] + paddd xm11, xm13 +%endmacro + +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W16_24xN_avx2 3 +INIT_YMM avx2 +%if ARCH_X86_64 +cglobal interp_4tap_vert_%2_24x%1, 5, 7, %3 + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + mov r4d, %1/2 + +%ifidn %2, pp + vbroadcasti128 m7, [INTERP_OFFSET_PP] +%elifidn %2, sp + vbroadcasti128 m7, [INTERP_OFFSET_SP] +%elifidn %2, ps + vbroadcasti128 m7, [INTERP_OFFSET_PS] +%endif + +.loopH: + PROCESS_CHROMA_VERT_W24_2R +%ifidn %2, ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + + packssdw m0, m1 + packssdw m2, m3 + packssdw m8, m9 + packssdw m10, m11 +%elifidn %2, ps + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS + paddd m8, m7 + paddd m9, m7 + paddd m10, m7 + paddd m11, m7 + psrad m8, INTERP_SHIFT_PS + psrad m9, INTERP_SHIFT_PS + psrad m10, INTERP_SHIFT_PS + psrad m11, INTERP_SHIFT_PS + + packssdw m0, m1 + packssdw m2, m3 + packssdw m8, m9 + packssdw m10, m11 +%else + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + paddd m8, m7 + paddd m9, m7 + paddd m10, m7 + paddd m11, m7 + %ifidn %2, pp + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP + psrad m8, INTERP_SHIFT_PP + psrad m9, INTERP_SHIFT_PP + psrad m10, INTERP_SHIFT_PP + psrad m11, INTERP_SHIFT_PP +%else + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP + psrad m8, INTERP_SHIFT_SP + psrad m9, INTERP_SHIFT_SP + psrad m10, INTERP_SHIFT_SP + psrad m11, INTERP_SHIFT_SP +%endif + packssdw m0, m1 + packssdw m2, m3 + packssdw m8, m9 + packssdw m10, m11 + pxor m5, m5 + CLIPW2 m0, m2, m5, [pw_pixel_max] + CLIPW2 m8, m10, m5, [pw_pixel_max] +%endif + + movu [r2], m0 + movu [r2 + r3], m2 + movu [r2 + mmsize], xm8 + movu [r2 + r3 + mmsize], xm10 + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + dec r4d + jnz .loopH + RET +%endif +%endmacro + FILTER_VER_CHROMA_W16_24xN_avx2 32, ss, 15 + FILTER_VER_CHROMA_W16_24xN_avx2 32, sp, 15 + FILTER_VER_CHROMA_W16_24xN_avx2 32, ps, 15 + FILTER_VER_CHROMA_W16_24xN_avx2 32, pp, 15 + FILTER_VER_CHROMA_W16_24xN_avx2 64, ss, 15 + FILTER_VER_CHROMA_W16_24xN_avx2 64, sp, 15 + FILTER_VER_CHROMA_W16_24xN_avx2 64, ps, 15 + FILTER_VER_CHROMA_W16_24xN_avx2 64, pp, 15 + + +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_W16_48x64_avx2 2 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_48x64, 5, 7, %2 + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + mov r4d, 32 + +%ifidn %1, pp + vbroadcasti128 m7, [INTERP_OFFSET_PP] +%elifidn %1, sp + vbroadcasti128 m7, [INTERP_OFFSET_SP] +%elifidn %1, ps + vbroadcasti128 m7, [INTERP_OFFSET_PS] +%endif + +.loopH: +%assign x 0 +%rep 3 + movu m1, [r0 + x] + movu m3, [r0 + r1 + x] + movu m5, [r5 + 0 * mmsize] + punpcklwd m0, m1, m3 + pmaddwd m0, m5 + punpckhwd m1, m3 + pmaddwd m1, m5 + + movu m4, [r0 + 2 * r1 + x] + punpcklwd m2, m3, m4 + pmaddwd m2, m5 + punpckhwd m3, m4 + pmaddwd m3, m5 + + lea r6, [r0 + 2 * r1] + movu m5, [r6 + r1 + x] + punpcklwd m6, m4, m5 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m0, m6 + punpckhwd m4, m5 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m1, m4 + + movu m4, [r6 + 2 * r1 + x] + punpcklwd m6, m5, m4 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m2, m6 + punpckhwd m5, m4 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m3, m5 + +%ifidn %1, ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 +%elifidn %1, ps + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS + + packssdw m0, m1 + packssdw m2, m3 +%else + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 +%ifidn %1, pp + psrad m0, INTERP_SHIFT_PP + psrad m1, INTERP_SHIFT_PP + psrad m2, INTERP_SHIFT_PP + psrad m3, INTERP_SHIFT_PP +%else + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP +%endif + packssdw m0, m1 + packssdw m2, m3 + pxor m5, m5 + CLIPW2 m0, m2, m5, [pw_pixel_max] +%endif + + movu [r2 + x], m0 + movu [r2 + r3 + x], m2 +%assign x x+mmsize +%endrep + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + dec r4d + jnz .loopH + RET +%endmacro + + FILTER_VER_CHROMA_W16_48x64_avx2 pp, 8 + FILTER_VER_CHROMA_W16_48x64_avx2 ps, 8 + FILTER_VER_CHROMA_W16_48x64_avx2 ss, 7 + FILTER_VER_CHROMA_W16_48x64_avx2 sp, 8 + +INIT_XMM sse2 +cglobal chroma_p2s, 3, 7, 3 + ; load width and height + mov r3d, r3m + mov r4d, r4m + add r1, r1 + + ; load constant + mova m2, [tab_c_n8192] + +.loopH: + + xor r5d, r5d +.loopW: + lea r6, [r0 + r5 * 2] + + movu m0, [r6] + psllw m0, (14 - BIT_DEPTH) + paddw m0, m2 + + movu m1, [r6 + r1] + psllw m1, (14 - BIT_DEPTH) + paddw m1, m2 + + add r5d, 8 + cmp r5d, r3d + lea r6, [r2 + r5 * 2] + jg .width4 + movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0 + movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + je .nextH + jmp .loopW + +.width4: + test r3d, 4 + jz .width2 + test r3d, 2 + movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0 + movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + lea r6, [r6 + 8] + pshufd m0, m0, 2 + pshufd m1, m1, 2 + jz .nextH + +.width2: + movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0 + movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + +.nextH: + lea r0, [r0 + r1 * 2] + add r2, FENC_STRIDE / 2 * 4 + + sub r4d, 2 + jnz .loopH + RET + +;----------------------------------------------------------------------------- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride) +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal filterPixelToShort_48x64, 3, 7, 4 + add r1d, r1d + mov r3d, r3m + add r3d, r3d + lea r4, [r3 * 3] + lea r5, [r1 * 3] + + ; load height + mov r6d, 16 + + ; load constant + mova m3, [pw_2000] + +.loop: + movu m0, [r0] + movu m1, [r0 + 32] + movu m2, [r0 + 64] + psllw m0, (14 - BIT_DEPTH) + psllw m1, (14 - BIT_DEPTH) + psllw m2, (14 - BIT_DEPTH) + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 + movu [r2 + r3 * 0], m0 + movu [r2 + r3 * 0 + 32], m1 + movu [r2 + r3 * 0 + 64], m2 + + movu m0, [r0 + r1] + movu m1, [r0 + r1 + 32] + movu m2, [r0 + r1 + 64] + psllw m0, (14 - BIT_DEPTH) + psllw m1, (14 - BIT_DEPTH) + psllw m2, (14 - BIT_DEPTH) + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 + movu [r2 + r3 * 1], m0 + movu [r2 + r3 * 1 + 32], m1 + movu [r2 + r3 * 1 + 64], m2 + + movu m0, [r0 + r1 * 2] + movu m1, [r0 + r1 * 2 + 32] + movu m2, [r0 + r1 * 2 + 64] + psllw m0, (14 - BIT_DEPTH) + psllw m1, (14 - BIT_DEPTH) + psllw m2, (14 - BIT_DEPTH) + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 + movu [r2 + r3 * 2], m0 + movu [r2 + r3 * 2 + 32], m1 + movu [r2 + r3 * 2 + 64], m2 + + movu m0, [r0 + r5] + movu m1, [r0 + r5 + 32] + movu m2, [r0 + r5 + 64] + psllw m0, (14 - BIT_DEPTH) + psllw m1, (14 - BIT_DEPTH) + psllw m2, (14 - BIT_DEPTH) + psubw m0, m3 + psubw m1, m3 + psubw m2, m3 + movu [r2 + r4], m0 + movu [r2 + r4 + 32], m1 + movu [r2 + r4 + 64], m2 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + + dec r6d + jnz .loop + RET + + %macro FILTER_VER_CHROMA_AVX2_8xN 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_8x%2, 4, 9, 15 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + vbroadcasti128 m14, [pd_32] +%elifidn %1, sp + vbroadcasti128 m14, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m14, [INTERP_OFFSET_PS] +%endif + lea r6, [r3 * 3] + lea r7, [r1 * 4] + mov r8d, %2 / 16 +.loopH: + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + + + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] + + + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhwd xm11, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] + + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] + +%ifidn %1,ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%else + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 +%ifidn %1,pp + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%elifidn %1, sp + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP + psrad m2, INTERP_SHIFT_SP + psrad m3, INTERP_SHIFT_SP + psrad m4, INTERP_SHIFT_SP + psrad m5, INTERP_SHIFT_SP +%else + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS + psrad m2, INTERP_SHIFT_PS + psrad m3, INTERP_SHIFT_PS + psrad m4, INTERP_SHIFT_PS + psrad m5, INTERP_SHIFT_PS +%endif +%endif + + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + vpermq m0, m0, q3120 + vpermq m2, m2, q3120 + vpermq m4, m4, q3120 + pxor m5, m5 + mova m3, [pw_pixel_max] +%ifidn %1,pp + CLIPW m0, m5, m3 + CLIPW m2, m5, m3 + CLIPW m4, m5, m3 +%elifidn %1, sp + CLIPW m0, m5, m3 + CLIPW m2, m5, m3 + CLIPW m4, m5, m3 +%endif + + vextracti128 xm1, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + vextracti128 xm1, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm1 + lea r2, [r2 + r3 * 4] + vextracti128 xm1, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm1 + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m0, m12, [r5 + 1 * mmsize] + paddd m10, m0 + pmaddwd m12, [r5] + + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddwd m1, m13, [r5 + 1 * mmsize] + paddd m11, m1 + pmaddwd m13, [r5] + +%ifidn %1,ss + psrad m6, 6 + psrad m7, 6 +%else + paddd m6, m14 + paddd m7, m14 +%ifidn %1,pp + psrad m6, 6 + psrad m7, 6 +%elifidn %1, sp + psrad m6, INTERP_SHIFT_SP + psrad m7, INTERP_SHIFT_SP +%else + psrad m6, INTERP_SHIFT_PS + psrad m7, INTERP_SHIFT_PS +%endif +%endif + + packssdw m6, m7 + vpermq m6, m6, q3120 +%ifidn %1,pp + CLIPW m6, m5, m3 +%elifidn %1, sp + CLIPW m6, m5, m3 +%endif + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 + + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m12, m2 + pmaddwd m0, [r5] + + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhwd xm6, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm6, 1 + pmaddwd m6, m1, [r5 + 1 * mmsize] + paddd m13, m6 + pmaddwd m1, [r5] + + movu xm6, [r0 + r1] ; m6 = row 17 + punpckhwd xm4, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm4, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm2, 1 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 + +%ifidn %1,ss + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m1, 6 +%else + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 + paddd m12, m14 + paddd m13, m14 + paddd m0, m14 + paddd m1, m14 +%ifidn %1,pp + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m1, 6 +%elifidn %1, sp + psrad m8, INTERP_SHIFT_SP + psrad m9, INTERP_SHIFT_SP + psrad m10, INTERP_SHIFT_SP + psrad m11, INTERP_SHIFT_SP + psrad m12, INTERP_SHIFT_SP + psrad m13, INTERP_SHIFT_SP + psrad m0, INTERP_SHIFT_SP + psrad m1, INTERP_SHIFT_SP +%else + psrad m8, INTERP_SHIFT_PS + psrad m9, INTERP_SHIFT_PS + psrad m10, INTERP_SHIFT_PS + psrad m11, INTERP_SHIFT_PS + psrad m12, INTERP_SHIFT_PS + psrad m13, INTERP_SHIFT_PS + psrad m0, INTERP_SHIFT_PS + psrad m1, INTERP_SHIFT_PS +%endif +%endif + + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m0, m1 + vpermq m8, m8, q3120 + vpermq m10, m10, q3120 + vpermq m12, m12, q3120 + vpermq m0, m0, q3120 +%ifidn %1,pp + CLIPW m8, m5, m3 + CLIPW m10, m5, m3 + CLIPW m12, m5, m3 + CLIPW m0, m5, m3 +%elifidn %1, sp + CLIPW m8, m5, m3 + CLIPW m10, m5, m3 + CLIPW m12, m5, m3 + CLIPW m0, m5, m3 +%endif + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + lea r2, [r2 + r3 * 4] + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm1 + lea r2, [r2 + r3 * 4] + dec r8d + jnz .loopH + RET +%endif +%endmacro + +FILTER_VER_CHROMA_AVX2_8xN pp, 16 +FILTER_VER_CHROMA_AVX2_8xN ps, 16 +FILTER_VER_CHROMA_AVX2_8xN ss, 16 +FILTER_VER_CHROMA_AVX2_8xN sp, 16 +FILTER_VER_CHROMA_AVX2_8xN pp, 32 +FILTER_VER_CHROMA_AVX2_8xN ps, 32 +FILTER_VER_CHROMA_AVX2_8xN sp, 32 +FILTER_VER_CHROMA_AVX2_8xN ss, 32 +FILTER_VER_CHROMA_AVX2_8xN pp, 64 +FILTER_VER_CHROMA_AVX2_8xN ps, 64 +FILTER_VER_CHROMA_AVX2_8xN sp, 64 +FILTER_VER_CHROMA_AVX2_8xN ss, 64 + +%macro PROCESS_CHROMA_AVX2_8x2 3 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m2, m2, [r5 + 1 * mmsize] + paddd m0, m2 + + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m3, m3, [r5 + 1 * mmsize] + paddd m1, m3 + +%ifnidn %1,ss + paddd m0, m7 + paddd m1, m7 +%endif + psrad m0, %3 + psrad m1, %3 + + packssdw m0, m1 + vpermq m0, m0, q3120 + pxor m4, m4 + +%if %2 + CLIPW m0, m4, [pw_pixel_max] +%endif + vextracti128 xm1, m0, 1 +%endmacro + + +%macro FILTER_VER_CHROMA_AVX2_8x2 3 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x2, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + vbroadcasti128 m7, [pd_32] +%elifidn %1, sp + vbroadcasti128 m7, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m7, [INTERP_OFFSET_PS] +%endif + + PROCESS_CHROMA_AVX2_8x2 %1, %2, %3 + movu [r2], xm0 + movu [r2 + r3], xm1 + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_8x2 pp, 1, 6 +FILTER_VER_CHROMA_AVX2_8x2 ps, 0, INTERP_SHIFT_PS +FILTER_VER_CHROMA_AVX2_8x2 sp, 1, INTERP_SHIFT_SP +FILTER_VER_CHROMA_AVX2_8x2 ss, 0, 6 + +%macro FILTER_VER_CHROMA_AVX2_4x2 3 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x2, 4, 6, 7 + mov r4d, r4m + add r1d, r1d + add r3d, r3d + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + +%ifidn %1,pp + vbroadcasti128 m6, [pd_32] +%elifidn %1, sp + vbroadcasti128 m6, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m6, [INTERP_OFFSET_PS] +%endif + + movq xm0, [r0] ; row 0 + movq xm1, [r0 + r1] ; row 1 + punpcklwd xm0, xm1 + + movq xm2, [r0 + r1 * 2] ; row 2 + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + + movq xm3, [r0 + r4] ; row 3 + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] ; row 4 + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + paddd m0, m5 + +%ifnidn %1, ss + paddd m0, m6 +%endif + psrad m0, %3 + packssdw m0, m0 + pxor m1, m1 + +%if %2 + CLIPW m0, m1, [pw_pixel_max] +%endif + + vextracti128 xm2, m0, 1 + lea r4, [r3 * 3] + movq [r2], xm0 + movq [r2 + r3], xm2 + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_4x2 pp, 1, 6 +FILTER_VER_CHROMA_AVX2_4x2 ps, 0, INTERP_SHIFT_PS +FILTER_VER_CHROMA_AVX2_4x2 sp, 1, INTERP_SHIFT_SP +FILTER_VER_CHROMA_AVX2_4x2 ss, 0, 6 + +%macro FILTER_VER_CHROMA_AVX2_4x4 3 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x4, 4, 6, 7 + mov r4d, r4m + add r1d, r1d + add r3d, r3d + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + +%ifidn %1,pp + vbroadcasti128 m6, [pd_32] +%elifidn %1, sp + vbroadcasti128 m6, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m6, [INTERP_OFFSET_PS] +%endif + movq xm0, [r0] ; row 0 + movq xm1, [r0 + r1] ; row 1 + punpcklwd xm0, xm1 + + movq xm2, [r0 + r1 * 2] ; row 2 + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + + movq xm3, [r0 + r4] ; row 3 + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] ; row 4 + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + + movq xm3, [r0 + r1] ; row 5 + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] ; row 6 + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m4, [r5 + 1 * mmsize] + paddd m2, m4 + +%ifnidn %1,ss + paddd m0, m6 + paddd m2, m6 +%endif + psrad m0, %3 + psrad m2, %3 + + packssdw m0, m2 + pxor m1, m1 +%if %2 + CLIPW m0, m1, [pw_pixel_max] +%endif + + vextracti128 xm2, m0, 1 + lea r4, [r3 * 3] + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_4x4 pp, 1, 6 +FILTER_VER_CHROMA_AVX2_4x4 ps, 0, INTERP_SHIFT_PS +FILTER_VER_CHROMA_AVX2_4x4 sp, 1, INTERP_SHIFT_SP +FILTER_VER_CHROMA_AVX2_4x4 ss, 0, 6 + + +%macro FILTER_VER_CHROMA_AVX2_4x8 3 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x8, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + +%ifidn %1,pp + vbroadcasti128 m7, [pd_32] +%elifidn %1, sp + vbroadcasti128 m7, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m7, [INTERP_OFFSET_PS] +%endif + lea r6, [r3 * 3] + + movq xm0, [r0] ; row 0 + movq xm1, [r0 + r1] ; row 1 + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] ; row 2 + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + + movq xm3, [r0 + r4] ; row 3 + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] ; row 4 + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + + movq xm3, [r0 + r1] ; row 5 + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] ; row 6 + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + + movq xm3, [r0 + r4] ; row 7 + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] ; row 8 + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + + movq xm3, [r0 + r1] ; row 9 + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] ; row 10 + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 +%ifnidn %1,ss + paddd m0, m7 + paddd m2, m7 +%endif + psrad m0, %3 + psrad m2, %3 + packssdw m0, m2 + pxor m6, m6 + mova m3, [pw_pixel_max] +%if %2 + CLIPW m0, m6, m3 +%endif + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm2 +%ifnidn %1,ss + paddd m4, m7 + paddd m1, m7 +%endif + psrad m4, %3 + psrad m1, %3 + packssdw m4, m1 +%if %2 + CLIPW m4, m6, m3 +%endif + vextracti128 xm1, m4, 1 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_4x8 pp, 1, 6 +FILTER_VER_CHROMA_AVX2_4x8 ps, 0, INTERP_SHIFT_PS +FILTER_VER_CHROMA_AVX2_4x8 sp, 1, INTERP_SHIFT_SP +FILTER_VER_CHROMA_AVX2_4x8 ss, 0 , 6 + +%macro PROCESS_LUMA_AVX2_W4_16R_4TAP 3 + movq xm0, [r0] ; row 0 + movq xm1, [r0 + r1] ; row 1 + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] ; row 2 + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] ; row 3 + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] ; row 4 + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] ; row 5 + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] ; row 6 + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] ; row 7 + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] ; row 8 + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] ; row 9 + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] ; row 10 + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] + pmaddwd m3, m6, [r5 + 1 * mmsize] + paddd m1, m3 + pmaddwd m6, [r5] +%ifnidn %1,ss + paddd m0, m7 + paddd m2, m7 +%endif + psrad m0, %3 + psrad m2, %3 + packssdw m0, m2 + pxor m3, m3 +%if %2 + CLIPW m0, m3, [pw_pixel_max] +%endif + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm2 + movq xm2, [r0 + r4] ;row 11 + punpcklwd xm5, xm2 + lea r0, [r0 + 4 * r1] + movq xm0, [r0] ; row 12 + punpcklwd xm2, xm0 + vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] + pmaddwd m2, m5, [r5 + 1 * mmsize] + paddd m6, m2 + pmaddwd m5, [r5] + movq xm2, [r0 + r1] ; row 13 + punpcklwd xm0, xm2 + movq xm3, [r0 + 2 * r1] ; row 14 + punpcklwd xm2, xm3 + vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m5, m2 + pmaddwd m0, [r5] +%ifnidn %1,ss + paddd m4, m7 + paddd m1, m7 +%endif + psrad m4, %3 + psrad m1, %3 + packssdw m4, m1 + pxor m2, m2 +%if %2 + CLIPW m4, m2, [pw_pixel_max] +%endif + + vextracti128 xm1, m4, 1 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 + movq xm4, [r0 + r4] ; row 15 + punpcklwd xm3, xm4 + lea r0, [r0 + 4 * r1] + movq xm1, [r0] ; row 16 + punpcklwd xm4, xm1 + vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] + pmaddwd m4, m3, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m3, [r5] + movq xm4, [r0 + r1] ; row 17 + punpcklwd xm1, xm4 + movq xm2, [r0 + 2 * r1] ; row 18 + punpcklwd xm4, xm2 + vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] + pmaddwd m1, [r5 + 1 * mmsize] + paddd m3, m1 + +%ifnidn %1,ss + paddd m6, m7 + paddd m5, m7 +%endif + psrad m6, %3 + psrad m5, %3 + packssdw m6, m5 + pxor m1, m1 +%if %2 + CLIPW m6, m1, [pw_pixel_max] +%endif + vextracti128 xm5, m6, 1 + lea r2, [r2 + r3 * 4] + movq [r2], xm6 + movq [r2 + r3], xm5 + movhps [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm5 +%ifnidn %1,ss + paddd m0, m7 + paddd m3, m7 +%endif + psrad m0, %3 + psrad m3, %3 + packssdw m0, m3 +%if %2 + CLIPW m0, m1, [pw_pixel_max] +%endif + vextracti128 xm3, m0, 1 + lea r2, [r2 + r3 * 4] + movq [r2], xm0 + movq [r2 + r3], xm3 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm3 +%endmacro + +%macro FILTER_VER_CHROMA_AVX2_4xN 4 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x%2, 4, 8, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + mov r7d, %2 / 16 +%ifidn %1,pp + vbroadcasti128 m7, [pd_32] +%elifidn %1, sp + vbroadcasti128 m7, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m7, [INTERP_OFFSET_PS] +%endif + lea r6, [r3 * 3] +.loopH: + PROCESS_LUMA_AVX2_W4_16R_4TAP %1, %3, %4 + lea r2, [r2 + r3 * 4] + dec r7d + jnz .loopH + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_4xN pp, 16, 1, 6 +FILTER_VER_CHROMA_AVX2_4xN ps, 16, 0, INTERP_SHIFT_PS +FILTER_VER_CHROMA_AVX2_4xN sp, 16, 1, INTERP_SHIFT_SP +FILTER_VER_CHROMA_AVX2_4xN ss, 16, 0, 6 +FILTER_VER_CHROMA_AVX2_4xN pp, 32, 1, 6 +FILTER_VER_CHROMA_AVX2_4xN ps, 32, 0, INTERP_SHIFT_PS +FILTER_VER_CHROMA_AVX2_4xN sp, 32, 1, INTERP_SHIFT_SP +FILTER_VER_CHROMA_AVX2_4xN ss, 32, 0, 6 + +%macro FILTER_VER_CHROMA_AVX2_8x8 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_8x8, 4, 6, 12 + mov r4d, r4m + add r1d, r1d + add r3d, r3d + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + +%ifidn %1,pp + vbroadcasti128 m11, [pd_32] +%elifidn %1, sp + vbroadcasti128 m11, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m11, [INTERP_OFFSET_PS] +%endif + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 ; res row0 done(0,1,2,3) + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 ;res row1 done(1, 2, 3, 4) + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + pmaddwd m4, [r5] + paddd m2, m6 ;res row2 done(2,3,4,5) + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m7 ;res row3 done(3,4,5,6) + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m8 ;res row4 done(4,5,6,7) + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 1 * mmsize] + pmaddwd m7, [r5] + paddd m5, m9 ;res row5 done(5,6,7,8) + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m8, [r5 + 1 * mmsize] + paddd m6, m8 ;res row6 done(6,7,8,9) + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhwd xm8, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm8, 1 + pmaddwd m9, [r5 + 1 * mmsize] + paddd m7, m9 ;res row7 done 7,8,9,10 + lea r4, [r3 * 3] +%ifnidn %1,ss + paddd m0, m11 + paddd m1, m11 + paddd m2, m11 + paddd m3, m11 +%endif + psrad m0, %3 + psrad m1, %3 + psrad m2, %3 + psrad m3, %3 + packssdw m0, m1 + packssdw m2, m3 + vpermq m0, m0, q3120 + vpermq m2, m2, q3120 + pxor m1, m1 + mova m3, [pw_pixel_max] +%if %2 + CLIPW m0, m1, m3 + CLIPW m2, m1, m3 +%endif + vextracti128 xm9, m0, 1 + vextracti128 xm8, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm8 +%ifnidn %1,ss + paddd m4, m11 + paddd m5, m11 + paddd m6, m11 + paddd m7, m11 +%endif + psrad m4, %3 + psrad m5, %3 + psrad m6, %3 + psrad m7, %3 + packssdw m4, m5 + packssdw m6, m7 + vpermq m4, m4, q3120 + vpermq m6, m6, q3120 +%if %2 + CLIPW m4, m1, m3 + CLIPW m6, m1, m3 +%endif + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r4], xm7 + RET +%endif +%endmacro + +FILTER_VER_CHROMA_AVX2_8x8 pp, 1, 6 +FILTER_VER_CHROMA_AVX2_8x8 ps, 0, INTERP_SHIFT_PS +FILTER_VER_CHROMA_AVX2_8x8 sp, 1, INTERP_SHIFT_SP +FILTER_VER_CHROMA_AVX2_8x8 ss, 0, 6 + +%macro FILTER_VER_CHROMA_AVX2_8x6 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_8x6, 4, 6, 12 + mov r4d, r4m + add r1d, r1d + add r3d, r3d + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + +%ifidn %1,pp + vbroadcasti128 m11, [pd_32] +%elifidn %1, sp + vbroadcasti128 m11, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m11, [INTERP_OFFSET_PS] +%endif + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 ; r0 done(0,1,2,3) + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 ;r1 done(1, 2, 3, 4) + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + pmaddwd m4, [r5] + paddd m2, m6 ;r2 done(2,3,4,5) + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m7 ;r3 done(3,4,5,6) + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 ;r4 done(4,5,6,7) + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m7, m7, [r5 + 1 * mmsize] + paddd m5, m7 ;r5 done(5,6,7,8) + lea r4, [r3 * 3] +%ifnidn %1,ss + paddd m0, m11 + paddd m1, m11 + paddd m2, m11 + paddd m3, m11 +%endif + psrad m0, %3 + psrad m1, %3 + psrad m2, %3 + psrad m3, %3 + packssdw m0, m1 + packssdw m2, m3 + vpermq m0, m0, q3120 + vpermq m2, m2, q3120 + pxor m10, m10 + mova m9, [pw_pixel_max] +%if %2 + CLIPW m0, m10, m9 + CLIPW m2, m10, m9 +%endif + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 +%ifnidn %1,ss + paddd m4, m11 + paddd m5, m11 +%endif + psrad m4, %3 + psrad m5, %3 + packssdw m4, m5 + vpermq m4, m4, 11011000b +%if %2 + CLIPW m4, m10, m9 +%endif + vextracti128 xm5, m4, 1 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 + RET +%endif +%endmacro + +FILTER_VER_CHROMA_AVX2_8x6 pp, 1, 6 +FILTER_VER_CHROMA_AVX2_8x6 ps, 0, INTERP_SHIFT_PS +FILTER_VER_CHROMA_AVX2_8x6 sp, 1, INTERP_SHIFT_SP +FILTER_VER_CHROMA_AVX2_8x6 ss, 0, 6 + +%macro PROCESS_CHROMA_AVX2 3 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m2, m4 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm4, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm4, 1 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m3, m5 +%ifnidn %1,ss + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 +%endif + psrad m0, %3 + psrad m1, %3 + psrad m2, %3 + psrad m3, %3 + packssdw m0, m1 + packssdw m2, m3 + vpermq m0, m0, q3120 + vpermq m2, m2, q3120 + pxor m4, m4 +%if %2 + CLIPW m0, m4, [pw_pixel_max] + CLIPW m2, m4, [pw_pixel_max] +%endif + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 +%endmacro + + +%macro FILTER_VER_CHROMA_AVX2_8x4 3 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x4, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + add r3d, r3d +%ifdef PIC + lea r5, [tab_ChromaCoeffVer] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer + r4] +%endif + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + vbroadcasti128 m7, [pd_32] +%elifidn %1, sp + vbroadcasti128 m7, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m7, [INTERP_OFFSET_PS] +%endif + PROCESS_CHROMA_AVX2 %1, %2, %3 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + lea r4, [r3 * 3] + movu [r2 + r4], xm3 + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_8x4 pp, 1, 6 +FILTER_VER_CHROMA_AVX2_8x4 ps, 0, INTERP_SHIFT_PS +FILTER_VER_CHROMA_AVX2_8x4 sp, 1, INTERP_SHIFT_SP +FILTER_VER_CHROMA_AVX2_8x4 ss, 0, 6 + +%macro FILTER_VER_CHROMA_AVX2_8x12 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_8x12, 4, 7, 15 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + vbroadcasti128 m14, [pd_32] +%elifidn %1, sp + vbroadcasti128 m14, [INTERP_OFFSET_SP] +%else + vbroadcasti128 m14, [INTERP_OFFSET_PS] +%endif + lea r6, [r3 * 3] + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhwd xm11, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] +%ifnidn %1,ss + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 +%endif + psrad m0, %3 + psrad m1, %3 + psrad m2, %3 + psrad m3, %3 + psrad m4, %3 + psrad m5, %3 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + vpermq m0, m0, q3120 + vpermq m2, m2, q3120 + vpermq m4, m4, q3120 + pxor m5, m5 + mova m3, [pw_pixel_max] +%if %2 + CLIPW m0, m5, m3 + CLIPW m2, m5, m3 + CLIPW m4, m5, m3 +%endif + vextracti128 xm1, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + vextracti128 xm1, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm1 + lea r2, [r2 + r3 * 4] + vextracti128 xm1, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm1 + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m12, m12, [r5 + 1 * mmsize] + paddd m10, m12 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddwd m13, m13, [r5 + 1 * mmsize] + paddd m11, m13 +%ifnidn %1,ss + paddd m6, m14 + paddd m7, m14 + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 +%endif + psrad m6, %3 + psrad m7, %3 + psrad m8, %3 + psrad m9, %3 + psrad m10, %3 + psrad m11, %3 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + vpermq m6, m6, q3120 + vpermq m8, m8, q3120 + vpermq m10, m10, q3120 +%if %2 + CLIPW m6, m5, m3 + CLIPW m8, m5, m3 + CLIPW m10, m5, m3 +%endif + vextracti128 xm7, m6, 1 + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 + lea r2, [r2 + r3 * 4] + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + RET +%endif +%endmacro + +FILTER_VER_CHROMA_AVX2_8x12 pp, 1, 6 +FILTER_VER_CHROMA_AVX2_8x12 ps, 0, INTERP_SHIFT_PS +FILTER_VER_CHROMA_AVX2_8x12 sp, 1, INTERP_SHIFT_SP +FILTER_VER_CHROMA_AVX2_8x12 ss, 0, 6 \ No newline at end of file From 09265145041bee53ade04a47f23b1a7c232007fb Mon Sep 17 00:00:00 2001 From: Ashok Kumar Mishra Date: Wed, 21 Feb 2018 14:25:56 +0530 Subject: [PATCH 51/51] release: Release notes for v2.7 --- doc/reST/releasenotes.rst | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/doc/reST/releasenotes.rst b/doc/reST/releasenotes.rst index 89eca798ef..19de52c38b 100644 --- a/doc/reST/releasenotes.rst +++ b/doc/reST/releasenotes.rst @@ -2,6 +2,32 @@ Release Notes ************* +Version 2.7 +=========== + +Release date - 21st Feb, 2018. + +New features +------------ +1. :option:`--gop-lookahead` can be used to extend the gop boundary(set by `--keyint`). The GOP will be extended, if a scene-cut frame is found within this many number of frames. +2. Support for RADL pictures added in x265. + :option:`--radl` can be used to decide number of RADL pictures preceding the IDR picture. + +Encoder enhancements +-------------------- +1. Moved from YASM to NASM assembler. Supports NASM assembler version 2.13 and greater. +2. Enable analysis save and load in a single run. Introduces two new cli options `--analysis-save ` and `--analysis-load `. +3. Comply to HDR10+ LLC specification. +4. Reduced x265 build time by more than 50% by re-factoring ipfilter.asm. + +Bug fixes +--------- +1. Fixed inconsistent output issue in deblock filter and --const-vbv. +2. Fixed Mac OS build warnings. +3. Fixed inconsistency in pass-2 when weightp and cutree are enabled. +4. Fixed deadlock issue due to dropping of BREF frames, while forcing slice types through qp file. + + Version 2.6 ===========