diff --git a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch index 47dc1f9ddfc..0e0ec50e7e5 100644 --- a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch +++ b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch @@ -95,7 +95,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c -@@ -0,0 +1,2153 @@ +@@ -0,0 +1,2157 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -2051,7 +2051,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4; + int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off); + int32x4_t out_rndx4 = vdupq_n_s32(out_rnd); -+ int16x8_t out_sh2x8 = vdupq_n_s16(out_sh2); ++ int32x4_t out_sh2x4 = vdupq_n_s32(out_sh2); + int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset); + int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2); + for (; height > 1; height -= 2, @@ -2213,24 +2213,28 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + } + + y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4); ++ y0oax4 = vshlq_s32(y0oax4, out_sh2x4); + y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4); ++ y0obx4 = vshlq_s32(y0obx4, out_sh2x4); + y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4); ++ y1oax4 = vshlq_s32(y1oax4, out_sh2x4); + y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4); ++ y1obx4 = vshlq_s32(y1obx4, out_sh2x4); + uox4 = vaddq_s32(uox4, out_uv_offsetx4); ++ uox4 = vshlq_s32(uox4, out_sh2x4); + vox4 = vaddq_s32(vox4, out_uv_offsetx4); ++ vox4 = vshlq_s32(vox4, out_sh2x4); + + y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4)); -+ y0ox8 = vshlq_u16(y0ox8, out_sh2x8); + vst1q_u16(&dsty[x], y0ox8); + + y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4)); -+ y1ox8 = vshlq_u16(y1ox8, out_sh2x8); + vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8); + + uvoax4 = vzip1q_s32(uox4, vox4); + uvobx4 = vzip2q_s32(uox4, vox4); + -+ vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8)); ++ vst1q_u16(&dstuv[x], vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4))); + } + } + @@ -2417,7 +2421,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/vf_tonemapx.c -@@ -0,0 +1,1791 @@ +@@ -0,0 +1,1790 @@ +/* + * This file is part of FFmpeg. + * @@ -2476,6 +2480,9 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c +#include "internal.h" +#include "video.h" + ++#define MIX(x, y, a) ((x) + ((y) - (x)) * (a)) ++#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c))) ++ +enum TonemapAlgorithm { + TONEMAP_NONE, + TONEMAP_LINEAR, @@ -2588,11 +2595,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c +}; + +const double dovi_lms2rgb_matrix[3][3] = -+ { -+ { 3.06441879, -2.16597676, 0.10155818}, -+ {-0.65612108, 1.78554118, -0.12943749}, -+ { 0.01736321, -0.04725154, 1.03004253}, -+ }; ++{ ++ { 3.06441879, -2.16597676, 0.10155818}, ++ {-0.65612108, 1.78554118, -0.12943749}, ++ { 0.01736321, -0.04725154, 1.03004253}, ++}; + +static void update_dovi_buf(AVFilterContext *ctx) +{ @@ -2736,7 +2743,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + dest[2] = l * (float)lms2rgb_matrix[2][0] + m * (float)lms2rgb_matrix[2][1] + s * (float)lms2rgb_matrix[2][2]; +} + -+#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c))) +inline static void reshape_dovi_yuv(float* dest, float* src, const TonemapIntParams *ctx) +{ + int i; @@ -2777,41 +2783,40 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + coeffs[2] = dovi_coeffs[0*4+2]; + coeffs[3] = dovi_coeffs[0*4+3]; + -+#define mix(x, y, a) ((x) + ((y) - (x)) * (a)) + if (i == 0 && dovi_num_pivots > 2) { + int t0 = s >= dovi_pivots[0], t1 = s >= dovi_pivots[1]; + int t2 = s >= dovi_pivots[2], t3 = s >= dovi_pivots[3]; + int t4 = s >= dovi_pivots[4], t5 = s >= dovi_pivots[5], t6 = s >= dovi_pivots[6]; + -+ float m01[4] = { mix(dovi_coeffs[0*4+0], dovi_coeffs[1*4+0], t0), -+ mix(dovi_coeffs[0*4+1], dovi_coeffs[1*4+1], t0), -+ mix(dovi_coeffs[0*4+2], dovi_coeffs[1*4+2], t0), -+ mix(dovi_coeffs[0*4+3], dovi_coeffs[1*4+3], t0) }; -+ float m23[4] = { mix(dovi_coeffs[2*4+0], dovi_coeffs[3*4+0], t2), -+ mix(dovi_coeffs[2*4+1], dovi_coeffs[3*4+1], t2), -+ mix(dovi_coeffs[2*4+2], dovi_coeffs[3*4+2], t2), -+ mix(dovi_coeffs[2*4+3], dovi_coeffs[3*4+3], t2) }; -+ float m0123[4] = { mix(m01[0], m23[0], t1), -+ mix(m01[1], m23[1], t1), -+ mix(m01[2], m23[2], t1), -+ mix(m01[3], m23[3], t1) }; -+ float m45[4] = { mix(dovi_coeffs[4*4+0], dovi_coeffs[5*4+0], t4), -+ mix(dovi_coeffs[4*4+1], dovi_coeffs[5*4+1], t4), -+ mix(dovi_coeffs[4*4+2], dovi_coeffs[5*4+2], t4), -+ mix(dovi_coeffs[4*4+3], dovi_coeffs[5*4+3], t4) }; -+ float m67[4] = { mix(dovi_coeffs[6*4+0], dovi_coeffs[7*4+0], t6), -+ mix(dovi_coeffs[6*4+1], dovi_coeffs[7*4+1], t6), -+ mix(dovi_coeffs[6*4+2], dovi_coeffs[7*4+2], t6), -+ mix(dovi_coeffs[6*4+3], dovi_coeffs[7*4+3], t6) }; -+ float m4567[4] = { mix(m45[0], m67[0], t5), -+ mix(m45[1], m67[1], t5), -+ mix(m45[2], m67[2], t5), -+ mix(m45[3], m67[3], t5) }; -+ -+ coeffs[0] = mix(m0123[0], m4567[0], t3); -+ coeffs[1] = mix(m0123[1], m4567[1], t3); -+ coeffs[2] = mix(m0123[2], m4567[2], t3); -+ coeffs[3] = mix(m0123[3], m4567[3], t3); ++ float m01[4] = { MIX(dovi_coeffs[0*4+0], dovi_coeffs[1*4+0], t0), ++ MIX(dovi_coeffs[0*4+1], dovi_coeffs[1*4+1], t0), ++ MIX(dovi_coeffs[0*4+2], dovi_coeffs[1*4+2], t0), ++ MIX(dovi_coeffs[0*4+3], dovi_coeffs[1*4+3], t0) }; ++ float m23[4] = { MIX(dovi_coeffs[2*4+0], dovi_coeffs[3*4+0], t2), ++ MIX(dovi_coeffs[2*4+1], dovi_coeffs[3*4+1], t2), ++ MIX(dovi_coeffs[2*4+2], dovi_coeffs[3*4+2], t2), ++ MIX(dovi_coeffs[2*4+3], dovi_coeffs[3*4+3], t2) }; ++ float m0123[4] = { MIX(m01[0], m23[0], t1), ++ MIX(m01[1], m23[1], t1), ++ MIX(m01[2], m23[2], t1), ++ MIX(m01[3], m23[3], t1) }; ++ float m45[4] = { MIX(dovi_coeffs[4*4+0], dovi_coeffs[5*4+0], t4), ++ MIX(dovi_coeffs[4*4+1], dovi_coeffs[5*4+1], t4), ++ MIX(dovi_coeffs[4*4+2], dovi_coeffs[5*4+2], t4), ++ MIX(dovi_coeffs[4*4+3], dovi_coeffs[5*4+3], t4) }; ++ float m67[4] = { MIX(dovi_coeffs[6*4+0], dovi_coeffs[7*4+0], t6), ++ MIX(dovi_coeffs[6*4+1], dovi_coeffs[7*4+1], t6), ++ MIX(dovi_coeffs[6*4+2], dovi_coeffs[7*4+2], t6), ++ MIX(dovi_coeffs[6*4+3], dovi_coeffs[7*4+3], t6) }; ++ float m4567[4] = { MIX(m45[0], m67[0], t5), ++ MIX(m45[1], m67[1], t5), ++ MIX(m45[2], m67[2], t5), ++ MIX(m45[3], m67[3], t5) }; ++ ++ coeffs[0] = MIX(m0123[0], m4567[0], t3); ++ coeffs[1] = MIX(m0123[1], m4567[1], t3); ++ coeffs[2] = MIX(m0123[2], m4567[2], t3); ++ coeffs[3] = MIX(m0123[3], m4567[3], t3); + } + + has_mmr_poly = dovi_has_mmr && dovi_has_poly; @@ -2867,16 +2872,16 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + // SDR peak + float dst_peak = 1.0f; + float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE_ALT) * scale; -+ float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale; ++ float max_lum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale; + -+ float ks = 1.5f * maxLum - 0.5f; ++ float ks = 1.5f * max_lum - 0.5f; + float tb = (s_pq - ks) / (1.0f - ks); + float tb2 = tb * tb; + float tb3 = tb2 * tb; + float pb = (2.0f * tb3 - 3.0f * tb2 + 1.0f) * ks + + (tb3 - 2.0f * tb2 + tb) * (1.0f - ks) + -+ (-2.0f * tb3 + 3.0f * tb2) * maxLum; -+ float sig = (s_pq < ks) ? s_pq : pb; ++ (-2.0f * tb3 + 3.0f * tb2) * max_lum; ++ float sig = MIX(pb, s_pq, s_pq < ks); + + return eotf_st2084(sig * peak_pq, REFERENCE_WHITE_ALT); +} @@ -3092,12 +3097,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c +} + +inline static void tonemap_int16(int16_t r_in, int16_t g_in, int16_t b_in, -+ int16_t *r_out, int16_t *g_out, int16_t *b_out, -+ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, -+ const AVLumaCoefficients *coeffs, -+ const AVLumaCoefficients *ocoeffs, double desat, -+ double (*rgb2rgb)[3][3], -+ int rgb2rgb_passthrough) ++ int16_t *r_out, int16_t *g_out, int16_t *b_out, ++ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, ++ const AVLumaCoefficients *coeffs, ++ const AVLumaCoefficients *ocoeffs, double desat, ++ double (*rgb2rgb)[3][3], ++ int rgb2rgb_passthrough) +{ + int16_t sig; + float mapval, r_lin, g_lin, b_lin; @@ -3128,7 +3133,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + b_lin = (*rgb2rgb)[2][0] * r_lin + (*rgb2rgb)[2][1] * g_lin + (*rgb2rgb)[2][2] * b_lin; + } + -+#define MIX(x,y,a) ((x) * (1 - (a)) + (y) * (a)) + /* desaturate to prevent unnatural colors */ + if (desat > 0) { + float luma = av_q2d(coeffs->cr) * r_lin + av_q2d(coeffs->cg) * g_lin + av_q2d(coeffs->cb) * b_lin; @@ -3141,7 +3145,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + r_lin *= mapval; + g_lin *= mapval; + b_lin *= mapval; -+#undef MIX + + *r_out = delin_lut[av_clip_uintp2(r_lin * 32767 + 0.5, 15)]; + *g_out = delin_lut[av_clip_uintp2(g_lin * 32767 + 0.5, 15)]; @@ -4358,7 +4361,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c -@@ -0,0 +1,2290 @@ +@@ -0,0 +1,2293 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -6552,6 +6555,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd)); + yoax8 = _mm256_srai_epi32(yoax8, out_sh); + yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off)); ++ yoax8 = _mm256_slli_epi32(yoax8, out_sh2); + + yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry)); + yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy))); @@ -6559,10 +6563,10 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd)); + yobx8 = _mm256_srai_epi32(yobx8, out_sh); + yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); ++ yobx8 = _mm256_slli_epi32(yobx8, out_sh2); + + y0ox16 = _mm256_packus_epi32(yoax8, yobx8); + y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2); + _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16); + + r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); @@ -6583,6 +6587,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd)); + y1oax8 = _mm256_srai_epi32(y1oax8, out_sh); + y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off)); ++ y1oax8 = _mm256_slli_epi32(y1oax8, out_sh2); + + y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry)); + y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy))); @@ -6590,10 +6595,10 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd)); + y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); + y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); ++ y1obx8 = _mm256_slli_epi32(y1obx8, out_sh2); + + y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8); + y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2); + _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16); + + ravgx8 = _mm256_hadd_epi32(roax8, robx8); @@ -6628,8 +6633,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + + uvoax8 = _mm256_unpacklo_epi32(uox8, vox8); + uvobx8 = _mm256_unpackhi_epi32(uox8, vox8); ++ uvoax8 = _mm256_slli_epi32(uvoax8, out_sh2); ++ uvobx8 = _mm256_slli_epi32(uvobx8, out_sh2); + uvox16 = _mm256_packus_epi32(uvoax8, uvobx8); -+ uvox16 = _mm256_slli_epi16(uvox16, out_sh2); + _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16); + } + } @@ -6726,7 +6732,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c -@@ -0,0 +1,2371 @@ +@@ -0,0 +1,2374 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -9006,6 +9012,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd)); + yoax4 = _mm_srai_epi32(yoax4, out_sh); + yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off)); ++ yoax4 = _mm_slli_epi32(yoax4, out_sh2); + + yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry)); + yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy))); @@ -9013,9 +9020,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd)); + yobx4 = _mm_srai_epi32(yobx4, out_sh); + yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off)); ++ yobx4 = _mm_slli_epi32(yobx4, out_sh2); + + y0ox8 = _mm_packus_epi32(yoax4, yobx4); -+ y0ox8 = _mm_slli_epi16(y0ox8, out_sh2); + _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8); + + r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1); @@ -9036,6 +9043,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd)); + y1oax4 = _mm_srai_epi32(y1oax4, out_sh); + y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off)); ++ y1oax4 = _mm_slli_epi32(y1oax4, out_sh2); + + y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry)); + y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy))); @@ -9043,9 +9051,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd)); + y1obx4 = _mm_srai_epi32(y1obx4, out_sh); + y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off)); ++ y1obx4 = _mm_slli_epi32(y1obx4, out_sh2); + + y1ox8 = _mm_packus_epi32(y1oax4, y1obx4); -+ y1ox8 = _mm_slli_epi16(y1ox8, out_sh2); + _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8); + + ravgx4 = _mm_hadd_epi32(roax4, robx4); @@ -9077,8 +9085,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c + + uvoax4 = _mm_unpacklo_epi32(uoax4, voax4); + uvobx4 = _mm_unpackhi_epi32(uoax4, voax4); ++ uvoax4 = _mm_slli_epi32(uvoax4, out_sh2); ++ uvobx4 = _mm_slli_epi32(uvobx4, out_sh2); + uvox8 = _mm_packus_epi32(uvoax4, uvobx4); -+ uvox8 = _mm_slli_epi16(uvox8, out_sh2); + _mm_storeu_si128((__m128i_u *) &dstuv[x], uvox8); + } + }