Skip to content

Commit

Permalink
avfilter/tonemapx: fix p010 overflow
Browse files Browse the repository at this point in the history
Need to perform bit shift before saturation move to prevent overflow
  • Loading branch information
gnattu committed Oct 5, 2024
1 parent 040ec3d commit 84aa53a
Showing 1 changed file with 70 additions and 61 deletions.
131 changes: 70 additions & 61 deletions debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
===================================================================
--- /dev/null
+++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
@@ -0,0 +1,2153 @@
@@ -0,0 +1,2157 @@
+/*
+ * Copyright (c) 2024 Gnattu OC <[email protected]>
+ *
Expand Down Expand Up @@ -2213,24 +2213,28 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
+ }
+
+ y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
+ y0oax4 = vshlq_s32(y0oax4, out_sh2x8);
+ y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
+ y0obx4 = vshlq_s32(y0obx4, out_sh2x8);
+ y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
+ y1oax4 = vshlq_s32(y1oax4, out_sh2x8);
+ y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
+ y1obx4 = vshlq_s32(y1obx4, out_sh2x8);
+ uox4 = vaddq_s32(uox4, out_uv_offsetx4);
+ uox4 = vshlq_s32(uox4, out_sh2x8);
+ vox4 = vaddq_s32(vox4, out_uv_offsetx4);
+ vox4 = vshlq_s32(vox4, out_sh2x8);
+
+ y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
+ y0ox8 = vshlq_u16(y0ox8, out_sh2x8);
+ vst1q_u16(&dsty[x], y0ox8);
+
+ y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
+ y1ox8 = vshlq_u16(y1ox8, out_sh2x8);
+ vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
+
+ uvoax4 = vzip1q_s32(uox4, vox4);
+ uvobx4 = vzip2q_s32(uox4, vox4);
+
+ vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8));
+ vst1q_u16(&dstuv[x], vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)));
+ }
+ }
+
Expand Down Expand Up @@ -2417,7 +2421,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
===================================================================
--- /dev/null
+++ FFmpeg/libavfilter/vf_tonemapx.c
@@ -0,0 +1,1791 @@
@@ -0,0 +1,1790 @@
+/*
+ * This file is part of FFmpeg.
+ *
Expand Down Expand Up @@ -2476,6 +2480,9 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
+#include "internal.h"
+#include "video.h"
+
+#define MIX(x, y, a) ((x) + ((y) - (x)) * (a))
+#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c)))
+
+enum TonemapAlgorithm {
+ TONEMAP_NONE,
+ TONEMAP_LINEAR,
Expand Down Expand Up @@ -2588,11 +2595,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
+};
+
+const double dovi_lms2rgb_matrix[3][3] =
+ {
+ { 3.06441879, -2.16597676, 0.10155818},
+ {-0.65612108, 1.78554118, -0.12943749},
+ { 0.01736321, -0.04725154, 1.03004253},
+ };
+{
+ { 3.06441879, -2.16597676, 0.10155818},
+ {-0.65612108, 1.78554118, -0.12943749},
+ { 0.01736321, -0.04725154, 1.03004253},
+};
+
+static void update_dovi_buf(AVFilterContext *ctx)
+{
Expand Down Expand Up @@ -2736,7 +2743,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
+ dest[2] = l * (float)lms2rgb_matrix[2][0] + m * (float)lms2rgb_matrix[2][1] + s * (float)lms2rgb_matrix[2][2];
+}
+
+#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c)))
+inline static void reshape_dovi_yuv(float* dest, float* src, const TonemapIntParams *ctx)
+{
+ int i;
Expand Down Expand Up @@ -2777,41 +2783,40 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
+ coeffs[2] = dovi_coeffs[0*4+2];
+ coeffs[3] = dovi_coeffs[0*4+3];
+
+#define mix(x, y, a) ((x) + ((y) - (x)) * (a))
+ if (i == 0 && dovi_num_pivots > 2) {
+ int t0 = s >= dovi_pivots[0], t1 = s >= dovi_pivots[1];
+ int t2 = s >= dovi_pivots[2], t3 = s >= dovi_pivots[3];
+ int t4 = s >= dovi_pivots[4], t5 = s >= dovi_pivots[5], t6 = s >= dovi_pivots[6];
+
+ float m01[4] = { mix(dovi_coeffs[0*4+0], dovi_coeffs[1*4+0], t0),
+ mix(dovi_coeffs[0*4+1], dovi_coeffs[1*4+1], t0),
+ mix(dovi_coeffs[0*4+2], dovi_coeffs[1*4+2], t0),
+ mix(dovi_coeffs[0*4+3], dovi_coeffs[1*4+3], t0) };
+ float m23[4] = { mix(dovi_coeffs[2*4+0], dovi_coeffs[3*4+0], t2),
+ mix(dovi_coeffs[2*4+1], dovi_coeffs[3*4+1], t2),
+ mix(dovi_coeffs[2*4+2], dovi_coeffs[3*4+2], t2),
+ mix(dovi_coeffs[2*4+3], dovi_coeffs[3*4+3], t2) };
+ float m0123[4] = { mix(m01[0], m23[0], t1),
+ mix(m01[1], m23[1], t1),
+ mix(m01[2], m23[2], t1),
+ mix(m01[3], m23[3], t1) };
+ float m45[4] = { mix(dovi_coeffs[4*4+0], dovi_coeffs[5*4+0], t4),
+ mix(dovi_coeffs[4*4+1], dovi_coeffs[5*4+1], t4),
+ mix(dovi_coeffs[4*4+2], dovi_coeffs[5*4+2], t4),
+ mix(dovi_coeffs[4*4+3], dovi_coeffs[5*4+3], t4) };
+ float m67[4] = { mix(dovi_coeffs[6*4+0], dovi_coeffs[7*4+0], t6),
+ mix(dovi_coeffs[6*4+1], dovi_coeffs[7*4+1], t6),
+ mix(dovi_coeffs[6*4+2], dovi_coeffs[7*4+2], t6),
+ mix(dovi_coeffs[6*4+3], dovi_coeffs[7*4+3], t6) };
+ float m4567[4] = { mix(m45[0], m67[0], t5),
+ mix(m45[1], m67[1], t5),
+ mix(m45[2], m67[2], t5),
+ mix(m45[3], m67[3], t5) };
+
+ coeffs[0] = mix(m0123[0], m4567[0], t3);
+ coeffs[1] = mix(m0123[1], m4567[1], t3);
+ coeffs[2] = mix(m0123[2], m4567[2], t3);
+ coeffs[3] = mix(m0123[3], m4567[3], t3);
+ float m01[4] = { MIX(dovi_coeffs[0*4+0], dovi_coeffs[1*4+0], t0),
+ MIX(dovi_coeffs[0*4+1], dovi_coeffs[1*4+1], t0),
+ MIX(dovi_coeffs[0*4+2], dovi_coeffs[1*4+2], t0),
+ MIX(dovi_coeffs[0*4+3], dovi_coeffs[1*4+3], t0) };
+ float m23[4] = { MIX(dovi_coeffs[2*4+0], dovi_coeffs[3*4+0], t2),
+ MIX(dovi_coeffs[2*4+1], dovi_coeffs[3*4+1], t2),
+ MIX(dovi_coeffs[2*4+2], dovi_coeffs[3*4+2], t2),
+ MIX(dovi_coeffs[2*4+3], dovi_coeffs[3*4+3], t2) };
+ float m0123[4] = { MIX(m01[0], m23[0], t1),
+ MIX(m01[1], m23[1], t1),
+ MIX(m01[2], m23[2], t1),
+ MIX(m01[3], m23[3], t1) };
+ float m45[4] = { MIX(dovi_coeffs[4*4+0], dovi_coeffs[5*4+0], t4),
+ MIX(dovi_coeffs[4*4+1], dovi_coeffs[5*4+1], t4),
+ MIX(dovi_coeffs[4*4+2], dovi_coeffs[5*4+2], t4),
+ MIX(dovi_coeffs[4*4+3], dovi_coeffs[5*4+3], t4) };
+ float m67[4] = { MIX(dovi_coeffs[6*4+0], dovi_coeffs[7*4+0], t6),
+ MIX(dovi_coeffs[6*4+1], dovi_coeffs[7*4+1], t6),
+ MIX(dovi_coeffs[6*4+2], dovi_coeffs[7*4+2], t6),
+ MIX(dovi_coeffs[6*4+3], dovi_coeffs[7*4+3], t6) };
+ float m4567[4] = { MIX(m45[0], m67[0], t5),
+ MIX(m45[1], m67[1], t5),
+ MIX(m45[2], m67[2], t5),
+ MIX(m45[3], m67[3], t5) };
+
+ coeffs[0] = MIX(m0123[0], m4567[0], t3);
+ coeffs[1] = MIX(m0123[1], m4567[1], t3);
+ coeffs[2] = MIX(m0123[2], m4567[2], t3);
+ coeffs[3] = MIX(m0123[3], m4567[3], t3);
+ }
+
+ has_mmr_poly = dovi_has_mmr && dovi_has_poly;
Expand Down Expand Up @@ -2867,16 +2872,16 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
+ // SDR peak
+ float dst_peak = 1.0f;
+ float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE_ALT) * scale;
+ float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale;
+ float max_lum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale;
+
+ float ks = 1.5f * maxLum - 0.5f;
+ float ks = 1.5f * max_lum - 0.5f;
+ float tb = (s_pq - ks) / (1.0f - ks);
+ float tb2 = tb * tb;
+ float tb3 = tb2 * tb;
+ float pb = (2.0f * tb3 - 3.0f * tb2 + 1.0f) * ks +
+ (tb3 - 2.0f * tb2 + tb) * (1.0f - ks) +
+ (-2.0f * tb3 + 3.0f * tb2) * maxLum;
+ float sig = (s_pq < ks) ? s_pq : pb;
+ (-2.0f * tb3 + 3.0f * tb2) * max_lum;
+ float sig = MIX(pb, s_pq, s_pq < ks);
+
+ return eotf_st2084(sig * peak_pq, REFERENCE_WHITE_ALT);
+}
Expand Down Expand Up @@ -3092,12 +3097,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
+}
+
+inline static void tonemap_int16(int16_t r_in, int16_t g_in, int16_t b_in,
+ int16_t *r_out, int16_t *g_out, int16_t *b_out,
+ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
+ const AVLumaCoefficients *coeffs,
+ const AVLumaCoefficients *ocoeffs, double desat,
+ double (*rgb2rgb)[3][3],
+ int rgb2rgb_passthrough)
+ int16_t *r_out, int16_t *g_out, int16_t *b_out,
+ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
+ const AVLumaCoefficients *coeffs,
+ const AVLumaCoefficients *ocoeffs, double desat,
+ double (*rgb2rgb)[3][3],
+ int rgb2rgb_passthrough)
+{
+ int16_t sig;
+ float mapval, r_lin, g_lin, b_lin;
Expand Down Expand Up @@ -3128,7 +3133,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
+ b_lin = (*rgb2rgb)[2][0] * r_lin + (*rgb2rgb)[2][1] * g_lin + (*rgb2rgb)[2][2] * b_lin;
+ }
+
+#define MIX(x,y,a) ((x) * (1 - (a)) + (y) * (a))
+ /* desaturate to prevent unnatural colors */
+ if (desat > 0) {
+ float luma = av_q2d(coeffs->cr) * r_lin + av_q2d(coeffs->cg) * g_lin + av_q2d(coeffs->cb) * b_lin;
Expand All @@ -3141,7 +3145,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
+ r_lin *= mapval;
+ g_lin *= mapval;
+ b_lin *= mapval;
+#undef MIX
+
+ *r_out = delin_lut[av_clip_uintp2(r_lin * 32767 + 0.5, 15)];
+ *g_out = delin_lut[av_clip_uintp2(g_lin * 32767 + 0.5, 15)];
Expand Down Expand Up @@ -4358,7 +4361,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
===================================================================
--- /dev/null
+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
@@ -0,0 +1,2290 @@
@@ -0,0 +1,2293 @@
+/*
+ * Copyright (c) 2024 Gnattu OC <[email protected]>
+ *
Expand Down Expand Up @@ -6552,17 +6555,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
+ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
+ yoax8 = _mm256_srai_epi32(yoax8, out_sh);
+ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
+ yoax8 = _mm256_slli_epi32(yoax8, out_sh2);
+
+ yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
+ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
+ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
+ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
+ yobx8 = _mm256_srai_epi32(yobx8, out_sh);
+ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
+ yobx8 = _mm256_slli_epi32(yobx8, out_sh2);
+
+ y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
+ y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
+ y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2);
+ _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
+
+ r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
Expand All @@ -6583,17 +6587,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
+ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
+ y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
+ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
+ y1oax8 = _mm256_slli_epi32(y1oax8, out_sh2);
+
+ y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
+ y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
+ y1obx8 = _mm256_slli_epi32(y1obx8, out_sh2);
+
+ y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
+ y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
+ y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2);
+ _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
+
+ ravgx8 = _mm256_hadd_epi32(roax8, robx8);
Expand Down Expand Up @@ -6628,8 +6633,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
+
+ uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
+ uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
+ uvoax8 = _mm256_slli_epi32(uvoax8, out_sh2);
+ uvobx8 = _mm256_slli_epi32(uvobx8, out_sh2);
+ uvox16 = _mm256_packus_epi32(uvoax8, uvobx8);
+ uvox16 = _mm256_slli_epi16(uvox16, out_sh2);
+ _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16);
+ }
+ }
Expand Down Expand Up @@ -6726,7 +6732,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
===================================================================
--- /dev/null
+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
@@ -0,0 +1,2371 @@
@@ -0,0 +1,2374 @@
+/*
+ * Copyright (c) 2024 Gnattu OC <[email protected]>
+ *
Expand Down Expand Up @@ -9006,16 +9012,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
+ yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
+ yoax4 = _mm_srai_epi32(yoax4, out_sh);
+ yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
+ yoax4 = _mm_slli_epi32(yoax4, out_sh2);
+
+ yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
+ yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
+ yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
+ yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
+ yobx4 = _mm_srai_epi32(yobx4, out_sh);
+ yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
+ yobx4 = _mm_slli_epi32(yobx4, out_sh2);
+
+ y0ox8 = _mm_packus_epi32(yoax4, yobx4);
+ y0ox8 = _mm_slli_epi16(y0ox8, out_sh2);
+ _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
+
+ r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
Expand All @@ -9036,16 +9043,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
+ y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
+ y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
+ y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
+ y1oax4 = _mm_slli_epi32(y1oax4, out_sh2);
+
+ y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
+ y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
+ y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
+ y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
+ y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
+ y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
+ y1obx4 = _mm_slli_epi32(y1obx4, out_sh2);
+
+ y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
+ y1ox8 = _mm_slli_epi16(y1ox8, out_sh2);
+ _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
+
+ ravgx4 = _mm_hadd_epi32(roax4, robx4);
Expand Down Expand Up @@ -9077,8 +9085,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
+
+ uvoax4 = _mm_unpacklo_epi32(uoax4, voax4);
+ uvobx4 = _mm_unpackhi_epi32(uoax4, voax4);
+ uvoax4 = _mm_slli_epi32(uvoax4, out_sh2);
+ uvobx4 = _mm_slli_epi32(uvobx4, out_sh2);
+ uvox8 = _mm_packus_epi32(uvoax4, uvobx4);
+ uvox8 = _mm_slli_epi16(uvox8, out_sh2);
+ _mm_storeu_si128((__m128i_u *) &dstuv[x], uvox8);
+ }
+ }
Expand Down

0 comments on commit 84aa53a

Please sign in to comment.