-
-
Notifications
You must be signed in to change notification settings - Fork 135
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
avfilter/tonemapx: fix p010 overflow
Need to perform bit shift before saturation move to prevent overflow
- Loading branch information
Showing
1 changed file
with
70 additions
and
61 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -95,7 +95,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c | |
=================================================================== | ||
--- /dev/null | ||
+++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c | ||
@@ -0,0 +1,2153 @@ | ||
@@ -0,0 +1,2157 @@ | ||
+/* | ||
+ * Copyright (c) 2024 Gnattu OC <[email protected]> | ||
+ * | ||
|
@@ -2213,24 +2213,28 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c | |
+ } | ||
+ | ||
+ y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4); | ||
+ y0oax4 = vshlq_s32(y0oax4, out_sh2x8); | ||
+ y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4); | ||
+ y0obx4 = vshlq_s32(y0obx4, out_sh2x8); | ||
+ y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4); | ||
+ y1oax4 = vshlq_s32(y1oax4, out_sh2x8); | ||
+ y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4); | ||
+ y1obx4 = vshlq_s32(y1obx4, out_sh2x8); | ||
+ uox4 = vaddq_s32(uox4, out_uv_offsetx4); | ||
+ uox4 = vshlq_s32(uox4, out_sh2x8); | ||
+ vox4 = vaddq_s32(vox4, out_uv_offsetx4); | ||
+ vox4 = vshlq_s32(vox4, out_sh2x8); | ||
+ | ||
+ y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4)); | ||
+ y0ox8 = vshlq_u16(y0ox8, out_sh2x8); | ||
+ vst1q_u16(&dsty[x], y0ox8); | ||
+ | ||
+ y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4)); | ||
+ y1ox8 = vshlq_u16(y1ox8, out_sh2x8); | ||
+ vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8); | ||
+ | ||
+ uvoax4 = vzip1q_s32(uox4, vox4); | ||
+ uvobx4 = vzip2q_s32(uox4, vox4); | ||
+ | ||
+ vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8)); | ||
+ vst1q_u16(&dstuv[x], vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4))); | ||
+ } | ||
+ } | ||
+ | ||
|
@@ -2417,7 +2421,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c | |
=================================================================== | ||
--- /dev/null | ||
+++ FFmpeg/libavfilter/vf_tonemapx.c | ||
@@ -0,0 +1,1791 @@ | ||
@@ -0,0 +1,1790 @@ | ||
+/* | ||
+ * This file is part of FFmpeg. | ||
+ * | ||
|
@@ -2476,6 +2480,9 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c | |
+#include "internal.h" | ||
+#include "video.h" | ||
+ | ||
+#define MIX(x, y, a) ((x) + ((y) - (x)) * (a)) | ||
+#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c))) | ||
+ | ||
+enum TonemapAlgorithm { | ||
+ TONEMAP_NONE, | ||
+ TONEMAP_LINEAR, | ||
|
@@ -2588,11 +2595,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c | |
+}; | ||
+ | ||
+const double dovi_lms2rgb_matrix[3][3] = | ||
+ { | ||
+ { 3.06441879, -2.16597676, 0.10155818}, | ||
+ {-0.65612108, 1.78554118, -0.12943749}, | ||
+ { 0.01736321, -0.04725154, 1.03004253}, | ||
+ }; | ||
+{ | ||
+ { 3.06441879, -2.16597676, 0.10155818}, | ||
+ {-0.65612108, 1.78554118, -0.12943749}, | ||
+ { 0.01736321, -0.04725154, 1.03004253}, | ||
+}; | ||
+ | ||
+static void update_dovi_buf(AVFilterContext *ctx) | ||
+{ | ||
|
@@ -2736,7 +2743,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c | |
+ dest[2] = l * (float)lms2rgb_matrix[2][0] + m * (float)lms2rgb_matrix[2][1] + s * (float)lms2rgb_matrix[2][2]; | ||
+} | ||
+ | ||
+#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c))) | ||
+inline static void reshape_dovi_yuv(float* dest, float* src, const TonemapIntParams *ctx) | ||
+{ | ||
+ int i; | ||
|
@@ -2777,41 +2783,40 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c | |
+ coeffs[2] = dovi_coeffs[0*4+2]; | ||
+ coeffs[3] = dovi_coeffs[0*4+3]; | ||
+ | ||
+#define mix(x, y, a) ((x) + ((y) - (x)) * (a)) | ||
+ if (i == 0 && dovi_num_pivots > 2) { | ||
+ int t0 = s >= dovi_pivots[0], t1 = s >= dovi_pivots[1]; | ||
+ int t2 = s >= dovi_pivots[2], t3 = s >= dovi_pivots[3]; | ||
+ int t4 = s >= dovi_pivots[4], t5 = s >= dovi_pivots[5], t6 = s >= dovi_pivots[6]; | ||
+ | ||
+ float m01[4] = { mix(dovi_coeffs[0*4+0], dovi_coeffs[1*4+0], t0), | ||
+ mix(dovi_coeffs[0*4+1], dovi_coeffs[1*4+1], t0), | ||
+ mix(dovi_coeffs[0*4+2], dovi_coeffs[1*4+2], t0), | ||
+ mix(dovi_coeffs[0*4+3], dovi_coeffs[1*4+3], t0) }; | ||
+ float m23[4] = { mix(dovi_coeffs[2*4+0], dovi_coeffs[3*4+0], t2), | ||
+ mix(dovi_coeffs[2*4+1], dovi_coeffs[3*4+1], t2), | ||
+ mix(dovi_coeffs[2*4+2], dovi_coeffs[3*4+2], t2), | ||
+ mix(dovi_coeffs[2*4+3], dovi_coeffs[3*4+3], t2) }; | ||
+ float m0123[4] = { mix(m01[0], m23[0], t1), | ||
+ mix(m01[1], m23[1], t1), | ||
+ mix(m01[2], m23[2], t1), | ||
+ mix(m01[3], m23[3], t1) }; | ||
+ float m45[4] = { mix(dovi_coeffs[4*4+0], dovi_coeffs[5*4+0], t4), | ||
+ mix(dovi_coeffs[4*4+1], dovi_coeffs[5*4+1], t4), | ||
+ mix(dovi_coeffs[4*4+2], dovi_coeffs[5*4+2], t4), | ||
+ mix(dovi_coeffs[4*4+3], dovi_coeffs[5*4+3], t4) }; | ||
+ float m67[4] = { mix(dovi_coeffs[6*4+0], dovi_coeffs[7*4+0], t6), | ||
+ mix(dovi_coeffs[6*4+1], dovi_coeffs[7*4+1], t6), | ||
+ mix(dovi_coeffs[6*4+2], dovi_coeffs[7*4+2], t6), | ||
+ mix(dovi_coeffs[6*4+3], dovi_coeffs[7*4+3], t6) }; | ||
+ float m4567[4] = { mix(m45[0], m67[0], t5), | ||
+ mix(m45[1], m67[1], t5), | ||
+ mix(m45[2], m67[2], t5), | ||
+ mix(m45[3], m67[3], t5) }; | ||
+ | ||
+ coeffs[0] = mix(m0123[0], m4567[0], t3); | ||
+ coeffs[1] = mix(m0123[1], m4567[1], t3); | ||
+ coeffs[2] = mix(m0123[2], m4567[2], t3); | ||
+ coeffs[3] = mix(m0123[3], m4567[3], t3); | ||
+ float m01[4] = { MIX(dovi_coeffs[0*4+0], dovi_coeffs[1*4+0], t0), | ||
+ MIX(dovi_coeffs[0*4+1], dovi_coeffs[1*4+1], t0), | ||
+ MIX(dovi_coeffs[0*4+2], dovi_coeffs[1*4+2], t0), | ||
+ MIX(dovi_coeffs[0*4+3], dovi_coeffs[1*4+3], t0) }; | ||
+ float m23[4] = { MIX(dovi_coeffs[2*4+0], dovi_coeffs[3*4+0], t2), | ||
+ MIX(dovi_coeffs[2*4+1], dovi_coeffs[3*4+1], t2), | ||
+ MIX(dovi_coeffs[2*4+2], dovi_coeffs[3*4+2], t2), | ||
+ MIX(dovi_coeffs[2*4+3], dovi_coeffs[3*4+3], t2) }; | ||
+ float m0123[4] = { MIX(m01[0], m23[0], t1), | ||
+ MIX(m01[1], m23[1], t1), | ||
+ MIX(m01[2], m23[2], t1), | ||
+ MIX(m01[3], m23[3], t1) }; | ||
+ float m45[4] = { MIX(dovi_coeffs[4*4+0], dovi_coeffs[5*4+0], t4), | ||
+ MIX(dovi_coeffs[4*4+1], dovi_coeffs[5*4+1], t4), | ||
+ MIX(dovi_coeffs[4*4+2], dovi_coeffs[5*4+2], t4), | ||
+ MIX(dovi_coeffs[4*4+3], dovi_coeffs[5*4+3], t4) }; | ||
+ float m67[4] = { MIX(dovi_coeffs[6*4+0], dovi_coeffs[7*4+0], t6), | ||
+ MIX(dovi_coeffs[6*4+1], dovi_coeffs[7*4+1], t6), | ||
+ MIX(dovi_coeffs[6*4+2], dovi_coeffs[7*4+2], t6), | ||
+ MIX(dovi_coeffs[6*4+3], dovi_coeffs[7*4+3], t6) }; | ||
+ float m4567[4] = { MIX(m45[0], m67[0], t5), | ||
+ MIX(m45[1], m67[1], t5), | ||
+ MIX(m45[2], m67[2], t5), | ||
+ MIX(m45[3], m67[3], t5) }; | ||
+ | ||
+ coeffs[0] = MIX(m0123[0], m4567[0], t3); | ||
+ coeffs[1] = MIX(m0123[1], m4567[1], t3); | ||
+ coeffs[2] = MIX(m0123[2], m4567[2], t3); | ||
+ coeffs[3] = MIX(m0123[3], m4567[3], t3); | ||
+ } | ||
+ | ||
+ has_mmr_poly = dovi_has_mmr && dovi_has_poly; | ||
|
@@ -2867,16 +2872,16 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c | |
+ // SDR peak | ||
+ float dst_peak = 1.0f; | ||
+ float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE_ALT) * scale; | ||
+ float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale; | ||
+ float max_lum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale; | ||
+ | ||
+ float ks = 1.5f * maxLum - 0.5f; | ||
+ float ks = 1.5f * max_lum - 0.5f; | ||
+ float tb = (s_pq - ks) / (1.0f - ks); | ||
+ float tb2 = tb * tb; | ||
+ float tb3 = tb2 * tb; | ||
+ float pb = (2.0f * tb3 - 3.0f * tb2 + 1.0f) * ks + | ||
+ (tb3 - 2.0f * tb2 + tb) * (1.0f - ks) + | ||
+ (-2.0f * tb3 + 3.0f * tb2) * maxLum; | ||
+ float sig = (s_pq < ks) ? s_pq : pb; | ||
+ (-2.0f * tb3 + 3.0f * tb2) * max_lum; | ||
+ float sig = MIX(pb, s_pq, s_pq < ks); | ||
+ | ||
+ return eotf_st2084(sig * peak_pq, REFERENCE_WHITE_ALT); | ||
+} | ||
|
@@ -3092,12 +3097,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c | |
+} | ||
+ | ||
+inline static void tonemap_int16(int16_t r_in, int16_t g_in, int16_t b_in, | ||
+ int16_t *r_out, int16_t *g_out, int16_t *b_out, | ||
+ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, | ||
+ const AVLumaCoefficients *coeffs, | ||
+ const AVLumaCoefficients *ocoeffs, double desat, | ||
+ double (*rgb2rgb)[3][3], | ||
+ int rgb2rgb_passthrough) | ||
+ int16_t *r_out, int16_t *g_out, int16_t *b_out, | ||
+ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, | ||
+ const AVLumaCoefficients *coeffs, | ||
+ const AVLumaCoefficients *ocoeffs, double desat, | ||
+ double (*rgb2rgb)[3][3], | ||
+ int rgb2rgb_passthrough) | ||
+{ | ||
+ int16_t sig; | ||
+ float mapval, r_lin, g_lin, b_lin; | ||
|
@@ -3128,7 +3133,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c | |
+ b_lin = (*rgb2rgb)[2][0] * r_lin + (*rgb2rgb)[2][1] * g_lin + (*rgb2rgb)[2][2] * b_lin; | ||
+ } | ||
+ | ||
+#define MIX(x,y,a) ((x) * (1 - (a)) + (y) * (a)) | ||
+ /* desaturate to prevent unnatural colors */ | ||
+ if (desat > 0) { | ||
+ float luma = av_q2d(coeffs->cr) * r_lin + av_q2d(coeffs->cg) * g_lin + av_q2d(coeffs->cb) * b_lin; | ||
|
@@ -3141,7 +3145,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c | |
+ r_lin *= mapval; | ||
+ g_lin *= mapval; | ||
+ b_lin *= mapval; | ||
+#undef MIX | ||
+ | ||
+ *r_out = delin_lut[av_clip_uintp2(r_lin * 32767 + 0.5, 15)]; | ||
+ *g_out = delin_lut[av_clip_uintp2(g_lin * 32767 + 0.5, 15)]; | ||
|
@@ -4358,7 +4361,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c | |
=================================================================== | ||
--- /dev/null | ||
+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c | ||
@@ -0,0 +1,2290 @@ | ||
@@ -0,0 +1,2293 @@ | ||
+/* | ||
+ * Copyright (c) 2024 Gnattu OC <[email protected]> | ||
+ * | ||
|
@@ -6552,17 +6555,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c | |
+ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd)); | ||
+ yoax8 = _mm256_srai_epi32(yoax8, out_sh); | ||
+ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off)); | ||
+ yoax8 = _mm256_slli_epi32(yoax8, out_sh2); | ||
+ | ||
+ yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry)); | ||
+ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy))); | ||
+ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby))); | ||
+ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd)); | ||
+ yobx8 = _mm256_srai_epi32(yobx8, out_sh); | ||
+ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); | ||
+ yobx8 = _mm256_slli_epi32(yobx8, out_sh2); | ||
+ | ||
+ y0ox16 = _mm256_packus_epi32(yoax8, yobx8); | ||
+ y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); | ||
+ y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2); | ||
+ _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16); | ||
+ | ||
+ r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); | ||
|
@@ -6583,17 +6587,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c | |
+ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd)); | ||
+ y1oax8 = _mm256_srai_epi32(y1oax8, out_sh); | ||
+ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off)); | ||
+ y1oax8 = _mm256_slli_epi32(y1oax8, out_sh2); | ||
+ | ||
+ y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry)); | ||
+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy))); | ||
+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby))); | ||
+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd)); | ||
+ y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); | ||
+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); | ||
+ y1obx8 = _mm256_slli_epi32(y1obx8, out_sh2); | ||
+ | ||
+ y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8); | ||
+ y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); | ||
+ y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2); | ||
+ _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16); | ||
+ | ||
+ ravgx8 = _mm256_hadd_epi32(roax8, robx8); | ||
|
@@ -6628,8 +6633,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c | |
+ | ||
+ uvoax8 = _mm256_unpacklo_epi32(uox8, vox8); | ||
+ uvobx8 = _mm256_unpackhi_epi32(uox8, vox8); | ||
+ uvoax8 = _mm256_slli_epi32(uvoax8, out_sh2); | ||
+ uvobx8 = _mm256_slli_epi32(uvobx8, out_sh2); | ||
+ uvox16 = _mm256_packus_epi32(uvoax8, uvobx8); | ||
+ uvox16 = _mm256_slli_epi16(uvox16, out_sh2); | ||
+ _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16); | ||
+ } | ||
+ } | ||
|
@@ -6726,7 +6732,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c | |
=================================================================== | ||
--- /dev/null | ||
+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c | ||
@@ -0,0 +1,2371 @@ | ||
@@ -0,0 +1,2374 @@ | ||
+/* | ||
+ * Copyright (c) 2024 Gnattu OC <[email protected]> | ||
+ * | ||
|
@@ -9006,16 +9012,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c | |
+ yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd)); | ||
+ yoax4 = _mm_srai_epi32(yoax4, out_sh); | ||
+ yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off)); | ||
+ yoax4 = _mm_slli_epi32(yoax4, out_sh2); | ||
+ | ||
+ yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry)); | ||
+ yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy))); | ||
+ yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby))); | ||
+ yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd)); | ||
+ yobx4 = _mm_srai_epi32(yobx4, out_sh); | ||
+ yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off)); | ||
+ yobx4 = _mm_slli_epi32(yobx4, out_sh2); | ||
+ | ||
+ y0ox8 = _mm_packus_epi32(yoax4, yobx4); | ||
+ y0ox8 = _mm_slli_epi16(y0ox8, out_sh2); | ||
+ _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8); | ||
+ | ||
+ r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1); | ||
|
@@ -9036,16 +9043,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c | |
+ y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd)); | ||
+ y1oax4 = _mm_srai_epi32(y1oax4, out_sh); | ||
+ y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off)); | ||
+ y1oax4 = _mm_slli_epi32(y1oax4, out_sh2); | ||
+ | ||
+ y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry)); | ||
+ y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy))); | ||
+ y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby))); | ||
+ y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd)); | ||
+ y1obx4 = _mm_srai_epi32(y1obx4, out_sh); | ||
+ y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off)); | ||
+ y1obx4 = _mm_slli_epi32(y1obx4, out_sh2); | ||
+ | ||
+ y1ox8 = _mm_packus_epi32(y1oax4, y1obx4); | ||
+ y1ox8 = _mm_slli_epi16(y1ox8, out_sh2); | ||
+ _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8); | ||
+ | ||
+ ravgx4 = _mm_hadd_epi32(roax4, robx4); | ||
|
@@ -9077,8 +9085,9 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c | |
+ | ||
+ uvoax4 = _mm_unpacklo_epi32(uoax4, voax4); | ||
+ uvobx4 = _mm_unpackhi_epi32(uoax4, voax4); | ||
+ uvoax4 = _mm_slli_epi32(uvoax4, out_sh2); | ||
+ uvobx4 = _mm_slli_epi32(uvobx4, out_sh2); | ||
+ uvox8 = _mm_packus_epi32(uvoax4, uvobx4); | ||
+ uvox8 = _mm_slli_epi16(uvox8, out_sh2); | ||
+ _mm_storeu_si128((__m128i_u *) &dstuv[x], uvox8); | ||
+ } | ||
+ } | ||
|