diff --git a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch index 5df4dfe9e09..e5ae04b5872 100644 --- a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch +++ b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch @@ -95,7 +95,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c =================================================================== --- /dev/null +++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c -@@ -0,0 +1,1229 @@ +@@ -0,0 +1,2149 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -123,6 +123,212 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS + +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS ++inline static float32x4_t mix_float32x4(float32x4_t x, float32x4_t y, float32x4_t a) ++{ ++ float32x4_t n = vsubq_f32(y, x); ++ n = vfmaq_f32(x, n, a); ++ return n; ++} ++ ++static inline float reshape_poly(float s, float32x4_t coeffs) ++{ ++ float32x4_t ps = vdupq_n_f32(0.0f); ++ ps = vsetq_lane_f32(1.0f, ps, 0); ++ ps = vsetq_lane_f32(s, ps, 1); ++ ps = vsetq_lane_f32(s * s, ps, 2); ++ ps = vmulq_f32(ps, coeffs); ++ return vaddvq_f32(ps); ++} ++ ++inline static float reshape_mmr(float32x4_t sig, float32x4_t coeffs, const float* mmr, ++ int mmr_single, int min_order, int max_order) ++{ ++ int mmr_idx = mmr_single ? 0 : (int)vgetq_lane_f32(coeffs, 1); ++ int order = (int)vgetq_lane_f32(coeffs, 3); ++ float s = vgetq_lane_f32(coeffs, 0); ++ float32x4_t mmr_coeffs, ps; ++ float32x4_t sigX01 = vmulq_laneq_f32(sig, sig, 1); // {sig[0]*sig[1], sig[1]*sig[1], sig[2]*sig[1], sig[3]*sig[1]} ++ float32x4_t sigX02 = vmulq_laneq_f32(sig, sig, 2); // {sig[0]*sig[2], sig[1]*sig[2], sig[2]*sig[2], sig[3]*sig[2]} ++ float32x4_t sigX12 = vmulq_laneq_f32(sigX01, sig, 2); // {sig[0]*sig[1]*sig[2], sig[1]*sig[1]*sig[2], sig[2]*sig[1]*sig[2], sig[3]*sig[1]*sig[2]} ++ float32x4_t sigX = sigX01; // sig[0]*sig[1] now positioned at 0 ++ sigX = vsetq_lane_f32(vgetq_lane_f32(sigX02, 0), sigX, 1); // sig[0]*sig[2] at 1 ++ sigX = vsetq_lane_f32(vgetq_lane_f32(sigX02, 1), sigX, 2); // sig[1]*sig[2] at 2 ++ sigX = vsetq_lane_f32(vgetq_lane_f32(sigX12, 0), sigX, 3); // sig[0]*sig[1]*sig[2] at 3 ++ ++ // dot first order ++ mmr_coeffs = vld1q_f32(&mmr[mmr_idx + 0*4]); ++ ps = vmulq_f32(sig, mmr_coeffs); ++ s += vaddvq_f32(ps); ++ mmr_coeffs = vld1q_f32(&mmr[mmr_idx + 1*4]); ++ ps = vmulq_f32(sigX, mmr_coeffs); ++ s += vaddvq_f32(ps); ++ ++ if (max_order >= 2 && (min_order >= 2 || order >= 2)) { ++ float32x4_t sig2 = vmulq_f32(sig, sig); ++ float32x4_t sigX2 = vmulq_f32(sigX, sigX); ++ ++ mmr_coeffs = vld1q_f32(&mmr[mmr_idx + 2*4]); ++ ps = vmulq_f32(sig2, mmr_coeffs); ++ s += vaddvq_f32(ps); ++ mmr_coeffs = vld1q_f32(&mmr[mmr_idx + 3*4]); ++ ps = vmulq_f32(sigX2, mmr_coeffs); ++ s += vaddvq_f32(ps); ++ ++ if (max_order == 3 && (min_order == 3 || order >= 3)) { ++ float32x4_t sig3 = vmulq_f32(sig2, sig); ++ float32x4_t sigX3 = vmulq_f32(sigX2, sigX); ++ ++ mmr_coeffs = vld1q_f32(&mmr[mmr_idx + 4*4]); ++ ps = vmulq_f32(sig3, mmr_coeffs); ++ s += vaddvq_f32(ps); ++ mmr_coeffs = vld1q_f32(&mmr[mmr_idx + 5*4]); ++ ps = vmulq_f32(sigX3, mmr_coeffs); ++ s += vaddvq_f32(ps); ++ } ++ } ++ ++ return s; ++} ++ ++#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c))) ++inline static float32x4_t reshape_dovi_iptpqc2(float32x4_t sig, const TonemapIntParams *ctx) ++{ ++ int has_mmr_poly; ++ float s; ++ ++ float *src_dovi_params = ctx->dovi_pbuf; ++ float *src_dovi_pivots = ctx->dovi_pbuf + 24; ++ float *src_dovi_coeffs = ctx->dovi_pbuf + 48; //float4* ++ float *src_dovi_mmr = ctx->dovi_pbuf + 144; //float4* ++ ++ float* dovi_params_i = src_dovi_params + 0*8; ++ float* dovi_pivots_i = src_dovi_pivots + 0*8; ++ float* dovi_coeffs_i = src_dovi_coeffs + 0 * 8 * 4; //float4* ++ float* dovi_mmr_i = src_dovi_mmr + 0 * 48 * 4; //float4* ++ int dovi_num_pivots_i = dovi_params_i[0]; ++ int dovi_has_mmr_i = dovi_params_i[1]; ++ int dovi_has_poly_i = dovi_params_i[2]; ++ int dovi_mmr_single_i = dovi_params_i[3]; ++ int dovi_min_order_i = dovi_params_i[4]; ++ int dovi_max_order_i = dovi_params_i[5]; ++ float dovi_lo_i = dovi_params_i[6]; ++ float dovi_hi_i = dovi_params_i[7]; ++ ++ float* dovi_params_p = src_dovi_params + 1*8; ++ float* dovi_coeffs_p = src_dovi_coeffs + 1*8 * 4; //float4* ++ float* dovi_mmr_p = src_dovi_mmr + 1*48 * 4; //float4* ++ int dovi_has_mmr_p = dovi_params_p[1]; ++ int dovi_has_poly_p = dovi_params_p[2]; ++ int dovi_mmr_single_p = dovi_params_p[3]; ++ int dovi_min_order_p = dovi_params_p[4]; ++ int dovi_max_order_p = dovi_params_p[5]; ++ float dovi_lo_p = dovi_params_p[6]; ++ float dovi_hi_p = dovi_params_p[7]; ++ ++ float* dovi_params_t = src_dovi_params + 2*8; ++ float* dovi_coeffs_t = src_dovi_coeffs + 2*8 * 4; //float4* ++ float* dovi_mmr_t = src_dovi_mmr + 2*48 * 4; //float4* ++ int dovi_has_mmr_t = dovi_params_t[1]; ++ int dovi_has_poly_t = dovi_params_t[2]; ++ int dovi_mmr_single_t = dovi_params_t[3]; ++ int dovi_min_order_t = dovi_params_t[4]; ++ int dovi_max_order_t = dovi_params_t[5]; ++ float dovi_lo_t = dovi_params_t[6]; ++ float dovi_hi_t = dovi_params_t[7]; ++ ++ float32x4_t coeffs, result; ++ ++ // reshape I ++ s = vgetq_lane_f32(sig, 0); ++ result = sig; ++ if (dovi_num_pivots_i > 2) { ++ float32x4_t m01 = mix_float32x4(vld1q_f32(dovi_coeffs_i), vld1q_f32(dovi_coeffs_i + 4), vdupq_n_f32(s >= dovi_pivots_i[0])); ++ float32x4_t m23 = mix_float32x4(vld1q_f32(dovi_coeffs_i + 2*4), vld1q_f32(dovi_coeffs_i + 3*4), vdupq_n_f32(s >= dovi_pivots_i[2])); ++ float32x4_t m0123 = mix_float32x4(m01, m23, vdupq_n_f32(s >= dovi_pivots_i[1])); ++ float32x4_t m45 = mix_float32x4(vld1q_f32(dovi_coeffs_i + 4*4), vld1q_f32(dovi_coeffs_i + 5*4), vdupq_n_f32(s >= dovi_pivots_i[4])); ++ float32x4_t m67 = mix_float32x4(vld1q_f32(dovi_coeffs_i + 6*4), vld1q_f32(dovi_coeffs_i + 7*4), vdupq_n_f32(s >= dovi_pivots_i[6])); ++ float32x4_t m4567 = mix_float32x4(m45, m67, vdupq_n_f32(s >= dovi_pivots_i[5])); ++ coeffs = mix_float32x4(m0123, m4567, vdupq_n_f32(s >= dovi_pivots_i[3])); ++ } else { ++ coeffs = vld1q_f32(dovi_coeffs_i); ++ } ++ ++ has_mmr_poly = dovi_has_mmr_i && dovi_has_poly_i; ++ ++ if ((has_mmr_poly && vgetq_lane_f32(coeffs, 3) == 0.0f) || (!has_mmr_poly && dovi_has_poly_i)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(result, coeffs, dovi_mmr_i, ++ dovi_mmr_single_i, dovi_min_order_i, dovi_max_order_i); ++ ++ result = vsetq_lane_f32(CLAMP(s, dovi_lo_i, dovi_hi_i), result, 0); ++ ++ // reshape P ++ s = vgetq_lane_f32(sig, 1); ++ coeffs = vld1q_f32(dovi_coeffs_p); ++ has_mmr_poly = dovi_has_mmr_p && dovi_has_poly_p; ++ ++ if ((has_mmr_poly && vgetq_lane_f32(coeffs, 3) == 0.0f) || (!has_mmr_poly && dovi_has_poly_p)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(result, coeffs, dovi_mmr_p, ++ dovi_mmr_single_p, dovi_min_order_p, dovi_max_order_p); ++ ++ result = vsetq_lane_f32(CLAMP(s, dovi_lo_p, dovi_hi_p), result, 1); ++ ++ // reshape T ++ s = vgetq_lane_f32(sig, 2); ++ coeffs = vld1q_f32(dovi_coeffs_t); ++ has_mmr_poly = dovi_has_mmr_t && dovi_has_poly_t; ++ ++ if ((has_mmr_poly && vgetq_lane_f32(coeffs, 3) == 0.0f) || (!has_mmr_poly && dovi_has_poly_t)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(result, coeffs, dovi_mmr_t, ++ dovi_mmr_single_t, dovi_min_order_t, dovi_max_order_t); ++ ++ result = vsetq_lane_f32(CLAMP(s, dovi_lo_t, dovi_hi_t), result, 2); ++ ++ return result; ++} ++ ++inline static void ycc2rgbx4(float32x4_t* dy, float32x4_t* dcb, float32x4_t* dcr, ++ float32x4_t y, float32x4_t cb, float32x4_t cr, ++ const double nonlinear[3][3], const float ycc_offset[3]) ++{ ++ *dy = vmulq_n_f32(y, (float)nonlinear[0][0]); ++ *dy = vfmaq_n_f32(*dy, cb, (float)nonlinear[0][1]); ++ *dy = vfmaq_n_f32(*dy, cr, (float)nonlinear[0][2]); ++ *dy = vsubq_f32(*dy, vdupq_n_f32(ycc_offset[0])); ++ ++ *dcb = vmulq_n_f32(y, (float)nonlinear[1][0]); ++ *dcb = vfmaq_n_f32(*dcb, cb, (float)nonlinear[1][1]); ++ *dcb = vfmaq_n_f32(*dcb, cr, (float)nonlinear[1][2]); ++ *dcb = vsubq_f32(*dcb, vdupq_n_f32(ycc_offset[1])); ++ ++ *dcr = vmulq_n_f32(y, (float)nonlinear[2][0]); ++ *dcr = vfmaq_n_f32(*dcr, cb, (float)nonlinear[2][1]); ++ *dcr = vfmaq_n_f32(*dcr, cr, (float)nonlinear[2][2]); ++ *dcr = vsubq_f32(*dcr, vdupq_n_f32(ycc_offset[2])); ++} ++ ++inline static void lms2rgbx4(float32x4_t* dl, float32x4_t* dm, float32x4_t* ds, ++ float32x4_t l, float32x4_t m, float32x4_t s, ++ const double lms2rgb_matrix[3][3]) ++{ ++ *dl = vmulq_n_f32(l, (float)lms2rgb_matrix[0][0]); ++ *dl = vfmaq_n_f32(*dl, m, (float)lms2rgb_matrix[0][1]); ++ *dl = vfmaq_n_f32(*dl, s, (float)lms2rgb_matrix[0][2]); ++ ++ *dm = vmulq_n_f32(l, (float)lms2rgb_matrix[1][0]); ++ *dm = vfmaq_n_f32(*dm, m, (float)lms2rgb_matrix[1][1]); ++ *dm = vfmaq_n_f32(*dm, s, (float)lms2rgb_matrix[1][2]); ++ ++ *ds = vmulq_n_f32(l, (float)lms2rgb_matrix[2][0]); ++ *ds = vfmaq_n_f32(*ds, m, (float)lms2rgb_matrix[2][1]); ++ *ds = vfmaq_n_f32(*ds, s, (float)lms2rgb_matrix[2][2]); ++} ++ +static inline void tonemap_int16x8_neon(uint16x8_t r_in, uint16x8_t g_in, uint16x8_t b_in, + int16_t *r_out, int16_t *g_out, int16_t *b_out, + float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, @@ -311,12 +517,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c +} +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS + -+void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++void tonemap_frame_dovi_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS + uint8_t *rdsty = dsty; @@ -331,20 +537,11 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + int remainw = width & 6; + + const int in_depth = srcdepth; -+ const int in_uv_offset = 128 << (in_depth - 8); -+ const int in_sh = in_depth - 1; -+ const int in_rnd = 1 << (in_sh - 1); -+ + const int out_depth = dstdepth; + const int out_uv_offset = 128 << (out_depth - 8); + const int out_sh = 29 - out_depth; + const int out_rnd = 1 << (out_sh - 1); -+ -+ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; -+ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; -+ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; -+ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; -+ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++ const float in_rng = (float)((1 << in_depth) - 1); + + int cry = (*params->rgb2yuv_coeffs)[0][0][0]; + int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; @@ -357,15 +554,6 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + + int16_t r[8], g[8], b[8]; + int16_t r1[8], g1[8], b1[8]; -+ uint16_t cy_shifted = av_clip_int16(cy >> in_sh); -+ uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh); -+ uint16_t crv_shifted = av_clip_int16(crv >> in_sh); -+ uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh); -+ uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh); -+ uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh); -+ uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted); -+ uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off)); -+ uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset)); + uint16x8_t y0x8, y1x8, ux8, vx8; + uint16x8_t r0x8, g0x8, b0x8; + uint16x8_t r1x8, g1x8, b1x8; @@ -386,6 +574,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + int32x4_t out_rndx4 = vdupq_n_s32(out_rnd); + int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset); + int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2); ++ float32x4_t ipt0, ipt1, ipt2, ipt3; ++ float32x4_t ia1, ib1, ia2, ib2; ++ float32x4_t ix4, px4, tx4; ++ float32x4_t lx4, mx4, sx4; ++ float32x4_t rx4a, gx4a, bx4a, rx4b, gx4b, bx4b; ++ float32x4_t y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b; + for (; height > 1; height -= 2, + dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], + srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { @@ -397,34 +591,159 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + ux4 = vld1_u16(srcu + (x >> 1)); + vx4 = vld1_u16(srcv + (x >> 1)); + -+ y0x8 = vsubq_u16(y0x8, in_yuv_offx8); -+ y1x8 = vsubq_u16(y1x8, in_yuv_offx8); + ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4)); -+ ux8 = vsubq_u16(ux8, in_uv_offx8); + vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4)); -+ vx8 = vsubq_u16(vx8, in_uv_offx8); -+ -+ r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted); -+ r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted); -+ r0x8 = vaddq_u16(r0x8, rndx8); -+ -+ g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted); -+ g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted); -+ g0x8 = vaddq_u16(g0x8, rndx8); -+ -+ b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted); -+ b0x8 = vaddq_u16(b0x8, rndx8); -+ -+ r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted); -+ r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted); -+ r1x8 = vaddq_u16(r1x8, rndx8); -+ -+ g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted); -+ g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted); -+ g1x8 = vaddq_u16(g1x8, rndx8); + -+ b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted); -+ b1x8 = vaddq_u16(b1x8, rndx8); ++ y0x4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(y0x8))); ++ y0x4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(y0x8))); ++ y1x4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(y1x8))); ++ y1x4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(y1x8))); ++ ++ ux4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(ux8))); ++ ux4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(ux8))); ++ vx4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vx8))); ++ vx4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vx8))); ++ ++ y0x4a = vdivq_f32(y0x4a, vdupq_n_f32(in_rng)); ++ y0x4b = vdivq_f32(y0x4b, vdupq_n_f32(in_rng)); ++ y1x4a = vdivq_f32(y1x4a, vdupq_n_f32(in_rng)); ++ y1x4b = vdivq_f32(y1x4b, vdupq_n_f32(in_rng)); ++ ux4a = vdivq_f32(ux4a, vdupq_n_f32(in_rng)); ++ ux4b = vdivq_f32(ux4b, vdupq_n_f32(in_rng)); ++ vx4a = vdivq_f32(vx4a, vdupq_n_f32(in_rng)); ++ vx4b = vdivq_f32(vx4b, vdupq_n_f32(in_rng)); ++ ++ // Reshape y0x4a ++ ia1 = vzip1q_f32(y0x4a, ux4a); ++ ia2 = vzip2q_f32(y0x4a, ux4a); ++ ib1 = vzip1q_f32(vx4a, vdupq_n_f32(0.0f)); ++ ib2 = vzip2q_f32(vx4a, vdupq_n_f32(0.0f)); ++ ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1)); ++ ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1)); ++ ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2)); ++ ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ia1 = vtrn1q_f32(ipt0, ipt1); ++ ia2 = vtrn1q_f32(ipt2, ipt3); ++ ib1 = vtrn2q_f32(ipt0, ipt1); ++ ib2 = vtrn2q_f32(ipt2, ipt3); ++ ++ ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2)); ++ px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2)); ++ tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4a = vmulq_n_f32(rx4a, 28672.0f); ++ gx4a = vmulq_n_f32(gx4a, 28672.0f); ++ bx4a = vmulq_n_f32(bx4a, 28672.0f); ++ ++ // Reshape y0x4b ++ ia1 = vzip1q_f32(y0x4b, ux4b); ++ ia2 = vzip2q_f32(y0x4b, ux4b); ++ ib1 = vzip1q_f32(vx4b, vdupq_n_f32(0.0f)); ++ ib2 = vzip2q_f32(vx4b, vdupq_n_f32(0.0f)); ++ ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1)); ++ ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1)); ++ ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2)); ++ ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ia1 = vtrn1q_f32(ipt0, ipt1); ++ ia2 = vtrn1q_f32(ipt2, ipt3); ++ ib1 = vtrn2q_f32(ipt0, ipt1); ++ ib2 = vtrn2q_f32(ipt2, ipt3); ++ ++ ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2)); ++ px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2)); ++ tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4b = vmulq_n_f32(rx4b, 28672.0f); ++ gx4b = vmulq_n_f32(gx4b, 28672.0f); ++ bx4b = vmulq_n_f32(bx4b, 28672.0f); ++ ++ r0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b))); ++ g0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b))); ++ b0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b))); ++ ++ // Reshape y1x4a ++ ia1 = vzip1q_f32(y1x4a, ux4a); ++ ia2 = vzip2q_f32(y1x4a, ux4a); ++ ib1 = vzip1q_f32(vx4a, vdupq_n_f32(0.0f)); ++ ib2 = vzip2q_f32(vx4a, vdupq_n_f32(0.0f)); ++ ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1)); ++ ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1)); ++ ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2)); ++ ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ia1 = vtrn1q_f32(ipt0, ipt1); ++ ia2 = vtrn1q_f32(ipt2, ipt3); ++ ib1 = vtrn2q_f32(ipt0, ipt1); ++ ib2 = vtrn2q_f32(ipt2, ipt3); ++ ++ ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2)); ++ px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2)); ++ tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4a = vmulq_n_f32(rx4a, 28672.0f); ++ gx4a = vmulq_n_f32(gx4a, 28672.0f); ++ bx4a = vmulq_n_f32(bx4a, 28672.0f); ++ ++ // Reshape y1x4b ++ ia1 = vzip1q_f32(y1x4b, ux4b); ++ ia2 = vzip2q_f32(y1x4b, ux4b); ++ ib1 = vzip1q_f32(vx4b, vdupq_n_f32(0.0f)); ++ ib2 = vzip2q_f32(vx4b, vdupq_n_f32(0.0f)); ++ ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1)); ++ ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1)); ++ ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2)); ++ ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ia1 = vtrn1q_f32(ipt0, ipt1); ++ ia2 = vtrn1q_f32(ipt2, ipt3); ++ ib1 = vtrn2q_f32(ipt0, ipt1); ++ ib2 = vtrn2q_f32(ipt2, ipt3); ++ ++ ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2)); ++ px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2)); ++ tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4b = vmulq_n_f32(rx4b, 28672.0f); ++ gx4b = vmulq_n_f32(gx4b, 28672.0f); ++ bx4b = vmulq_n_f32(bx4b, 28672.0f); ++ ++ r1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b))); ++ g1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b))); ++ b1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b))); + + tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b, + params->lin_lut, params->tonemap_lut, params->delin_lut, @@ -546,7 +865,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + rsrcy += offset; + rsrcu += offset >> 1; + rsrcv += offset >> 1; -+ tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv, ++ tonemap_frame_dovi_2_420p(rdsty, rdstu, rdstv, + rsrcy, rsrcu, rsrcv, + dstlinesize, srclinesize, + dstdepth, srcdepth, @@ -555,18 +874,20 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS +} + -+void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS + uint8_t *rdsty = dsty; -+ uint8_t *rdstuv = dstuv; ++ uint8_t *rdstu = dstu; ++ uint8_t *rdstv = dstv; + const uint16_t *rsrcy = srcy; -+ const uint16_t *rsrcuv = srcuv; ++ const uint16_t *rsrcu = srcu; ++ const uint16_t *rsrcv = srcv; + int rheight = height; + // not zero when not divisible by 8 + // intentionally leave last pixel emtpy when input is odd @@ -608,11 +929,10 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted); + uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off)); + uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset)); -+ uint16x8_t uvx8; -+ uint16x4_t ux2a, vx2a, ux2b, vx2b; + uint16x8_t y0x8, y1x8, ux8, vx8; + uint16x8_t r0x8, g0x8, b0x8; + uint16x8_t r1x8, g1x8, b1x8; ++ uint16x4_t ux4, vx4; + + int16x8_t r0ox8, g0ox8, b0ox8; + int16x8_t y0ox8; @@ -623,7 +943,6 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + int16x8_t y1ox8; + int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4; + int32x4_t y1oax4, y1obx4; -+ int32x4_t uvoax4, uvobx4; + int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2; + int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4; + int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off); @@ -631,32 +950,22 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset); + int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2); + for (; height > 1; height -= 2, -+ dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1], -+ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { ++ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { + for (int xx = 0; xx < width >> 3; xx++) { + int x = xx << 3; + + y0x8 = vld1q_u16(srcy + x); + y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x)); -+ uvx8 = vld1q_u16(srcuv + x); -+ if (in_depth == 10) { -+ // shift to low10bits for 10bit input -+ // shift bit has to be compile-time constant -+ y0x8 = vshrq_n_u16(y0x8, 6); -+ y1x8 = vshrq_n_u16(y1x8, 6); -+ uvx8 = vshrq_n_u16(uvx8, 6); -+ } ++ ux4 = vld1_u16(srcu + (x >> 1)); ++ vx4 = vld1_u16(srcv + (x >> 1)); ++ + y0x8 = vsubq_u16(y0x8, in_yuv_offx8); + y1x8 = vsubq_u16(y1x8, in_yuv_offx8); -+ uvx8 = vsubq_u16(uvx8, in_uv_offx8); -+ -+ ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2); -+ vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2); -+ ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2); -+ vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2); -+ -+ ux8 = vcombine_u16(ux2a, ux2b); -+ vx8 = vcombine_u16(vx2a, vx2b); ++ ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4)); ++ ux8 = vsubq_u16(ux8, in_uv_offx8); ++ vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4)); ++ vx8 = vsubq_u16(vx8, in_uv_offx8); + + r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted); + r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted); @@ -780,17 +1089,14 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + uox4 = vmlaq_n_s32(uox4, bavgx4, cburv); + uox4 = vshrq_n_s32(uox4, 21); + uox4 = vaddq_s32(uox4, out_uv_offsetx4); ++ vst1_lane_u32((uint32_t *) &dstu[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(uox4), vdup_n_s16(0)))), 0); + + vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv); + vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv); + vox4 = vmlaq_n_s32(vox4, bavgx4, cbv); + vox4 = vshrq_n_s32(vox4, 21); + vox4 = vaddq_s32(vox4, out_uv_offsetx4); -+ -+ uvoax4 = vzip1q_s32(uox4, vox4); -+ uvobx4 = vzip2q_s32(uox4, vox4); -+ -+ vst1_u8(&dstuv[x], vqmovun_s16(vcombine_s16(vmovn_s32(uvoax4), vmovn_s32(uvobx4)))); ++ vst1_lane_u32((uint32_t *) &dstv[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(vox4), vdup_n_s16(0)))), 0); + } + } + @@ -798,32 +1104,32 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + if (remainw) { + int offset = width & (int)0xfffffff8; + rdsty += offset; -+ rdstuv += offset; ++ rdstu += offset >> 1; ++ rdstv += offset >> 1; + rsrcy += offset; -+ rsrcuv += offset; -+ tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv, -+ rsrcy, rsrcuv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ rsrcu += offset >> 1; ++ rsrcv += offset >> 1; ++ tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS +} + -+void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS -+ uint16_t *rdsty = dsty; -+ uint16_t *rdstu = dstu; -+ uint16_t *rdstv = dstv; ++ uint8_t *rdsty = dsty; ++ uint8_t *rdstuv = dstuv; + const uint16_t *rsrcy = srcy; -+ const uint16_t *rsrcu = srcu; -+ const uint16_t *rsrcv = srcv; ++ const uint16_t *rsrcuv = srcuv; + int rheight = height; + // not zero when not divisible by 8 + // intentionally leave last pixel emtpy when input is odd @@ -865,20 +1171,22 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted); + uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off)); + uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset)); -+ uint16x4_t ux4, vx4; ++ uint16x8_t uvx8; ++ uint16x4_t ux2a, vx2a, ux2b, vx2b; + uint16x8_t y0x8, y1x8, ux8, vx8; + uint16x8_t r0x8, g0x8, b0x8; + uint16x8_t r1x8, g1x8, b1x8; + + int16x8_t r0ox8, g0ox8, b0ox8; -+ uint16x8_t y0ox8; ++ int16x8_t y0ox8; + int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4; + int32x4_t y0oax4, y0obx4; + + int16x8_t r1ox8, g1ox8, b1ox8; -+ uint16x8_t y1ox8; ++ int16x8_t y1ox8; + int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4; + int32x4_t y1oax4, y1obx4; ++ int32x4_t uvoax4, uvobx4; + int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2; + int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4; + int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off); @@ -886,22 +1194,32 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset); + int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2); + for (; height > 1; height -= 2, -+ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, -+ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { ++ dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1], ++ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { + for (int xx = 0; xx < width >> 3; xx++) { + int x = xx << 3; + + y0x8 = vld1q_u16(srcy + x); + y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x)); -+ ux4 = vld1_u16(srcu + (x >> 1)); -+ vx4 = vld1_u16(srcv + (x >> 1)); ++ uvx8 = vld1q_u16(srcuv + x); ++ if (in_depth == 10) { ++ // shift to low10bits for 10bit input ++ // shift bit has to be compile-time constant ++ y0x8 = vshrq_n_u16(y0x8, 6); ++ y1x8 = vshrq_n_u16(y1x8, 6); ++ uvx8 = vshrq_n_u16(uvx8, 6); ++ } + y0x8 = vsubq_u16(y0x8, in_yuv_offx8); + y1x8 = vsubq_u16(y1x8, in_yuv_offx8); ++ uvx8 = vsubq_u16(uvx8, in_uv_offx8); + -+ ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4)); -+ ux8 = vsubq_u16(ux8, in_uv_offx8); -+ vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4)); -+ vx8 = vsubq_u16(vx8, in_uv_offx8); ++ ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2); ++ vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2); ++ ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2); ++ vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2); ++ ++ ux8 = vcombine_u16(ux2a, ux2b); ++ vx8 = vcombine_u16(vx2a, vx2b); + + r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted); + r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted); @@ -950,18 +1268,19 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy); + y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby); + y0oax4 = vaddq_s32(y0oax4, out_rndx4); -+ y0oax4 = vshrq_n_s32(y0oax4, 19); ++ // output shift bits for 8bit outputs is 29 - 8 = 21 ++ y0oax4 = vshrq_n_s32(y0oax4, 21); + y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4); + + y0obx4 = vmulq_n_s32(r0obx4, cry); + y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy); + y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby); + y0obx4 = vaddq_s32(y0obx4, out_rndx4); -+ y0obx4 = vshrq_n_s32(y0obx4, 19); ++ y0obx4 = vshrq_n_s32(y0obx4, 21); + y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4); + -+ y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4)); -+ vst1q_u16(&dsty[x], y0ox8); ++ y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4)); ++ vst1_u8(&dsty[x], vqmovun_s16(y0ox8)); + + r1ox8 = vld1q_s16(r1); + g1ox8 = vld1q_s16(g1); @@ -979,18 +1298,18 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy); + y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby); + y1oax4 = vaddq_s32(y1oax4, out_rndx4); -+ y1oax4 = vshrq_n_s32(y1oax4, 19); ++ y1oax4 = vshrq_n_s32(y1oax4, 21); + y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4); + + y1obx4 = vmulq_n_s32(r1obx4, cry); + y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy); + y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby); + y1obx4 = vaddq_s32(y1obx4, out_rndx4); -+ y1obx4 = vshrq_n_s32(y1obx4, 19); ++ y1obx4 = vshrq_n_s32(y1obx4, 21); + y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4); + -+ y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4)); -+ vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8); ++ y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4)); ++ vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8)); + + ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4)); + ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4)); @@ -1022,16 +1341,19 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru); + uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu); + uox4 = vmlaq_n_s32(uox4, bavgx4, cburv); -+ uox4 = vshrq_n_s32(uox4, 19); ++ uox4 = vshrq_n_s32(uox4, 21); + uox4 = vaddq_s32(uox4, out_uv_offsetx4); -+ vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4)); + + vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv); + vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv); + vox4 = vmlaq_n_s32(vox4, bavgx4, cbv); -+ vox4 = vshrq_n_s32(vox4, 19); ++ vox4 = vshrq_n_s32(vox4, 21); + vox4 = vaddq_s32(vox4, out_uv_offsetx4); -+ vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4)); ++ ++ uvoax4 = vzip1q_s32(uox4, vox4); ++ uvobx4 = vzip2q_s32(uox4, vox4); ++ ++ vst1_u8(&dstuv[x], vqmovun_s16(vcombine_s16(vmovn_s32(uvoax4), vmovn_s32(uvobx4)))); + } + } + @@ -1039,53 +1361,44 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + if (remainw) { + int offset = width & (int)0xfffffff8; + rdsty += offset; -+ rdstu += offset >> 1; -+ rdstv += offset >> 1; ++ rdstuv += offset; + rsrcy += offset; -+ rsrcu += offset >> 1; -+ rsrcv += offset >> 1; -+ tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv, -+ rsrcy, rsrcu, rsrcv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ rsrcuv += offset; ++ tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv, ++ rsrcy, rsrcuv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS +} + -+void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++void tonemap_frame_dovi_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS + uint16_t *rdsty = dsty; -+ uint16_t *rdstuv = dstuv; ++ uint16_t *rdstu = dstu; ++ uint16_t *rdstv = dstv; + const uint16_t *rsrcy = srcy; -+ const uint16_t *rsrcuv = srcuv; ++ const uint16_t *rsrcu = srcu; ++ const uint16_t *rsrcv = srcv; + int rheight = height; + // not zero when not divisible by 8 + // intentionally leave last pixel emtpy when input is odd + int remainw = width & 6; + + const int in_depth = srcdepth; -+ const int in_uv_offset = 128 << (in_depth - 8); -+ const int in_sh = in_depth - 1; -+ const int in_rnd = 1 << (in_sh - 1); ++ const float in_rng = (float)((1 << in_depth) - 1); + + const int out_depth = dstdepth; + const int out_uv_offset = 128 << (out_depth - 8); + const int out_sh = 29 - out_depth; + const int out_rnd = 1 << (out_sh - 1); -+ const int out_sh2 = 16 - out_depth; -+ -+ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; -+ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; -+ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; -+ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; -+ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; + + int cry = (*params->rgb2yuv_coeffs)[0][0][0]; + int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; @@ -1098,17 +1411,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + + int16_t r[8], g[8], b[8]; + int16_t r1[8], g1[8], b1[8]; -+ uint16_t cy_shifted = av_clip_int16(cy >> in_sh); -+ uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh); -+ uint16_t crv_shifted = av_clip_int16(crv >> in_sh); -+ uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh); -+ uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh); -+ uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh); -+ uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted); -+ uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off)); -+ uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset)); -+ uint16x8_t uvx8; -+ uint16x4_t ux2a, vx2a, ux2b, vx2b; ++ uint16x4_t ux4, vx4; + uint16x8_t y0x8, y1x8, ux8, vx8; + uint16x8_t r0x8, g0x8, b0x8; + uint16x8_t r1x8, g1x8, b1x8; @@ -1122,63 +1425,182 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + uint16x8_t y1ox8; + int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4; + int32x4_t y1oax4, y1obx4; -+ int32x4_t uvoax4, uvobx4; + int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2; + int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4; + int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off); + int32x4_t out_rndx4 = vdupq_n_s32(out_rnd); -+ int16x8_t out_sh2x8 = vdupq_n_s16(out_sh2); + int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset); + int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2); ++ float32x4_t ipt0, ipt1, ipt2, ipt3; ++ float32x4_t ia1, ib1, ia2, ib2; ++ float32x4_t ix4, px4, tx4; ++ float32x4_t lx4, mx4, sx4; ++ float32x4_t rx4a, gx4a, bx4a, rx4b, gx4b, bx4b; ++ float32x4_t y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b; + for (; height > 1; height -= 2, -+ dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2, -+ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { ++ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { + for (int xx = 0; xx < width >> 3; xx++) { + int x = xx << 3; + + y0x8 = vld1q_u16(srcy + x); + y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x)); -+ uvx8 = vld1q_u16(srcuv + x); -+ if (in_depth == 10) { -+ // shift to low10bits for 10bit input -+ // shift bit has to be compile-time constant -+ y0x8 = vshrq_n_u16(y0x8, 6); -+ y1x8 = vshrq_n_u16(y1x8, 6); -+ uvx8 = vshrq_n_u16(uvx8, 6); -+ } -+ y0x8 = vsubq_u16(y0x8, in_yuv_offx8); -+ y1x8 = vsubq_u16(y1x8, in_yuv_offx8); -+ uvx8 = vsubq_u16(uvx8, in_uv_offx8); -+ -+ ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2); -+ vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2); -+ ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2); -+ vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2); -+ -+ ux8 = vcombine_u16(ux2a, ux2b); -+ vx8 = vcombine_u16(vx2a, vx2b); -+ -+ r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted); -+ r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted); -+ r0x8 = vaddq_u16(r0x8, rndx8); -+ -+ g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted); -+ g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted); -+ g0x8 = vaddq_u16(g0x8, rndx8); -+ -+ b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted); -+ b0x8 = vaddq_u16(b0x8, rndx8); -+ -+ r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted); -+ r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted); -+ r1x8 = vaddq_u16(r1x8, rndx8); ++ ux4 = vld1_u16(srcu + (x >> 1)); ++ vx4 = vld1_u16(srcv + (x >> 1)); + -+ g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted); -+ g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted); -+ g1x8 = vaddq_u16(g1x8, rndx8); ++ ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4)); ++ vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4)); + -+ b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted); -+ b1x8 = vaddq_u16(b1x8, rndx8); ++ y0x4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(y0x8))); ++ y0x4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(y0x8))); ++ y1x4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(y1x8))); ++ y1x4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(y1x8))); ++ ++ ux4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(ux8))); ++ ux4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(ux8))); ++ vx4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vx8))); ++ vx4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vx8))); ++ ++ y0x4a = vdivq_f32(y0x4a, vdupq_n_f32(in_rng)); ++ y0x4b = vdivq_f32(y0x4b, vdupq_n_f32(in_rng)); ++ y1x4a = vdivq_f32(y1x4a, vdupq_n_f32(in_rng)); ++ y1x4b = vdivq_f32(y1x4b, vdupq_n_f32(in_rng)); ++ ux4a = vdivq_f32(ux4a, vdupq_n_f32(in_rng)); ++ ux4b = vdivq_f32(ux4b, vdupq_n_f32(in_rng)); ++ vx4a = vdivq_f32(vx4a, vdupq_n_f32(in_rng)); ++ vx4b = vdivq_f32(vx4b, vdupq_n_f32(in_rng)); ++ ++ // Reshape y0x4a ++ ia1 = vzip1q_f32(y0x4a, ux4a); ++ ia2 = vzip2q_f32(y0x4a, ux4a); ++ ib1 = vzip1q_f32(vx4a, vdupq_n_f32(0.0f)); ++ ib2 = vzip2q_f32(vx4a, vdupq_n_f32(0.0f)); ++ ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1)); ++ ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1)); ++ ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2)); ++ ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ia1 = vtrn1q_f32(ipt0, ipt1); ++ ia2 = vtrn1q_f32(ipt2, ipt3); ++ ib1 = vtrn2q_f32(ipt0, ipt1); ++ ib2 = vtrn2q_f32(ipt2, ipt3); ++ ++ ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2)); ++ px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2)); ++ tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4a = vmulq_n_f32(rx4a, 28672.0f); ++ gx4a = vmulq_n_f32(gx4a, 28672.0f); ++ bx4a = vmulq_n_f32(bx4a, 28672.0f); ++ ++ // Reshape y0x4b ++ ia1 = vzip1q_f32(y0x4b, ux4b); ++ ia2 = vzip2q_f32(y0x4b, ux4b); ++ ib1 = vzip1q_f32(vx4b, vdupq_n_f32(0.0f)); ++ ib2 = vzip2q_f32(vx4b, vdupq_n_f32(0.0f)); ++ ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1)); ++ ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1)); ++ ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2)); ++ ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ia1 = vtrn1q_f32(ipt0, ipt1); ++ ia2 = vtrn1q_f32(ipt2, ipt3); ++ ib1 = vtrn2q_f32(ipt0, ipt1); ++ ib2 = vtrn2q_f32(ipt2, ipt3); ++ ++ ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2)); ++ px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2)); ++ tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4b = vmulq_n_f32(rx4b, 28672.0f); ++ gx4b = vmulq_n_f32(gx4b, 28672.0f); ++ bx4b = vmulq_n_f32(bx4b, 28672.0f); ++ ++ r0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b))); ++ g0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b))); ++ b0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b))); ++ ++ // Reshape y1x4a ++ ia1 = vzip1q_f32(y1x4a, ux4a); ++ ia2 = vzip2q_f32(y1x4a, ux4a); ++ ib1 = vzip1q_f32(vx4a, vdupq_n_f32(0.0f)); ++ ib2 = vzip2q_f32(vx4a, vdupq_n_f32(0.0f)); ++ ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1)); ++ ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1)); ++ ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2)); ++ ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ia1 = vtrn1q_f32(ipt0, ipt1); ++ ia2 = vtrn1q_f32(ipt2, ipt3); ++ ib1 = vtrn2q_f32(ipt0, ipt1); ++ ib2 = vtrn2q_f32(ipt2, ipt3); ++ ++ ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2)); ++ px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2)); ++ tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4a = vmulq_n_f32(rx4a, 28672.0f); ++ gx4a = vmulq_n_f32(gx4a, 28672.0f); ++ bx4a = vmulq_n_f32(bx4a, 28672.0f); ++ ++ // Reshape y1x4b ++ ia1 = vzip1q_f32(y1x4b, ux4b); ++ ia2 = vzip2q_f32(y1x4b, ux4b); ++ ib1 = vzip1q_f32(vx4b, vdupq_n_f32(0.0f)); ++ ib2 = vzip2q_f32(vx4b, vdupq_n_f32(0.0f)); ++ ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1)); ++ ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1)); ++ ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2)); ++ ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ia1 = vtrn1q_f32(ipt0, ipt1); ++ ia2 = vtrn1q_f32(ipt2, ipt3); ++ ib1 = vtrn2q_f32(ipt0, ipt1); ++ ib2 = vtrn2q_f32(ipt2, ipt3); ++ ++ ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2)); ++ px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2)); ++ tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4b = vmulq_n_f32(rx4b, 28672.0f); ++ gx4b = vmulq_n_f32(gx4b, 28672.0f); ++ bx4b = vmulq_n_f32(bx4b, 28672.0f); ++ ++ r1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b))); ++ g1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b))); ++ b1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b))); + + tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b, + params->lin_lut, params->tonemap_lut, params->delin_lut, @@ -1205,11 +1627,18 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy); + y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby); + y0oax4 = vaddq_s32(y0oax4, out_rndx4); ++ y0oax4 = vshrq_n_s32(y0oax4, 19); ++ y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4); + + y0obx4 = vmulq_n_s32(r0obx4, cry); + y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy); + y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby); + y0obx4 = vaddq_s32(y0obx4, out_rndx4); ++ y0obx4 = vshrq_n_s32(y0obx4, 19); ++ y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4); ++ ++ y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4)); ++ vst1q_u16(&dsty[x], y0ox8); + + r1ox8 = vld1q_s16(r1); + g1ox8 = vld1q_s16(g1); @@ -1227,11 +1656,18 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy); + y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby); + y1oax4 = vaddq_s32(y1oax4, out_rndx4); ++ y1oax4 = vshrq_n_s32(y1oax4, 19); ++ y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4); + + y1obx4 = vmulq_n_s32(r1obx4, cry); + y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy); + y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby); + y1obx4 = vaddq_s32(y1obx4, out_rndx4); ++ y1obx4 = vshrq_n_s32(y1obx4, 19); ++ y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4); ++ ++ y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4)); ++ vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8); + + ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4)); + ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4)); @@ -1263,50 +1699,16 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru); + uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu); + uox4 = vmlaq_n_s32(uox4, bavgx4, cburv); ++ uox4 = vshrq_n_s32(uox4, 19); ++ uox4 = vaddq_s32(uox4, out_uv_offsetx4); ++ vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4)); + + vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv); + vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv); + vox4 = vmlaq_n_s32(vox4, bavgx4, cbv); -+ -+ switch(out_depth) { -+ default: -+ case 10: -+ y0oax4 = vshrq_n_s32(y0oax4, 19); -+ y0obx4 = vshrq_n_s32(y0obx4, 19); -+ y1oax4 = vshrq_n_s32(y1oax4, 19); -+ y1obx4 = vshrq_n_s32(y1obx4, 19); -+ uox4 = vshrq_n_s32(uox4, 19); -+ vox4 = vshrq_n_s32(vox4, 19); -+ break; -+ case 16: -+ y0oax4 = vshrq_n_s32(y0oax4, 13); -+ y0obx4 = vshrq_n_s32(y0obx4, 13); -+ y1oax4 = vshrq_n_s32(y1oax4, 13); -+ y1obx4 = vshrq_n_s32(y1obx4, 13); -+ uox4 = vshrq_n_s32(uox4, 13); -+ vox4 = vshrq_n_s32(vox4, 13); -+ break; -+ } -+ -+ y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4); -+ y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4); -+ y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4); -+ y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4); -+ uox4 = vaddq_s32(uox4, out_uv_offsetx4); ++ vox4 = vshrq_n_s32(vox4, 19); + vox4 = vaddq_s32(vox4, out_uv_offsetx4); -+ -+ y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4)); -+ y0ox8 = vshlq_u16(y0ox8, out_sh2x8); -+ vst1q_u16(&dsty[x], y0ox8); -+ -+ y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4)); -+ y1ox8 = vshlq_u16(y1ox8, out_sh2x8); -+ vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8); -+ -+ uvoax4 = vzip1q_s32(uox4, vox4); -+ uvobx4 = vzip2q_s32(uox4, vox4); -+ -+ vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8)); ++ vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4)); + } + } + @@ -1314,1398 +1716,2446 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c + if (remainw) { + int offset = width & (int)0xfffffff8; + rdsty += offset; -+ rdstuv += offset; ++ rdstu += offset >> 1; ++ rdstv += offset >> 1; + rsrcy += offset; -+ rsrcuv += offset; -+ tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv, -+ rsrcy, rsrcuv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ rsrcu += offset >> 1; ++ rsrcv += offset >> 1; ++ tonemap_frame_dovi_2_420p10(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS +} -Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h -=================================================================== ---- /dev/null -+++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h -@@ -0,0 +1,54 @@ -+/* -+ * Copyright (c) 2024 Gnattu OC -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#ifndef AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H -+#define AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H -+ -+#include "libavfilter/vf_tonemapx.h" -+ -+void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); -+ -+void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); + +void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, + const int *dstlinesize, const int *srclinesize, + int dstdepth, int srcdepth, + int width, int height, -+ const struct TonemapIntParams *params); ++ const struct TonemapIntParams *params) ++{ ++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS ++ uint16_t *rdsty = dsty; ++ uint16_t *rdstu = dstu; ++ uint16_t *rdstv = dstv; ++ const uint16_t *rsrcy = srcy; ++ const uint16_t *rsrcu = srcu; ++ const uint16_t *rsrcv = srcv; ++ int rheight = height; ++ // not zero when not divisible by 8 ++ // intentionally leave last pixel emtpy when input is odd ++ int remainw = width & 6; + -+void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++ const int in_depth = srcdepth; ++ const int in_uv_offset = 128 << (in_depth - 8); ++ const int in_sh = in_depth - 1; ++ const int in_rnd = 1 << (in_sh - 1); + -+#endif // AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H -Index: FFmpeg/libavfilter/allfilters.c -=================================================================== ---- FFmpeg.orig/libavfilter/allfilters.c -+++ FFmpeg/libavfilter/allfilters.c -@@ -498,6 +498,7 @@ extern const AVFilter ff_vf_tmedian; - extern const AVFilter ff_vf_tmidequalizer; - extern const AVFilter ff_vf_tmix; - extern const AVFilter ff_vf_tonemap; -+extern const AVFilter ff_vf_tonemapx; - extern const AVFilter ff_vf_tonemap_cuda; - extern const AVFilter ff_vf_tonemap_opencl; - extern const AVFilter ff_vf_tonemap_vaapi; -Index: FFmpeg/libavfilter/colorspace.c -=================================================================== ---- FFmpeg.orig/libavfilter/colorspace.c -+++ FFmpeg/libavfilter/colorspace.c -@@ -17,6 +17,7 @@ - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -+#include "libavutil/avassert.h" - #include "libavutil/frame.h" - #include "libavutil/mastering_display_metadata.h" - #include "libavutil/pixdesc.h" -@@ -354,3 +355,51 @@ float inverse_eotf_arib_b67(float x) { - float inverse_eotf_bt1886(float x) { - return x > 0.0f ? powf(x, 1.0f / 2.4f) : 0.0f; - } -+ -+int ff_get_range_off(int *off, int *y_rng, int *uv_rng, -+ enum AVColorRange rng, int depth) -+{ -+ switch (rng) { -+ case AVCOL_RANGE_UNSPECIFIED: -+ case AVCOL_RANGE_MPEG: -+ *off = 16 << (depth - 8); -+ *y_rng = 219 << (depth - 8); -+ *uv_rng = 224 << (depth - 8); -+ break; -+ case AVCOL_RANGE_JPEG: -+ *off = 0; -+ *y_rng = *uv_rng = (256 << (depth - 8)) - 1; -+ break; -+ default: -+ return AVERROR(EINVAL); -+ } ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); + -+ return 0; -+} ++ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; ++ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; ++ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; ++ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; ++ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; + -+void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3], -+ int depth, int y_rng, int uv_rng, int yuv2rgb) -+{ -+#define N (yuv2rgb ? m : n) -+#define M (yuv2rgb ? n : m) -+ int rng, n, m, o; -+ int bits = 1 << (yuv2rgb ? (depth - 1) : (29 - depth)); -+ for (rng = y_rng, n = 0; n < 3; n++, rng = uv_rng) { -+ for (m = 0; m < 3; m++) { -+ out[N][M][0] = lrint(bits * (yuv2rgb ? 28672 : rng) * table[N][M] / (yuv2rgb ? rng : 28672)); -+ for (o = 1; o < 8; o++) -+ out[N][M][o] = out[N][M][0]; -+ } -+ } -+#undef N -+#undef M ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; + -+ if (yuv2rgb) { -+ av_assert2(out[0][1][0] == 0); -+ av_assert2(out[2][2][0] == 0); -+ av_assert2(out[0][0][0] == out[1][0][0]); -+ av_assert2(out[0][0][0] == out[2][0][0]); -+ } else { -+ av_assert2(out[1][2][0] == out[2][0][0]); -+ } -+} -Index: FFmpeg/libavfilter/colorspace.h -=================================================================== ---- FFmpeg.orig/libavfilter/colorspace.h -+++ FFmpeg/libavfilter/colorspace.h -@@ -85,4 +85,8 @@ float eotf_arib_b67(float x); - float inverse_eotf_arib_b67(float x); - float inverse_eotf_bt1886(float x); - -+int ff_get_range_off(int *off, int *y_rng, int *uv_rng, -+ enum AVColorRange rng, int depth); -+void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3], -+ int depth, int y_rng, int uv_rng, int yuv2rgb); - #endif -Index: FFmpeg/libavfilter/vf_tonemapx.c -=================================================================== ---- /dev/null -+++ FFmpeg/libavfilter/vf_tonemapx.c -@@ -0,0 +1,1261 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ ++ int16_t r[8], g[8], b[8]; ++ int16_t r1[8], g1[8], b1[8]; ++ uint16_t cy_shifted = av_clip_int16(cy >> in_sh); ++ uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh); ++ uint16_t crv_shifted = av_clip_int16(crv >> in_sh); ++ uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh); ++ uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh); ++ uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh); ++ uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted); ++ uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off)); ++ uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset)); ++ uint16x4_t ux4, vx4; ++ uint16x8_t y0x8, y1x8, ux8, vx8; ++ uint16x8_t r0x8, g0x8, b0x8; ++ uint16x8_t r1x8, g1x8, b1x8; + -+/** -+ * @file -+ * tonemap algorithms -+ */ ++ int16x8_t r0ox8, g0ox8, b0ox8; ++ uint16x8_t y0ox8; ++ int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4; ++ int32x4_t y0oax4, y0obx4; + -+#include -+#include ++ int16x8_t r1ox8, g1ox8, b1ox8; ++ uint16x8_t y1ox8; ++ int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4; ++ int32x4_t y1oax4, y1obx4; ++ int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2; ++ int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4; ++ int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off); ++ int32x4_t out_rndx4 = vdupq_n_s32(out_rnd); ++ int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset); ++ int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2); ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { ++ for (int xx = 0; xx < width >> 3; xx++) { ++ int x = xx << 3; + -+#include "libavutil/avassert.h" -+#include "libavutil/imgutils.h" -+#include "libavutil/internal.h" -+#include "libavutil/mem_internal.h" -+#include "libavutil/opt.h" -+#include "libavutil/cpu.h" ++ y0x8 = vld1q_u16(srcy + x); ++ y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x)); ++ ux4 = vld1_u16(srcu + (x >> 1)); ++ vx4 = vld1_u16(srcv + (x >> 1)); ++ y0x8 = vsubq_u16(y0x8, in_yuv_offx8); ++ y1x8 = vsubq_u16(y1x8, in_yuv_offx8); + -+#include "vf_tonemapx.h" ++ ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4)); ++ ux8 = vsubq_u16(ux8, in_uv_offx8); ++ vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4)); ++ vx8 = vsubq_u16(vx8, in_uv_offx8); + -+#ifdef CC_SUPPORTS_TONEMAPX_INTRINSICS -+# if ARCH_AARCH64 -+# if HAVE_INTRINSICS_NEON -+# include "libavutil/aarch64/cpu.h" -+# include "aarch64/vf_tonemapx_intrin_neon.h" -+# endif -+# endif // ARCH_AARCH64 -+# if ARCH_X86 -+# include "libavutil/x86/cpu.h" -+# if HAVE_INTRINSICS_SSE42 -+# include "x86/vf_tonemapx_intrin_sse.h" -+# endif -+# if HAVE_INTRINSICS_AVX2 && HAVE_INTRINSICS_FMA3 -+# include "x86/vf_tonemapx_intrin_avx.h" -+# endif -+# endif // ARCH_X86 -+#endif // CC_SUPPORTS_TONEMAPX_INTRINSICS ++ r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted); ++ r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted); ++ r0x8 = vaddq_u16(r0x8, rndx8); + -+#include "avfilter.h" -+#include "formats.h" -+#include "internal.h" -+#include "video.h" ++ g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted); ++ g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted); ++ g0x8 = vaddq_u16(g0x8, rndx8); + -+enum TonemapAlgorithm { -+ TONEMAP_NONE, -+ TONEMAP_LINEAR, -+ TONEMAP_GAMMA, -+ TONEMAP_CLIP, -+ TONEMAP_REINHARD, -+ TONEMAP_HABLE, -+ TONEMAP_MOBIUS, -+ TONEMAP_BT2390, -+ TONEMAP_MAX, -+}; ++ b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted); ++ b0x8 = vaddq_u16(b0x8, rndx8); + -+typedef struct TonemapxContext { -+ const AVClass *class; ++ r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted); ++ r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted); ++ r1x8 = vaddq_u16(r1x8, rndx8); + -+ enum TonemapAlgorithm tonemap; -+ enum AVColorTransferCharacteristic trc; -+ enum AVColorSpace spc; -+ enum AVColorPrimaries pri; -+ enum AVColorRange range; -+ enum AVPixelFormat format; -+ char *format_str; -+ double param; -+ double desat; -+ double peak; ++ g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted); ++ g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted); ++ g1x8 = vaddq_u16(g1x8, rndx8); + -+ const AVLumaCoefficients *coeffs, *ocoeffs; ++ b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted); ++ b1x8 = vaddq_u16(b1x8, rndx8); + -+ double lut_peak; -+ float *lin_lut; -+ float *tonemap_lut; -+ uint16_t *delin_lut; -+ int in_yuv_off, out_yuv_off; ++ tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); + -+ DECLARE_ALIGNED(16, int16_t, yuv2rgb_coeffs)[3][3][8]; -+ DECLARE_ALIGNED(16, int16_t, rgb2yuv_coeffs)[3][3][8]; -+ DECLARE_ALIGNED(16, double, rgb2rgb_coeffs)[3][3]; ++ r0ox8 = vld1q_s16(r); ++ g0ox8 = vld1q_s16(g); ++ b0ox8 = vld1q_s16(b); + -+ int (*filter_slice) (AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs); ++ r0oax4 = vmovl_s16(vget_low_s16(r0ox8)); ++ g0oax4 = vmovl_s16(vget_low_s16(g0ox8)); ++ b0oax4 = vmovl_s16(vget_low_s16(b0ox8)); + -+ void (*tonemap_func_biplanar8) (uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++ r0obx4 = vmovl_s16(vget_high_s16(r0ox8)); ++ g0obx4 = vmovl_s16(vget_high_s16(g0ox8)); ++ b0obx4 = vmovl_s16(vget_high_s16(b0ox8)); + -+ void (*tonemap_func_planar8) (uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); -+ -+ void (*tonemap_func_biplanar10) (uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); -+ -+ void (*tonemap_func_planar10) (uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); -+} TonemapxContext; -+ -+typedef struct ThreadData { -+ AVFrame *in, *out; -+ const AVPixFmtDescriptor *desc, *odesc; -+ double peak; -+} ThreadData; -+ -+static const enum AVPixelFormat in_pix_fmts[] = { -+ AV_PIX_FMT_YUV420P10, -+ AV_PIX_FMT_P010, -+ AV_PIX_FMT_P016, -+ AV_PIX_FMT_NONE, -+}; ++ y0oax4 = vmulq_n_s32(r0oax4, cry); ++ y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy); ++ y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby); ++ y0oax4 = vaddq_s32(y0oax4, out_rndx4); ++ y0oax4 = vshrq_n_s32(y0oax4, 19); ++ y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4); + -+static const enum AVPixelFormat out_pix_fmts[] = { -+ AV_PIX_FMT_YUV420P, -+ AV_PIX_FMT_YUV420P10, -+ AV_PIX_FMT_NV12, -+ AV_PIX_FMT_P010, -+ AV_PIX_FMT_P016, -+}; ++ y0obx4 = vmulq_n_s32(r0obx4, cry); ++ y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy); ++ y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby); ++ y0obx4 = vaddq_s32(y0obx4, out_rndx4); ++ y0obx4 = vshrq_n_s32(y0obx4, 19); ++ y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4); + -+static int out_format_is_supported(enum AVPixelFormat fmt) -+{ -+ int i; ++ y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4)); ++ vst1q_u16(&dsty[x], y0ox8); + -+ for (i = 0; i < FF_ARRAY_ELEMS(out_pix_fmts); i++) -+ if (out_pix_fmts[i] == fmt) -+ return 1; -+ return 0; -+} ++ r1ox8 = vld1q_s16(r1); ++ g1ox8 = vld1q_s16(g1); ++ b1ox8 = vld1q_s16(b1); + -+static float hable(float in) -+{ -+ float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f; -+ return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f; -+} ++ r1oax4 = vmovl_s16(vget_low_s16(r1ox8)); ++ g1oax4 = vmovl_s16(vget_low_s16(g1ox8)); ++ b1oax4 = vmovl_s16(vget_low_s16(b1ox8)); + -+static float mobius(float in, float j, double peak) -+{ -+ float a, b; ++ r1obx4 = vmovl_s16(vget_high_s16(r1ox8)); ++ g1obx4 = vmovl_s16(vget_high_s16(g1ox8)); ++ b1obx4 = vmovl_s16(vget_high_s16(b1ox8)); + -+ if (in <= j) -+ return in; ++ y1oax4 = vmulq_n_s32(r1oax4, cry); ++ y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy); ++ y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby); ++ y1oax4 = vaddq_s32(y1oax4, out_rndx4); ++ y1oax4 = vshrq_n_s32(y1oax4, 19); ++ y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4); + -+ a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak); -+ b = (j * j - 2.0f * j * peak + peak) / FFMAX(peak - 1.0f, FLOAT_EPS); ++ y1obx4 = vmulq_n_s32(r1obx4, cry); ++ y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy); ++ y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby); ++ y1obx4 = vaddq_s32(y1obx4, out_rndx4); ++ y1obx4 = vshrq_n_s32(y1obx4, 19); ++ y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4); + -+ return (b * b + 2.0f * b * j + j * j) / (b - a) * (in + a) / (in + b); -+} ++ y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4)); ++ vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8); + -+static float bt2390(float s, float peak) -+{ -+ float peak_pq = inverse_eotf_st2084(peak, REFERENCE_WHITE_ALT); -+ float scale = 1.0f / peak_pq; ++ ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4)); ++ ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4)); ++ ravgx4 = vcombine_s32(ravgax2, ravgbx2); ++ ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4)); ++ ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4)); ++ ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2)); ++ ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4); ++ ravgx4 = vshrq_n_s32(ravgx4, 2); + -+ // SDR peak -+ float dst_peak = 1.0f; -+ float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE_ALT) * scale; -+ float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale; ++ gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4)); ++ gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4)); ++ gavgx4 = vcombine_s32(gavgax2, gavgbx2); ++ gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4)); ++ gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4)); ++ gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2)); ++ gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4); ++ gavgx4 = vshrq_n_s32(gavgx4, 2); + -+ float ks = 1.5f * maxLum - 0.5f; -+ float tb = (s_pq - ks) / (1.0f - ks); -+ float tb2 = tb * tb; -+ float tb3 = tb2 * tb; -+ float pb = (2.0f * tb3 - 3.0f * tb2 + 1.0f) * ks + -+ (tb3 - 2.0f * tb2 + tb) * (1.0f - ks) + -+ (-2.0f * tb3 + 3.0f * tb2) * maxLum; -+ float sig = (s_pq < ks) ? s_pq : pb; ++ bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4)); ++ bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4)); ++ bavgx4 = vcombine_s32(bavgax2, bavgbx2); ++ bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4)); ++ bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4)); ++ bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2)); ++ bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4); ++ bavgx4 = vshrq_n_s32(bavgx4, 2); + -+ return eotf_st2084(sig * peak_pq, REFERENCE_WHITE_ALT); -+} ++ uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru); ++ uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu); ++ uox4 = vmlaq_n_s32(uox4, bavgx4, cburv); ++ uox4 = vshrq_n_s32(uox4, 19); ++ uox4 = vaddq_s32(uox4, out_uv_offsetx4); ++ vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4)); + -+static float mapsig(enum TonemapAlgorithm alg, float sig, double peak, double param) -+{ -+ switch(alg) { -+ default: -+ case TONEMAP_NONE: -+ // do nothing -+ break; -+ case TONEMAP_LINEAR: -+ sig = sig * param / peak; -+ break; -+ case TONEMAP_GAMMA: -+ sig = sig > 0.05f -+ ? pow(sig / peak, 1.0f / param) -+ : sig * pow(0.05f / peak, 1.0f / param) / 0.05f; -+ break; -+ case TONEMAP_CLIP: -+ sig = av_clipf(sig * param, 0, 1.0f); -+ break; -+ case TONEMAP_HABLE: -+ sig = hable(sig) / hable(peak); -+ break; -+ case TONEMAP_REINHARD: -+ sig = sig / (sig + param) * (peak + param) / peak; -+ break; -+ case TONEMAP_MOBIUS: -+ sig = mobius(sig, param, peak); -+ break; -+ case TONEMAP_BT2390: -+ sig = bt2390(sig, peak); -+ break; ++ vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv); ++ vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv); ++ vox4 = vmlaq_n_s32(vox4, bavgx4, cbv); ++ vox4 = vshrq_n_s32(vox4, 19); ++ vox4 = vaddq_s32(vox4, out_uv_offsetx4); ++ vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4)); ++ } + } + -+ return sig; ++ // Process remaining pixels cannot fill the full simd register with scalar version ++ if (remainw) { ++ int offset = width & (int)0xfffffff8; ++ rdsty += offset; ++ rdstu += offset >> 1; ++ rdstv += offset >> 1; ++ rsrcy += offset; ++ rsrcu += offset >> 1; ++ rsrcv += offset >> 1; ++ tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); ++ } ++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS +} + -+static float linearize(float x, enum AVColorTransferCharacteristic trc_src) ++void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ -+ if (trc_src == AVCOL_TRC_SMPTE2084) -+ return eotf_st2084(x, REFERENCE_WHITE_ALT); -+ else if (trc_src == AVCOL_TRC_ARIB_STD_B67) -+ return eotf_arib_b67(x); -+ else -+ return x; -+} ++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS ++ uint16_t *rdsty = dsty; ++ uint16_t *rdstuv = dstuv; ++ const uint16_t *rsrcy = srcy; ++ const uint16_t *rsrcuv = srcuv; ++ int rheight = height; ++ // not zero when not divisible by 8 ++ // intentionally leave last pixel emtpy when input is odd ++ int remainw = width & 6; + -+static float delinearize(float x, enum AVColorTransferCharacteristic trc_dst) -+{ -+ if (trc_dst == AVCOL_TRC_BT709 || trc_dst == AVCOL_TRC_BT2020_10) -+ return inverse_eotf_bt1886(x); -+ else -+ return x; -+} ++ const int in_depth = srcdepth; ++ const int in_uv_offset = 128 << (in_depth - 8); ++ const int in_sh = in_depth - 1; ++ const int in_rnd = 1 << (in_sh - 1); + -+static int compute_trc_luts(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src, -+ enum AVColorTransferCharacteristic trc_dst) -+{ -+ int i; ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ const int out_sh2 = 16 - out_depth; + -+ if (!s->lin_lut && !(s->lin_lut = av_calloc(32768, sizeof(float)))) -+ return AVERROR(ENOMEM); -+ if (!s->delin_lut && !(s->delin_lut = av_calloc(32768, sizeof(uint16_t)))) -+ return AVERROR(ENOMEM); ++ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; ++ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; ++ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; ++ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; ++ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; + -+ for (i = 0; i < 32768; i++) { -+ double v1 = (i - 2048.0f) / 28672.0f; -+ double v2 = i / 32767.0f; -+ s->lin_lut[i] = FFMAX(linearize(v1, trc_src), 0); -+ s->delin_lut[i] = av_clip_int16(lrint(delinearize(v2, trc_dst) * 28672.0f)); -+ } -+ -+ return 0; -+} ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; + -+static int compute_tonemap_lut(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src) -+{ -+ int i; -+ double peak = s->lut_peak; ++ int16_t r[8], g[8], b[8]; ++ int16_t r1[8], g1[8], b1[8]; ++ uint16_t cy_shifted = av_clip_int16(cy >> in_sh); ++ uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh); ++ uint16_t crv_shifted = av_clip_int16(crv >> in_sh); ++ uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh); ++ uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh); ++ uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh); ++ uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted); ++ uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off)); ++ uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset)); ++ uint16x8_t uvx8; ++ uint16x4_t ux2a, vx2a, ux2b, vx2b; ++ uint16x8_t y0x8, y1x8, ux8, vx8; ++ uint16x8_t r0x8, g0x8, b0x8; ++ uint16x8_t r1x8, g1x8, b1x8; + -+ if (!s->tonemap_lut && !(s->tonemap_lut = av_calloc(32768, sizeof(float)))) -+ return AVERROR(ENOMEM); ++ int16x8_t r0ox8, g0ox8, b0ox8; ++ uint16x8_t y0ox8; ++ int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4; ++ int32x4_t y0oax4, y0obx4; + -+ for (i = 0; i < 32768; i++) { -+ double v = (i - 2048.0f) / 28672.0f; -+ double sig = linearize(v, trc_src); -+ float mapped = mapsig(s->tonemap, sig, peak, s->param); -+ s->tonemap_lut[i] = (sig > 0.0f && mapped > 0.0f) ? mapped / sig : 0.0f; -+ } ++ int16x8_t r1ox8, g1ox8, b1ox8; ++ uint16x8_t y1ox8; ++ int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4; ++ int32x4_t y1oax4, y1obx4; ++ int32x4_t uvoax4, uvobx4; ++ int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2; ++ int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4; ++ int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off); ++ int32x4_t out_rndx4 = vdupq_n_s32(out_rnd); ++ int16x8_t out_sh2x8 = vdupq_n_s16(out_sh2); ++ int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset); ++ int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2); ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { ++ for (int xx = 0; xx < width >> 3; xx++) { ++ int x = xx << 3; + -+ return 0; -+} ++ y0x8 = vld1q_u16(srcy + x); ++ y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x)); ++ uvx8 = vld1q_u16(srcuv + x); ++ if (in_depth == 10) { ++ // shift to low10bits for 10bit input ++ // shift bit has to be compile-time constant ++ y0x8 = vshrq_n_u16(y0x8, 6); ++ y1x8 = vshrq_n_u16(y1x8, 6); ++ uvx8 = vshrq_n_u16(uvx8, 6); ++ } ++ y0x8 = vsubq_u16(y0x8, in_yuv_offx8); ++ y1x8 = vsubq_u16(y1x8, in_yuv_offx8); ++ uvx8 = vsubq_u16(uvx8, in_uv_offx8); + -+static int compute_yuv_coeffs(TonemapxContext *s, -+ const AVLumaCoefficients *coeffs, -+ const AVLumaCoefficients *ocoeffs, -+ const AVPixFmtDescriptor *idesc, -+ const AVPixFmtDescriptor *odesc, -+ enum AVColorRange irng, -+ enum AVColorRange orng) -+{ -+ double rgb2yuv[3][3], yuv2rgb[3][3]; -+ int res; -+ int y_rng, uv_rng; ++ ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2); ++ vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2); ++ ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2); ++ vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2); + -+ res = ff_get_range_off(&s->in_yuv_off, &y_rng, &uv_rng, -+ irng, idesc->comp[0].depth); -+ if (res < 0) { -+ av_log(s, AV_LOG_ERROR, -+ "Unsupported input color range %d (%s)\n", -+ irng, av_color_range_name(irng)); -+ return res; -+ } ++ ux8 = vcombine_u16(ux2a, ux2b); ++ vx8 = vcombine_u16(vx2a, vx2b); + -+ ff_fill_rgb2yuv_table(coeffs, rgb2yuv); -+ ff_matrix_invert_3x3(rgb2yuv, yuv2rgb); -+ ff_fill_rgb2yuv_table(ocoeffs, rgb2yuv); ++ r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted); ++ r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted); ++ r0x8 = vaddq_u16(r0x8, rndx8); + -+ ff_get_yuv_coeffs(s->yuv2rgb_coeffs, yuv2rgb, idesc->comp[0].depth, -+ y_rng, uv_rng, 1); ++ g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted); ++ g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted); ++ g0x8 = vaddq_u16(g0x8, rndx8); + -+ res = ff_get_range_off(&s->out_yuv_off, &y_rng, &uv_rng, -+ orng, odesc->comp[0].depth); -+ if (res < 0) { -+ av_log(s, AV_LOG_ERROR, -+ "Unsupported output color range %d (%s)\n", -+ orng, av_color_range_name(orng)); -+ return res; -+ } ++ b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted); ++ b0x8 = vaddq_u16(b0x8, rndx8); + -+ ff_get_yuv_coeffs(s->rgb2yuv_coeffs, rgb2yuv, odesc->comp[0].depth, -+ y_rng, uv_rng, 0); ++ r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted); ++ r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted); ++ r1x8 = vaddq_u16(r1x8, rndx8); + -+ return 0; -+} ++ g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted); ++ g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted); ++ g1x8 = vaddq_u16(g1x8, rndx8); + -+static int compute_rgb_coeffs(TonemapxContext *s, -+ enum AVColorPrimaries iprm, -+ enum AVColorPrimaries oprm) -+{ -+ double rgb2xyz[3][3], xyz2rgb[3][3]; -+ const AVColorPrimariesDesc *iprm_desc = av_csp_primaries_desc_from_id(iprm); -+ const AVColorPrimariesDesc *oprm_desc = av_csp_primaries_desc_from_id(oprm); ++ b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted); ++ b1x8 = vaddq_u16(b1x8, rndx8); + -+ if (!iprm_desc) { -+ av_log(s, AV_LOG_ERROR, -+ "Unsupported input color primaries %d (%s)\n", -+ iprm, av_color_primaries_name(iprm)); -+ return AVERROR(EINVAL); -+ } -+ if (!oprm_desc) { -+ av_log(s, AV_LOG_ERROR, -+ "Unsupported output color primaries %d (%s)\n", -+ oprm, av_color_primaries_name(oprm)); -+ return AVERROR(EINVAL); -+ } ++ tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); + -+ ff_fill_rgb2xyz_table(&oprm_desc->prim, &oprm_desc->wp, rgb2xyz); -+ ff_matrix_invert_3x3(rgb2xyz, xyz2rgb); -+ ff_fill_rgb2xyz_table(&iprm_desc->prim, &iprm_desc->wp, rgb2xyz); -+ ff_matrix_mul_3x3(s->rgb2rgb_coeffs, rgb2xyz, xyz2rgb); ++ r0ox8 = vld1q_s16(r); ++ g0ox8 = vld1q_s16(g); ++ b0ox8 = vld1q_s16(b); + -+ return 0; -+} ++ r0oax4 = vmovl_s16(vget_low_s16(r0ox8)); ++ g0oax4 = vmovl_s16(vget_low_s16(g0ox8)); ++ b0oax4 = vmovl_s16(vget_low_s16(b0ox8)); + -+static void tonemap_int16(int16_t r_in, int16_t g_in, int16_t b_in, -+ int16_t *r_out, int16_t *g_out, int16_t *b_out, -+ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, -+ const AVLumaCoefficients *coeffs, -+ const AVLumaCoefficients *ocoeffs, double desat, -+ double (*rgb2rgb)[3][3], -+ int rgb2rgb_passthrough) -+{ -+ int16_t sig; -+ float mapval, r_lin, g_lin, b_lin; ++ r0obx4 = vmovl_s16(vget_high_s16(r0ox8)); ++ g0obx4 = vmovl_s16(vget_high_s16(g0ox8)); ++ b0obx4 = vmovl_s16(vget_high_s16(b0ox8)); + -+ /* load values */ -+ *r_out = r_in; -+ *g_out = g_in; -+ *b_out = b_in; ++ y0oax4 = vmulq_n_s32(r0oax4, cry); ++ y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy); ++ y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby); ++ y0oax4 = vaddq_s32(y0oax4, out_rndx4); + -+ /* pick the brightest component, reducing the value range as necessary -+ * to keep the entire signal in range and preventing discoloration due to -+ * out-of-bounds clipping */ -+ sig = FFMAX3(r_in, g_in, b_in); ++ y0obx4 = vmulq_n_s32(r0obx4, cry); ++ y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy); ++ y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby); ++ y0obx4 = vaddq_s32(y0obx4, out_rndx4); + -+ mapval = tonemap_lut[av_clip_uintp2(sig + 2048, 15)]; ++ r1ox8 = vld1q_s16(r1); ++ g1ox8 = vld1q_s16(g1); ++ b1ox8 = vld1q_s16(b1); + -+ r_lin = lin_lut[av_clip_uintp2(r_in + 2048, 15)]; -+ g_lin = lin_lut[av_clip_uintp2(g_in + 2048, 15)]; -+ b_lin = lin_lut[av_clip_uintp2(b_in + 2048, 15)]; ++ r1oax4 = vmovl_s16(vget_low_s16(r1ox8)); ++ g1oax4 = vmovl_s16(vget_low_s16(g1ox8)); ++ b1oax4 = vmovl_s16(vget_low_s16(b1ox8)); + -+ if (!rgb2rgb_passthrough) { -+ r_lin = (*rgb2rgb)[0][0] * r_lin + (*rgb2rgb)[0][1] * g_lin + (*rgb2rgb)[0][2] * b_lin; -+ g_lin = (*rgb2rgb)[1][0] * r_lin + (*rgb2rgb)[1][1] * g_lin + (*rgb2rgb)[1][2] * b_lin; -+ b_lin = (*rgb2rgb)[2][0] * r_lin + (*rgb2rgb)[2][1] * g_lin + (*rgb2rgb)[2][2] * b_lin; -+ } ++ r1obx4 = vmovl_s16(vget_high_s16(r1ox8)); ++ g1obx4 = vmovl_s16(vget_high_s16(g1ox8)); ++ b1obx4 = vmovl_s16(vget_high_s16(b1ox8)); + -+#define MIX(x,y,a) (x) * (1 - (a)) + (y) * (a) -+ /* desaturate to prevent unnatural colors */ -+ if (desat > 0) { -+ float luma = av_q2d(coeffs->cr) * r_lin + av_q2d(coeffs->cg) * g_lin + av_q2d(coeffs->cb) * b_lin; -+ float overbright = FFMAX(luma - desat, FLOAT_EPS) / FFMAX(luma, FLOAT_EPS); -+ r_lin = MIX(r_lin, luma, overbright); -+ g_lin = MIX(g_lin, luma, overbright); -+ b_lin = MIX(b_lin, luma, overbright); -+ } ++ y1oax4 = vmulq_n_s32(r1oax4, cry); ++ y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy); ++ y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby); ++ y1oax4 = vaddq_s32(y1oax4, out_rndx4); + -+ r_lin *= mapval; -+ g_lin *= mapval; -+ b_lin *= mapval; -+#undef MIX ++ y1obx4 = vmulq_n_s32(r1obx4, cry); ++ y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy); ++ y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby); ++ y1obx4 = vaddq_s32(y1obx4, out_rndx4); + -+ *r_out = delin_lut[av_clip_uintp2(r_lin * 32767 + 0.5, 15)]; -+ *g_out = delin_lut[av_clip_uintp2(g_lin * 32767 + 0.5, 15)]; -+ *b_out = delin_lut[av_clip_uintp2(b_lin * 32767 + 0.5, 15)]; -+} ++ ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4)); ++ ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4)); ++ ravgx4 = vcombine_s32(ravgax2, ravgbx2); ++ ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4)); ++ ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4)); ++ ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2)); ++ ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4); ++ ravgx4 = vshrq_n_s32(ravgx4, 2); + -+// See also libavfilter/colorspacedsp_template.c -+void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) -+{ -+ const int in_depth = srcdepth; -+ const int in_uv_offset = 128 << (in_depth - 8); -+ const int in_sh = in_depth - 1; -+ const int in_rnd = 1 << (in_sh - 1); -+ const int in_sh2 = 16 - in_depth; -+ -+ const int out_depth = dstdepth; -+ const int out_uv_offset = 128 << (out_depth - 8); -+ const int out_sh = 29 - out_depth; -+ const int out_rnd = 1 << (out_sh - 1); -+ -+ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; -+ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; -+ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; -+ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; -+ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; -+ -+ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; -+ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; -+ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; -+ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; -+ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; -+ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; -+ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; -+ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4)); ++ gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4)); ++ gavgx4 = vcombine_s32(gavgax2, gavgbx2); ++ gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4)); ++ gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4)); ++ gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2)); ++ gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4); ++ gavgx4 = vshrq_n_s32(gavgx4, 2); + -+ int r00, g00, b00; -+ int r01, g01, b01; -+ int r10, g10, b10; -+ int r11, g11, b11; ++ bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4)); ++ bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4)); ++ bavgx4 = vcombine_s32(bavgax2, bavgbx2); ++ bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4)); ++ bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4)); ++ bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2)); ++ bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4); ++ bavgx4 = vshrq_n_s32(bavgx4, 2); + -+ int16_t r[4], g[4], b[4]; -+ for (; height > 1; height -= 2, -+ dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1], -+ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { -+ for (int x = 0; x < width; x += 2) { -+ int y00 = (srcy[x] >> in_sh2) - params->in_yuv_off; -+ int y01 = (srcy[x + 1] >> in_sh2) - params->in_yuv_off; -+ int y10 = (srcy[srclinesize[0] / 2 + x] >> in_sh2) - params->in_yuv_off; -+ int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off; -+ int u = (srcuv[x] >> in_sh2) - in_uv_offset; -+ int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset; ++ uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru); ++ uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu); ++ uox4 = vmlaq_n_s32(uox4, bavgx4, cburv); + -+ r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh); -+ r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh); -+ r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh); -+ r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh); ++ vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv); ++ vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv); ++ vox4 = vmlaq_n_s32(vox4, bavgx4, cbv); + -+ g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ switch(out_depth) { ++ default: ++ case 10: ++ y0oax4 = vshrq_n_s32(y0oax4, 19); ++ y0obx4 = vshrq_n_s32(y0obx4, 19); ++ y1oax4 = vshrq_n_s32(y1oax4, 19); ++ y1obx4 = vshrq_n_s32(y1obx4, 19); ++ uox4 = vshrq_n_s32(uox4, 19); ++ vox4 = vshrq_n_s32(vox4, 19); ++ break; ++ case 16: ++ y0oax4 = vshrq_n_s32(y0oax4, 13); ++ y0obx4 = vshrq_n_s32(y0obx4, 13); ++ y1oax4 = vshrq_n_s32(y1oax4, 13); ++ y1obx4 = vshrq_n_s32(y1obx4, 13); ++ uox4 = vshrq_n_s32(uox4, 13); ++ vox4 = vshrq_n_s32(vox4, 13); ++ break; ++ } + -+ b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh); -+ b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh); -+ b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh); -+ b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh); ++ y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4); ++ y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4); ++ y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4); ++ y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4); ++ uox4 = vaddq_s32(uox4, out_uv_offsetx4); ++ vox4 = vaddq_s32(vox4, out_uv_offsetx4); + -+ tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4)); ++ y0ox8 = vshlq_u16(y0ox8, out_sh2x8); ++ vst1q_u16(&dsty[x], y0ox8); + -+ r00 = r[0], g00 = g[0], b00 = b[0]; -+ r01 = r[1], g01 = g[1], b01 = b[1]; -+ r10 = r[2], g10 = g[2], b10 = b[2]; -+ r11 = r[3], g11 = g[3], b11 = b[3]; ++ y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4)); ++ y1ox8 = vshlq_u16(y1ox8, out_sh2x8); ++ vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8); + -+ dsty[x] = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)); -+ dsty[x + 1] = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)); -+ dsty[dstlinesize[0] + x] = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)); -+ dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)); ++ uvoax4 = vzip1q_s32(uox4, vox4); ++ uvobx4 = vzip2q_s32(uox4, vox4); + -+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) -+ dstuv[x] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)); -+ dstuv[x + 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)); -+#undef AVG ++ vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8)); + } + } ++ ++ // Process remaining pixels cannot fill the full simd register with scalar version ++ if (remainw) { ++ int offset = width & (int)0xfffffff8; ++ rdsty += offset; ++ rdstuv += offset; ++ rsrcy += offset; ++ rsrcuv += offset; ++ tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv, ++ rsrcy, rsrcuv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); ++ } ++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS +} +Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h +=================================================================== +--- /dev/null ++++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h +@@ -0,0 +1,68 @@ ++/* ++ * Copyright (c) 2024 Gnattu OC ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) -+{ -+ const int in_depth = srcdepth; -+ const int in_uv_offset = 128 << (in_depth - 8); -+ const int in_sh = in_depth - 1; -+ const int in_rnd = 1 << (in_sh - 1); ++#ifndef AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H ++#define AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H + -+ const int out_depth = dstdepth; -+ const int out_uv_offset = 128 << (out_depth - 8); -+ const int out_sh = 29 - out_depth; -+ const int out_rnd = 1 << (out_sh - 1); ++#include "libavfilter/vf_tonemapx.h" + -+ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; -+ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; -+ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; -+ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; -+ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++void tonemap_frame_dovi_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; -+ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; -+ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; -+ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; -+ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; -+ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; -+ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; -+ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+ int r00, g00, b00; -+ int r01, g01, b01; -+ int r10, g10, b10; -+ int r11, g11, b11; ++void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+ int16_t r[4], g[4], b[4]; -+ for (; height > 1; height -= 2, -+ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], -+ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { -+ for (int x = 0; x < width; x += 2) { -+ int y00 = (srcy[x] ) - params->in_yuv_off; -+ int y01 = (srcy[x + 1] ) - params->in_yuv_off; -+ int y10 = (srcy[srclinesize[0] / 2 + x] ) - params->in_yuv_off; -+ int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off; -+ int u = (srcu[x >> 1]) - in_uv_offset; -+ int v = (srcv[x >> 1]) - in_uv_offset; ++void tonemap_frame_dovi_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+ r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh); -+ r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh); -+ r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh); -+ r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh); ++void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+ g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ -+ b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh); -+ b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh); -+ b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh); -+ b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh); ++void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+ tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++#endif // AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H +Index: FFmpeg/libavfilter/allfilters.c +=================================================================== +--- FFmpeg.orig/libavfilter/allfilters.c ++++ FFmpeg/libavfilter/allfilters.c +@@ -498,6 +498,7 @@ extern const AVFilter ff_vf_tmedian; + extern const AVFilter ff_vf_tmidequalizer; + extern const AVFilter ff_vf_tmix; + extern const AVFilter ff_vf_tonemap; ++extern const AVFilter ff_vf_tonemapx; + extern const AVFilter ff_vf_tonemap_cuda; + extern const AVFilter ff_vf_tonemap_opencl; + extern const AVFilter ff_vf_tonemap_vaapi; +Index: FFmpeg/libavfilter/colorspace.c +=================================================================== +--- FFmpeg.orig/libavfilter/colorspace.c ++++ FFmpeg/libavfilter/colorspace.c +@@ -17,6 +17,7 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include "libavutil/avassert.h" + #include "libavutil/frame.h" + #include "libavutil/mastering_display_metadata.h" + #include "libavutil/pixdesc.h" +@@ -354,3 +355,51 @@ float inverse_eotf_arib_b67(float x) { + float inverse_eotf_bt1886(float x) { + return x > 0.0f ? powf(x, 1.0f / 2.4f) : 0.0f; + } + -+ r00 = r[0], g00 = g[0], b00 = b[0]; -+ r01 = r[1], g01 = g[1], b01 = b[1]; -+ r10 = r[2], g10 = g[2], b10 = b[2]; -+ r11 = r[3], g11 = g[3], b11 = b[3]; ++int ff_get_range_off(int *off, int *y_rng, int *uv_rng, ++ enum AVColorRange rng, int depth) ++{ ++ switch (rng) { ++ case AVCOL_RANGE_UNSPECIFIED: ++ case AVCOL_RANGE_MPEG: ++ *off = 16 << (depth - 8); ++ *y_rng = 219 << (depth - 8); ++ *uv_rng = 224 << (depth - 8); ++ break; ++ case AVCOL_RANGE_JPEG: ++ *off = 0; ++ *y_rng = *uv_rng = (256 << (depth - 8)) - 1; ++ break; ++ default: ++ return AVERROR(EINVAL); ++ } + -+ dsty[x] = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)); -+ dsty[x + 1] = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)); -+ dsty[dstlinesize[0] + x] = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)); -+ dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)); ++ return 0; ++} + -+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) -+ dstu[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)); -+ dstv[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)); -+#undef AVG ++void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3], ++ int depth, int y_rng, int uv_rng, int yuv2rgb) ++{ ++#define N (yuv2rgb ? m : n) ++#define M (yuv2rgb ? n : m) ++ int rng, n, m, o; ++ int bits = 1 << (yuv2rgb ? (depth - 1) : (29 - depth)); ++ for (rng = y_rng, n = 0; n < 3; n++, rng = uv_rng) { ++ for (m = 0; m < 3; m++) { ++ out[N][M][0] = lrint(bits * (yuv2rgb ? 28672 : rng) * table[N][M] / (yuv2rgb ? rng : 28672)); ++ for (o = 1; o < 8; o++) ++ out[N][M][o] = out[N][M][0]; + } + } ++#undef N ++#undef M ++ ++ if (yuv2rgb) { ++ av_assert2(out[0][1][0] == 0); ++ av_assert2(out[2][2][0] == 0); ++ av_assert2(out[0][0][0] == out[1][0][0]); ++ av_assert2(out[0][0][0] == out[2][0][0]); ++ } else { ++ av_assert2(out[1][2][0] == out[2][0][0]); ++ } +} +Index: FFmpeg/libavfilter/colorspace.h +=================================================================== +--- FFmpeg.orig/libavfilter/colorspace.h ++++ FFmpeg/libavfilter/colorspace.h +@@ -85,4 +85,8 @@ float eotf_arib_b67(float x); + float inverse_eotf_arib_b67(float x); + float inverse_eotf_bt1886(float x); + ++int ff_get_range_off(int *off, int *y_rng, int *uv_rng, ++ enum AVColorRange rng, int depth); ++void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3], ++ int depth, int y_rng, int uv_rng, int yuv2rgb); + #endif +Index: FFmpeg/libavfilter/vf_tonemapx.c +=================================================================== +--- /dev/null ++++ FFmpeg/libavfilter/vf_tonemapx.c +@@ -0,0 +1,1778 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) -+{ -+ const int in_depth = srcdepth; -+ const int in_uv_offset = 128 << (in_depth - 8); -+ const int in_sh = in_depth - 1; -+ const int in_rnd = 1 << (in_sh - 1); ++/** ++ * @file ++ * tonemap algorithms ++ */ + -+ const int out_depth = dstdepth; -+ const int out_uv_offset = 128 << (out_depth - 8); -+ const int out_sh = 29 - out_depth; -+ const int out_rnd = 1 << (out_sh - 1); ++#include ++#include + -+ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; -+ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; -+ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; -+ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; -+ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++#include "libavutil/avassert.h" ++#include "libavutil/imgutils.h" ++#include "libavutil/internal.h" ++#include "libavutil/mem_internal.h" ++#include "libavutil/opt.h" ++#include "libavutil/cpu.h" + -+ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; -+ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; -+ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; -+ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; -+ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; -+ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; -+ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; -+ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++#include "vf_tonemapx.h" + -+ int r00, g00, b00; -+ int r01, g01, b01; -+ int r10, g10, b10; -+ int r11, g11, b11; ++#ifdef CC_SUPPORTS_TONEMAPX_INTRINSICS ++# if ARCH_AARCH64 ++# if HAVE_INTRINSICS_NEON ++# include "libavutil/aarch64/cpu.h" ++# include "aarch64/vf_tonemapx_intrin_neon.h" ++# endif ++# endif // ARCH_AARCH64 ++# if ARCH_X86 ++# include "libavutil/x86/cpu.h" ++# if HAVE_INTRINSICS_SSE42 ++# include "x86/vf_tonemapx_intrin_sse.h" ++# endif ++# if HAVE_INTRINSICS_AVX2 && HAVE_INTRINSICS_FMA3 ++# include "x86/vf_tonemapx_intrin_avx.h" ++# endif ++# endif // ARCH_X86 ++#endif // CC_SUPPORTS_TONEMAPX_INTRINSICS + -+ int16_t r[4], g[4], b[4]; -+ for (; height > 1; height -= 2, -+ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, -+ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { -+ for (int x = 0; x < width; x += 2) { -+ int y00 = (srcy[x] ) - params->in_yuv_off; -+ int y01 = (srcy[x + 1] ) - params->in_yuv_off; -+ int y10 = (srcy[srclinesize[0] / 2 + x] ) - params->in_yuv_off; -+ int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off; -+ int u = (srcu[x >> 1]) - in_uv_offset; -+ int v = (srcv[x >> 1]) - in_uv_offset; ++#include "avfilter.h" ++#include "formats.h" ++#include "internal.h" ++#include "video.h" + -+ r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh); -+ r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh); -+ r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh); -+ r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh); ++enum TonemapAlgorithm { ++ TONEMAP_NONE, ++ TONEMAP_LINEAR, ++ TONEMAP_GAMMA, ++ TONEMAP_CLIP, ++ TONEMAP_REINHARD, ++ TONEMAP_HABLE, ++ TONEMAP_MOBIUS, ++ TONEMAP_BT2390, ++ TONEMAP_MAX, ++}; + -+ g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++typedef struct TonemapxContext { ++ const AVClass *class; + -+ b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh); -+ b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh); -+ b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh); -+ b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh); ++ enum TonemapAlgorithm tonemap; ++ enum AVColorTransferCharacteristic trc; ++ enum AVColorSpace spc; ++ enum AVColorPrimaries pri; ++ enum AVColorRange range; ++ enum AVPixelFormat format; ++ char *format_str; ++ double param; ++ double desat; ++ double peak; ++ int apply_dovi; + -+ tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ const AVLumaCoefficients *coeffs, *ocoeffs; + -+ r00 = r[0], g00 = g[0], b00 = b[0]; -+ r01 = r[1], g01 = g[1], b01 = b[1]; -+ r10 = r[2], g10 = g[2], b10 = b[2]; -+ r11 = r[3], g11 = g[3], b11 = b[3]; ++ double lut_peak; ++ float *lin_lut; ++ float *tonemap_lut; ++ uint16_t *delin_lut; ++ int in_yuv_off, out_yuv_off; + -+ dsty[x] = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 16); -+ dsty[x + 1] = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 16); -+ dsty[dstlinesize[0] / 2 + x] = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 16); -+ dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 16); ++ struct DoviMetadata *dovi; + -+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) -+ dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 16); -+ dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 16); -+#undef AVG -+ } -+ } -+} ++ DECLARE_ALIGNED(16, float, dovi_pbuf)[3*(params_sz+pivots_sz+coeffs_sz+mmr_sz)]; ++ DECLARE_ALIGNED(16, int16_t, yuv2rgb_coeffs)[3][3][8]; ++ DECLARE_ALIGNED(16, int16_t, rgb2yuv_coeffs)[3][3][8]; ++ DECLARE_ALIGNED(16, double, rgb2rgb_coeffs)[3][3]; ++ DECLARE_ALIGNED(16, double, lms2rgb_matrix)[3][3]; ++ DECLARE_ALIGNED(16, float, ycc_offset)[3]; + -+void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) -+{ -+ const int in_depth = srcdepth; -+ const int in_uv_offset = 128 << (in_depth - 8); -+ const int in_sh = in_depth - 1; -+ const int in_rnd = 1 << (in_sh - 1); -+ const int in_sh2 = 16 - in_depth; ++ int (*filter_slice) (AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs); + -+ const int out_depth = dstdepth; -+ const int out_uv_offset = 128 << (out_depth - 8); -+ const int out_sh = 29 - out_depth; -+ const int out_rnd = 1 << (out_sh - 1); -+ const int out_sh2 = 16 - out_depth; ++ void (*tonemap_func_biplanar8) (uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; -+ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; -+ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; -+ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; -+ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++ void (*tonemap_func_planar8) (uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; -+ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; -+ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; -+ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; -+ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; -+ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; -+ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; -+ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ void (*tonemap_func_biplanar10) (uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+ int r00, g00, b00; -+ int r01, g01, b01; -+ int r10, g10, b10; -+ int r11, g11, b11; ++ void (*tonemap_func_planar10) (uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+ int16_t r[4], g[4], b[4]; -+ for (; height > 1; height -= 2, -+ dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2, -+ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { -+ for (int x = 0; x < width; x += 2) { -+ int y00 = (srcy[x] >> in_sh2) - params->in_yuv_off; -+ int y01 = (srcy[x + 1] >> in_sh2) - params->in_yuv_off; -+ int y10 = (srcy[srclinesize[0] / 2 + x] >> in_sh2) - params->in_yuv_off; -+ int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off; -+ int u = (srcuv[x] >> in_sh2) - in_uv_offset; -+ int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset; ++ void (*tonemap_func_dovi8) (uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+ r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh); -+ r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh); -+ r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh); -+ r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh); ++ void (*tonemap_func_dovi10) (uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + -+ g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++} TonemapxContext; + -+ b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh); -+ b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh); -+ b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh); -+ b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh); ++typedef struct ThreadData { ++ AVFrame *in, *out; ++ const AVPixFmtDescriptor *desc, *odesc; ++ double peak; ++} ThreadData; + -+ tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); -+ tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++static const enum AVPixelFormat in_pix_fmts[] = { ++ AV_PIX_FMT_YUV420P10, ++ AV_PIX_FMT_P010, ++ AV_PIX_FMT_P016, ++ AV_PIX_FMT_NONE, ++}; + -+ r00 = r[0], g00 = g[0], b00 = b[0]; -+ r01 = r[1], g01 = g[1], b01 = b[1]; -+ r10 = r[2], g10 = g[2], b10 = b[2]; -+ r11 = r[3], g11 = g[3], b11 = b[3]; ++static const enum AVPixelFormat out_pix_fmts[] = { ++ AV_PIX_FMT_YUV420P, ++ AV_PIX_FMT_YUV420P10, ++ AV_PIX_FMT_NV12, ++ AV_PIX_FMT_P010, ++ AV_PIX_FMT_P016, ++}; + -+ dsty[x] = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16); -+ dsty[x + 1] = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16); -+ dsty[dstlinesize[0] / 2 + x] = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16); -+ dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16); ++const double dovi_lms2rgb_matrix[3][3] = ++ { ++ { 3.06441879, -2.16597676, 0.10155818}, ++ {-0.65612108, 1.78554118, -0.12943749}, ++ { 0.01736321, -0.04725154, 1.03004253}, ++ }; + -+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) -+ dstuv[x] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16); -+ dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16); -+#undef AVG ++static void update_dovi_buf(AVFilterContext *ctx) ++{ ++ TonemapxContext *s = ctx->priv; ++ float coeffs_data[8][4] = {0}; ++ float mmr_packed_data[8*6][4] = {0}; ++ int c, i, j, k; ++ ++ for (c = 0; c < 3; c++) { ++ int has_poly = 0, has_mmr = 0, mmr_single = 1; ++ int mmr_idx = 0, min_order = 3, max_order = 1; ++ const struct ReshapeData *comp = &s->dovi->comp[c]; ++ if (!comp->num_pivots) ++ continue; ++ av_assert0(comp->num_pivots >= 2 && comp->num_pivots <= 9); ++ ++ memset(coeffs_data, 0, sizeof(coeffs_data)); ++ for (i = 0; i < comp->num_pivots - 1; i++) { ++ switch (comp->method[i]) { ++ case 0: // polynomial ++ has_poly = 1; ++ coeffs_data[i][3] = 0.0f; // order=0 signals polynomial ++ for (k = 0; k < 3; k++) ++ coeffs_data[i][k] = comp->poly_coeffs[i][k]; ++ break; ++ case 1: ++ min_order = FFMIN(min_order, comp->mmr_order[i]); ++ max_order = FFMAX(max_order, comp->mmr_order[i]); ++ mmr_single = !has_mmr; ++ has_mmr = 1; ++ coeffs_data[i][3] = (float)comp->mmr_order[i]; ++ coeffs_data[i][0] = comp->mmr_constant[i]; ++ coeffs_data[i][1] = (float)mmr_idx; ++ for (j = 0; j < comp->mmr_order[i]; j++) { ++ // store weights per order as two packed vec4s ++ float *mmr = &mmr_packed_data[mmr_idx][0]; ++ mmr[0] = comp->mmr_coeffs[i][j][0]; ++ mmr[1] = comp->mmr_coeffs[i][j][1]; ++ mmr[2] = comp->mmr_coeffs[i][j][2]; ++ mmr[3] = 0.0f; // unused ++ mmr[4] = comp->mmr_coeffs[i][j][3]; ++ mmr[5] = comp->mmr_coeffs[i][j][4]; ++ mmr[6] = comp->mmr_coeffs[i][j][5]; ++ mmr[7] = comp->mmr_coeffs[i][j][6]; ++ mmr_idx += 2; ++ } ++ break; ++ default: ++ av_assert0(0); ++ } + } -+ } -+} + -+#define LOAD_TONEMAP_PARAMS TonemapxContext *s = ctx->priv; \ -+ThreadData *td = arg; \ -+AVFrame *in = td->in; \ -+AVFrame *out = td->out; \ -+const AVPixFmtDescriptor *desc = td->desc; \ -+const AVPixFmtDescriptor *odesc = td->odesc; \ -+const int ss = 1 << FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h); \ -+const int slice_start = (in->height / ss * jobnr ) / nb_jobs * ss; \ -+const int slice_end = (in->height / ss * (jobnr + 1)) / nb_jobs * ss; \ -+TonemapIntParams params = { \ -+.lut_peak = s->lut_peak, \ -+.lin_lut = s->lin_lut, \ -+.tonemap_lut = s->tonemap_lut, \ -+.delin_lut = s->delin_lut, \ -+.in_yuv_off = s->in_yuv_off, \ -+.out_yuv_off = s->out_yuv_off, \ -+.yuv2rgb_coeffs = &s->yuv2rgb_coeffs, \ -+.rgb2yuv_coeffs = &s->rgb2yuv_coeffs, \ -+.rgb2rgb_coeffs = &s->rgb2rgb_coeffs, \ -+.rgb2rgb_passthrough = in->color_primaries == out->color_primaries, \ -+.coeffs = s->coeffs, \ -+.ocoeffs = s->ocoeffs, \ -+.desat = s->desat, \ -+}; ++ av_assert0(has_poly || has_mmr); + -+static int filter_slice_planar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) -+{ -+ LOAD_TONEMAP_PARAMS -+ av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth); ++ if (has_mmr) ++ av_assert0(min_order <= max_order); + -+ s->tonemap_func_planar8(out->data[0] + out->linesize[0] * slice_start, -+ out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h), -+ out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h), -+ (void*)(in->data[0] + in->linesize[0] * slice_start), -+ (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)), -+ (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)), -+ out->linesize, in->linesize, -+ odesc->comp[0].depth, desc->comp[0].depth, -+ out->width, slice_end - slice_start, -+ ¶ms); ++ // dovi_params ++ { ++ float params[8] = { ++ comp->num_pivots, !!has_mmr, !!has_poly, ++ mmr_single, min_order, max_order, ++ comp->pivots[0], comp->pivots[comp->num_pivots - 1] ++ }; ++ memcpy(s->dovi_pbuf + c*params_cnt, params, params_sz); ++ } + -+ return 0; ++ // dovi_pivots ++ if (c == 0 && comp->num_pivots > 2) { ++ // Skip the (irrelevant) lower and upper bounds ++ float pivots_data[7+1] = {0}; ++ memcpy(pivots_data, comp->pivots + 1, ++ (comp->num_pivots - 2) * sizeof(pivots_data[0])); ++ // Fill the remainder with a quasi-infinite sentinel pivot ++ for (i = comp->num_pivots - 2; i < FF_ARRAY_ELEMS(pivots_data); i++) ++ pivots_data[i] = 1e9f; ++ memcpy(s->dovi_pbuf + 3*params_cnt + c*pivots_cnt, pivots_data, pivots_sz); ++ } ++ ++ // dovi_coeffs ++ memcpy(s->dovi_pbuf + 3*(params_cnt+pivots_cnt) + c*coeffs_cnt, &coeffs_data[0], coeffs_sz); ++ ++ // dovi_mmr ++ if (has_mmr) ++ memcpy(s->dovi_pbuf + 3*(params_cnt+pivots_cnt+coeffs_cnt) + c*mmr_cnt, &mmr_packed_data[0], mmr_sz); ++ } +} + -+static int filter_slice_biplanar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) ++inline static float dot(const float* x, const float* y, int len) +{ -+ LOAD_TONEMAP_PARAMS -+ av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth); -+ -+ s->tonemap_func_biplanar8(out->data[0] + out->linesize[0] * slice_start, -+ out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h), -+ (void*)(in->data[0] + in->linesize[0] * slice_start), -+ (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)), -+ out->linesize, in->linesize, -+ odesc->comp[0].depth, desc->comp[0].depth, -+ out->width, slice_end - slice_start, -+ ¶ms); ++ int i; ++ float result = 0; ++ for (i = 0; i < len; i++) { ++ result += x[i] * y[i]; ++ } ++ return result; ++} + -+ return 0; ++inline static float reshape_poly(float s, float* coeffs) { ++ return (coeffs[2] * s + coeffs[1]) * s + coeffs[0]; +} + -+static int filter_slice_planar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) ++inline static float reshape_mmr(const float* sig, const float* coeffs, const float* mmr, ++ int mmr_single, int min_order, int max_order) +{ -+ LOAD_TONEMAP_PARAMS -+ av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth); ++ int mmr_idx = mmr_single ? 0 : (int)coeffs[1]; ++ int order = (int)coeffs[3]; ++ float s = coeffs[0]; ++ float sigX[7+1] = {sig[0], sig[1], sig[2], 0, ++ sig[0]*sig[1], sig[0]*sig[2], sig[1]*sig[2], sig[0]*sig[1]*sig[2]}; ++ ++ s += dot(&mmr[mmr_idx + 0*4], sigX, 7+1); ++ if (max_order >= 2 && (min_order >= 2 || order >= 2)) { ++ float sigX2[7+1] = {sig[0]*sig[0], sig[1]*sig[1], sig[2]*sig[2], 0, ++ sigX[4]*sigX[4], sigX[5]*sigX[5], sigX[6]*sigX[6], sigX[7]*sigX[7]}; ++ s += dot(&mmr[mmr_idx + 2*4], sigX2, 7+1); ++ ++ if (max_order == 3 && (min_order == 3 || order >= 3)) { ++ float sigX3[7+1] = {sig[0]*sig[0]*sig[0], sig[1]*sig[1]*sig[1], sig[2]*sig[2]*sig[2], 0, ++ sigX2[4]*sigX[4], sigX2[5]*sigX[5], sigX2[6]*sigX[6], sigX2[7]*sigX[7]}; ++ s += dot(&mmr[mmr_idx + 4*4], sigX3, 7+1); ++ } ++ } + -+ s->tonemap_func_planar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start), -+ (uint16_t *) (out->data[1] + -+ out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)), -+ (uint16_t *) (out->data[2] + -+ out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)), -+ (void*)(in->data[0] + in->linesize[0] * slice_start), -+ (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)), -+ (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)), -+ out->linesize, in->linesize, -+ odesc->comp[0].depth, desc->comp[0].depth, -+ out->width, slice_end - slice_start, -+ ¶ms); ++ return s; ++} + -+ return 0; ++inline static void ycc2rgb(float* dest, float y, float cb, float cr, const double nonlinear[3][3], const float ycc_offset[3]) ++{ ++ dest[0] = (y * (float)nonlinear[0][0] + cb * (float)nonlinear[0][1] + cr * (float)nonlinear[0][2]) - ycc_offset[0]; ++ dest[1] = (y * (float)nonlinear[1][0] + cb * (float)nonlinear[1][1] + cr * (float)nonlinear[1][2]) - ycc_offset[1]; ++ dest[2] = (y * (float)nonlinear[2][0] + cb * (float)nonlinear[2][1] + cr * (float)nonlinear[2][2]) - ycc_offset[2]; +} + -+static int filter_slice_biplanar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) ++// This implementation does not do the costly linearization and de-linearization for performance reasons ++// The output color accuracy will be affected due to this ++inline static void lms2rgb(float* dest, float l, float m, float s, const double linear[3][3], const double lms2rgb_matrix[3][3]) +{ -+ LOAD_TONEMAP_PARAMS -+ av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth); ++ dest[0] = l * (float)lms2rgb_matrix[0][0] + m * (float)lms2rgb_matrix[0][1] + s * (float)lms2rgb_matrix[0][2]; ++ dest[1] = l * (float)lms2rgb_matrix[1][0] + m * (float)lms2rgb_matrix[1][1] + s * (float)lms2rgb_matrix[1][2]; ++ dest[2] = l * (float)lms2rgb_matrix[2][0] + m * (float)lms2rgb_matrix[2][1] + s * (float)lms2rgb_matrix[2][2]; ++} + -+ s->tonemap_func_biplanar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start), -+ (uint16_t *) (out->data[1] + -+ out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)), -+ (void*)(in->data[0] + in->linesize[0] * slice_start), -+ (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)), -+ out->linesize, in->linesize, -+ odesc->comp[0].depth, desc->comp[0].depth, -+ out->width, slice_end - slice_start, -+ ¶ms); ++#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c))) ++inline static void reshape_dovi_yuv(float* dest, float* src, const TonemapIntParams *ctx) ++{ ++ int i; ++ float s; ++ float coeffs[4] = {0, 0, 0, 0}; ++ float sig_arr[3] = {src[0],src[1],src[2]}; ++ ++ int dovi_num_pivots, dovi_has_mmr, dovi_has_poly; ++ int dovi_mmr_single, dovi_min_order, dovi_max_order; ++ int has_mmr_poly; ++ float dovi_lo, dovi_hi; ++ float *dovi_params; ++ float *dovi_pivots; ++ float *dovi_coeffs, *dovi_mmr; //float4* ++ ++ float *src_dovi_params = ctx->dovi_pbuf; ++ float *src_dovi_pivots = ctx->dovi_pbuf + 24; ++ float *src_dovi_coeffs = ctx->dovi_pbuf + 48; //float4* ++ float *src_dovi_mmr = ctx->dovi_pbuf + 144; //float4* ++ ++ for (i = 0; i < 3; i++) { ++ dovi_params = src_dovi_params + i*8; ++ dovi_pivots = src_dovi_pivots + i*8; ++ dovi_coeffs = src_dovi_coeffs + i*8*4; //float4* ++ dovi_mmr = src_dovi_mmr + i*48*4; //float4* ++ dovi_num_pivots = dovi_params[0]; ++ dovi_has_mmr = dovi_params[1]; ++ dovi_has_poly = dovi_params[2]; ++ dovi_mmr_single = dovi_params[3]; ++ dovi_min_order = dovi_params[4]; ++ dovi_max_order = dovi_params[5]; ++ dovi_lo = dovi_params[6]; ++ dovi_hi = dovi_params[7]; ++ ++ s = sig_arr[i]; ++ coeffs[0] = dovi_coeffs[0*4+0]; ++ coeffs[1] = dovi_coeffs[0*4+1]; ++ coeffs[2] = dovi_coeffs[0*4+2]; ++ coeffs[3] = dovi_coeffs[0*4+3]; ++ ++#define mix(x, y, a) ((x) + ((y) - (x)) * (a)) ++ if (i == 0 && dovi_num_pivots > 2) { ++ int t0 = s >= dovi_pivots[0], t1 = s >= dovi_pivots[1]; ++ int t2 = s >= dovi_pivots[2], t3 = s >= dovi_pivots[3]; ++ int t4 = s >= dovi_pivots[4], t5 = s >= dovi_pivots[5], t6 = s >= dovi_pivots[6]; ++ ++ float m01[4] = { mix(dovi_coeffs[0*4+0], dovi_coeffs[1*4+0], t0), ++ mix(dovi_coeffs[0*4+1], dovi_coeffs[1*4+1], t0), ++ mix(dovi_coeffs[0*4+2], dovi_coeffs[1*4+2], t0), ++ mix(dovi_coeffs[0*4+3], dovi_coeffs[1*4+3], t0) }; ++ float m23[4] = { mix(dovi_coeffs[2*4+0], dovi_coeffs[3*4+0], t2), ++ mix(dovi_coeffs[2*4+1], dovi_coeffs[3*4+1], t2), ++ mix(dovi_coeffs[2*4+2], dovi_coeffs[3*4+2], t2), ++ mix(dovi_coeffs[2*4+3], dovi_coeffs[3*4+3], t2) }; ++ float m0123[4] = { mix(m01[0], m23[0], t1), ++ mix(m01[1], m23[1], t1), ++ mix(m01[2], m23[2], t1), ++ mix(m01[3], m23[3], t1) }; ++ float m45[4] = { mix(dovi_coeffs[4*4+0], dovi_coeffs[5*4+0], t4), ++ mix(dovi_coeffs[4*4+1], dovi_coeffs[5*4+1], t4), ++ mix(dovi_coeffs[4*4+2], dovi_coeffs[5*4+2], t4), ++ mix(dovi_coeffs[4*4+3], dovi_coeffs[5*4+3], t4) }; ++ float m67[4] = { mix(dovi_coeffs[6*4+0], dovi_coeffs[7*4+0], t6), ++ mix(dovi_coeffs[6*4+1], dovi_coeffs[7*4+1], t6), ++ mix(dovi_coeffs[6*4+2], dovi_coeffs[7*4+2], t6), ++ mix(dovi_coeffs[6*4+3], dovi_coeffs[7*4+3], t6) }; ++ float m4567[4] = { mix(m45[0], m67[0], t5), ++ mix(m45[1], m67[1], t5), ++ mix(m45[2], m67[2], t5), ++ mix(m45[3], m67[3], t5) }; ++ ++ coeffs[0] = mix(m0123[0], m4567[0], t3); ++ coeffs[1] = mix(m0123[1], m4567[1], t3); ++ coeffs[2] = mix(m0123[2], m4567[2], t3); ++ coeffs[3] = mix(m0123[3], m4567[3], t3); ++ } + -+ return 0; ++ has_mmr_poly = dovi_has_mmr && dovi_has_poly; ++ ++ if ((has_mmr_poly && coeffs[3] == 0.0f) || (!has_mmr_poly && dovi_has_poly)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(sig_arr, coeffs, dovi_mmr, ++ dovi_mmr_single, dovi_min_order, dovi_max_order); ++ ++ sig_arr[i] = CLAMP(s, dovi_lo, dovi_hi); ++ } ++ ++ dest[0] = sig_arr[0]; ++ dest[1] = sig_arr[1]; ++ dest[2] = sig_arr[2]; +} + -+static int filter_frame(AVFilterLink *link, AVFrame *in) ++static int out_format_is_supported(enum AVPixelFormat fmt) +{ -+ AVFilterContext *ctx = link->dst; -+ TonemapxContext *s = ctx->priv; -+ AVFilterLink *outlink = ctx->outputs[0]; -+ AVFrame *out; -+ const AVPixFmtDescriptor *desc; -+ const AVPixFmtDescriptor *odesc; -+ int ret; -+ double peak = s->peak; -+ const AVLumaCoefficients *coeffs; -+ ThreadData td; ++ int i; + -+ desc = av_pix_fmt_desc_get(link->format); -+ odesc = av_pix_fmt_desc_get(outlink->format); -+ if (!desc || !odesc) { -+ av_frame_free(&in); -+ return AVERROR_BUG; -+ } ++ for (i = 0; i < FF_ARRAY_ELEMS(out_pix_fmts); i++) ++ if (out_pix_fmts[i] == fmt) ++ return 1; ++ return 0; ++} + -+ switch (odesc->comp[2].plane) { -+ case 1: // biplanar -+ if (odesc->comp[0].depth == 8) { -+ s->filter_slice = filter_slice_biplanar8; -+ } else { -+ s->filter_slice = filter_slice_biplanar10; -+ } -+ break; -+ default: -+ case 2: // planar -+ if (odesc->comp[0].depth == 8) { -+ s->filter_slice = filter_slice_planar8; -+ } else { -+ s->filter_slice = filter_slice_planar10; -+ } -+ break; -+ } ++static float hable(float in) ++{ ++ float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f; ++ return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f; ++} + -+ out = ff_get_video_buffer(outlink, outlink->w, outlink->h); -+ if (!out) { -+ av_frame_free(&in); -+ return AVERROR(ENOMEM); -+ } ++static float mobius(float in, float j, double peak) ++{ ++ float a, b; + -+ if ((ret = av_frame_copy_props(out, in)) < 0) -+ goto fail; ++ if (in <= j) ++ return in; + -+ /* read peak from side data if not passed in */ -+ if (!peak) { -+ peak = ff_determine_signal_peak(in); -+ av_log(s, AV_LOG_DEBUG, "Computed signal peak: %f\n", peak); -+ } ++ a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak); ++ b = (j * j - 2.0f * j * peak + peak) / FFMAX(peak - 1.0f, FLOAT_EPS); + -+ out->color_trc = s->trc; -+ out->colorspace = s->spc; -+ out->color_primaries = s->pri; -+ out->color_range = s->range; ++ return (b * b + 2.0f * b * j + j * j) / (b - a) * (in + a) / (in + b); ++} + -+ if (in->color_trc == AVCOL_TRC_UNSPECIFIED) -+ in->color_trc = AVCOL_TRC_SMPTE2084; -+ if (out->color_trc == AVCOL_TRC_UNSPECIFIED) -+ out->color_trc = AVCOL_TRC_BT709; ++static float bt2390(float s, float peak) ++{ ++ float peak_pq = inverse_eotf_st2084(peak, REFERENCE_WHITE_ALT); ++ float scale = 1.0f / peak_pq; + -+ if (in->colorspace == AVCOL_SPC_UNSPECIFIED) -+ in->colorspace = AVCOL_SPC_BT2020_NCL; -+ if (out->colorspace == AVCOL_SPC_UNSPECIFIED) -+ out->colorspace = AVCOL_SPC_BT709; ++ // SDR peak ++ float dst_peak = 1.0f; ++ float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE_ALT) * scale; ++ float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale; + -+ if (in->color_primaries == AVCOL_PRI_UNSPECIFIED) -+ in->color_primaries = AVCOL_PRI_BT2020; -+ if (out->color_primaries == AVCOL_PRI_UNSPECIFIED) -+ out->color_primaries = AVCOL_PRI_BT709; ++ float ks = 1.5f * maxLum - 0.5f; ++ float tb = (s_pq - ks) / (1.0f - ks); ++ float tb2 = tb * tb; ++ float tb3 = tb2 * tb; ++ float pb = (2.0f * tb3 - 3.0f * tb2 + 1.0f) * ks + ++ (tb3 - 2.0f * tb2 + tb) * (1.0f - ks) + ++ (-2.0f * tb3 + 3.0f * tb2) * maxLum; ++ float sig = (s_pq < ks) ? s_pq : pb; + -+ if (in->color_range == AVCOL_RANGE_UNSPECIFIED) -+ in->color_range = AVCOL_RANGE_MPEG; -+ if (out->color_range == AVCOL_RANGE_UNSPECIFIED) -+ out->color_range = AVCOL_RANGE_MPEG; ++ return eotf_st2084(sig * peak_pq, REFERENCE_WHITE_ALT); ++} + -+ if (!s->lin_lut || !s->delin_lut) { -+ if ((ret = compute_trc_luts(s, in->color_trc, out->color_trc)) < 0) -+ goto fail; ++static float mapsig(enum TonemapAlgorithm alg, float sig, double peak, double param) ++{ ++ switch(alg) { ++ default: ++ case TONEMAP_NONE: ++ // do nothing ++ break; ++ case TONEMAP_LINEAR: ++ sig = sig * param / peak; ++ break; ++ case TONEMAP_GAMMA: ++ sig = sig > 0.05f ++ ? pow(sig / peak, 1.0f / param) ++ : sig * pow(0.05f / peak, 1.0f / param) / 0.05f; ++ break; ++ case TONEMAP_CLIP: ++ sig = av_clipf(sig * param, 0, 1.0f); ++ break; ++ case TONEMAP_HABLE: ++ sig = hable(sig) / hable(peak); ++ break; ++ case TONEMAP_REINHARD: ++ sig = sig / (sig + param) * (peak + param) / peak; ++ break; ++ case TONEMAP_MOBIUS: ++ sig = mobius(sig, param, peak); ++ break; ++ case TONEMAP_BT2390: ++ sig = bt2390(sig, peak); ++ break; + } + -+ if (!s->tonemap_lut || s->lut_peak != peak) { -+ s->lut_peak = peak; -+ if ((ret = compute_tonemap_lut(s, out->color_trc)) < 0) -+ goto fail; -+ } ++ return sig; ++} + -+ coeffs = av_csp_luma_coeffs_from_avcsp(in->colorspace); -+ if (s->coeffs != coeffs) { -+ s->coeffs = coeffs; -+ s->ocoeffs = av_csp_luma_coeffs_from_avcsp(out->colorspace); -+ if ((ret = compute_yuv_coeffs(s, coeffs, s->ocoeffs, desc, odesc, -+ in->color_range, out->color_range)) < 0) -+ goto fail; -+ if ((ret = compute_rgb_coeffs(s, in->color_primaries, out->color_primaries)) < 0) -+ goto fail; -+ } ++static float linearize(float x, enum AVColorTransferCharacteristic trc_src) ++{ ++ if (trc_src == AVCOL_TRC_SMPTE2084) ++ return eotf_st2084(x, REFERENCE_WHITE_ALT); ++ else if (trc_src == AVCOL_TRC_ARIB_STD_B67) ++ return eotf_arib_b67(x); ++ else ++ return x; ++} + -+ /* do the tonemap */ -+ td.in = in; -+ td.out = out; -+ td.desc = desc; -+ td.odesc = odesc; -+ td.peak = peak; -+ ff_filter_execute(ctx, s->filter_slice, &td, NULL, -+ FFMIN(outlink->h >> FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h), ff_filter_get_nb_threads(ctx))); ++static float delinearize(float x, enum AVColorTransferCharacteristic trc_dst) ++{ ++ if (trc_dst == AVCOL_TRC_BT709 || trc_dst == AVCOL_TRC_BT2020_10) ++ return inverse_eotf_bt1886(x); ++ else ++ return x; ++} + -+ av_frame_free(&in); ++static int compute_trc_luts(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src, ++ enum AVColorTransferCharacteristic trc_dst) ++{ ++ int i; + -+ av_frame_remove_side_data(out, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA); -+ av_frame_remove_side_data(out, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL); ++ if (!s->lin_lut && !(s->lin_lut = av_calloc(32768, sizeof(float)))) ++ return AVERROR(ENOMEM); ++ if (!s->delin_lut && !(s->delin_lut = av_calloc(32768, sizeof(uint16_t)))) ++ return AVERROR(ENOMEM); + -+ return ff_filter_frame(outlink, out); -+fail: -+ av_frame_free(&in); -+ av_frame_free(&out); -+ return ret; ++ for (i = 0; i < 32768; i++) { ++ double v1 = (i - 2048.0f) / 28672.0f; ++ double v2 = i / 32767.0f; ++ s->lin_lut[i] = FFMAX(linearize(v1, trc_src), 0); ++ s->delin_lut[i] = av_clip_int16(lrint(delinearize(v2, trc_dst) * 28672.0f)); ++ } ++ ++ return 0; +} + -+static void uninit(AVFilterContext *ctx) ++static int compute_tonemap_lut(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src) +{ -+ TonemapxContext *s = ctx->priv; ++ int i; ++ double peak = s->lut_peak; + -+ av_freep(&s->lin_lut); -+ av_freep(&s->delin_lut); -+ av_freep(&s->tonemap_lut); ++ if (!s->tonemap_lut && !(s->tonemap_lut = av_calloc(32768, sizeof(float)))) ++ return AVERROR(ENOMEM); ++ ++ for (i = 0; i < 32768; i++) { ++ double v = (i - 2048.0f) / 28672.0f; ++ double sig = linearize(v, trc_src); ++ float mapped = mapsig(s->tonemap, sig, peak, s->param); ++ s->tonemap_lut[i] = (sig > 0.0f && mapped > 0.0f) ? mapped / sig : 0.0f; ++ } ++ ++ return 0; +} + -+static int query_formats(AVFilterContext *ctx) ++static int compute_yuv_coeffs(TonemapxContext *s, ++ const AVLumaCoefficients *coeffs, ++ const AVLumaCoefficients *ocoeffs, ++ const AVPixFmtDescriptor *idesc, ++ const AVPixFmtDescriptor *odesc, ++ enum AVColorRange irng, ++ enum AVColorRange orng) +{ -+ enum AVPixelFormat valid_in_pix_fmts[4]; -+ AVFilterFormats *formats; -+ const AVPixFmtDescriptor *desc; -+ TonemapxContext *s = ctx->priv; ++ double rgb2yuv[3][3], yuv2rgb[3][3]; ++ int res; ++ int y_rng, uv_rng; + -+ if (!strcmp(s->format_str, "same")) { -+ int res; -+ formats = ff_make_format_list(in_pix_fmts); -+ res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats); -+ if (res < 0) -+ return res; -+ s->format = AV_PIX_FMT_NONE; -+ } else { -+ int i, j = 0; -+ int res; -+ formats = ff_make_format_list(in_pix_fmts); -+ res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats); -+ if (res < 0) -+ return res; -+ if (s->format == AV_PIX_FMT_NONE) { -+ av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str); -+ return AVERROR(EINVAL); -+ } -+ s->format = av_get_pix_fmt(s->format_str); -+ // Check again in case of the string is invalid -+ if (s->format == AV_PIX_FMT_NONE) { -+ av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str); -+ return AVERROR(EINVAL); -+ } -+ desc = av_pix_fmt_desc_get(s->format); -+ // Filter out the input formats for requested output formats -+ // The input and output must have the same planar format, either planar or bi-planar packed -+ for (i = 0; in_pix_fmts[i] != AV_PIX_FMT_NONE; i++) { -+ const AVPixFmtDescriptor *tdesc = av_pix_fmt_desc_get(in_pix_fmts[i]); -+ if (tdesc->comp[2].plane == desc->comp[2].plane) { -+ valid_in_pix_fmts[j] = in_pix_fmts[i]; -+ j++; -+ } -+ } -+ valid_in_pix_fmts[j] = AV_PIX_FMT_NONE; -+ formats = ff_make_format_list(valid_in_pix_fmts); -+ res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats); -+ if (res < 0) -+ return res; -+ if (out_format_is_supported(s->format)) { -+ formats = NULL; -+ res = ff_add_format(&formats, s->format); -+ if (res < 0) -+ return res; -+ } else { -+ av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n", -+ av_get_pix_fmt_name(s->format)); -+ return AVERROR(ENOSYS); -+ } ++ res = ff_get_range_off(&s->in_yuv_off, &y_rng, &uv_rng, ++ irng, idesc->comp[0].depth); ++ if (res < 0) { ++ av_log(s, AV_LOG_ERROR, ++ "Unsupported input color range %d (%s)\n", ++ irng, av_color_range_name(irng)); ++ return res; + } + -+ return ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats); ++ ff_fill_rgb2yuv_table(coeffs, rgb2yuv); ++ ff_matrix_invert_3x3(rgb2yuv, yuv2rgb); ++ ff_fill_rgb2yuv_table(ocoeffs, rgb2yuv); ++ ++ ff_get_yuv_coeffs(s->yuv2rgb_coeffs, yuv2rgb, idesc->comp[0].depth, ++ y_rng, uv_rng, 1); ++ ++ res = ff_get_range_off(&s->out_yuv_off, &y_rng, &uv_rng, ++ orng, odesc->comp[0].depth); ++ if (res < 0) { ++ av_log(s, AV_LOG_ERROR, ++ "Unsupported output color range %d (%s)\n", ++ orng, av_color_range_name(orng)); ++ return res; ++ } ++ ++ ff_get_yuv_coeffs(s->rgb2yuv_coeffs, rgb2yuv, odesc->comp[0].depth, ++ y_rng, uv_rng, 0); ++ ++ return 0; +} + -+static av_cold int init(AVFilterContext *ctx) ++static int compute_rgb_coeffs(TonemapxContext *s, ++ enum AVColorPrimaries iprm, ++ enum AVColorPrimaries oprm) +{ -+ TonemapxContext *s = ctx->priv; -+ enum SIMDVariant active_simd = SIMD_NONE; -+ av_log(s, AV_LOG_DEBUG, "Requested output format: %s\n", -+ s->format_str); ++ double rgb2xyz[3][3], xyz2rgb[3][3]; ++ const AVColorPrimariesDesc *iprm_desc = av_csp_primaries_desc_from_id(iprm); ++ const AVColorPrimariesDesc *oprm_desc = av_csp_primaries_desc_from_id(oprm); + -+#if ARCH_AARCH64 -+#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS -+ { -+ int cpu_flags = av_get_cpu_flags(); -+ if (have_neon(cpu_flags)) { -+ s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_neon; -+ s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_neon; -+ s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon; -+ s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon; -+ active_simd = SIMD_NEON; -+ } -+ } -+#else -+ av_log(s, AV_LOG_WARNING, "NEON optimization disabled at compile time\n"); -+#endif // ENABLE_TONEMAPX_NEON_INTRINSICS -+#elif ARCH_X86 -+#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS -+ { -+ int cpu_flags = av_get_cpu_flags(); -+ if (X86_SSE42(cpu_flags)) { -+ s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_sse; -+ s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_sse; -+ s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse; -+ s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse; -+ active_simd = SIMD_SSE; -+ } ++ if (!iprm_desc) { ++ av_log(s, AV_LOG_ERROR, ++ "Unsupported input color primaries %d (%s)\n", ++ iprm, av_color_primaries_name(iprm)); ++ return AVERROR(EINVAL); + } -+#else -+ av_log(s, AV_LOG_WARNING, "SSE optimization disabled at compile time\n"); -+#endif // ENABLE_TONEMAPX_SSE_INTRINSICS -+#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS -+ { -+ int cpu_flags = av_get_cpu_flags(); -+ if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) { -+ s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_avx; -+ s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_avx; -+ s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx; -+ s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx; -+ active_simd = SIMD_AVX; -+ } ++ if (!oprm_desc) { ++ av_log(s, AV_LOG_ERROR, ++ "Unsupported output color primaries %d (%s)\n", ++ oprm, av_color_primaries_name(oprm)); ++ return AVERROR(EINVAL); + } -+#else -+ av_log(s, AV_LOG_WARNING, "AVX optimization disabled at compile time\n"); -+#endif // ENABLE_TONEMAPX_AVX_INTRINSICS -+#endif // ARCH_X86/ARCH_AARCH64 + -+#if !defined(ENABLE_TONEMAPX_NEON_INTRINSICS) && \ -+ !defined(ENABLE_TONEMAPX_SSE_INTRINSICS) && \ -+ !defined(ENABLE_TONEMAPX_AVX_INTRINSICS) -+ av_log(s, AV_LOG_WARNING, "SIMD optimization disabled at compile time\n"); -+#endif ++ ff_fill_rgb2xyz_table(&oprm_desc->prim, &oprm_desc->wp, rgb2xyz); ++ ff_matrix_invert_3x3(rgb2xyz, xyz2rgb); ++ ff_fill_rgb2xyz_table(&iprm_desc->prim, &iprm_desc->wp, rgb2xyz); ++ ff_matrix_mul_3x3(s->rgb2rgb_coeffs, rgb2xyz, xyz2rgb); + -+ if (!s->tonemap_func_biplanar8) { -+ s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12; -+ } ++ return 0; ++} + -+ if (!s->tonemap_func_biplanar10) { -+ s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010; -+ } ++__attribute__((always_inline)) ++static inline void dovi2rgb(int y00, int y01, int y10, int y11, int u, int v, ++ const struct TonemapIntParams *params, ++ const float in_rng, ++ int16_t r[4], int16_t g[4], int16_t b[4]) ++{ ++ float yuv1[3], yuv2[3], yuv3[3], yuv4[3]; ++ float c1[3], c2[3], c3[3], c4[3]; ++ ++ yuv1[0] = CLAMP(y00 / in_rng, 0.0f, 1.0f); ++ yuv2[0] = CLAMP(y01 / in_rng, 0.0f, 1.0f); ++ yuv3[0] = CLAMP(y10 / in_rng, 0.0f, 1.0f); ++ yuv4[0] = CLAMP(y11 / in_rng, 0.0f, 1.0f); ++ yuv1[1] = yuv2[1] = yuv3[1] = yuv4[1] = CLAMP(u / in_rng, 0.0f, 1.0f); ++ yuv1[2] = yuv2[2] = yuv3[2] = yuv4[2] = CLAMP(v / in_rng, 0.0f, 1.0f); ++ ++ reshape_dovi_yuv(yuv1, yuv1, params); ++ reshape_dovi_yuv(yuv2, yuv2, params); ++ reshape_dovi_yuv(yuv3, yuv3, params); ++ reshape_dovi_yuv(yuv4, yuv4, params); ++ ++ ycc2rgb(c1, yuv1[0], yuv1[1], yuv1[2], params->dovi->nonlinear, *params->ycc_offset); ++ ycc2rgb(c2, yuv2[0], yuv2[1], yuv2[2], params->dovi->nonlinear, *params->ycc_offset); ++ ycc2rgb(c3, yuv3[0], yuv3[1], yuv3[2], params->dovi->nonlinear, *params->ycc_offset); ++ ycc2rgb(c4, yuv4[0], yuv4[1], yuv4[2], params->dovi->nonlinear, *params->ycc_offset); ++ ++ lms2rgb(c1, c1[0], c1[1], c1[2], params->dovi->linear, *params->lms2rgb_matrix); ++ lms2rgb(c2, c2[0], c2[1], c2[2], params->dovi->linear, *params->lms2rgb_matrix); ++ lms2rgb(c3, c3[0], c3[1], c3[2], params->dovi->linear, *params->lms2rgb_matrix); ++ lms2rgb(c4, c4[0], c4[1], c4[2], params->dovi->linear, *params->lms2rgb_matrix); ++ ++ r[0] = av_clip_int16(c1[0] * 28672); ++ r[1] = av_clip_int16(c2[0] * 28672); ++ r[2] = av_clip_int16(c3[0] * 28672); ++ r[3] = av_clip_int16(c4[0] * 28672); ++ ++ g[0] = av_clip_int16(c1[1] * 28672); ++ g[1] = av_clip_int16(c2[1] * 28672); ++ g[2] = av_clip_int16(c3[1] * 28672); ++ g[3] = av_clip_int16(c4[1] * 28672); ++ ++ b[0] = av_clip_int16(c1[2] * 28672); ++ b[1] = av_clip_int16(c2[2] * 28672); ++ b[2] = av_clip_int16(c3[2] * 28672); ++ b[3] = av_clip_int16(c4[2] * 28672); ++} + -+ if (!s->tonemap_func_planar8) { -+ s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p; -+ } ++inline static void tonemap_int16(int16_t r_in, int16_t g_in, int16_t b_in, ++ int16_t *r_out, int16_t *g_out, int16_t *b_out, ++ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, ++ const AVLumaCoefficients *coeffs, ++ const AVLumaCoefficients *ocoeffs, double desat, ++ double (*rgb2rgb)[3][3], ++ int rgb2rgb_passthrough) ++{ ++ int16_t sig; ++ float mapval, r_lin, g_lin, b_lin; + -+ if (!s->tonemap_func_planar10) { -+ s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10; -+ } ++ /* load values */ ++ *r_out = r_in; ++ *g_out = g_in; ++ *b_out = b_in; + -+ switch (active_simd) { -+ case SIMD_NEON: -+ av_log(s, AV_LOG_INFO, "Using CPU capability: NEON\n"); -+ break; -+ case SIMD_SSE: -+ av_log(s, AV_LOG_INFO, "Using CPU capability: SSE4.2\n"); -+ break; -+ case SIMD_AVX: -+ av_log(s, AV_LOG_INFO, "Using CPU capabilities: AVX2 FMA3\n"); -+ break; -+ default: -+ case SIMD_NONE: -+ av_log(s, AV_LOG_INFO, "No CPU SIMD extension available\n"); -+ break; ++ /* pick the brightest component, reducing the value range as necessary ++ * to keep the entire signal in range and preventing discoloration due to ++ * out-of-bounds clipping */ ++ sig = FFMAX3(r_in, g_in, b_in); ++ ++ mapval = tonemap_lut[av_clip_uintp2(sig + 2048, 15)]; ++ ++ r_lin = lin_lut[av_clip_uintp2(r_in + 2048, 15)]; ++ g_lin = lin_lut[av_clip_uintp2(g_in + 2048, 15)]; ++ b_lin = lin_lut[av_clip_uintp2(b_in + 2048, 15)]; ++ ++ if (!rgb2rgb_passthrough) { ++ r_lin = (*rgb2rgb)[0][0] * r_lin + (*rgb2rgb)[0][1] * g_lin + (*rgb2rgb)[0][2] * b_lin; ++ g_lin = (*rgb2rgb)[1][0] * r_lin + (*rgb2rgb)[1][1] * g_lin + (*rgb2rgb)[1][2] * b_lin; ++ b_lin = (*rgb2rgb)[2][0] * r_lin + (*rgb2rgb)[2][1] * g_lin + (*rgb2rgb)[2][2] * b_lin; + } + -+ switch (s->tonemap) { -+ case TONEMAP_GAMMA: -+ if (isnan(s->param)) -+ s->param = 1.8f; -+ break; -+ case TONEMAP_REINHARD: -+ if (!isnan(s->param)) -+ s->param = (1.0f - s->param) / s->param; -+ break; -+ case TONEMAP_MOBIUS: -+ if (isnan(s->param)) -+ s->param = 0.3f; -+ break; ++#define MIX(x,y,a) (x) * (1 - (a)) + (y) * (a) ++ /* desaturate to prevent unnatural colors */ ++ if (desat > 0) { ++ float luma = av_q2d(coeffs->cr) * r_lin + av_q2d(coeffs->cg) * g_lin + av_q2d(coeffs->cb) * b_lin; ++ float overbright = FFMAX(luma - desat, FLOAT_EPS) / FFMAX(luma, FLOAT_EPS); ++ r_lin = MIX(r_lin, luma, overbright); ++ g_lin = MIX(g_lin, luma, overbright); ++ b_lin = MIX(b_lin, luma, overbright); + } + -+ if (isnan(s->param)) -+ s->param = 1.0f; ++ r_lin *= mapval; ++ g_lin *= mapval; ++ b_lin *= mapval; ++#undef MIX + -+ return 0; ++ *r_out = delin_lut[av_clip_uintp2(r_lin * 32767 + 0.5, 15)]; ++ *g_out = delin_lut[av_clip_uintp2(g_lin * 32767 + 0.5, 15)]; ++ *b_out = delin_lut[av_clip_uintp2(b_lin * 32767 + 0.5, 15)]; +} + -+#define OFFSET(x) offsetof(TonemapxContext, x) -+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM -+static const AVOption tonemapx_options[] = { -+ { "tonemap", "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_BT2390}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, .unit = "tonemap" }, -+ { "none", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE}, 0, 0, FLAGS, .unit = "tonemap" }, -+ { "linear", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR}, 0, 0, FLAGS, .unit = "tonemap" }, -+ { "gamma", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA}, 0, 0, FLAGS, .unit = "tonemap" }, -+ { "clip", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP}, 0, 0, FLAGS, .unit = "tonemap" }, -+ { "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD}, 0, 0, FLAGS, .unit = "tonemap" }, -+ { "hable", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE}, 0, 0, FLAGS, .unit = "tonemap" }, -+ { "mobius", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS}, 0, 0, FLAGS, .unit = "tonemap" }, -+ { "bt2390", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_BT2390}, 0, 0, FLAGS, .unit = "tonemap" }, -+ { "transfer", "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, .unit = "transfer" }, -+ { "t", "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, .unit = "transfer" }, -+ { "bt709", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709}, 0, 0, FLAGS, .unit = "transfer" }, -+ { "bt2020", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10}, 0, 0, FLAGS, .unit = "transfer" }, -+ { "matrix", "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, .unit = "matrix" }, -+ { "m", "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, .unit = "matrix" }, -+ { "bt709", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709}, 0, 0, FLAGS, .unit = "matrix" }, -+ { "bt2020", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL}, 0, 0, FLAGS, .unit = "matrix" }, ++// See also libavfilter/colorspacedsp_template.c ++void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++ const int in_depth = srcdepth; ++ const int in_uv_offset = 128 << (in_depth - 8); ++ const int in_sh = in_depth - 1; ++ const int in_rnd = 1 << (in_sh - 1); ++ const int in_sh2 = 16 - in_depth; ++ ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ ++ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; ++ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; ++ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; ++ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; ++ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int r00, g00, b00; ++ int r01, g01, b01; ++ int r10, g10, b10; ++ int r11, g11, b11; ++ ++ int16_t r[4], g[4], b[4]; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1], ++ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { ++ for (int x = 0; x < width; x += 2) { ++ int y00 = (srcy[x] >> in_sh2) - params->in_yuv_off; ++ int y01 = (srcy[x + 1] >> in_sh2) - params->in_yuv_off; ++ int y10 = (srcy[srclinesize[0] / 2 + x] >> in_sh2) - params->in_yuv_off; ++ int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off; ++ int u = (srcuv[x] >> in_sh2) - in_uv_offset; ++ int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset; ++ ++ r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh); ++ r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh); ++ r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh); ++ r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh); ++ ++ g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ ++ b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh); ++ b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh); ++ b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh); ++ b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh); ++ ++ tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ ++ r00 = r[0], g00 = g[0], b00 = b[0]; ++ r01 = r[1], g01 = g[1], b01 = b[1]; ++ r10 = r[2], g10 = g[2], b10 = b[2]; ++ r11 = r[3], g11 = g[3], b11 = b[3]; ++ ++ dsty[x] = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)); ++ dsty[x + 1] = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)); ++ dsty[dstlinesize[0] + x] = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)); ++ dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)); ++ ++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) ++ dstuv[x] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)); ++ dstuv[x + 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)); ++#undef AVG ++ } ++ } ++} ++ ++void tonemap_frame_dovi_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++ const int in_depth = srcdepth; ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int r00, g00, b00; ++ int r01, g01, b01; ++ int r10, g10, b10; ++ int r11, g11, b11; ++ ++ const float in_rng = (float)((1 << in_depth) - 1); ++ ++ int16_t r[4], g[4], b[4]; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { ++ for (int x = 0; x < width; x += 2) { ++ int y00 = (srcy[x] ); ++ int y01 = (srcy[x + 1] ); ++ int y10 = (srcy[srclinesize[0] / 2 + x] ); ++ int y11 = (srcy[srclinesize[0] / 2 + x + 1]); ++ int u = (srcu[x >> 1]); ++ int v = (srcv[x >> 1]); ++ ++ dovi2rgb(y00, y01, y10, y11, u, v, params, in_rng, r, g, b); ++ ++ tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ ++ r00 = r[0], g00 = g[0], b00 = b[0]; ++ r01 = r[1], g01 = g[1], b01 = b[1]; ++ r10 = r[2], g10 = g[2], b10 = b[2]; ++ r11 = r[3], g11 = g[3], b11 = b[3]; ++ ++ dsty[x] = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)); ++ dsty[x + 1] = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)); ++ dsty[dstlinesize[0] + x] = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)); ++ dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)); ++ ++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) ++ dstu[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)); ++ dstv[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)); ++#undef AVG ++ } ++ } ++} ++ ++void tonemap_frame_dovi_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++ const int in_depth = srcdepth; ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int r00, g00, b00; ++ int r01, g01, b01; ++ int r10, g10, b10; ++ int r11, g11, b11; ++ ++ const float in_rng = (float)((1 << in_depth) - 1); ++ ++ int16_t r[4], g[4], b[4]; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { ++ for (int x = 0; x < width; x += 2) { ++ int y00 = (srcy[x] ); ++ int y01 = (srcy[x + 1] ); ++ int y10 = (srcy[srclinesize[0] / 2 + x] ); ++ int y11 = (srcy[srclinesize[0] / 2 + x + 1]); ++ int u = (srcu[x >> 1]); ++ int v = (srcv[x >> 1]); ++ ++ dovi2rgb(y00, y01, y10, y11, u, v, params, in_rng, r, g, b); ++ ++ tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ ++ r00 = r[0], g00 = g[0], b00 = b[0]; ++ r01 = r[1], g01 = g[1], b01 = b[1]; ++ r10 = r[2], g10 = g[2], b10 = b[2]; ++ r11 = r[3], g11 = g[3], b11 = b[3]; ++ ++ dsty[x] = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 16); ++ dsty[x + 1] = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 16); ++ dsty[dstlinesize[0] / 2 + x] = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 16); ++ dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 16); ++ ++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) ++ dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 16); ++ dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 16); ++#undef AVG ++ } ++ } ++} ++ ++void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++ const int in_depth = srcdepth; ++ const int in_uv_offset = 128 << (in_depth - 8); ++ const int in_sh = in_depth - 1; ++ const int in_rnd = 1 << (in_sh - 1); ++ ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ ++ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; ++ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; ++ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; ++ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; ++ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int r00, g00, b00; ++ int r01, g01, b01; ++ int r10, g10, b10; ++ int r11, g11, b11; ++ ++ int16_t r[4], g[4], b[4]; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { ++ for (int x = 0; x < width; x += 2) { ++ int y00 = (srcy[x] ) - params->in_yuv_off; ++ int y01 = (srcy[x + 1] ) - params->in_yuv_off; ++ int y10 = (srcy[srclinesize[0] / 2 + x] ) - params->in_yuv_off; ++ int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off; ++ int u = (srcu[x >> 1]) - in_uv_offset; ++ int v = (srcv[x >> 1]) - in_uv_offset; ++ ++ r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh); ++ r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh); ++ r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh); ++ r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh); ++ ++ g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ ++ b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh); ++ b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh); ++ b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh); ++ b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh); ++ ++ tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ ++ r00 = r[0], g00 = g[0], b00 = b[0]; ++ r01 = r[1], g01 = g[1], b01 = b[1]; ++ r10 = r[2], g10 = g[2], b10 = b[2]; ++ r11 = r[3], g11 = g[3], b11 = b[3]; ++ ++ dsty[x] = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)); ++ dsty[x + 1] = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)); ++ dsty[dstlinesize[0] + x] = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)); ++ dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)); ++ ++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) ++ dstu[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)); ++ dstv[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)); ++#undef AVG ++ } ++ } ++} ++ ++void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++ const int in_depth = srcdepth; ++ const int in_uv_offset = 128 << (in_depth - 8); ++ const int in_sh = in_depth - 1; ++ const int in_rnd = 1 << (in_sh - 1); ++ ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ ++ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; ++ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; ++ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; ++ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; ++ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int r00, g00, b00; ++ int r01, g01, b01; ++ int r10, g10, b10; ++ int r11, g11, b11; ++ ++ int16_t r[4], g[4], b[4]; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { ++ for (int x = 0; x < width; x += 2) { ++ int y00 = (srcy[x] ) - params->in_yuv_off; ++ int y01 = (srcy[x + 1] ) - params->in_yuv_off; ++ int y10 = (srcy[srclinesize[0] / 2 + x] ) - params->in_yuv_off; ++ int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off; ++ int u = (srcu[x >> 1]) - in_uv_offset; ++ int v = (srcv[x >> 1]) - in_uv_offset; ++ ++ r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh); ++ r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh); ++ r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh); ++ r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh); ++ ++ g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ ++ b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh); ++ b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh); ++ b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh); ++ b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh); ++ ++ tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ ++ r00 = r[0], g00 = g[0], b00 = b[0]; ++ r01 = r[1], g01 = g[1], b01 = b[1]; ++ r10 = r[2], g10 = g[2], b10 = b[2]; ++ r11 = r[3], g11 = g[3], b11 = b[3]; ++ ++ dsty[x] = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 16); ++ dsty[x + 1] = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 16); ++ dsty[dstlinesize[0] / 2 + x] = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 16); ++ dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 16); ++ ++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) ++ dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 16); ++ dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 16); ++#undef AVG ++ } ++ } ++} ++ ++void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++ const int in_depth = srcdepth; ++ const int in_uv_offset = 128 << (in_depth - 8); ++ const int in_sh = in_depth - 1; ++ const int in_rnd = 1 << (in_sh - 1); ++ const int in_sh2 = 16 - in_depth; ++ ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ const int out_sh2 = 16 - out_depth; ++ ++ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; ++ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; ++ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; ++ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; ++ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int r00, g00, b00; ++ int r01, g01, b01; ++ int r10, g10, b10; ++ int r11, g11, b11; ++ ++ int16_t r[4], g[4], b[4]; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { ++ for (int x = 0; x < width; x += 2) { ++ int y00 = (srcy[x] >> in_sh2) - params->in_yuv_off; ++ int y01 = (srcy[x + 1] >> in_sh2) - params->in_yuv_off; ++ int y10 = (srcy[srclinesize[0] / 2 + x] >> in_sh2) - params->in_yuv_off; ++ int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off; ++ int u = (srcuv[x] >> in_sh2) - in_uv_offset; ++ int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset; ++ ++ r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh); ++ r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh); ++ r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh); ++ r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh); ++ ++ g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ ++ b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh); ++ b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh); ++ b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh); ++ b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh); ++ ++ tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough); ++ ++ r00 = r[0], g00 = g[0], b00 = b[0]; ++ r01 = r[1], g01 = g[1], b01 = b[1]; ++ r10 = r[2], g10 = g[2], b10 = b[2]; ++ r11 = r[3], g11 = g[3], b11 = b[3]; ++ ++ dsty[x] = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16); ++ dsty[x + 1] = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16); ++ dsty[dstlinesize[0] / 2 + x] = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16); ++ dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16); ++ ++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2) ++ dstuv[x] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16); ++ dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16); ++#undef AVG ++ } ++ } ++} ++ ++#define LOAD_TONEMAP_PARAMS TonemapxContext *s = ctx->priv; \ ++ThreadData *td = arg; \ ++AVFrame *in = td->in; \ ++AVFrame *out = td->out; \ ++const AVPixFmtDescriptor *desc = td->desc; \ ++const AVPixFmtDescriptor *odesc = td->odesc; \ ++const int ss = 1 << FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h); \ ++const int slice_start = (in->height / ss * jobnr ) / nb_jobs * ss; \ ++const int slice_end = (in->height / ss * (jobnr + 1)) / nb_jobs * ss; \ ++TonemapIntParams params = { \ ++.lut_peak = s->lut_peak, \ ++.lin_lut = s->lin_lut, \ ++.tonemap_lut = s->tonemap_lut, \ ++.delin_lut = s->delin_lut, \ ++.in_yuv_off = s->in_yuv_off, \ ++.out_yuv_off = s->out_yuv_off, \ ++.yuv2rgb_coeffs = &s->yuv2rgb_coeffs, \ ++.rgb2yuv_coeffs = &s->rgb2yuv_coeffs, \ ++.rgb2rgb_coeffs = &s->rgb2rgb_coeffs, \ ++.rgb2rgb_passthrough = in->color_primaries == out->color_primaries, \ ++.coeffs = s->coeffs, \ ++.ocoeffs = s->ocoeffs, \ ++.desat = s->desat, \ ++.dovi = s->dovi, \ ++.dovi_pbuf = s->dovi_pbuf, \ ++.lms2rgb_matrix = &s->lms2rgb_matrix, \ ++.ycc_offset = &s->ycc_offset \ ++}; ++ ++static int filter_slice_planar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) ++{ ++ LOAD_TONEMAP_PARAMS ++ av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth); ++ ++ s->tonemap_func_planar8(out->data[0] + out->linesize[0] * slice_start, ++ out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h), ++ out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h), ++ (void*)(in->data[0] + in->linesize[0] * slice_start), ++ (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)), ++ (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)), ++ out->linesize, in->linesize, ++ odesc->comp[0].depth, desc->comp[0].depth, ++ out->width, slice_end - slice_start, ++ ¶ms); ++ ++ return 0; ++} ++ ++static int filter_slice_biplanar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) ++{ ++ LOAD_TONEMAP_PARAMS ++ av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth); ++ ++ s->tonemap_func_biplanar8(out->data[0] + out->linesize[0] * slice_start, ++ out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h), ++ (void*)(in->data[0] + in->linesize[0] * slice_start), ++ (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)), ++ out->linesize, in->linesize, ++ odesc->comp[0].depth, desc->comp[0].depth, ++ out->width, slice_end - slice_start, ++ ¶ms); ++ ++ return 0; ++} ++ ++static int filter_slice_planar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) ++{ ++ LOAD_TONEMAP_PARAMS ++ av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth); ++ ++ s->tonemap_func_planar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start), ++ (uint16_t *) (out->data[1] + ++ out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)), ++ (uint16_t *) (out->data[2] + ++ out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)), ++ (void*)(in->data[0] + in->linesize[0] * slice_start), ++ (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)), ++ (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)), ++ out->linesize, in->linesize, ++ odesc->comp[0].depth, desc->comp[0].depth, ++ out->width, slice_end - slice_start, ++ ¶ms); ++ ++ return 0; ++} ++ ++static int filter_slice_biplanar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) ++{ ++ LOAD_TONEMAP_PARAMS ++ av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth); ++ ++ s->tonemap_func_biplanar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start), ++ (uint16_t *) (out->data[1] + ++ out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)), ++ (void*)(in->data[0] + in->linesize[0] * slice_start), ++ (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)), ++ out->linesize, in->linesize, ++ odesc->comp[0].depth, desc->comp[0].depth, ++ out->width, slice_end - slice_start, ++ ¶ms); ++ ++ return 0; ++} ++ ++static int filter_frame(AVFilterLink *link, AVFrame *in) ++{ ++ AVFilterContext *ctx = link->dst; ++ TonemapxContext *s = ctx->priv; ++ AVFilterLink *outlink = ctx->outputs[0]; ++ AVFrame *out; ++ const AVPixFmtDescriptor *desc; ++ const AVPixFmtDescriptor *odesc; ++ int ret; ++ double peak = s->peak; ++ const AVLumaCoefficients *coeffs; ++ ThreadData td; ++ ++ desc = av_pix_fmt_desc_get(link->format); ++ odesc = av_pix_fmt_desc_get(outlink->format); ++ if (!desc || !odesc) { ++ av_frame_free(&in); ++ return AVERROR_BUG; ++ } ++ ++ switch (odesc->comp[2].plane) { ++ case 1: // biplanar ++ if (odesc->comp[0].depth == 8) { ++ s->filter_slice = filter_slice_biplanar8; ++ } else { ++ s->filter_slice = filter_slice_biplanar10; ++ } ++ break; ++ default: ++ case 2: // planar ++ if (odesc->comp[0].depth == 8) { ++ s->filter_slice = filter_slice_planar8; ++ } else { ++ s->filter_slice = filter_slice_planar10; ++ } ++ break; ++ } ++ ++ out = ff_get_video_buffer(outlink, outlink->w, outlink->h); ++ if (!out) { ++ av_frame_free(&in); ++ return AVERROR(ENOMEM); ++ } ++ ++ if ((ret = av_frame_copy_props(out, in)) < 0) ++ goto fail; ++ ++ /* read peak from side data if not passed in */ ++ if (!peak) { ++ peak = ff_determine_signal_peak(in); ++ av_log(s, AV_LOG_DEBUG, "Computed signal peak: %f\n", peak); ++ } ++ ++ out->color_trc = s->trc; ++ out->colorspace = s->spc; ++ out->color_primaries = s->pri; ++ if (s->range != -1) out->color_range = s->range; ++ ++ if (in->color_trc == AVCOL_TRC_UNSPECIFIED) ++ in->color_trc = AVCOL_TRC_SMPTE2084; ++ if (out->color_trc == AVCOL_TRC_UNSPECIFIED) ++ out->color_trc = AVCOL_TRC_BT709; ++ ++ if (in->colorspace == AVCOL_SPC_UNSPECIFIED) ++ in->colorspace = AVCOL_SPC_BT2020_NCL; ++ if (out->colorspace == AVCOL_SPC_UNSPECIFIED) ++ out->colorspace = AVCOL_SPC_BT709; ++ ++ if (in->color_primaries == AVCOL_PRI_UNSPECIFIED) ++ in->color_primaries = AVCOL_PRI_BT2020; ++ if (out->color_primaries == AVCOL_PRI_UNSPECIFIED) ++ out->color_primaries = AVCOL_PRI_BT709; ++ ++ if (in->color_range == AVCOL_RANGE_UNSPECIFIED) ++ in->color_range = AVCOL_RANGE_MPEG; ++ if (out->color_range == AVCOL_RANGE_UNSPECIFIED) ++ out->color_range = AVCOL_RANGE_MPEG; ++ ++ if (!s->lin_lut || !s->delin_lut) { ++ if ((ret = compute_trc_luts(s, in->color_trc, out->color_trc)) < 0) ++ goto fail; ++ } ++ ++ if (!s->tonemap_lut || s->lut_peak != peak) { ++ s->lut_peak = peak; ++ if ((ret = compute_tonemap_lut(s, out->color_trc)) < 0) ++ goto fail; ++ } ++ ++ coeffs = av_csp_luma_coeffs_from_avcsp(in->colorspace); ++ if (s->coeffs != coeffs) { ++ s->coeffs = coeffs; ++ s->ocoeffs = av_csp_luma_coeffs_from_avcsp(out->colorspace); ++ if ((ret = compute_yuv_coeffs(s, coeffs, s->ocoeffs, desc, odesc, ++ in->color_range, out->color_range)) < 0) ++ goto fail; ++ if ((ret = compute_rgb_coeffs(s, in->color_primaries, out->color_primaries)) < 0) ++ goto fail; ++ } ++ ++ if (s->apply_dovi) { ++ AVFrameSideData *dovi_sd = av_frame_get_side_data(in, AV_FRAME_DATA_DOVI_METADATA); ++ if (dovi_sd) { ++ const AVDOVIMetadata *metadata = (AVDOVIMetadata *) dovi_sd->data; ++ const AVDOVIRpuDataHeader *rpu = av_dovi_get_header(metadata); ++ // only map dovi rpus that don't require an EL and has rpu profile == 0 ++ // for performance reason we only want to do reshaping when absolutely needed ++ // such videos usually have vdr_rpu_profile == 0, for example profile 5 videos ++ // this could be wrong as there is no public documentation on this field ++ if (rpu->disable_residual_flag && rpu->vdr_rpu_profile == 0) { ++ struct DoviMetadata *dovi = av_malloc(sizeof(*dovi)); ++ s->dovi = dovi; ++ if (!s->dovi) ++ goto fail; ++ ++ ff_map_dovi_metadata(s->dovi, metadata); ++ } ++ } ++ ++ if (s->dovi) { ++ if (desc->comp[2].plane == 1) { ++ av_log(s, AV_LOG_ERROR, "Input pixel format has to be yuv420p10 for Dolby Vision reshaping\n"); ++ av_assert0(0); ++ } ++ update_dovi_buf(ctx); ++ ff_matrix_mul_3x3(s->lms2rgb_matrix, dovi_lms2rgb_matrix, s->dovi->linear); ++ s->ycc_offset[0] = s->dovi->nonlinear_offset[0] * (float)s->dovi->nonlinear[0][0] + s->dovi->nonlinear_offset[1] * (float)s->dovi->nonlinear[0][1] + s->dovi->nonlinear_offset[2] * (float)s->dovi->nonlinear[0][2]; ++ s->ycc_offset[1] = s->dovi->nonlinear_offset[0] * (float)s->dovi->nonlinear[1][0] + s->dovi->nonlinear_offset[1] * (float)s->dovi->nonlinear[1][1] + s->dovi->nonlinear_offset[2] * (float)s->dovi->nonlinear[1][2]; ++ s->ycc_offset[2] = s->dovi->nonlinear_offset[0] * (float)s->dovi->nonlinear[2][0] + s->dovi->nonlinear_offset[1] * (float)s->dovi->nonlinear[2][1] + s->dovi->nonlinear_offset[2] * (float)s->dovi->nonlinear[2][2]; ++ s->tonemap_func_planar8 = s->tonemap_func_dovi8; ++ s->tonemap_func_planar10 = s->tonemap_func_dovi10; ++ } ++ } ++ ++ /* do the tonemap */ ++ td.in = in; ++ td.out = out; ++ td.desc = desc; ++ td.odesc = odesc; ++ td.peak = peak; ++ ff_filter_execute(ctx, s->filter_slice, &td, NULL, ++ FFMIN(outlink->h >> FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h), ff_filter_get_nb_threads(ctx))); ++ ++ av_frame_free(&in); ++ ++ av_frame_remove_side_data(out, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA); ++ av_frame_remove_side_data(out, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL); ++ av_frame_remove_side_data(out, AV_FRAME_DATA_DOVI_RPU_BUFFER); ++ av_frame_remove_side_data(out, AV_FRAME_DATA_DOVI_METADATA); ++ ++ return ff_filter_frame(outlink, out); ++fail: ++ av_frame_free(&in); ++ av_frame_free(&out); ++ return ret; ++} ++ ++static void uninit(AVFilterContext *ctx) ++{ ++ TonemapxContext *s = ctx->priv; ++ ++ av_freep(&s->lin_lut); ++ av_freep(&s->delin_lut); ++ av_freep(&s->tonemap_lut); ++ ++ if (s->dovi) ++ av_freep(&s->dovi); ++} ++ ++static int query_formats(AVFilterContext *ctx) ++{ ++ enum AVPixelFormat valid_in_pix_fmts[4]; ++ AVFilterFormats *formats; ++ const AVPixFmtDescriptor *desc; ++ TonemapxContext *s = ctx->priv; ++ ++ if (!strcmp(s->format_str, "same")) { ++ int res; ++ formats = ff_make_format_list(in_pix_fmts); ++ res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats); ++ if (res < 0) ++ return res; ++ s->format = AV_PIX_FMT_NONE; ++ } else { ++ int i, j = 0; ++ int res; ++ formats = ff_make_format_list(in_pix_fmts); ++ res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats); ++ if (res < 0) ++ return res; ++ if (s->format == AV_PIX_FMT_NONE) { ++ av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str); ++ return AVERROR(EINVAL); ++ } ++ s->format = av_get_pix_fmt(s->format_str); ++ // Check again in case of the string is invalid ++ if (s->format == AV_PIX_FMT_NONE) { ++ av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str); ++ return AVERROR(EINVAL); ++ } ++ desc = av_pix_fmt_desc_get(s->format); ++ // Filter out the input formats for requested output formats ++ // The input and output must have the same planar format, either planar or bi-planar packed ++ for (i = 0; in_pix_fmts[i] != AV_PIX_FMT_NONE; i++) { ++ const AVPixFmtDescriptor *tdesc = av_pix_fmt_desc_get(in_pix_fmts[i]); ++ if (tdesc->comp[2].plane == desc->comp[2].plane) { ++ valid_in_pix_fmts[j] = in_pix_fmts[i]; ++ j++; ++ } ++ } ++ valid_in_pix_fmts[j] = AV_PIX_FMT_NONE; ++ formats = ff_make_format_list(valid_in_pix_fmts); ++ res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats); ++ if (res < 0) ++ return res; ++ if (out_format_is_supported(s->format)) { ++ formats = NULL; ++ res = ff_add_format(&formats, s->format); ++ if (res < 0) ++ return res; ++ } else { ++ av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n", ++ av_get_pix_fmt_name(s->format)); ++ return AVERROR(ENOSYS); ++ } ++ } ++ ++ return ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats); ++} ++ ++static av_cold int init(AVFilterContext *ctx) ++{ ++ TonemapxContext *s = ctx->priv; ++ enum SIMDVariant active_simd = SIMD_NONE; ++ av_log(s, AV_LOG_DEBUG, "Requested output format: %s\n", ++ s->format_str); ++ ++#if ARCH_AARCH64 ++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS ++ { ++ int cpu_flags = av_get_cpu_flags(); ++ if (have_neon(cpu_flags)) { ++ s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_neon; ++ s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_neon; ++ s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon; ++ s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon; ++ s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_neon; ++ s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10_neon; ++ active_simd = SIMD_NEON; ++ } ++ } ++#else ++ av_log(s, AV_LOG_WARNING, "NEON optimization disabled at compile time\n"); ++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS ++#elif ARCH_X86 ++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS ++ { ++ int cpu_flags = av_get_cpu_flags(); ++ if (X86_SSE42(cpu_flags)) { ++ s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_sse; ++ s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_sse; ++ s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse; ++ s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse; ++ s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_sse; ++ s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10_sse; ++ active_simd = SIMD_SSE; ++ } ++ } ++#else ++ av_log(s, AV_LOG_WARNING, "SSE optimization disabled at compile time\n"); ++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS ++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS ++ { ++ int cpu_flags = av_get_cpu_flags(); ++ if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) { ++ s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_avx; ++ s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_avx; ++ s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx; ++ s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx; ++ s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_avx; ++ s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10_avx; ++ active_simd = SIMD_AVX; ++ } ++ } ++#else ++ av_log(s, AV_LOG_WARNING, "AVX optimization disabled at compile time\n"); ++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS ++#endif // ARCH_X86/ARCH_AARCH64 ++ ++#if !defined(ENABLE_TONEMAPX_NEON_INTRINSICS) && \ ++ !defined(ENABLE_TONEMAPX_SSE_INTRINSICS) && \ ++ !defined(ENABLE_TONEMAPX_AVX_INTRINSICS) ++ av_log(s, AV_LOG_WARNING, "SIMD optimization disabled at compile time\n"); ++#endif ++ ++ if (!s->tonemap_func_biplanar8) { ++ s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12; ++ } ++ ++ if (!s->tonemap_func_biplanar10) { ++ s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010; ++ } ++ ++ if (!s->tonemap_func_planar8) { ++ s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p; ++ } ++ ++ if (!s->tonemap_func_planar10) { ++ s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10; ++ } ++ ++ if (!s->tonemap_func_dovi8) { ++ s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p; ++ } ++ ++ if (!s->tonemap_func_dovi10) { ++ s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10; ++ } ++ ++ switch (active_simd) { ++ case SIMD_NEON: ++ av_log(s, AV_LOG_INFO, "Using CPU capability: NEON\n"); ++ break; ++ case SIMD_SSE: ++ av_log(s, AV_LOG_INFO, "Using CPU capability: SSE4.2\n"); ++ break; ++ case SIMD_AVX: ++ av_log(s, AV_LOG_INFO, "Using CPU capabilities: AVX2 FMA3\n"); ++ break; ++ default: ++ case SIMD_NONE: ++ av_log(s, AV_LOG_INFO, "No CPU SIMD extension available\n"); ++ break; ++ } ++ ++ switch (s->tonemap) { ++ case TONEMAP_GAMMA: ++ if (isnan(s->param)) ++ s->param = 1.8f; ++ break; ++ case TONEMAP_REINHARD: ++ if (!isnan(s->param)) ++ s->param = (1.0f - s->param) / s->param; ++ break; ++ case TONEMAP_MOBIUS: ++ if (isnan(s->param)) ++ s->param = 0.3f; ++ break; ++ } ++ ++ if (isnan(s->param)) ++ s->param = 1.0f; ++ ++ return 0; ++} ++ ++#define OFFSET(x) offsetof(TonemapxContext, x) ++#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM ++static const AVOption tonemapx_options[] = { ++ { "tonemap", "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_BT2390}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, .unit = "tonemap" }, ++ { "none", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE}, 0, 0, FLAGS, .unit = "tonemap" }, ++ { "linear", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR}, 0, 0, FLAGS, .unit = "tonemap" }, ++ { "gamma", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA}, 0, 0, FLAGS, .unit = "tonemap" }, ++ { "clip", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP}, 0, 0, FLAGS, .unit = "tonemap" }, ++ { "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD}, 0, 0, FLAGS, .unit = "tonemap" }, ++ { "hable", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE}, 0, 0, FLAGS, .unit = "tonemap" }, ++ { "mobius", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS}, 0, 0, FLAGS, .unit = "tonemap" }, ++ { "bt2390", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_BT2390}, 0, 0, FLAGS, .unit = "tonemap" }, ++ { "transfer", "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, .unit = "transfer" }, ++ { "t", "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, .unit = "transfer" }, ++ { "bt709", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709}, 0, 0, FLAGS, .unit = "transfer" }, ++ { "bt2020", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10}, 0, 0, FLAGS, .unit = "transfer" }, ++ { "matrix", "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, .unit = "matrix" }, ++ { "m", "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, .unit = "matrix" }, ++ { "bt709", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709}, 0, 0, FLAGS, .unit = "matrix" }, ++ { "bt2020", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL}, 0, 0, FLAGS, .unit = "matrix" }, + { "primaries", "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, .unit = "primaries" }, + { "p", "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, .unit = "primaries" }, + { "bt709", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709}, 0, 0, FLAGS, .unit = "primaries" }, + { "bt2020", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020}, 0, 0, FLAGS, .unit = "primaries" }, -+ { "range", "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, .unit = "range" }, -+ { "r", "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, .unit = "range" }, ++ { "range", "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, .unit = "range" }, ++ { "r", "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, .unit = "range" }, + { "tv", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG}, 0, 0, FLAGS, .unit = "range" }, + { "pc", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG}, 0, 0, FLAGS, .unit = "range" }, + { "limited", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG}, 0, 0, FLAGS, .unit = "range" }, @@ -2714,331 +4164,1865 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c + { "param", "tonemap parameter", OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS }, + { "desat", "desaturation strength", OFFSET(desat), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS }, + { "peak", "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS }, ++ { "apply_dovi", "Apply Dolby Vision metadata if possible", OFFSET(apply_dovi), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS }, + { NULL } +}; + -+AVFILTER_DEFINE_CLASS(tonemapx); ++AVFILTER_DEFINE_CLASS(tonemapx); ++ ++static const AVFilterPad tonemapx_inputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .filter_frame = filter_frame, ++ }, ++}; ++ ++AVFilter ff_vf_tonemapx = { ++ .name = "tonemapx", ++ .description = NULL_IF_CONFIG_SMALL("SIMD optimized HDR to SDR tonemapping"), ++ .init = init, ++ .uninit = uninit, ++ .priv_size = sizeof(TonemapxContext), ++ .priv_class = &tonemapx_class, ++ FILTER_INPUTS(tonemapx_inputs), ++ FILTER_OUTPUTS(ff_video_default_filterpad), ++ FILTER_QUERY_FUNC(query_formats), ++ .flags = AVFILTER_FLAG_SLICE_THREADS, ++}; +Index: FFmpeg/libavfilter/vf_tonemapx.h +=================================================================== +--- /dev/null ++++ FFmpeg/libavfilter/vf_tonemapx.h +@@ -0,0 +1,126 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVFILTER_TONEMAPX_H ++#define AVFILTER_TONEMAPX_H ++ ++#include "config.h" ++#include "colorspace.h" ++ ++#define X86_64_V2 __attribute__((target("sse4.2"))) ++#define X86_64_V3 __attribute__((target("avx2,fma"))) ++ ++#if defined(__GNUC__) || defined(__clang__) ++# if (__GNUC__ >= 9) || (__clang_major__ >= 11) ++# define CC_SUPPORTS_TONEMAPX_INTRINSICS ++# endif // (__GNUC__ >= 10) || (__clang_major__ >= 11) ++#endif // defined(__GNUC__) || defined(__clang__) ++ ++#ifdef CC_SUPPORTS_TONEMAPX_INTRINSICS ++# if ARCH_AARCH64 ++# if HAVE_INTRINSICS_NEON ++# define ENABLE_TONEMAPX_NEON_INTRINSICS ++# endif ++# endif // ARCH_AARCH64 ++# if ARCH_X86 ++# if HAVE_INTRINSICS_SSE42 ++# define ENABLE_TONEMAPX_SSE_INTRINSICS ++# endif ++# if HAVE_INTRINSICS_AVX2 && HAVE_INTRINSICS_FMA3 ++# define ENABLE_TONEMAPX_AVX_INTRINSICS ++# endif ++# endif // ARCH_X86 ++#endif // CC_SUPPORTS_TONEMAPX_INTRINSICS ++ ++#define params_cnt 8 ++#define pivots_cnt (7+1) ++#define coeffs_cnt 8*4 ++#define mmr_cnt 8*6*4 ++#define params_sz params_cnt*sizeof(float) ++#define pivots_sz pivots_cnt*sizeof(float) ++#define coeffs_sz coeffs_cnt*sizeof(float) ++#define mmr_sz mmr_cnt*sizeof(float) ++ ++typedef struct TonemapIntParams { ++ double lut_peak; ++ float *lin_lut; ++ float *tonemap_lut; ++ uint16_t *delin_lut; ++ int in_yuv_off, out_yuv_off; ++ int16_t (*yuv2rgb_coeffs)[3][3][8]; ++ int16_t (*rgb2yuv_coeffs)[3][3][8]; ++ double (*rgb2rgb_coeffs)[3][3]; ++ int rgb2rgb_passthrough; ++ const AVLumaCoefficients *coeffs, *ocoeffs; ++ double desat; ++ struct DoviMetadata *dovi; ++ float *dovi_pbuf; ++ double (*lms2rgb_matrix)[3][3]; ++ float (*ycc_offset)[3]; ++} TonemapIntParams; ++ ++enum SIMDVariant { ++ SIMD_NONE = -1, ++ SIMD_NEON, ++ SIMD_SSE, ++ SIMD_AVX ++}; ++ ++void tonemap_frame_dovi_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++void tonemap_frame_dovi_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++#endif // AVFILTER_TONEMAPX_H +Index: FFmpeg/libavfilter/x86/Makefile +=================================================================== +--- FFmpeg.orig/libavfilter/x86/Makefile ++++ FFmpeg/libavfilter/x86/Makefile +@@ -34,6 +34,8 @@ OBJS-$(CONFIG_STEREO3D_FILTER) + OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend_init.o + OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold_init.o + OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o ++OBJS-$(CONFIG_TONEMAPX_FILTER) += x86/vf_tonemapx_intrin_sse.o \ ++ x86/vf_tonemapx_intrin_avx.o + OBJS-$(CONFIG_TRANSPOSE_FILTER) += x86/vf_transpose_init.o + OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o + OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360_init.o +Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c +=================================================================== +--- /dev/null ++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c +@@ -0,0 +1,2276 @@ ++/* ++ * Copyright (c) 2024 Gnattu OC ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "vf_tonemapx_intrin_avx.h" ++ ++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS ++# include ++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS ++ ++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS ++X86_64_V3 static inline __m256i av_clip_int16_avx(__m256i a) ++{ ++ __m256i add_result = _mm256_add_epi32(a, _mm256_set1_epi32(0x8000U)); ++ __m256i mask = _mm256_set1_epi32(~0xFFFF); ++ __m256i condition = _mm256_and_si256(add_result, mask); ++ __m256i cmp = _mm256_cmpeq_epi32(condition, _mm256_setzero_si256()); ++ ++ __m256i shifted = _mm256_srai_epi32(a, 31); ++ __m256i xor_result = _mm256_xor_si256(shifted, _mm256_set1_epi32(0x7FFF)); ++ ++ return _mm256_or_si256(_mm256_and_si256(cmp, a), _mm256_andnot_si256(cmp, xor_result)); ++} ++ ++X86_64_V3 inline static __m128 mix_float32x4(__m128 x, __m128 y, __m128 a) ++{ ++ __m128 n = _mm_sub_ps(y, x); ++ n = _mm_fmadd_ps(a, n, x); ++ return n; ++} ++ ++X86_64_V3 inline static float reduce_floatx4(__m128 x) { ++ x = _mm_hadd_ps(x, x); ++ x = _mm_hadd_ps(x, x); ++ return _mm_cvtss_f32(x); ++} ++ ++X86_64_V3 inline static float reduce_floatx8(__m256 x) { ++ __m256 x2 = _mm256_permute2f128_ps(x , x , 1); ++ x = _mm256_add_ps(x, x2); ++ x = _mm256_hadd_ps(x, x); ++ x = _mm256_hadd_ps(x, x); ++ return _mm256_cvtss_f32(x); ++} ++ ++X86_64_V3 static inline float reshape_poly(float s, __m128 coeffs) ++{ ++ __m128 ps = _mm_set_ps(0.0f, s * s, s, 1.0f); ++ ps = _mm_mul_ps(ps, coeffs); ++ return reduce_floatx4(ps); ++} ++ ++X86_64_V3 inline static float reshape_mmr(__m128 sig, __m128 coeffs, const float* mmr, ++ int mmr_single, int min_order, int max_order) ++{ ++ float s = _mm_cvtss_f32(coeffs); ++ int mmr_idx = 0; ++ int order = 0; ++ ++ __m256 sigX, mmr_coeffs, ps; ++ __m128 sigX01 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1))); // {sig[0]*sig[1], sig[1]*sig[1], sig[2]*sig[1], sig[3]*sig[1]} ++ __m128 sigX02 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[2], sig[1]*sig[2], sig[2]*sig[2], sig[3]*sig[2]} ++ __m128 sigX12 = _mm_mul_ps(sigX01, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[1]*sig[2], sig[1]*sig[1]*sig[2], sig[2]*sig[1]*sig[2], sig[3]*sig[1]*sig[2]} ++ __m128 sigX0 = sigX01; // sig[0]*sig[1] now positioned at 0 ++ ++ sigX0 = _mm_insert_ps(sigX0, sigX02, _MM_MK_INSERTPS_NDX(0, 1, 0)); // sig[0]*sig[2] at 1 ++ sigX0 = _mm_insert_ps(sigX0, sigX02, _MM_MK_INSERTPS_NDX(1, 2, 0)); // sig[1]*sig[2] at 2 ++ sigX0 = _mm_insert_ps(sigX0, sigX12, _MM_MK_INSERTPS_NDX(0, 3, 0)); // sig[0]*sig[1]*sig[2] at 3 ++ ++ sigX = _mm256_set_m128(sigX0, sig); ++ ++ mmr_idx = mmr_single ? 0 : (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 2, 0, 1))); ++ order = (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(1, 2, 0, 3))); ++ ++ // dot first order ++ mmr_coeffs = _mm256_loadu_ps(&mmr[mmr_idx + 0*4]); ++ ps = _mm256_mul_ps(sigX, mmr_coeffs); ++ s += reduce_floatx8(ps); ++ ++ if (max_order >= 2 && (min_order >= 2 || order >= 2)) { ++ __m256 sigX2 = _mm256_mul_ps(sigX, sigX); ++ mmr_coeffs = _mm256_loadu_ps(&mmr[mmr_idx + 2*4]); ++ ps = _mm256_mul_ps(sigX2, mmr_coeffs); ++ s += reduce_floatx8(ps); ++ ++ if (max_order == 3 && (min_order == 3 || order >= 3)) { ++ __m256 sigX3 = _mm256_mul_ps(sigX2, sigX); ++ mmr_coeffs = _mm256_loadu_ps(&mmr[mmr_idx + 4*4]); ++ ps = _mm256_mul_ps(sigX3, mmr_coeffs); ++ s += reduce_floatx8(ps); ++ } ++ } ++ ++ return s; ++} ++ ++#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c))) ++X86_64_V3 inline static __m128 reshape_dovi_iptpqc2(__m128 sig, const TonemapIntParams *ctx) ++{ ++ int has_mmr_poly; ++ float s; ++ ++ float *src_dovi_params = ctx->dovi_pbuf; ++ float *src_dovi_pivots = ctx->dovi_pbuf + 24; ++ float *src_dovi_coeffs = ctx->dovi_pbuf + 48; //float4* ++ float *src_dovi_mmr = ctx->dovi_pbuf + 144; //float4* ++ ++ float* dovi_params_i = src_dovi_params + 0*8; ++ float* dovi_pivots_i = src_dovi_pivots + 0*8; ++ float* dovi_coeffs_i = src_dovi_coeffs + 0 * 8 * 4; //float4* ++ float* dovi_mmr_i = src_dovi_mmr + 0 * 48 * 4; //float4* ++ int dovi_num_pivots_i = dovi_params_i[0]; ++ int dovi_has_mmr_i = dovi_params_i[1]; ++ int dovi_has_poly_i = dovi_params_i[2]; ++ int dovi_mmr_single_i = dovi_params_i[3]; ++ int dovi_min_order_i = dovi_params_i[4]; ++ int dovi_max_order_i = dovi_params_i[5]; ++ float dovi_lo_i = dovi_params_i[6]; ++ float dovi_hi_i = dovi_params_i[7]; ++ ++ float* dovi_params_p = src_dovi_params + 1*8; ++ float* dovi_coeffs_p = src_dovi_coeffs + 1*8 * 4; //float4* ++ float* dovi_mmr_p = src_dovi_mmr + 1*48 * 4; //float4* ++ int dovi_has_mmr_p = dovi_params_p[1]; ++ int dovi_has_poly_p = dovi_params_p[2]; ++ int dovi_mmr_single_p = dovi_params_p[3]; ++ int dovi_min_order_p = dovi_params_p[4]; ++ int dovi_max_order_p = dovi_params_p[5]; ++ float dovi_lo_p = dovi_params_p[6]; ++ float dovi_hi_p = dovi_params_p[7]; ++ ++ float* dovi_params_t = src_dovi_params + 2*8; ++ float* dovi_coeffs_t = src_dovi_coeffs + 2*8 * 4; //float4* ++ float* dovi_mmr_t = src_dovi_mmr + 2*48 * 4; //float4* ++ int dovi_has_mmr_t = dovi_params_t[1]; ++ int dovi_has_poly_t = dovi_params_t[2]; ++ int dovi_mmr_single_t = dovi_params_t[3]; ++ int dovi_min_order_t = dovi_params_t[4]; ++ int dovi_max_order_t = dovi_params_t[5]; ++ float dovi_lo_t = dovi_params_t[6]; ++ float dovi_hi_t = dovi_params_t[7]; ++ ++ __m128 coeffs, result; ++ ++ // reshape I ++ s = _mm_cvtss_f32(sig); ++ result = sig; ++ if (dovi_num_pivots_i > 2) { ++ __m128 m01 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i), _mm_loadu_ps(dovi_coeffs_i + 4), _mm_set1_ps(s >= dovi_pivots_i[0])); ++ __m128 m23 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 2*4), _mm_loadu_ps(dovi_coeffs_i + 3*4), _mm_set1_ps(s >= dovi_pivots_i[2])); ++ __m128 m0123 = mix_float32x4(m01, m23, _mm_set1_ps(s >= dovi_pivots_i[1])); ++ __m128 m45 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 4*4), _mm_loadu_ps(dovi_coeffs_i + 5*4), _mm_set1_ps(s >= dovi_pivots_i[4])); ++ __m128 m67 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 6*4), _mm_loadu_ps(dovi_coeffs_i + 7*4), _mm_set1_ps(s >= dovi_pivots_i[6])); ++ __m128 m4567 = mix_float32x4(m45, m67, _mm_set1_ps(s >= dovi_pivots_i[5])); ++ coeffs = mix_float32x4(m0123, m4567, _mm_set1_ps(s >= dovi_pivots_i[3])); ++ } else { ++ coeffs = _mm_loadu_ps(dovi_coeffs_i); ++ } ++ ++ has_mmr_poly = dovi_has_mmr_i && dovi_has_poly_i; ++ ++ if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_i)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(result, coeffs, dovi_mmr_i, ++ dovi_mmr_single_i, dovi_min_order_i, dovi_max_order_i); ++ ++ result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_i, dovi_hi_i)), _MM_MK_INSERTPS_NDX(0, 0, 0)); ++ ++ // reshape P ++ s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1))); ++ coeffs = _mm_loadu_ps(dovi_coeffs_p); ++ has_mmr_poly = dovi_has_mmr_p && dovi_has_poly_p; ++ ++ if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_p)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(result, coeffs, dovi_mmr_p, ++ dovi_mmr_single_p, dovi_min_order_p, dovi_max_order_p); ++ ++ result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_p, dovi_hi_p)), _MM_MK_INSERTPS_NDX(0, 1, 0)); ++ ++ // reshape T ++ s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); ++ coeffs = _mm_loadu_ps(dovi_coeffs_t); ++ has_mmr_poly = dovi_has_mmr_t && dovi_has_poly_t; ++ ++ if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_t)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(result, coeffs, dovi_mmr_t, ++ dovi_mmr_single_t, dovi_min_order_t, dovi_max_order_t); ++ ++ result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_t, dovi_hi_t)), _MM_MK_INSERTPS_NDX(0, 2, 0)); ++ ++ return result; ++} ++ ++X86_64_V3 inline static void ycc2rgbx8(__m256* dy, __m256* dcb, __m256* dcr, ++ __m256 y, __m256 cb, __m256 cr, ++ const double nonlinear[3][3], const float ycc_offset[3]) ++{ ++ *dy = _mm256_mul_ps(y, _mm256_set1_ps((float)nonlinear[0][0])); ++ *dy = _mm256_fmadd_ps(cb, _mm256_set1_ps((float)nonlinear[0][1]), *dy); ++ *dy = _mm256_fmadd_ps(cr, _mm256_set1_ps((float)nonlinear[0][2]), *dy); ++ *dy = _mm256_sub_ps(*dy, _mm256_set1_ps(ycc_offset[0])); ++ ++ *dcb = _mm256_mul_ps(y, _mm256_set1_ps((float)nonlinear[1][0])); ++ *dcb = _mm256_fmadd_ps(cb, _mm256_set1_ps((float)nonlinear[1][1]), *dcb); ++ *dcb = _mm256_fmadd_ps(cr, _mm256_set1_ps((float)nonlinear[1][2]), *dcb); ++ *dcb = _mm256_sub_ps(*dcb, _mm256_set1_ps(ycc_offset[1])); ++ ++ *dcr = _mm256_mul_ps(y, _mm256_set1_ps((float)nonlinear[2][0])); ++ *dcr = _mm256_fmadd_ps(cb, _mm256_set1_ps((float)nonlinear[2][1]), *dcr); ++ *dcr = _mm256_fmadd_ps(cr, _mm256_set1_ps((float)nonlinear[2][2]), *dcr); ++ *dcr = _mm256_sub_ps(*dcr, _mm256_set1_ps(ycc_offset[2])); ++} ++ ++X86_64_V3 inline static void lms2rgbx8(__m256* dl, __m256* dm, __m256* ds, ++ __m256 l, __m256 m, __m256 s, ++ const double lms2rgb_matrix[3][3]) ++{ ++ *dl = _mm256_mul_ps(l, _mm256_set1_ps((float)lms2rgb_matrix[0][0])); ++ *dl = _mm256_fmadd_ps(m, _mm256_set1_ps((float)lms2rgb_matrix[0][1]), *dl); ++ *dl = _mm256_fmadd_ps(s, _mm256_set1_ps((float)lms2rgb_matrix[0][2]), *dl); ++ ++ *dm = _mm256_mul_ps(l, _mm256_set1_ps((float)lms2rgb_matrix[1][0])); ++ *dm = _mm256_fmadd_ps(m, _mm256_set1_ps((float)lms2rgb_matrix[1][1]), *dm); ++ *dm = _mm256_fmadd_ps(s, _mm256_set1_ps((float)lms2rgb_matrix[1][2]), *dm); ++ ++ *ds = _mm256_mul_ps(l, _mm256_set1_ps((float)lms2rgb_matrix[2][0])); ++ *ds = _mm256_fmadd_ps(m, _mm256_set1_ps((float)lms2rgb_matrix[2][1]), *ds); ++ *ds = _mm256_fmadd_ps(s, _mm256_set1_ps((float)lms2rgb_matrix[2][2]), *ds); ++} ++ ++X86_64_V3 inline static void reshapeiptx8(__m128* ipt0, __m128* ipt1, __m128* ipt2, __m128* ipt3, ++ __m128* ipt4, __m128* ipt5, __m128* ipt6, __m128* ipt7, ++ __m256 yx8, __m256 ux8, __m256 vx8, ++ const struct TonemapIntParams *params) ++{ ++ __m128 yx4a = _mm256_extractf128_ps(yx8, 0); ++ __m128 yx4b = _mm256_extractf128_ps(yx8, 1); ++ __m128 ux4a = _mm256_extractf128_ps(ux8, 0); ++ __m128 ux4b = _mm256_extractf128_ps(ux8, 1); ++ __m128 vx4a = _mm256_extractf128_ps(vx8, 0); ++ __m128 vx4b = _mm256_extractf128_ps(vx8, 1); ++ ++ __m128 ia1 = _mm_unpacklo_ps(yx4a, ux4a); ++ __m128 ia2 = _mm_unpackhi_ps(yx4a, ux4a); ++ __m128 ib1 = _mm_unpacklo_ps(vx4a, _mm_setzero_ps()); ++ __m128 ib2 = _mm_unpackhi_ps(vx4a, _mm_setzero_ps()); ++ ++ *ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ *ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ *ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ *ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ *ipt0 = reshape_dovi_iptpqc2(*ipt0, params); ++ *ipt1 = reshape_dovi_iptpqc2(*ipt1, params); ++ *ipt2 = reshape_dovi_iptpqc2(*ipt2, params); ++ *ipt3 = reshape_dovi_iptpqc2(*ipt3, params); ++ ++ ia1 = _mm_unpacklo_ps(yx4b, ux4b); ++ ia2 = _mm_unpackhi_ps(yx4b, ux4b); ++ ib1 = _mm_unpacklo_ps(vx4b, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4b, _mm_setzero_ps()); ++ ++ *ipt4 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ *ipt5 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ *ipt6 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ *ipt7 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ *ipt4 = reshape_dovi_iptpqc2(*ipt4, params); ++ *ipt5 = reshape_dovi_iptpqc2(*ipt5, params); ++ *ipt6 = reshape_dovi_iptpqc2(*ipt6, params); ++ *ipt7 = reshape_dovi_iptpqc2(*ipt7, params); ++} ++ ++X86_64_V3 inline static void transpose_ipt8x4(__m128 ipt0, __m128 ipt1, __m128 ipt2, __m128 ipt3, ++ __m128 ipt4, __m128 ipt5, __m128 ipt6, __m128 ipt7, ++ __m256* ix8, __m256* px8, __m256* tx8) ++{ ++ __m256 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; ++ tmp0 = _mm256_castps128_ps256(ipt0); ++ tmp0 = _mm256_insertf128_ps(tmp0, ipt4, 1); ++ ++ tmp1 = _mm256_castps128_ps256(ipt1); ++ tmp1 = _mm256_insertf128_ps(tmp1, ipt5, 1); ++ ++ tmp2 = _mm256_castps128_ps256(ipt2); ++ tmp2 = _mm256_insertf128_ps(tmp2, ipt6, 1); ++ ++ tmp3 = _mm256_castps128_ps256(ipt3); ++ tmp3 = _mm256_insertf128_ps(tmp3, ipt7, 1); ++ ++ tmp4 = _mm256_unpacklo_ps(tmp0, tmp1); ++ tmp5 = _mm256_unpackhi_ps(tmp0, tmp1); ++ tmp6 = _mm256_unpacklo_ps(tmp2, tmp3); ++ tmp7 = _mm256_unpackhi_ps(tmp2, tmp3); ++ ++ *ix8 = _mm256_shuffle_ps(tmp4, tmp6, _MM_SHUFFLE(1, 0, 1, 0)); ++ *px8 = _mm256_shuffle_ps(tmp4, tmp6, _MM_SHUFFLE(3, 2, 3, 2)); ++ *tx8 = _mm256_shuffle_ps(tmp5, tmp7, _MM_SHUFFLE(1, 0, 1, 0)); ++} ++ ++X86_64_V3 static inline void tonemap_int32x8_avx(__m256i r_in, __m256i g_in, __m256i b_in, ++ int16_t *r_out, int16_t *g_out, int16_t *b_out, ++ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, ++ const AVLumaCoefficients *coeffs, ++ const AVLumaCoefficients *ocoeffs, double desat, ++ double (*rgb2rgb)[3][3], ++ int rgb2rgb_passthrough) ++{ ++ __m256i sig8; ++ __m256 mapvalx8, r_linx8, g_linx8, b_linx8; ++ __m256 offset = _mm256_set1_ps(0.5f); ++ __m256i zerox8 = _mm256_setzero_si256(); ++ __m256i input_lut_offset = _mm256_set1_epi32(2048); ++ __m256i upper_bound = _mm256_set1_epi32(32767); ++ __m256 intermediate_upper_bound = _mm256_set1_ps(32767.0f); ++ __m256i r, g, b, rx8, gx8, bx8; ++ ++ float mapval8[8], r_lin8[8], g_lin8[8], b_lin8[8]; ++ ++ sig8 = _mm256_max_epi32(r_in, _mm256_max_epi32(g_in, b_in)); ++ sig8 = _mm256_add_epi32(sig8, input_lut_offset); ++ sig8 = _mm256_min_epi32(sig8, upper_bound); ++ sig8 = _mm256_max_epi32(sig8, zerox8); ++ ++ r = _mm256_add_epi32(r_in, input_lut_offset); ++ r = _mm256_min_epi32(r, upper_bound); ++ r = _mm256_max_epi32(r, zerox8); ++ g = _mm256_add_epi32(g_in, input_lut_offset); ++ g = _mm256_min_epi32(g, upper_bound); ++ g = _mm256_max_epi32(g, zerox8); ++ b = _mm256_add_epi32(b_in, input_lut_offset); ++ b = _mm256_min_epi32(b, upper_bound); ++ b = _mm256_max_epi32(b, zerox8); ++ ++#define LOAD_LUT(i) mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)]; \ ++r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)]; \ ++g_lin8[i] = lin_lut[_mm256_extract_epi32(g, i)]; \ ++b_lin8[i] = lin_lut[_mm256_extract_epi32(b, i)]; ++ ++ LOAD_LUT(0) ++ LOAD_LUT(1) ++ LOAD_LUT(2) ++ LOAD_LUT(3) ++ LOAD_LUT(4) ++ LOAD_LUT(5) ++ LOAD_LUT(6) ++ LOAD_LUT(7) ++ ++#undef LOAD_LUT ++ ++ mapvalx8 = _mm256_loadu_ps(mapval8); ++ r_linx8 = _mm256_loadu_ps(r_lin8); ++ g_linx8 = _mm256_loadu_ps(g_lin8); ++ b_linx8 = _mm256_loadu_ps(b_lin8); ++ ++ if (!rgb2rgb_passthrough) { ++ r_linx8 = _mm256_mul_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][0])); ++ r_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][1]), r_linx8); ++ r_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][2]), r_linx8); ++ ++ g_linx8 = _mm256_mul_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][1])); ++ g_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][0]), g_linx8); ++ g_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][2]), g_linx8); ++ ++ b_linx8 = _mm256_mul_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][2])); ++ b_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][0]), b_linx8); ++ b_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][1]), b_linx8); ++ } ++ ++ if (desat > 0) { ++ __m256 eps_x8 = _mm256_set1_ps(FLOAT_EPS); ++ __m256 desat8 = _mm256_set1_ps((float)desat); ++ __m256 luma8 = _mm256_set1_ps(0); ++ __m256 overbright8; ++ ++ luma8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cr)), luma8); ++ luma8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cg)), luma8); ++ luma8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cb)), luma8); ++ overbright8 = _mm256_div_ps(_mm256_max_ps(_mm256_sub_ps(luma8, desat8), eps_x8), _mm256_max_ps(luma8, eps_x8)); ++ r_linx8 = _mm256_fnmadd_ps(r_linx8, overbright8, r_linx8); ++ r_linx8 = _mm256_fmadd_ps(luma8, overbright8, r_linx8); ++ g_linx8 = _mm256_fnmadd_ps(g_linx8, overbright8, g_linx8); ++ g_linx8 = _mm256_fmadd_ps(luma8, overbright8, g_linx8); ++ b_linx8 = _mm256_fnmadd_ps(b_linx8, overbright8, b_linx8); ++ b_linx8 = _mm256_fmadd_ps(luma8, overbright8, b_linx8); ++ } ++ ++ r_linx8 = _mm256_mul_ps(r_linx8, mapvalx8); ++ g_linx8 = _mm256_mul_ps(g_linx8, mapvalx8); ++ b_linx8 = _mm256_mul_ps(b_linx8, mapvalx8); ++ ++ r_linx8 = _mm256_fmadd_ps(r_linx8, intermediate_upper_bound, offset); ++ g_linx8 = _mm256_fmadd_ps(g_linx8, intermediate_upper_bound, offset); ++ b_linx8 = _mm256_fmadd_ps(b_linx8, intermediate_upper_bound, offset); ++ ++ rx8 = _mm256_cvttps_epi32(r_linx8); ++ rx8 = _mm256_min_epi32(rx8, upper_bound); ++ rx8 = _mm256_max_epi32(rx8, zerox8); ++ ++ gx8 = _mm256_cvttps_epi32(g_linx8); ++ gx8 = _mm256_min_epi32(gx8, upper_bound); ++ gx8 = _mm256_max_epi32(gx8, zerox8); ++ ++ bx8 = _mm256_cvttps_epi32(b_linx8); ++ bx8 = _mm256_min_epi32(bx8, upper_bound); ++ bx8 = _mm256_max_epi32(bx8, zerox8); ++ ++#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm256_extract_epi32(rx8, i)]; \ ++g_out[i] = delin_lut[_mm256_extract_epi32(gx8, i)]; \ ++b_out[i] = delin_lut[_mm256_extract_epi32(bx8, i)]; ++ ++ SAVE_COLOR(0) ++ SAVE_COLOR(1) ++ SAVE_COLOR(2) ++ SAVE_COLOR(3) ++ SAVE_COLOR(4) ++ SAVE_COLOR(5) ++ SAVE_COLOR(6) ++ SAVE_COLOR(7) ++ ++#undef SAVE_COLOR ++} ++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS ++ ++X86_64_V3 void tonemap_frame_dovi_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS ++ uint8_t *rdsty = dsty; ++ uint8_t *rdstu = dstu; ++ uint8_t *rdstv = dstv; ++ const uint16_t *rsrcy = srcy; ++ const uint16_t *rsrcu = srcu; ++ const uint16_t *rsrcv = srcv; ++ int rheight = height; ++ // not zero when not divisible by 16 ++ // intentionally leave last pixel emtpy when input is odd ++ int remainw = width & 14; ++ ++ const int in_depth = srcdepth; ++ const float in_rng = (float)((1 << in_depth) - 1); ++ ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int16_t r[16], g[16], b[16]; ++ int16_t r1[16], g1[16], b1[16]; ++ ++ __m256i ux8, vx8; ++ __m256i y0x16, y1x16; ++ __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; ++ __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; ++ __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b; ++ ++ __m256i r0ox16, g0ox16, b0ox16; ++ __m256i y0ox16; ++ __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; ++ __m256i yoax8, yobx8; ++ ++ __m256i r1ox16, g1ox16, b1ox16; ++ __m256i y1ox16; ++ __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; ++ __m256i y1oax8, y1obx8; ++ __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; ++ ++ __m128 ipt0, ipt1, ipt2, ipt3, ipt4, ipt5, ipt6, ipt7; ++ __m256 ix8, px8, tx8; ++ __m256 lx8, mx8, sx8; ++ __m256 rx8a, gx8a, bx8a, rx8b, gx8b, bx8b; ++ __m256 y0x8af, y0x8bf, y1x8af, y1x8bf, ux8af, ux8bf, vx8af, vx8bf; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { ++ for (int xx = 0; xx < width >> 4; xx++) { ++ int x = xx << 4; ++ ++ y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x)); ++ y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x))); ++ ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1)))); ++ vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1)))); ++ ++ y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0)); ++ y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1)); ++ y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0)); ++ y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1)); ++ ++ ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); ++ ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); ++ vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); ++ vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); ++ ++ y0x8af = _mm256_cvtepi32_ps(y0x8a); ++ y0x8bf = _mm256_cvtepi32_ps(y0x8b); ++ y1x8af = _mm256_cvtepi32_ps(y1x8a); ++ y1x8bf = _mm256_cvtepi32_ps(y1x8b); ++ ux8af = _mm256_cvtepi32_ps(ux8a); ++ ux8bf = _mm256_cvtepi32_ps(ux8b); ++ vx8af = _mm256_cvtepi32_ps(vx8a); ++ vx8bf = _mm256_cvtepi32_ps(vx8b); ++ ++ y0x8af = _mm256_div_ps(y0x8af, _mm256_set1_ps(in_rng)); ++ y0x8bf = _mm256_div_ps(y0x8bf, _mm256_set1_ps(in_rng)); ++ y1x8af = _mm256_div_ps(y1x8af, _mm256_set1_ps(in_rng)); ++ y1x8bf = _mm256_div_ps(y1x8bf, _mm256_set1_ps(in_rng)); ++ ux8af = _mm256_div_ps(ux8af, _mm256_set1_ps(in_rng)); ++ ux8bf = _mm256_div_ps(ux8bf, _mm256_set1_ps(in_rng)); ++ vx8af = _mm256_div_ps(vx8af, _mm256_set1_ps(in_rng)); ++ vx8bf = _mm256_div_ps(vx8bf, _mm256_set1_ps(in_rng)); ++ ++ // Reshape y0x8a ++ reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, ++ &ipt4, &ipt5, &ipt6, &ipt7, ++ y0x8af, ux8af, vx8af, params); ++ ++ transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3, ++ ipt4, ipt5, ipt6, ipt7, ++ &ix8, &px8, &tx8); ++ ++ ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix); ++ ++ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(28672.0f)); ++ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(28672.0f)); ++ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(28672.0f)); ++ ++ r0x8a = _mm256_cvtps_epi32(rx8a); ++ g0x8a = _mm256_cvtps_epi32(gx8a); ++ b0x8a = _mm256_cvtps_epi32(bx8a); ++ ++ // Reshape y1x8a ++ reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, ++ &ipt4, &ipt5, &ipt6, &ipt7, ++ y1x8af, ux8af, vx8af, params); ++ ++ transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3, ++ ipt4, ipt5, ipt6, ipt7, ++ &ix8, &px8, &tx8); ++ ++ ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix); ++ ++ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(28672.0f)); ++ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(28672.0f)); ++ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(28672.0f)); ++ ++ r1x8a = _mm256_cvtps_epi32(rx8a); ++ g1x8a = _mm256_cvtps_epi32(gx8a); ++ b1x8a = _mm256_cvtps_epi32(bx8a); ++ ++ // Reshape y0x8b ++ reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, ++ &ipt4, &ipt5, &ipt6, &ipt7, ++ y0x8bf, ux8bf, vx8bf, params); ++ ++ transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3, ++ ipt4, ipt5, ipt6, ipt7, ++ &ix8, &px8, &tx8); ++ ++ ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix); ++ ++ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(28672.0f)); ++ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(28672.0f)); ++ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(28672.0f)); ++ ++ r0x8b = _mm256_cvtps_epi32(rx8b); ++ g0x8b = _mm256_cvtps_epi32(gx8b); ++ b0x8b = _mm256_cvtps_epi32(bx8b); ++ ++ // Reshape y1x8b ++ reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, ++ &ipt4, &ipt5, &ipt6, &ipt7, ++ y1x8bf, ux8bf, vx8bf, params); ++ ++ transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3, ++ ipt4, ipt5, ipt6, ipt7, ++ &ix8, &px8, &tx8); ++ ++ ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix); ++ ++ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(28672.0f)); ++ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(28672.0f)); ++ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(28672.0f)); ++ ++ r1x8b = _mm256_cvtps_epi32(rx8b); ++ g1x8b = _mm256_cvtps_epi32(gx8b); ++ b1x8b = _mm256_cvtps_epi32(bx8b); ++ ++ tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ ++ r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r); ++ g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g); ++ b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b); ++ ++ roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0)); ++ goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0)); ++ boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0)); ++ ++ robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1)); ++ gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1)); ++ bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1)); ++ ++ yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry)); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy))); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby))); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd)); ++ yoax8 = _mm256_srai_epi32(yoax8, out_sh); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off)); ++ ++ yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry)); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy))); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby))); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd)); ++ yobx8 = _mm256_srai_epi32(yobx8, out_sh); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); ++ ++ y0ox16 = _mm256_packs_epi32(yoax8, yobx8); ++ y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); ++ ++ r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); ++ g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1); ++ b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1); ++ ++ r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0)); ++ g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0)); ++ b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0)); ++ ++ r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1)); ++ g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1)); ++ b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1)); ++ ++ y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry)); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy))); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby))); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd)); ++ y1oax8 = _mm256_srai_epi32(y1oax8, out_sh); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off)); ++ ++ y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry)); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy))); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby))); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd)); ++ y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); ++ ++ y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8); ++ y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); ++ ++ ravgx8 = _mm256_hadd_epi32(roax8, robx8); ++ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8)); ++ ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2)); ++ ravgx8 = _mm256_srai_epi32(ravgx8, 2); ++ ++ gavgx8 = _mm256_hadd_epi32(goax8, gobx8); ++ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8)); ++ gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2)); ++ gavgx8 = _mm256_srai_epi32(gavgx8, 2); ++ ++ bavgx8 = _mm256_hadd_epi32(boax8, bobx8); ++ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8)); ++ bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2)); ++ bavgx8 = _mm256_srai_epi32(bavgx8, 2); ++ ++ uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru))); ++ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu))); ++ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv))); ++ uox8 = _mm256_srai_epi32(uox8, out_sh); ++ uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset)); ++ uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256()); ++ uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0)); ++ uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256()); ++ _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8)); ++ ++ vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv))); ++ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv))); ++ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv))); ++ vox8 = _mm256_srai_epi32(vox8, out_sh); ++ vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset)); ++ vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256()); ++ vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0)); ++ vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256()); ++ _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8)); ++ } ++ } ++ ++ // Process remaining pixels cannot fill the full simd register with scalar version ++ if (remainw) { ++ int offset = width & (int)0xfffffff0; ++ rdsty += offset; ++ rdstu += offset >> 1; ++ rdstv += offset >> 1; ++ rsrcy += offset; ++ rsrcu += offset >> 1; ++ rsrcv += offset >> 1; ++ tonemap_frame_dovi_2_420p(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); ++ } ++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS ++} ++ ++X86_64_V3 void tonemap_frame_dovi_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS ++ uint16_t *rdsty = dsty; ++ uint16_t *rdstu = dstu; ++ uint16_t *rdstv = dstv; ++ const uint16_t *rsrcy = srcy; ++ const uint16_t *rsrcu = srcu; ++ const uint16_t *rsrcv = srcv; ++ int rheight = height; ++ // not zero when not divisible by 8 ++ // intentionally leave last pixel emtpy when input is odd ++ int remainw = width & 14; ++ ++ const int in_depth = srcdepth; ++ const float in_rng = (float)((1 << in_depth) - 1); ++ ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int16_t r[16], g[16], b[16]; ++ int16_t r1[16], g1[16], b1[16]; ++ ++ __m256i ux8, vx8; ++ __m256i y0x16, y1x16; ++ __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; ++ __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; ++ __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b; ++ ++ __m256i r0ox16, g0ox16, b0ox16; ++ __m256i y0ox16; ++ __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; ++ __m256i yoax8, yobx8; ++ ++ __m256i r1ox16, g1ox16, b1ox16; ++ __m256i y1ox16; ++ __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; ++ __m256i y1oax8, y1obx8; ++ __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; ++ ++ __m128 ipt0, ipt1, ipt2, ipt3, ipt4, ipt5, ipt6, ipt7; ++ __m256 ix8, px8, tx8; ++ __m256 lx8, mx8, sx8; ++ __m256 rx8a, gx8a, bx8a, rx8b, gx8b, bx8b; ++ __m256 y0x8af, y0x8bf, y1x8af, y1x8bf, ux8af, ux8bf, vx8af, vx8bf; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { ++ for (int xx = 0; xx < width >> 4; xx++) { ++ int x = xx << 4; ++ ++ y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x)); ++ y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x))); ++ ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1)))); ++ vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1)))); ++ ++ y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0)); ++ y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1)); ++ y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0)); ++ y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1)); ++ ++ ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); ++ ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); ++ vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); ++ vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); ++ ++ y0x8af = _mm256_cvtepi32_ps(y0x8a); ++ y0x8bf = _mm256_cvtepi32_ps(y0x8b); ++ y1x8af = _mm256_cvtepi32_ps(y1x8a); ++ y1x8bf = _mm256_cvtepi32_ps(y1x8b); ++ ux8af = _mm256_cvtepi32_ps(ux8a); ++ ux8bf = _mm256_cvtepi32_ps(ux8b); ++ vx8af = _mm256_cvtepi32_ps(vx8a); ++ vx8bf = _mm256_cvtepi32_ps(vx8b); ++ ++ y0x8af = _mm256_div_ps(y0x8af, _mm256_set1_ps(in_rng)); ++ y0x8bf = _mm256_div_ps(y0x8bf, _mm256_set1_ps(in_rng)); ++ y1x8af = _mm256_div_ps(y1x8af, _mm256_set1_ps(in_rng)); ++ y1x8bf = _mm256_div_ps(y1x8bf, _mm256_set1_ps(in_rng)); ++ ux8af = _mm256_div_ps(ux8af, _mm256_set1_ps(in_rng)); ++ ux8bf = _mm256_div_ps(ux8bf, _mm256_set1_ps(in_rng)); ++ vx8af = _mm256_div_ps(vx8af, _mm256_set1_ps(in_rng)); ++ vx8bf = _mm256_div_ps(vx8bf, _mm256_set1_ps(in_rng)); ++ ++ // Reshape y0x8a ++ reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, ++ &ipt4, &ipt5, &ipt6, &ipt7, ++ y0x8af, ux8af, vx8af, params); ++ ++ transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3, ++ ipt4, ipt5, ipt6, ipt7, ++ &ix8, &px8, &tx8); ++ ++ ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix); ++ ++ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(28672.0f)); ++ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(28672.0f)); ++ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(28672.0f)); ++ ++ r0x8a = _mm256_cvtps_epi32(rx8a); ++ g0x8a = _mm256_cvtps_epi32(gx8a); ++ b0x8a = _mm256_cvtps_epi32(bx8a); ++ ++ // Reshape y1x8a ++ reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, ++ &ipt4, &ipt5, &ipt6, &ipt7, ++ y1x8af, ux8af, vx8af, params); ++ ++ transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3, ++ ipt4, ipt5, ipt6, ipt7, ++ &ix8, &px8, &tx8); ++ ++ ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix); ++ ++ rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(28672.0f)); ++ gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(28672.0f)); ++ bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(28672.0f)); ++ ++ r1x8a = _mm256_cvtps_epi32(rx8a); ++ g1x8a = _mm256_cvtps_epi32(gx8a); ++ b1x8a = _mm256_cvtps_epi32(bx8a); ++ ++ // Reshape y0x8b ++ reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, ++ &ipt4, &ipt5, &ipt6, &ipt7, ++ y0x8bf, ux8bf, vx8bf, params); ++ ++ transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3, ++ ipt4, ipt5, ipt6, ipt7, ++ &ix8, &px8, &tx8); ++ ++ ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix); ++ ++ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(28672.0f)); ++ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(28672.0f)); ++ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(28672.0f)); ++ ++ r0x8b = _mm256_cvtps_epi32(rx8b); ++ g0x8b = _mm256_cvtps_epi32(gx8b); ++ b0x8b = _mm256_cvtps_epi32(bx8b); ++ ++ // Reshape y1x8b ++ reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3, ++ &ipt4, &ipt5, &ipt6, &ipt7, ++ y1x8bf, ux8bf, vx8bf, params); ++ ++ transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3, ++ ipt4, ipt5, ipt6, ipt7, ++ &ix8, &px8, &tx8); ++ ++ ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix); ++ ++ rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(28672.0f)); ++ gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(28672.0f)); ++ bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(28672.0f)); ++ ++ r1x8b = _mm256_cvtps_epi32(rx8b); ++ g1x8b = _mm256_cvtps_epi32(gx8b); ++ b1x8b = _mm256_cvtps_epi32(bx8b); ++ ++ tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ ++ r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r); ++ g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g); ++ b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b); ++ ++ roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0)); ++ goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0)); ++ boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0)); ++ ++ robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1)); ++ gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1)); ++ bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1)); ++ ++ yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry)); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy))); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby))); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd)); ++ yoax8 = _mm256_srai_epi32(yoax8, out_sh); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off)); ++ ++ yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry)); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy))); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby))); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd)); ++ yobx8 = _mm256_srai_epi32(yobx8, out_sh); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); ++ ++ y0ox16 = _mm256_packus_epi32(yoax8, yobx8); ++ y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16); ++ ++ r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); ++ g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1); ++ b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1); ++ ++ r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0)); ++ g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0)); ++ b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0)); ++ ++ r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1)); ++ g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1)); ++ b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1)); ++ ++ y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry)); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy))); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby))); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd)); ++ y1oax8 = _mm256_srai_epi32(y1oax8, out_sh); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off)); ++ ++ y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry)); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy))); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby))); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd)); ++ y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); ++ ++ y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8); ++ y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16); ++ ++ ravgx8 = _mm256_hadd_epi32(roax8, robx8); ++ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8)); ++ ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2)); ++ ravgx8 = _mm256_srai_epi32(ravgx8, 2); ++ ++ gavgx8 = _mm256_hadd_epi32(goax8, gobx8); ++ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8)); ++ gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2)); ++ gavgx8 = _mm256_srai_epi32(gavgx8, 2); ++ ++ bavgx8 = _mm256_hadd_epi32(boax8, bobx8); ++ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8)); ++ bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2)); ++ bavgx8 = _mm256_srai_epi32(bavgx8, 2); ++ ++ uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru))); ++ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu))); ++ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv))); ++ uox8 = _mm256_srai_epi32(uox8, out_sh); ++ uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset)); ++ uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256()); ++ uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8)); ++ ++ vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv))); ++ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv))); ++ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv))); ++ vox8 = _mm256_srai_epi32(vox8, out_sh); ++ vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset)); ++ vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256()); ++ vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8)); ++ } ++ } ++ ++ // Process remaining pixels cannot fill the full simd register with scalar version ++ if (remainw) { ++ int offset = width & (int)0xfffffff0; ++ rdsty += offset; ++ rdstu += offset >> 1; ++ rdstv += offset >> 1; ++ rsrcy += offset; ++ rsrcu += offset >> 1; ++ rsrcv += offset >> 1; ++ tonemap_frame_dovi_2_420p10(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); ++ } ++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS ++} ++ ++X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS ++ uint8_t *rdsty = dsty; ++ uint8_t *rdstu = dstu; ++ uint8_t *rdstv = dstv; ++ const uint16_t *rsrcy = srcy; ++ const uint16_t *rsrcu = srcu; ++ const uint16_t *rsrcv = srcv; ++ int rheight = height; ++ // not zero when not divisible by 16 ++ // intentionally leave last pixel emtpy when input is odd ++ int remainw = width & 14; ++ ++ const int in_depth = srcdepth; ++ const int in_uv_offset = 128 << (in_depth - 8); ++ const int in_sh = in_depth - 1; ++ const int in_rnd = 1 << (in_sh - 1); ++ ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ ++ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; ++ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; ++ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; ++ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; ++ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++ ++ int16_t r[16], g[16], b[16]; ++ int16_t r1[16], g1[16], b1[16]; ++ __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off); ++ __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset); ++ __m256i cyx8 = _mm256_set1_epi32(cy); ++ __m256i rndx8 = _mm256_set1_epi32(in_rnd); ++ ++ __m256i ux8, vx8; ++ __m256i y0x16, y1x16; ++ __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; ++ __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; ++ __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b; ++ ++ __m256i r0ox16, g0ox16, b0ox16; ++ __m256i y0ox16; ++ __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; ++ __m256i yoax8, yobx8; ++ ++ __m256i r1ox16, g1ox16, b1ox16; ++ __m256i y1ox16; ++ __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; ++ __m256i y1oax8, y1obx8; ++ __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { ++ for (int xx = 0; xx < width >> 4; xx++) { ++ int x = xx << 4; ++ ++ y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x)); ++ y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x))); ++ ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1)))); ++ vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1)))); ++ ++ y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0)); ++ y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1)); ++ y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0)); ++ y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1)); ++ ++ y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8); ++ y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8); ++ y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8); ++ y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8); ++ ux8 = _mm256_sub_epi32(ux8, in_uv_offx8); ++ vx8 = _mm256_sub_epi32(vx8, in_uv_offx8); ++ ++ ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); ++ ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); ++ vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); ++ vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); ++ ++ // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh); ++ r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8); ++ r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv))); ++ r0x8a = _mm256_add_epi32(r0x8a, rndx8); ++ r0x8a = _mm256_srai_epi32(r0x8a, in_sh); ++ r0x8a = av_clip_int16_avx(r0x8a); ++ ++ r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8); ++ r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv))); ++ r1x8a = _mm256_add_epi32(r1x8a, rndx8); ++ r1x8a = _mm256_srai_epi32(r1x8a, in_sh); ++ r1x8a = av_clip_int16_avx(r1x8a); ++ ++ // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu))); ++ g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv))); ++ g0x8a = _mm256_add_epi32(g0x8a, rndx8); ++ g0x8a = _mm256_srai_epi32(g0x8a, in_sh); ++ g0x8a = av_clip_int16_avx(g0x8a); ++ ++ g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu))); ++ g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv))); ++ g1x8a = _mm256_add_epi32(g1x8a, rndx8); ++ g1x8a = _mm256_srai_epi32(g1x8a, in_sh); ++ g1x8a = av_clip_int16_avx(g1x8a); ++ ++ // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh); ++ b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu))); ++ b0x8a = _mm256_add_epi32(b0x8a, rndx8); ++ b0x8a = _mm256_srai_epi32(b0x8a, in_sh); ++ b0x8a = av_clip_int16_avx(b0x8a); + -+static const AVFilterPad tonemapx_inputs[] = { -+ { -+ .name = "default", -+ .type = AVMEDIA_TYPE_VIDEO, -+ .filter_frame = filter_frame, -+ }, -+}; ++ b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu))); ++ b1x8a = _mm256_add_epi32(b1x8a, rndx8); ++ b1x8a = _mm256_srai_epi32(b1x8a, in_sh); ++ b1x8a = av_clip_int16_avx(b1x8a); + -+AVFilter ff_vf_tonemapx = { -+ .name = "tonemapx", -+ .description = NULL_IF_CONFIG_SMALL("SIMD optimized HDR to SDR tonemapping"), -+ .init = init, -+ .uninit = uninit, -+ .priv_size = sizeof(TonemapxContext), -+ .priv_class = &tonemapx_class, -+ FILTER_INPUTS(tonemapx_inputs), -+ FILTER_OUTPUTS(ff_video_default_filterpad), -+ FILTER_QUERY_FUNC(query_formats), -+ .flags = AVFILTER_FLAG_SLICE_THREADS, -+}; -Index: FFmpeg/libavfilter/vf_tonemapx.h -=================================================================== ---- /dev/null -+++ FFmpeg/libavfilter/vf_tonemapx.h -@@ -0,0 +1,99 @@ -+/* -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ ++ r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8); ++ r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv))); ++ r0x8b = _mm256_add_epi32(r0x8b, rndx8); ++ r0x8b = _mm256_srai_epi32(r0x8b, in_sh); ++ r0x8b = av_clip_int16_avx(r0x8b); ++ ++ r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8); ++ r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv))); ++ r1x8b = _mm256_add_epi32(r1x8b, rndx8); ++ r1x8b = _mm256_srai_epi32(r1x8b, in_sh); ++ r1x8b = av_clip_int16_avx(r1x8b); ++ ++ g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu))); ++ g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv))); ++ g0x8b = _mm256_add_epi32(g0x8b, rndx8); ++ g0x8b = _mm256_srai_epi32(g0x8b, in_sh); ++ g0x8b = av_clip_int16_avx(g0x8b); ++ ++ g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu))); ++ g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv))); ++ g1x8b = _mm256_add_epi32(g1x8b, rndx8); ++ g1x8b = _mm256_srai_epi32(g1x8b, in_sh); ++ g1x8b = av_clip_int16_avx(g1x8b); ++ ++ b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu))); ++ b0x8b = _mm256_add_epi32(b0x8b, rndx8); ++ b0x8b = _mm256_srai_epi32(b0x8b, in_sh); ++ b0x8b = av_clip_int16_avx(b0x8b); ++ ++ b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu))); ++ b1x8b = _mm256_add_epi32(b1x8b, rndx8); ++ b1x8b = _mm256_srai_epi32(b1x8b, in_sh); ++ b1x8b = av_clip_int16_avx(b1x8b); ++ ++ tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ ++ r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r); ++ g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g); ++ b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b); ++ ++ roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0)); ++ goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0)); ++ boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0)); ++ ++ robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1)); ++ gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1)); ++ bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1)); ++ ++ yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry)); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy))); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby))); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd)); ++ yoax8 = _mm256_srai_epi32(yoax8, out_sh); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off)); ++ ++ yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry)); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy))); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby))); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd)); ++ yobx8 = _mm256_srai_epi32(yobx8, out_sh); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); ++ ++ y0ox16 = _mm256_packs_epi32(yoax8, yobx8); ++ y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); ++ ++ r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); ++ g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1); ++ b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1); ++ ++ r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0)); ++ g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0)); ++ b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0)); ++ ++ r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1)); ++ g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1)); ++ b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1)); ++ ++ y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry)); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy))); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby))); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd)); ++ y1oax8 = _mm256_srai_epi32(y1oax8, out_sh); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off)); ++ ++ y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry)); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy))); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby))); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd)); ++ y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); ++ ++ y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8); ++ y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); ++ ++ ravgx8 = _mm256_hadd_epi32(roax8, robx8); ++ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8)); ++ ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2)); ++ ravgx8 = _mm256_srai_epi32(ravgx8, 2); ++ ++ gavgx8 = _mm256_hadd_epi32(goax8, gobx8); ++ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8)); ++ gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2)); ++ gavgx8 = _mm256_srai_epi32(gavgx8, 2); ++ ++ bavgx8 = _mm256_hadd_epi32(boax8, bobx8); ++ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8)); ++ bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2)); ++ bavgx8 = _mm256_srai_epi32(bavgx8, 2); ++ ++ uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru))); ++ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu))); ++ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv))); ++ uox8 = _mm256_srai_epi32(uox8, out_sh); ++ uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset)); ++ uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256()); ++ uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0)); ++ uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256()); ++ _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8)); ++ ++ vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv))); ++ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv))); ++ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv))); ++ vox8 = _mm256_srai_epi32(vox8, out_sh); ++ vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset)); ++ vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256()); ++ vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0)); ++ vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256()); ++ _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8)); ++ } ++ } ++ ++ // Process remaining pixels cannot fill the full simd register with scalar version ++ if (remainw) { ++ int offset = width & (int)0xfffffff0; ++ rdsty += offset; ++ rdstu += offset >> 1; ++ rdstv += offset >> 1; ++ rsrcy += offset; ++ rsrcu += offset >> 1; ++ rsrcv += offset >> 1; ++ tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); ++ } ++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS ++} ++ ++X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) ++{ ++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS ++ uint16_t *rdsty = dsty; ++ uint16_t *rdstu = dstu; ++ uint16_t *rdstv = dstv; ++ const uint16_t *rsrcy = srcy; ++ const uint16_t *rsrcu = srcu; ++ const uint16_t *rsrcv = srcv; ++ int rheight = height; ++ // not zero when not divisible by 8 ++ // intentionally leave last pixel emtpy when input is odd ++ int remainw = width & 14; ++ ++ const int in_depth = srcdepth; ++ const int in_uv_offset = 128 << (in_depth - 8); ++ const int in_sh = in_depth - 1; ++ const int in_rnd = 1 << (in_sh - 1); ++ ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); ++ ++ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; ++ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; ++ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; ++ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; ++ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++ ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; + -+#ifndef AVFILTER_TONEMAPX_H -+#define AVFILTER_TONEMAPX_H ++ int16_t r[16], g[16], b[16]; ++ int16_t r1[16], g1[16], b1[16]; ++ __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off); ++ __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset); ++ __m256i cyx8 = _mm256_set1_epi32(cy); ++ __m256i rndx8 = _mm256_set1_epi32(in_rnd); + -+#include "config.h" -+#include "colorspace.h" ++ __m256i r0ox16, g0ox16, b0ox16; ++ __m256i y0ox16; ++ __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; ++ __m256i yoax8, yobx8; ++ __m256i ux8, vx8; ++ __m256i y0x16, y1x16; ++ __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; ++ __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; ++ __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b; + -+#define X86_64_V2 __attribute__((target("sse4.2"))) -+#define X86_64_V3 __attribute__((target("avx2,fma"))) ++ __m256i r1ox16, g1ox16, b1ox16; ++ __m256i y1ox16; ++ __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; ++ __m256i y1oax8, y1obx8; ++ __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { ++ for (int xx = 0; xx < width >> 4; xx++) { ++ int x = xx << 4; + -+#if defined(__GNUC__) || defined(__clang__) -+# if (__GNUC__ >= 9) || (__clang_major__ >= 11) -+# define CC_SUPPORTS_TONEMAPX_INTRINSICS -+# endif // (__GNUC__ >= 10) || (__clang_major__ >= 11) -+#endif // defined(__GNUC__) || defined(__clang__) ++ y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x)); ++ y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x))); ++ ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1)))); ++ vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1)))); + -+#ifdef CC_SUPPORTS_TONEMAPX_INTRINSICS -+# if ARCH_AARCH64 -+# if HAVE_INTRINSICS_NEON -+# define ENABLE_TONEMAPX_NEON_INTRINSICS -+# endif -+# endif // ARCH_AARCH64 -+# if ARCH_X86 -+# if HAVE_INTRINSICS_SSE42 -+# define ENABLE_TONEMAPX_SSE_INTRINSICS -+# endif -+# if HAVE_INTRINSICS_AVX2 && HAVE_INTRINSICS_FMA3 -+# define ENABLE_TONEMAPX_AVX_INTRINSICS -+# endif -+# endif // ARCH_X86 -+#endif // CC_SUPPORTS_TONEMAPX_INTRINSICS ++ y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0)); ++ y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1)); ++ y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0)); ++ y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1)); + -+typedef struct TonemapIntParams { -+ double lut_peak; -+ float *lin_lut; -+ float *tonemap_lut; -+ uint16_t *delin_lut; -+ int in_yuv_off, out_yuv_off; -+ int16_t (*yuv2rgb_coeffs)[3][3][8]; -+ int16_t (*rgb2yuv_coeffs)[3][3][8]; -+ double (*rgb2rgb_coeffs)[3][3]; -+ int rgb2rgb_passthrough; -+ const AVLumaCoefficients *coeffs, *ocoeffs; -+ double desat; -+} TonemapIntParams; ++ y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8); ++ y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8); ++ y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8); ++ y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8); ++ ux8 = _mm256_sub_epi32(ux8, in_uv_offx8); ++ vx8 = _mm256_sub_epi32(vx8, in_uv_offx8); + -+enum SIMDVariant { -+ SIMD_NONE = -1, -+ SIMD_NEON, -+ SIMD_SSE, -+ SIMD_AVX -+}; ++ ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); ++ ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); ++ vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); ++ vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); + -+void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++ // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh); ++ r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8); ++ r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv))); ++ r0x8a = _mm256_add_epi32(r0x8a, rndx8); ++ r0x8a = _mm256_srai_epi32(r0x8a, in_sh); ++ r0x8a = av_clip_int16_avx(r0x8a); + -+void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++ r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8); ++ r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv))); ++ r1x8a = _mm256_add_epi32(r1x8a, rndx8); ++ r1x8a = _mm256_srai_epi32(r1x8a, in_sh); ++ r1x8a = av_clip_int16_avx(r1x8a); + -+void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++ // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh); ++ g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu))); ++ g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv))); ++ g0x8a = _mm256_add_epi32(g0x8a, rndx8); ++ g0x8a = _mm256_srai_epi32(g0x8a, in_sh); ++ g0x8a = av_clip_int16_avx(g0x8a); + -+void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); ++ g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu))); ++ g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv))); ++ g1x8a = _mm256_add_epi32(g1x8a, rndx8); ++ g1x8a = _mm256_srai_epi32(g1x8a, in_sh); ++ g1x8a = av_clip_int16_avx(g1x8a); + -+#endif // AVFILTER_TONEMAPX_H -Index: FFmpeg/libavfilter/x86/Makefile -=================================================================== ---- FFmpeg.orig/libavfilter/x86/Makefile -+++ FFmpeg/libavfilter/x86/Makefile -@@ -34,6 +34,8 @@ OBJS-$(CONFIG_STEREO3D_FILTER) - OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend_init.o - OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold_init.o - OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o -+OBJS-$(CONFIG_TONEMAPX_FILTER) += x86/vf_tonemapx_intrin_sse.o \ -+ x86/vf_tonemapx_intrin_avx.o - OBJS-$(CONFIG_TRANSPOSE_FILTER) += x86/vf_transpose_init.o - OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o - OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360_init.o -Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c -=================================================================== ---- /dev/null -+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c -@@ -0,0 +1,1367 @@ -+/* -+ * Copyright (c) 2024 Gnattu OC -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ ++ // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh); ++ b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu))); ++ b0x8a = _mm256_add_epi32(b0x8a, rndx8); ++ b0x8a = _mm256_srai_epi32(b0x8a, in_sh); ++ b0x8a = av_clip_int16_avx(b0x8a); + -+#include "vf_tonemapx_intrin_avx.h" ++ b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu))); ++ b1x8a = _mm256_add_epi32(b1x8a, rndx8); ++ b1x8a = _mm256_srai_epi32(b1x8a, in_sh); ++ b1x8a = av_clip_int16_avx(b1x8a); + -+#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS -+# include -+#endif // ENABLE_TONEMAPX_AVX_INTRINSICS ++ r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8); ++ r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv))); ++ r0x8b = _mm256_add_epi32(r0x8b, rndx8); ++ r0x8b = _mm256_srai_epi32(r0x8b, in_sh); ++ r0x8b = av_clip_int16_avx(r0x8b); + -+#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS -+X86_64_V3 static inline __m256i av_clip_int16_avx(__m256i a) -+{ -+__m256i add_result = _mm256_add_epi32(a, _mm256_set1_epi32(0x8000U)); -+__m256i mask = _mm256_set1_epi32(~0xFFFF); -+__m256i condition = _mm256_and_si256(add_result, mask); -+__m256i cmp = _mm256_cmpeq_epi32(condition, _mm256_setzero_si256()); ++ r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8); ++ r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv))); ++ r1x8b = _mm256_add_epi32(r1x8b, rndx8); ++ r1x8b = _mm256_srai_epi32(r1x8b, in_sh); ++ r1x8b = av_clip_int16_avx(r1x8b); + -+__m256i shifted = _mm256_srai_epi32(a, 31); -+__m256i xor_result = _mm256_xor_si256(shifted, _mm256_set1_epi32(0x7FFF)); ++ g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu))); ++ g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv))); ++ g0x8b = _mm256_add_epi32(g0x8b, rndx8); ++ g0x8b = _mm256_srai_epi32(g0x8b, in_sh); ++ g0x8b = av_clip_int16_avx(g0x8b); + -+return _mm256_or_si256(_mm256_and_si256(cmp, a), _mm256_andnot_si256(cmp, xor_result)); -+} ++ g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu))); ++ g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv))); ++ g1x8b = _mm256_add_epi32(g1x8b, rndx8); ++ g1x8b = _mm256_srai_epi32(g1x8b, in_sh); ++ g1x8b = av_clip_int16_avx(g1x8b); + -+X86_64_V3 static inline void tonemap_int32x8_avx(__m256i r_in, __m256i g_in, __m256i b_in, -+ int16_t *r_out, int16_t *g_out, int16_t *b_out, -+ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, -+ const AVLumaCoefficients *coeffs, -+ const AVLumaCoefficients *ocoeffs, double desat, -+ double (*rgb2rgb)[3][3], -+ int rgb2rgb_passthrough) -+{ -+ __m256i sig8; -+ __m256 mapvalx8, r_linx8, g_linx8, b_linx8; -+ __m256 offset = _mm256_set1_ps(0.5f); -+ __m256i zerox8 = _mm256_setzero_si256(); -+ __m256i input_lut_offset = _mm256_set1_epi32(2048); -+ __m256i upper_bound = _mm256_set1_epi32(32767); -+ __m256 intermediate_upper_bound = _mm256_set1_ps(32767.0f); -+ __m256i r, g, b, rx8, gx8, bx8; ++ b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu))); ++ b0x8b = _mm256_add_epi32(b0x8b, rndx8); ++ b0x8b = _mm256_srai_epi32(b0x8b, in_sh); ++ b0x8b = av_clip_int16_avx(b0x8b); + -+ float mapval8[8], r_lin8[8], g_lin8[8], b_lin8[8]; ++ b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu))); ++ b1x8b = _mm256_add_epi32(b1x8b, rndx8); ++ b1x8b = _mm256_srai_epi32(b1x8b, in_sh); ++ b1x8b = av_clip_int16_avx(b1x8b); + -+ sig8 = _mm256_max_epi32(r_in, _mm256_max_epi32(g_in, b_in)); -+ sig8 = _mm256_add_epi32(sig8, input_lut_offset); -+ sig8 = _mm256_min_epi32(sig8, upper_bound); -+ sig8 = _mm256_max_epi32(sig8, zerox8); ++ tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); + -+ r = _mm256_add_epi32(r_in, input_lut_offset); -+ r = _mm256_min_epi32(r, upper_bound); -+ r = _mm256_max_epi32(r, zerox8); -+ g = _mm256_add_epi32(g_in, input_lut_offset); -+ g = _mm256_min_epi32(g, upper_bound); -+ g = _mm256_max_epi32(g, zerox8); -+ b = _mm256_add_epi32(b_in, input_lut_offset); -+ b = _mm256_min_epi32(b, upper_bound); -+ b = _mm256_max_epi32(b, zerox8); ++ r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r); ++ g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g); ++ b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b); + -+#define LOAD_LUT(i) mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)]; \ -+r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)]; \ -+g_lin8[i] = lin_lut[_mm256_extract_epi32(g, i)]; \ -+b_lin8[i] = lin_lut[_mm256_extract_epi32(b, i)]; ++ roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0)); ++ goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0)); ++ boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0)); + -+ LOAD_LUT(0) -+ LOAD_LUT(1) -+ LOAD_LUT(2) -+ LOAD_LUT(3) -+ LOAD_LUT(4) -+ LOAD_LUT(5) -+ LOAD_LUT(6) -+ LOAD_LUT(7) ++ robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1)); ++ gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1)); ++ bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1)); + -+#undef LOAD_LUT ++ yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry)); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy))); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby))); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd)); ++ yoax8 = _mm256_srai_epi32(yoax8, out_sh); ++ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off)); + -+ mapvalx8 = _mm256_loadu_ps(mapval8); -+ r_linx8 = _mm256_loadu_ps(r_lin8); -+ g_linx8 = _mm256_loadu_ps(g_lin8); -+ b_linx8 = _mm256_loadu_ps(b_lin8); ++ yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry)); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy))); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby))); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd)); ++ yobx8 = _mm256_srai_epi32(yobx8, out_sh); ++ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); + -+ if (!rgb2rgb_passthrough) { -+ r_linx8 = _mm256_mul_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][0])); -+ r_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][1]), r_linx8); -+ r_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][2]), r_linx8); ++ y0ox16 = _mm256_packus_epi32(yoax8, yobx8); ++ y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16); + -+ g_linx8 = _mm256_mul_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][1])); -+ g_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][0]), g_linx8); -+ g_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][2]), g_linx8); ++ r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); ++ g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1); ++ b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1); + -+ b_linx8 = _mm256_mul_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][2])); -+ b_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][0]), b_linx8); -+ b_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][1]), b_linx8); -+ } ++ r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0)); ++ g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0)); ++ b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0)); + -+ if (desat > 0) { -+ __m256 eps_x8 = _mm256_set1_ps(FLOAT_EPS); -+ __m256 desat8 = _mm256_set1_ps((float)desat); -+ __m256 luma8 = _mm256_set1_ps(0); -+ __m256 overbright8; ++ r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1)); ++ g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1)); ++ b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1)); + -+ luma8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cr)), luma8); -+ luma8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cg)), luma8); -+ luma8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cb)), luma8); -+ overbright8 = _mm256_div_ps(_mm256_max_ps(_mm256_sub_ps(luma8, desat8), eps_x8), _mm256_max_ps(luma8, eps_x8)); -+ r_linx8 = _mm256_fnmadd_ps(r_linx8, overbright8, r_linx8); -+ r_linx8 = _mm256_fmadd_ps(luma8, overbright8, r_linx8); -+ g_linx8 = _mm256_fnmadd_ps(g_linx8, overbright8, g_linx8); -+ g_linx8 = _mm256_fmadd_ps(luma8, overbright8, g_linx8); -+ b_linx8 = _mm256_fnmadd_ps(b_linx8, overbright8, b_linx8); -+ b_linx8 = _mm256_fmadd_ps(luma8, overbright8, b_linx8); -+ } ++ y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry)); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy))); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby))); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd)); ++ y1oax8 = _mm256_srai_epi32(y1oax8, out_sh); ++ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off)); + -+ r_linx8 = _mm256_mul_ps(r_linx8, mapvalx8); -+ g_linx8 = _mm256_mul_ps(g_linx8, mapvalx8); -+ b_linx8 = _mm256_mul_ps(b_linx8, mapvalx8); ++ y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry)); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy))); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby))); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd)); ++ y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); ++ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); + -+ r_linx8 = _mm256_fmadd_ps(r_linx8, intermediate_upper_bound, offset); -+ g_linx8 = _mm256_fmadd_ps(g_linx8, intermediate_upper_bound, offset); -+ b_linx8 = _mm256_fmadd_ps(b_linx8, intermediate_upper_bound, offset); ++ y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8); ++ y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16); + -+ rx8 = _mm256_cvttps_epi32(r_linx8); -+ rx8 = _mm256_min_epi32(rx8, upper_bound); -+ rx8 = _mm256_max_epi32(rx8, zerox8); ++ ravgx8 = _mm256_hadd_epi32(roax8, robx8); ++ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8)); ++ ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2)); ++ ravgx8 = _mm256_srai_epi32(ravgx8, 2); + -+ gx8 = _mm256_cvttps_epi32(g_linx8); -+ gx8 = _mm256_min_epi32(gx8, upper_bound); -+ gx8 = _mm256_max_epi32(gx8, zerox8); ++ gavgx8 = _mm256_hadd_epi32(goax8, gobx8); ++ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8)); ++ gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2)); ++ gavgx8 = _mm256_srai_epi32(gavgx8, 2); + -+ bx8 = _mm256_cvttps_epi32(b_linx8); -+ bx8 = _mm256_min_epi32(bx8, upper_bound); -+ bx8 = _mm256_max_epi32(bx8, zerox8); ++ bavgx8 = _mm256_hadd_epi32(boax8, bobx8); ++ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8)); ++ bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0)); ++ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2)); ++ bavgx8 = _mm256_srai_epi32(bavgx8, 2); + -+#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm256_extract_epi32(rx8, i)]; \ -+g_out[i] = delin_lut[_mm256_extract_epi32(gx8, i)]; \ -+b_out[i] = delin_lut[_mm256_extract_epi32(bx8, i)]; ++ uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru))); ++ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu))); ++ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv))); ++ uox8 = _mm256_srai_epi32(uox8, out_sh); ++ uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset)); ++ uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256()); ++ uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8)); + -+ SAVE_COLOR(0) -+ SAVE_COLOR(1) -+ SAVE_COLOR(2) -+ SAVE_COLOR(3) -+ SAVE_COLOR(4) -+ SAVE_COLOR(5) -+ SAVE_COLOR(6) -+ SAVE_COLOR(7) ++ vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv))); ++ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv))); ++ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv))); ++ vox8 = _mm256_srai_epi32(vox8, out_sh); ++ vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset)); ++ vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256()); ++ vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0)); ++ _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8)); ++ } ++ } + -+#undef SAVE_COLOR -+} ++ // Process remaining pixels cannot fill the full simd register with scalar version ++ if (remainw) { ++ int offset = width & (int)0xfffffff0; ++ rdsty += offset; ++ rdstu += offset >> 1; ++ rdstv += offset >> 1; ++ rsrcy += offset; ++ rsrcu += offset >> 1; ++ rsrcv += offset >> 1; ++ tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); ++ } +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS ++} + -+X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS + uint8_t *rdsty = dsty; -+ uint8_t *rdstu = dstu; -+ uint8_t *rdstv = dstv; ++ uint8_t *rdstuv = dstuv; + const uint16_t *rsrcy = srcy; -+ const uint16_t *rsrcu = srcu; -+ const uint16_t *rsrcv = srcv; ++ const uint16_t *rsrcuv = srcuv; + int rheight = height; + // not zero when not divisible by 16 + // intentionally leave last pixel emtpy when input is odd @@ -3076,7 +6060,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + __m256i cyx8 = _mm256_set1_epi32(cy); + __m256i rndx8 = _mm256_set1_epi32(in_rnd); + -+ __m256i ux8, vx8; ++ __m256i uvx16, uvx8a, uvx8b; + __m256i y0x16, y1x16; + __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; + __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; @@ -3090,35 +6074,42 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + __m256i r1ox16, g1ox16, b1ox16; + __m256i y1ox16; + __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; -+ __m256i y1oax8, y1obx8; ++ __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16; + __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; + for (; height > 1; height -= 2, -+ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], -+ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { ++ dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1], ++ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { + for (int xx = 0; xx < width >> 4; xx++) { + int x = xx << 4; + + y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x)); + y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x))); -+ ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1)))); -+ vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1)))); ++ uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x)); ++ ++ if (in_depth == 10) { ++ // shift to low10bits for 10bit input ++ y0x16 = _mm256_srli_epi16(y0x16, 6); ++ y1x16 = _mm256_srli_epi16(y1x16, 6); ++ uvx16 = _mm256_srli_epi16(uvx16, 6); ++ } + + y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0)); + y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1)); + y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0)); + y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1)); -+ ++ uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0)); ++ uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1)); + y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8); + y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8); + y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8); + y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8); -+ ux8 = _mm256_sub_epi32(ux8, in_uv_offx8); -+ vx8 = _mm256_sub_epi32(vx8, in_uv_offx8); ++ uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8); ++ uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8); + -+ ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); -+ ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); -+ vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); -+ vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); ++ ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0)); ++ ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0)); ++ vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1)); ++ vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1)); + + // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh); + r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8); @@ -3291,20 +6282,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv))); + uox8 = _mm256_srai_epi32(uox8, out_sh); + uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset)); -+ uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256()); -+ uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0)); -+ uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256()); -+ _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8)); + + vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv))); + vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv))); + vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv))); + vox8 = _mm256_srai_epi32(vox8, out_sh); + vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset)); -+ vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256()); -+ vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0)); -+ vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256()); -+ _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8)); ++ ++ uvoax8 = _mm256_unpacklo_epi32(uox8, vox8); ++ uvobx8 = _mm256_unpackhi_epi32(uox8, vox8); ++ uvox16 = _mm256_packs_epi32(uvoax8, uvobx8); ++ _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); + } + } + @@ -3312,34 +6300,30 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + if (remainw) { + int offset = width & (int)0xfffffff0; + rdsty += offset; -+ rdstu += offset >> 1; -+ rdstv += offset >> 1; -+ rsrcy += offset; -+ rsrcu += offset >> 1; -+ rsrcv += offset >> 1; -+ tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv, -+ rsrcy, rsrcu, rsrcv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ rdstuv += offset; ++ rsrcy += offset; ++ rsrcuv += offset; ++ tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv, ++ rsrcy, rsrcuv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS +} + -+X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS + uint16_t *rdsty = dsty; -+ uint16_t *rdstu = dstu; -+ uint16_t *rdstv = dstv; ++ uint16_t *rdstuv = dstuv; + const uint16_t *rsrcy = srcy; -+ const uint16_t *rsrcu = srcu; -+ const uint16_t *rsrcv = srcv; ++ const uint16_t *rsrcuv = srcuv; + int rheight = height; + // not zero when not divisible by 8 + // intentionally leave last pixel emtpy when input is odd @@ -3354,6 +6338,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + const int out_uv_offset = 128 << (out_depth - 8); + const int out_sh = 29 - out_depth; + const int out_rnd = 1 << (out_sh - 1); ++ const int out_sh2 = 16 - out_depth; + + int cy = (*params->yuv2rgb_coeffs)[0][0][0]; + int crv = (*params->yuv2rgb_coeffs)[0][2][0]; @@ -3381,7 +6366,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + __m256i y0ox16; + __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; + __m256i yoax8, yobx8; -+ __m256i ux8, vx8; ++ __m256i uvx16, uvx8a, uvx8b; + __m256i y0x16, y1x16; + __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; + __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; @@ -3390,35 +6375,42 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + __m256i r1ox16, g1ox16, b1ox16; + __m256i y1ox16; + __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; -+ __m256i y1oax8, y1obx8; ++ __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16; + __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; + for (; height > 1; height -= 2, -+ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, -+ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { ++ dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { + for (int xx = 0; xx < width >> 4; xx++) { + int x = xx << 4; + + y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x)); + y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x))); -+ ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1)))); -+ vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1)))); ++ uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x)); ++ ++ if (in_depth == 10) { ++ // shift to low10bits for 10bit input ++ y0x16 = _mm256_srli_epi16(y0x16, 6); ++ y1x16 = _mm256_srli_epi16(y1x16, 6); ++ uvx16 = _mm256_srli_epi16(uvx16, 6); ++ } + + y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0)); + y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1)); + y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0)); + y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1)); -+ ++ uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0)); ++ uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1)); + y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8); + y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8); + y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8); + y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8); -+ ux8 = _mm256_sub_epi32(ux8, in_uv_offx8); -+ vx8 = _mm256_sub_epi32(vx8, in_uv_offx8); ++ uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8); ++ uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8); + -+ ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); -+ ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); -+ vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); -+ vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); ++ ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0)); ++ ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0)); ++ vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1)); ++ vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1)); + + // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh); + r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8); @@ -3536,6 +6528,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + + y0ox16 = _mm256_packus_epi32(yoax8, yobx8); + y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); ++ y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2); + _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16); + + r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); @@ -3566,6 +6559,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + + y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8); + y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); ++ y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2); + _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16); + + ravgx8 = _mm256_hadd_epi32(roax8, robx8); @@ -3591,18 +6585,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv))); + uox8 = _mm256_srai_epi32(uox8, out_sh); + uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset)); -+ uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256()); -+ uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8)); + + vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv))); + vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv))); + vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv))); + vox8 = _mm256_srai_epi32(vox8, out_sh); + vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset)); -+ vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256()); -+ vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8)); ++ ++ uvoax8 = _mm256_unpacklo_epi32(uox8, vox8); ++ uvobx8 = _mm256_unpackhi_epi32(uox8, vox8); ++ uvox16 = _mm256_packus_epi32(uvoax8, uvobx8); ++ uvox16 = _mm256_slli_epi16(uvox16, out_sh2); ++ _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16); + } + } + @@ -3610,354 +6604,513 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + if (remainw) { + int offset = width & (int)0xfffffff0; + rdsty += offset; -+ rdstu += offset >> 1; -+ rdstv += offset >> 1; ++ rdstuv += offset; + rsrcy += offset; -+ rsrcu += offset >> 1; -+ rsrcv += offset >> 1; -+ tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv, -+ rsrcy, rsrcu, rsrcv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ rsrcuv += offset; ++ tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv, ++ rsrcy, rsrcuv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS +} +Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h +=================================================================== +--- /dev/null ++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h +@@ -0,0 +1,68 @@ ++/* ++ * Copyright (c) 2024 Gnattu OC ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVFILTER_X86_TONEMAPX_INTRIN_AVX_H ++#define AVFILTER_X86_TONEMAPX_INTRIN_AVX_H ++ ++#include "libavfilter/vf_tonemapx.h" ++ ++X86_64_V3 void tonemap_frame_dovi_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++X86_64_V3 void tonemap_frame_dovi_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); + +X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv, + const uint16_t *srcy, const uint16_t *srcuv, + const int *dstlinesize, const int *srclinesize, + int dstdepth, int srcdepth, + int width, int height, -+ const struct TonemapIntParams *params) ++ const struct TonemapIntParams *params); ++ ++X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv, ++ const uint16_t *srcy, const uint16_t *srcuv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++#endif // AVFILTER_X86_TONEMAPX_INTRIN_AVX_H +Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c +=================================================================== +--- /dev/null ++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c +@@ -0,0 +1,2353 @@ ++/* ++ * Copyright (c) 2024 Gnattu OC ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "vf_tonemapx_intrin_sse.h" ++ ++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS ++# include ++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS ++ ++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS ++// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction ++// cast the register into float register and store with movss as a workaround ++#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10) ++__attribute__((always_inline)) ++X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) { ++ _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a)); ++ return; ++} ++#endif ++ ++X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a) +{ -+#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS -+ uint8_t *rdsty = dsty; -+ uint8_t *rdstuv = dstuv; -+ const uint16_t *rsrcy = srcy; -+ const uint16_t *rsrcuv = srcuv; -+ int rheight = height; -+ // not zero when not divisible by 16 -+ // intentionally leave last pixel emtpy when input is odd -+ int remainw = width & 14; ++ __m128i mask = _mm_set1_epi32(0x7FFF); ++ __m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF)); + -+ const int in_depth = srcdepth; -+ const int in_uv_offset = 128 << (in_depth - 8); -+ const int in_sh = in_depth - 1; -+ const int in_rnd = 1 << (in_sh - 1); ++ __m128i zero = _mm_setzero_si128(); ++ __m128i cmp = _mm_cmpeq_epi32(condition, zero); + -+ const int out_depth = dstdepth; -+ const int out_uv_offset = 128 << (out_depth - 8); -+ const int out_sh = 29 - out_depth; -+ const int out_rnd = 1 << (out_sh - 1); ++ __m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask); ++ __m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a)); + -+ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; -+ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; -+ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; -+ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; -+ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; ++ return result; ++} + -+ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; -+ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; -+ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; -+ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; -+ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; -+ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; -+ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; -+ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; ++X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a) ++{ ++ __m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U)); ++ __m128i mask = _mm_set1_epi32(~0xFFFF); ++ __m128i condition = _mm_and_si128(add_result, mask); ++ __m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128()); + -+ int16_t r[16], g[16], b[16]; -+ int16_t r1[16], g1[16], b1[16]; -+ __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off); -+ __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset); -+ __m256i cyx8 = _mm256_set1_epi32(cy); -+ __m256i rndx8 = _mm256_set1_epi32(in_rnd); ++ __m128i shifted = _mm_srai_epi32(a, 31); ++ __m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF)); + -+ __m256i uvx16, uvx8a, uvx8b; -+ __m256i y0x16, y1x16; -+ __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; -+ __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; -+ __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b; ++ return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result)); ++} + -+ __m256i r0ox16, g0ox16, b0ox16; -+ __m256i y0ox16; -+ __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; -+ __m256i yoax8, yobx8; ++X86_64_V2 inline static __m128 mix_float32x4(__m128 x, __m128 y, __m128 a) ++{ ++ __m128 n = _mm_sub_ps(y, x); ++ n = _mm_mul_ps(n, a); ++ n = _mm_add_ps(n, x); ++ return n; ++} + -+ __m256i r1ox16, g1ox16, b1ox16; -+ __m256i y1ox16; -+ __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; -+ __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16; -+ __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; -+ for (; height > 1; height -= 2, -+ dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1], -+ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { -+ for (int xx = 0; xx < width >> 4; xx++) { -+ int x = xx << 4; ++X86_64_V2 inline static float reduce_floatx4(__m128 x) { ++ x = _mm_hadd_ps(x, x); ++ x = _mm_hadd_ps(x, x); ++ return _mm_cvtss_f32(x); ++} ++ ++X86_64_V2 static inline float reshape_poly(float s, __m128 coeffs) ++{ ++ __m128 ps = _mm_set_ps(0.0f, s * s, s, 1.0f); ++ ps = _mm_mul_ps(ps, coeffs); ++ return reduce_floatx4(ps); ++} + -+ y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x)); -+ y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x))); -+ uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x)); ++X86_64_V2 inline static float reshape_mmr(__m128 sig, __m128 coeffs, const float* mmr, ++ int mmr_single, int min_order, int max_order) ++{ ++ float s = _mm_cvtss_f32(coeffs); ++ int mmr_idx = 0; ++ int order = 0; ++ ++ __m128 mmr_coeffs, ps; ++ __m128 sigX01 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1))); // {sig[0]*sig[1], sig[1]*sig[1], sig[2]*sig[1], sig[3]*sig[1]} ++ __m128 sigX02 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[2], sig[1]*sig[2], sig[2]*sig[2], sig[3]*sig[2]} ++ __m128 sigX12 = _mm_mul_ps(sigX01, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[1]*sig[2], sig[1]*sig[1]*sig[2], sig[2]*sig[1]*sig[2], sig[3]*sig[1]*sig[2]} ++ __m128 sigX = sigX01; // sig[0]*sig[1] now positioned at 0 ++ ++ sigX = _mm_insert_ps(sigX, sigX02, _MM_MK_INSERTPS_NDX(0, 1, 0)); // sig[0]*sig[2] at 1 ++ sigX = _mm_insert_ps(sigX, sigX02, _MM_MK_INSERTPS_NDX(1, 2, 0)); // sig[1]*sig[2] at 2 ++ sigX = _mm_insert_ps(sigX, sigX12, _MM_MK_INSERTPS_NDX(0, 3, 0)); // sig[0]*sig[1]*sig[2] at 3 ++ ++ mmr_idx = mmr_single ? 0 : (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 2, 0, 1))); ++ order = (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(1, 2, 0, 3))); ++ ++ // dot first order ++ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 0*4]); ++ ps = _mm_mul_ps(sig, mmr_coeffs); ++ s += reduce_floatx4(ps); ++ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 1*4]); ++ ps = _mm_mul_ps(sigX, mmr_coeffs); ++ s += reduce_floatx4(ps); ++ ++ if (max_order >= 2 && (min_order >= 2 || order >= 2)) { ++ __m128 sig2 = _mm_mul_ps(sig, sig); ++ __m128 sigX2 = _mm_mul_ps(sigX, sigX); ++ ++ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 2*4]); ++ ps = _mm_mul_ps(sig2, mmr_coeffs); ++ s += reduce_floatx4(ps); ++ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 3*4]); ++ ps = _mm_mul_ps(sigX2, mmr_coeffs); ++ s += reduce_floatx4(ps); ++ ++ if (max_order == 3 && (min_order == 3 || order >= 3)) { ++ __m128 sig3 = _mm_mul_ps(sig2, sig); ++ __m128 sigX3 = _mm_mul_ps(sigX2, sigX); ++ ++ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 4*4]); ++ ps = _mm_mul_ps(sig3, mmr_coeffs); ++ s += reduce_floatx4(ps); ++ mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 5*4]); ++ ps = _mm_mul_ps(sigX3, mmr_coeffs); ++ s += reduce_floatx4(ps); ++ } ++ } + -+ if (in_depth == 10) { -+ // shift to low10bits for 10bit input -+ y0x16 = _mm256_srli_epi16(y0x16, 6); -+ y1x16 = _mm256_srli_epi16(y1x16, 6); -+ uvx16 = _mm256_srli_epi16(uvx16, 6); -+ } ++ return s; ++} + -+ y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0)); -+ y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1)); -+ y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0)); -+ y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1)); -+ uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0)); -+ uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1)); -+ y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8); -+ y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8); -+ y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8); -+ y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8); -+ uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8); -+ uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8); ++#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c))) ++X86_64_V2 inline static __m128 reshape_dovi_iptpqc2(__m128 sig, const TonemapIntParams *ctx) ++{ ++ int has_mmr_poly; ++ float s; ++ ++ float *src_dovi_params = ctx->dovi_pbuf; ++ float *src_dovi_pivots = ctx->dovi_pbuf + 24; ++ float *src_dovi_coeffs = ctx->dovi_pbuf + 48; //float4* ++ float *src_dovi_mmr = ctx->dovi_pbuf + 144; //float4* ++ ++ float* dovi_params_i = src_dovi_params + 0*8; ++ float* dovi_pivots_i = src_dovi_pivots + 0*8; ++ float* dovi_coeffs_i = src_dovi_coeffs + 0 * 8 * 4; //float4* ++ float* dovi_mmr_i = src_dovi_mmr + 0 * 48 * 4; //float4* ++ int dovi_num_pivots_i = dovi_params_i[0]; ++ int dovi_has_mmr_i = dovi_params_i[1]; ++ int dovi_has_poly_i = dovi_params_i[2]; ++ int dovi_mmr_single_i = dovi_params_i[3]; ++ int dovi_min_order_i = dovi_params_i[4]; ++ int dovi_max_order_i = dovi_params_i[5]; ++ float dovi_lo_i = dovi_params_i[6]; ++ float dovi_hi_i = dovi_params_i[7]; ++ ++ float* dovi_params_p = src_dovi_params + 1*8; ++ float* dovi_coeffs_p = src_dovi_coeffs + 1*8 * 4; //float4* ++ float* dovi_mmr_p = src_dovi_mmr + 1*48 * 4; //float4* ++ int dovi_has_mmr_p = dovi_params_p[1]; ++ int dovi_has_poly_p = dovi_params_p[2]; ++ int dovi_mmr_single_p = dovi_params_p[3]; ++ int dovi_min_order_p = dovi_params_p[4]; ++ int dovi_max_order_p = dovi_params_p[5]; ++ float dovi_lo_p = dovi_params_p[6]; ++ float dovi_hi_p = dovi_params_p[7]; ++ ++ float* dovi_params_t = src_dovi_params + 2*8; ++ float* dovi_coeffs_t = src_dovi_coeffs + 2*8 * 4; //float4* ++ float* dovi_mmr_t = src_dovi_mmr + 2*48 * 4; //float4* ++ int dovi_has_mmr_t = dovi_params_t[1]; ++ int dovi_has_poly_t = dovi_params_t[2]; ++ int dovi_mmr_single_t = dovi_params_t[3]; ++ int dovi_min_order_t = dovi_params_t[4]; ++ int dovi_max_order_t = dovi_params_t[5]; ++ float dovi_lo_t = dovi_params_t[6]; ++ float dovi_hi_t = dovi_params_t[7]; ++ ++ __m128 coeffs, result; ++ ++ // reshape I ++ s = _mm_cvtss_f32(sig); ++ result = sig; ++ if (dovi_num_pivots_i > 2) { ++ __m128 m01 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i), _mm_loadu_ps(dovi_coeffs_i + 4), _mm_set1_ps(s >= dovi_pivots_i[0])); ++ __m128 m23 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 2*4), _mm_loadu_ps(dovi_coeffs_i + 3*4), _mm_set1_ps(s >= dovi_pivots_i[2])); ++ __m128 m0123 = mix_float32x4(m01, m23, _mm_set1_ps(s >= dovi_pivots_i[1])); ++ __m128 m45 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 4*4), _mm_loadu_ps(dovi_coeffs_i + 5*4), _mm_set1_ps(s >= dovi_pivots_i[4])); ++ __m128 m67 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 6*4), _mm_loadu_ps(dovi_coeffs_i + 7*4), _mm_set1_ps(s >= dovi_pivots_i[6])); ++ __m128 m4567 = mix_float32x4(m45, m67, _mm_set1_ps(s >= dovi_pivots_i[5])); ++ coeffs = mix_float32x4(m0123, m4567, _mm_set1_ps(s >= dovi_pivots_i[3])); ++ } else { ++ coeffs = _mm_loadu_ps(dovi_coeffs_i); ++ } + -+ ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0)); -+ ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0)); -+ vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1)); -+ vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1)); ++ has_mmr_poly = dovi_has_mmr_i && dovi_has_poly_i; + -+ // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh); -+ r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8); -+ r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv))); -+ r0x8a = _mm256_add_epi32(r0x8a, rndx8); -+ r0x8a = _mm256_srai_epi32(r0x8a, in_sh); -+ r0x8a = av_clip_int16_avx(r0x8a); ++ if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_i)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(result, coeffs, dovi_mmr_i, ++ dovi_mmr_single_i, dovi_min_order_i, dovi_max_order_i); + -+ r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8); -+ r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv))); -+ r1x8a = _mm256_add_epi32(r1x8a, rndx8); -+ r1x8a = _mm256_srai_epi32(r1x8a, in_sh); -+ r1x8a = av_clip_int16_avx(r1x8a); ++ result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_i, dovi_hi_i)), _MM_MK_INSERTPS_NDX(0, 0, 0)); + -+ // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu))); -+ g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv))); -+ g0x8a = _mm256_add_epi32(g0x8a, rndx8); -+ g0x8a = _mm256_srai_epi32(g0x8a, in_sh); -+ g0x8a = av_clip_int16_avx(g0x8a); ++ // reshape P ++ s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1))); ++ coeffs = _mm_loadu_ps(dovi_coeffs_p); ++ has_mmr_poly = dovi_has_mmr_p && dovi_has_poly_p; + -+ g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu))); -+ g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv))); -+ g1x8a = _mm256_add_epi32(g1x8a, rndx8); -+ g1x8a = _mm256_srai_epi32(g1x8a, in_sh); -+ g1x8a = av_clip_int16_avx(g1x8a); ++ if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_p)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(result, coeffs, dovi_mmr_p, ++ dovi_mmr_single_p, dovi_min_order_p, dovi_max_order_p); + -+ // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh); -+ b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu))); -+ b0x8a = _mm256_add_epi32(b0x8a, rndx8); -+ b0x8a = _mm256_srai_epi32(b0x8a, in_sh); -+ b0x8a = av_clip_int16_avx(b0x8a); ++ result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_p, dovi_hi_p)), _MM_MK_INSERTPS_NDX(0, 1, 0)); + -+ b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu))); -+ b1x8a = _mm256_add_epi32(b1x8a, rndx8); -+ b1x8a = _mm256_srai_epi32(b1x8a, in_sh); -+ b1x8a = av_clip_int16_avx(b1x8a); ++ // reshape T ++ s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); ++ coeffs = _mm_loadu_ps(dovi_coeffs_t); ++ has_mmr_poly = dovi_has_mmr_t && dovi_has_poly_t; + -+ r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8); -+ r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv))); -+ r0x8b = _mm256_add_epi32(r0x8b, rndx8); -+ r0x8b = _mm256_srai_epi32(r0x8b, in_sh); -+ r0x8b = av_clip_int16_avx(r0x8b); ++ if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_t)) ++ s = reshape_poly(s, coeffs); ++ else ++ s = reshape_mmr(result, coeffs, dovi_mmr_t, ++ dovi_mmr_single_t, dovi_min_order_t, dovi_max_order_t); + -+ r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8); -+ r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv))); -+ r1x8b = _mm256_add_epi32(r1x8b, rndx8); -+ r1x8b = _mm256_srai_epi32(r1x8b, in_sh); -+ r1x8b = av_clip_int16_avx(r1x8b); ++ result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_t, dovi_hi_t)), _MM_MK_INSERTPS_NDX(0, 2, 0)); + -+ g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu))); -+ g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv))); -+ g0x8b = _mm256_add_epi32(g0x8b, rndx8); -+ g0x8b = _mm256_srai_epi32(g0x8b, in_sh); -+ g0x8b = av_clip_int16_avx(g0x8b); ++ return result; ++} + -+ g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu))); -+ g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv))); -+ g1x8b = _mm256_add_epi32(g1x8b, rndx8); -+ g1x8b = _mm256_srai_epi32(g1x8b, in_sh); -+ g1x8b = av_clip_int16_avx(g1x8b); ++X86_64_V2 inline static void ycc2rgbx4(__m128* dy, __m128* dcb, __m128* dcr, ++ __m128 y, __m128 cb, __m128 cr, ++ const double nonlinear[3][3], const float ycc_offset[3]) ++{ ++ *dy = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[0][0])); ++ *dy = _mm_add_ps(*dy, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[0][1]))); ++ *dy = _mm_add_ps(*dy, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[0][2]))); ++ *dy = _mm_sub_ps(*dy, _mm_set1_ps(ycc_offset[0])); ++ ++ *dcb = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[1][0])); ++ *dcb = _mm_add_ps(*dcb, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[1][1]))); ++ *dcb = _mm_add_ps(*dcb, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[1][2]))); ++ *dcb = _mm_sub_ps(*dcb, _mm_set1_ps(ycc_offset[1])); ++ ++ *dcr = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[2][0])); ++ *dcr = _mm_add_ps(*dcr, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[2][1]))); ++ *dcr = _mm_add_ps(*dcr, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[2][2]))); ++ *dcr = _mm_sub_ps(*dcr, _mm_set1_ps(ycc_offset[2])); ++} + -+ b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu))); -+ b0x8b = _mm256_add_epi32(b0x8b, rndx8); -+ b0x8b = _mm256_srai_epi32(b0x8b, in_sh); -+ b0x8b = av_clip_int16_avx(b0x8b); ++X86_64_V2 inline static void lms2rgbx4(__m128* dl, __m128* dm, __m128* ds, ++ __m128 l, __m128 m, __m128 s, ++ const double lms2rgb_matrix[3][3]) ++{ ++ *dl = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[0][0])); ++ *dl = _mm_add_ps(*dl, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[0][1]))); ++ *dl = _mm_add_ps(*dl, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[0][2]))); + -+ b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu))); -+ b1x8b = _mm256_add_epi32(b1x8b, rndx8); -+ b1x8b = _mm256_srai_epi32(b1x8b, in_sh); -+ b1x8b = av_clip_int16_avx(b1x8b); ++ *dm = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[1][0])); ++ *dm = _mm_add_ps(*dm, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[1][1]))); ++ *dm = _mm_add_ps(*dm, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[1][2]))); + -+ tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b, -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); -+ tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1, -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); -+ tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); -+ tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8], -+ params->lin_lut, params->tonemap_lut, params->delin_lut, -+ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, -+ params->rgb2rgb_passthrough); ++ *ds = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[2][0])); ++ *ds = _mm_add_ps(*ds, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[2][1]))); ++ *ds = _mm_add_ps(*ds, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[2][2]))); ++} + -+ r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r); -+ g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g); -+ b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b); ++X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in, ++ int16_t *r_out, int16_t *g_out, int16_t *b_out, ++ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, ++ const AVLumaCoefficients *coeffs, ++ const AVLumaCoefficients *ocoeffs, double desat, ++ double (*rgb2rgb)[3][3], ++ int rgb2rgb_passthrough) ++{ ++ __m128i sig4; ++ __m128 mapvalx4, r_linx4, g_linx4, b_linx4; ++ __m128 offset = _mm_set1_ps(0.5f); ++ __m128i input_lut_offset = _mm_set1_epi32(2048); ++ __m128 intermediate_upper_bound = _mm_set1_ps(32767.0f); ++ __m128i r, g, b, rx4, gx4, bx4; + -+ roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0)); -+ goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0)); -+ boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0)); ++ float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4]; + -+ robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1)); -+ gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1)); -+ bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1)); ++ sig4 = _mm_max_epi32(r_in, _mm_max_epi32(g_in, b_in)); ++ sig4 = _mm_add_epi32(sig4, input_lut_offset); ++ sig4 = av_clip_uint16_sse(sig4); + -+ yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry)); -+ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy))); -+ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby))); -+ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd)); -+ yoax8 = _mm256_srai_epi32(yoax8, out_sh); -+ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off)); ++ r = _mm_add_epi32(r_in, input_lut_offset); ++ r = av_clip_uint16_sse(r); ++ g = _mm_add_epi32(g_in, input_lut_offset); ++ g = av_clip_uint16_sse(g); ++ b = _mm_add_epi32(b_in, input_lut_offset); ++ b = av_clip_uint16_sse(b); + -+ yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry)); -+ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy))); -+ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby))); -+ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd)); -+ yobx8 = _mm256_srai_epi32(yobx8, out_sh); -+ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); ++ // Cannot use loop here as the lane has to be compile-time constant ++#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \ ++r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)]; \ ++g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)]; \ ++b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)]; + -+ y0ox16 = _mm256_packs_epi32(yoax8, yobx8); -+ y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); ++ LOAD_LUT(0) ++ LOAD_LUT(1) ++ LOAD_LUT(2) ++ LOAD_LUT(3) + -+ r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); -+ g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1); -+ b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1); ++#undef LOAD_LUT + -+ r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0)); -+ g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0)); -+ b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0)); ++ mapvalx4 = _mm_loadu_ps(mapval4); ++ r_linx4 = _mm_loadu_ps(r_lin4); ++ g_linx4 = _mm_loadu_ps(g_lin4); ++ b_linx4 = _mm_loadu_ps(b_lin4); + -+ r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1)); -+ g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1)); -+ b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1)); ++ if (!rgb2rgb_passthrough) { ++ r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0])); ++ r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1]))); ++ r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2]))); + -+ y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry)); -+ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy))); -+ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby))); -+ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd)); -+ y1oax8 = _mm256_srai_epi32(y1oax8, out_sh); -+ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off)); ++ g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1])); ++ g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0]))); ++ g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2]))); + -+ y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry)); -+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy))); -+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby))); -+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd)); -+ y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); -+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); ++ b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2])); ++ b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0]))); ++ b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1]))); ++ } + -+ y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8); -+ y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); ++ if (desat > 0) { ++ __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS); ++ __m128 desat4 = _mm_set1_ps((float)desat); ++ __m128 luma4 = _mm_set1_ps(0); ++ __m128 overbright4; + -+ ravgx8 = _mm256_hadd_epi32(roax8, robx8); -+ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8)); -+ ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0)); -+ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2)); -+ ravgx8 = _mm256_srai_epi32(ravgx8, 2); ++ luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr)))); ++ luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg)))); ++ luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb)))); ++ overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4)); ++ r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4)); ++ r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4)); ++ g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4)); ++ g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4)); ++ b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4)); ++ b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4)); ++ } + -+ gavgx8 = _mm256_hadd_epi32(goax8, gobx8); -+ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8)); -+ gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0)); -+ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2)); -+ gavgx8 = _mm256_srai_epi32(gavgx8, 2); ++ r_linx4 = _mm_mul_ps(r_linx4, mapvalx4); ++ g_linx4 = _mm_mul_ps(g_linx4, mapvalx4); ++ b_linx4 = _mm_mul_ps(b_linx4, mapvalx4); + -+ bavgx8 = _mm256_hadd_epi32(boax8, bobx8); -+ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8)); -+ bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0)); -+ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2)); -+ bavgx8 = _mm256_srai_epi32(bavgx8, 2); ++ r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound); ++ r_linx4 = _mm_add_ps(r_linx4, offset); + -+ uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru))); -+ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu))); -+ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv))); -+ uox8 = _mm256_srai_epi32(uox8, out_sh); -+ uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset)); ++ g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound); ++ g_linx4 = _mm_add_ps(g_linx4, offset); ++ ++ b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound); ++ b_linx4 = _mm_add_ps(b_linx4, offset); + -+ vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv))); -+ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv))); -+ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv))); -+ vox8 = _mm256_srai_epi32(vox8, out_sh); -+ vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset)); ++ rx4 = _mm_cvttps_epi32(r_linx4); ++ rx4 = av_clip_uint16_sse(rx4); ++ gx4 = _mm_cvttps_epi32(g_linx4); ++ gx4 = av_clip_uint16_sse(gx4); ++ bx4 = _mm_cvttps_epi32(b_linx4); ++ bx4 = av_clip_uint16_sse(bx4); + -+ uvoax8 = _mm256_unpacklo_epi32(uox8, vox8); -+ uvobx8 = _mm256_unpackhi_epi32(uox8, vox8); -+ uvox16 = _mm256_packs_epi32(uvoax8, uvobx8); -+ _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0)))); -+ } -+ } ++#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \ ++g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)]; \ ++b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)]; + -+ // Process remaining pixels cannot fill the full simd register with scalar version -+ if (remainw) { -+ int offset = width & (int)0xfffffff0; -+ rdsty += offset; -+ rdstuv += offset; -+ rsrcy += offset; -+ rsrcuv += offset; -+ tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv, -+ rsrcy, rsrcuv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); -+ } -+#endif // ENABLE_TONEMAPX_AVX_INTRINSICS ++ SAVE_COLOR(0) ++ SAVE_COLOR(1) ++ SAVE_COLOR(2) ++ SAVE_COLOR(3) ++ ++#undef SAVE_COLOR +} ++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS + -+X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params) ++X86_64_V2 void tonemap_frame_dovi_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params) +{ -+#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS -+ uint16_t *rdsty = dsty; -+ uint16_t *rdstuv = dstuv; ++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS ++ uint8_t *rdsty = dsty; ++ uint8_t *rdstu = dstu; ++ uint8_t *rdstv = dstv; ++ + const uint16_t *rsrcy = srcy; -+ const uint16_t *rsrcuv = srcuv; ++ const uint16_t *rsrcu = srcu; ++ const uint16_t *rsrcv = srcv; ++ + int rheight = height; + // not zero when not divisible by 8 + // intentionally leave last pixel emtpy when input is odd -+ int remainw = width & 14; ++ int remainw = width & 6; + + const int in_depth = srcdepth; -+ const int in_uv_offset = 128 << (in_depth - 8); -+ const int in_sh = in_depth - 1; -+ const int in_rnd = 1 << (in_sh - 1); ++ const float in_rng = (float)((1 << in_depth) - 1); + + const int out_depth = dstdepth; + const int out_uv_offset = 128 << (out_depth - 8); + const int out_sh = 29 - out_depth; + const int out_rnd = 1 << (out_sh - 1); -+ const int out_sh2 = 16 - out_depth; -+ -+ int cy = (*params->yuv2rgb_coeffs)[0][0][0]; -+ int crv = (*params->yuv2rgb_coeffs)[0][2][0]; -+ int cgu = (*params->yuv2rgb_coeffs)[1][1][0]; -+ int cgv = (*params->yuv2rgb_coeffs)[1][2][0]; -+ int cbu = (*params->yuv2rgb_coeffs)[2][1][0]; + + int cry = (*params->rgb2yuv_coeffs)[0][0][0]; + int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; @@ -3968,504 +7121,746 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c + int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; + int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; + -+ int16_t r[16], g[16], b[16]; -+ int16_t r1[16], g1[16], b1[16]; -+ __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off); -+ __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset); -+ __m256i cyx8 = _mm256_set1_epi32(cy); -+ __m256i rndx8 = _mm256_set1_epi32(in_rnd); -+ -+ __m256i r0ox16, g0ox16, b0ox16; -+ __m256i y0ox16; -+ __m256i roax8, robx8, goax8, gobx8, boax8, bobx8; -+ __m256i yoax8, yobx8; -+ __m256i uvx16, uvx8a, uvx8b; -+ __m256i y0x16, y1x16; -+ __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b; -+ __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b; -+ __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b; -+ -+ __m256i r1ox16, g1ox16, b1ox16; -+ __m256i y1ox16; -+ __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8; -+ __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16; -+ __m256i uox8, vox8, ravgx8, gavgx8, bavgx8; -+ for (; height > 1; height -= 2, -+ dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2, -+ srcy += srclinesize[0], srcuv += srclinesize[1] / 2) { -+ for (int xx = 0; xx < width >> 4; xx++) { -+ int x = xx << 4; -+ -+ y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x)); -+ y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x))); -+ uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x)); -+ -+ if (in_depth == 10) { -+ // shift to low10bits for 10bit input -+ y0x16 = _mm256_srli_epi16(y0x16, 6); -+ y1x16 = _mm256_srli_epi16(y1x16, 6); -+ uvx16 = _mm256_srli_epi16(uvx16, 6); -+ } -+ -+ y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0)); -+ y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1)); -+ y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0)); -+ y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1)); -+ uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0)); -+ uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1)); -+ y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8); -+ y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8); -+ y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8); -+ y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8); -+ uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8); -+ uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8); -+ -+ ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0)); -+ ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0)); -+ vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1)); -+ vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1)); -+ -+ // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh); -+ r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8); -+ r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv))); -+ r0x8a = _mm256_add_epi32(r0x8a, rndx8); -+ r0x8a = _mm256_srai_epi32(r0x8a, in_sh); -+ r0x8a = av_clip_int16_avx(r0x8a); -+ -+ r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8); -+ r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv))); -+ r1x8a = _mm256_add_epi32(r1x8a, rndx8); -+ r1x8a = _mm256_srai_epi32(r1x8a, in_sh); -+ r1x8a = av_clip_int16_avx(r1x8a); -+ -+ // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh); -+ g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu))); -+ g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv))); -+ g0x8a = _mm256_add_epi32(g0x8a, rndx8); -+ g0x8a = _mm256_srai_epi32(g0x8a, in_sh); -+ g0x8a = av_clip_int16_avx(g0x8a); -+ -+ g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu))); -+ g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv))); -+ g1x8a = _mm256_add_epi32(g1x8a, rndx8); -+ g1x8a = _mm256_srai_epi32(g1x8a, in_sh); -+ g1x8a = av_clip_int16_avx(g1x8a); ++ int16_t r[8], g[8], b[8]; ++ int16_t r1[8], g1[8], b1[8]; + -+ // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh); -+ b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu))); -+ b0x8a = _mm256_add_epi32(b0x8a, rndx8); -+ b0x8a = _mm256_srai_epi32(b0x8a, in_sh); -+ b0x8a = av_clip_int16_avx(b0x8a); ++ __m128i zero128 = _mm_setzero_si128(); ++ __m128i ux4, vx4; ++ __m128i y0x8, y1x8; ++ __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b; ++ __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b; ++ __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b; + -+ b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu))); -+ b1x8a = _mm256_add_epi32(b1x8a, rndx8); -+ b1x8a = _mm256_srai_epi32(b1x8a, in_sh); -+ b1x8a = av_clip_int16_avx(b1x8a); ++ __m128i r0ox8, g0ox8, b0ox8; ++ __m128i y0ox8; ++ __m128i roax4, robx4, goax4, gobx4, boax4, bobx4; ++ __m128i yoax4, yobx4; + -+ r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8); -+ r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv))); -+ r0x8b = _mm256_add_epi32(r0x8b, rndx8); -+ r0x8b = _mm256_srai_epi32(r0x8b, in_sh); -+ r0x8b = av_clip_int16_avx(r0x8b); ++ __m128i r1ox8, g1ox8, b1ox8; ++ __m128i y1ox8; ++ __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4; ++ __m128i y1oax4, y1obx4; ++ __m128i uox4, vox4, ravgx4, gavgx4, bavgx4; + -+ r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8); -+ r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv))); -+ r1x8b = _mm256_add_epi32(r1x8b, rndx8); -+ r1x8b = _mm256_srai_epi32(r1x8b, in_sh); -+ r1x8b = av_clip_int16_avx(r1x8b); ++ __m128 ipt0, ipt1, ipt2, ipt3; ++ __m128 ia1, ib1, ia2, ib2; ++ __m128 ix4, px4, tx4; ++ __m128 lx4, mx4, sx4; ++ __m128 rx4a, gx4a, bx4a, rx4b, gx4b, bx4b; ++ __m128 y0x4af, y0x4bf, y1x4af, y1x4bf, ux4af, ux4bf, vx4af, vx4bf; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2], ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) { ++ for (int xx = 0; xx < width >> 3; xx++) { ++ int x = xx << 3; + -+ g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu))); -+ g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv))); -+ g0x8b = _mm256_add_epi32(g0x8b, rndx8); -+ g0x8b = _mm256_srai_epi32(g0x8b, in_sh); -+ g0x8b = av_clip_int16_avx(g0x8b); ++ y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x)); ++ y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x))); ++ ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1))); ++ vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1))); + -+ g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu))); -+ g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv))); -+ g1x8b = _mm256_add_epi32(g1x8b, rndx8); -+ g1x8b = _mm256_srai_epi32(g1x8b, in_sh); -+ g1x8b = av_clip_int16_avx(g1x8b); ++ y0x4a = _mm_cvtepu16_epi32(y0x8); ++ y0x4b = _mm_unpackhi_epi16(y0x8, zero128); ++ y1x4a = _mm_cvtepu16_epi32(y1x8); ++ y1x4b = _mm_unpackhi_epi16(y1x8, zero128); ++ ux4 = _mm_cvtepu16_epi32(ux4); ++ vx4 = _mm_cvtepu16_epi32(vx4); + -+ b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu))); -+ b0x8b = _mm256_add_epi32(b0x8b, rndx8); -+ b0x8b = _mm256_srai_epi32(b0x8b, in_sh); -+ b0x8b = av_clip_int16_avx(b0x8b); ++ ux4a = _mm_unpacklo_epi32(ux4, ux4); ++ ux4b = _mm_unpackhi_epi32(ux4, ux4); ++ vx4a = _mm_unpacklo_epi32(vx4, vx4); ++ vx4b = _mm_unpackhi_epi32(vx4, vx4); + -+ b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu))); -+ b1x8b = _mm256_add_epi32(b1x8b, rndx8); -+ b1x8b = _mm256_srai_epi32(b1x8b, in_sh); -+ b1x8b = av_clip_int16_avx(b1x8b); ++ y0x4af = _mm_cvtepi32_ps(y0x4a); ++ y0x4bf = _mm_cvtepi32_ps(y0x4b); ++ y1x4af = _mm_cvtepi32_ps(y1x4a); ++ y1x4bf = _mm_cvtepi32_ps(y1x4b); ++ ux4af = _mm_cvtepi32_ps(ux4a); ++ ux4bf = _mm_cvtepi32_ps(ux4b); ++ vx4af = _mm_cvtepi32_ps(vx4a); ++ vx4bf = _mm_cvtepi32_ps(vx4b); ++ ++ y0x4af = _mm_div_ps(y0x4af, _mm_set1_ps(in_rng)); ++ y0x4bf = _mm_div_ps(y0x4bf, _mm_set1_ps(in_rng)); ++ y1x4af = _mm_div_ps(y1x4af, _mm_set1_ps(in_rng)); ++ y1x4bf = _mm_div_ps(y1x4bf, _mm_set1_ps(in_rng)); ++ ux4af = _mm_div_ps(ux4af, _mm_set1_ps(in_rng)); ++ ux4bf = _mm_div_ps(ux4bf, _mm_set1_ps(in_rng)); ++ vx4af = _mm_div_ps(vx4af, _mm_set1_ps(in_rng)); ++ vx4bf = _mm_div_ps(vx4bf, _mm_set1_ps(in_rng)); ++ ++ // Reshape y0x4a ++ ia1 = _mm_unpacklo_ps(y0x4af, ux4af); ++ ia2 = _mm_unpackhi_ps(y0x4af, ux4af); ++ ib1 = _mm_unpacklo_ps(vx4af, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4af, _mm_setzero_ps()); ++ ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++ ia1 = _mm_unpacklo_ps(ipt0, ipt1); ++ ia2 = _mm_unpacklo_ps(ipt2, ipt3); ++ ib1 = _mm_unpackhi_ps(ipt0, ipt1); ++ ib2 = _mm_unpackhi_ps(ipt2, ipt3); ++ ++ ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0)); ++ px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(28672.0f)); ++ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(28672.0f)); ++ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(28672.0f)); ++ ++ r0x4a = _mm_cvtps_epi32(rx4a); ++ g0x4a = _mm_cvtps_epi32(gx4a); ++ b0x4a = _mm_cvtps_epi32(bx4a); ++ ++ // Reshape y1x4a ++ ia1 = _mm_unpacklo_ps(y1x4af, ux4af); ++ ia2 = _mm_unpackhi_ps(y1x4af, ux4af); ++ ib1 = _mm_unpacklo_ps(vx4af, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4af, _mm_setzero_ps()); ++ ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++ ia1 = _mm_unpacklo_ps(ipt0, ipt1); ++ ia2 = _mm_unpacklo_ps(ipt2, ipt3); ++ ib1 = _mm_unpackhi_ps(ipt0, ipt1); ++ ib2 = _mm_unpackhi_ps(ipt2, ipt3); ++ ++ ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0)); ++ px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(28672.0f)); ++ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(28672.0f)); ++ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(28672.0f)); ++ ++ r1x4a = _mm_cvtps_epi32(rx4a); ++ g1x4a = _mm_cvtps_epi32(gx4a); ++ b1x4a = _mm_cvtps_epi32(bx4a); ++ ++ // Reshape y0x4b ++ ia1 = _mm_unpacklo_ps(y0x4bf, ux4bf); ++ ia2 = _mm_unpackhi_ps(y0x4bf, ux4bf); ++ ib1 = _mm_unpacklo_ps(vx4bf, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4bf, _mm_setzero_ps()); ++ ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++ ia1 = _mm_unpacklo_ps(ipt0, ipt1); ++ ia2 = _mm_unpacklo_ps(ipt2, ipt3); ++ ib1 = _mm_unpackhi_ps(ipt0, ipt1); ++ ib2 = _mm_unpackhi_ps(ipt2, ipt3); ++ ++ ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0)); ++ px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(28672.0f)); ++ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(28672.0f)); ++ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(28672.0f)); ++ ++ r0x4b = _mm_cvtps_epi32(rx4b); ++ g0x4b = _mm_cvtps_epi32(gx4b); ++ b0x4b = _mm_cvtps_epi32(bx4b); ++ ++ // Reshape y1x4b ++ ia1 = _mm_unpacklo_ps(y1x4bf, ux4bf); ++ ia2 = _mm_unpackhi_ps(y1x4bf, ux4bf); ++ ib1 = _mm_unpacklo_ps(vx4bf, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4bf, _mm_setzero_ps()); ++ ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++ ia1 = _mm_unpacklo_ps(ipt0, ipt1); ++ ia2 = _mm_unpacklo_ps(ipt2, ipt3); ++ ib1 = _mm_unpackhi_ps(ipt0, ipt1); ++ ib2 = _mm_unpackhi_ps(ipt2, ipt3); ++ ++ ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0)); ++ px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(28672.0f)); ++ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(28672.0f)); ++ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(28672.0f)); ++ ++ r1x4b = _mm_cvtps_epi32(rx4b); ++ g1x4b = _mm_cvtps_epi32(gx4b); ++ b1x4b = _mm_cvtps_epi32(bx4b); + -+ tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b, ++ tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b, + params->lin_lut, params->tonemap_lut, params->delin_lut, + params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, + params->rgb2rgb_passthrough); -+ tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1, ++ tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1, + params->lin_lut, params->tonemap_lut, params->delin_lut, + params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, + params->rgb2rgb_passthrough); -+ tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8], ++ tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4], + params->lin_lut, params->tonemap_lut, params->delin_lut, + params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, + params->rgb2rgb_passthrough); -+ tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8], ++ tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4], + params->lin_lut, params->tonemap_lut, params->delin_lut, + params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, + params->rgb2rgb_passthrough); + -+ r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r); -+ g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g); -+ b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b); -+ -+ roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0)); -+ goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0)); -+ boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0)); -+ -+ robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1)); -+ gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1)); -+ bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1)); ++ r0ox8 = _mm_lddqu_si128((const __m128i_u *)r); ++ g0ox8 = _mm_lddqu_si128((const __m128i_u *)g); ++ b0ox8 = _mm_lddqu_si128((const __m128i_u *)b); + -+ yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry)); -+ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy))); -+ yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby))); -+ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd)); -+ yoax8 = _mm256_srai_epi32(yoax8, out_sh); -+ yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off)); ++ roax4 = _mm_cvtepi16_epi32(r0ox8); ++ goax4 = _mm_cvtepi16_epi32(g0ox8); ++ boax4 = _mm_cvtepi16_epi32(b0ox8); + -+ yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry)); -+ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy))); -+ yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby))); -+ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd)); -+ yobx8 = _mm256_srai_epi32(yobx8, out_sh); -+ yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off)); ++ robx4 = _mm_unpackhi_epi16(r0ox8, zero128); ++ gobx4 = _mm_unpackhi_epi16(g0ox8, zero128); ++ bobx4 = _mm_unpackhi_epi16(b0ox8, zero128); + -+ y0ox16 = _mm256_packus_epi32(yoax8, yobx8); -+ y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2); -+ _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16); ++ yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry)); ++ yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy))); ++ yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby))); ++ yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd)); ++ // output shift bits for 8bit outputs is 29 - 8 = 21 ++ yoax4 = _mm_srai_epi32(yoax4, 21); ++ yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off)); + -+ r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1); -+ g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1); -+ b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1); ++ yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry)); ++ yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy))); ++ yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby))); ++ yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd)); ++ yobx4 = _mm_srai_epi32(yobx4, 21); ++ yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off)); + -+ r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0)); -+ g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0)); -+ b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0)); ++ y0ox8 = _mm_packs_epi32(yoax4, yobx4); ++ _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128)); + -+ r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1)); -+ g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1)); -+ b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1)); ++ r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1); ++ g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1); ++ b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1); + -+ y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry)); -+ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy))); -+ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby))); -+ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd)); -+ y1oax8 = _mm256_srai_epi32(y1oax8, out_sh); -+ y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off)); ++ r1oax4 = _mm_cvtepi16_epi32(r1ox8); ++ g1oax4 = _mm_cvtepi16_epi32(g1ox8); ++ b1oax4 = _mm_cvtepi16_epi32(b1ox8); + -+ y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry)); -+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy))); -+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby))); -+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd)); -+ y1obx8 = _mm256_srai_epi32(y1obx8, out_sh); -+ y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off)); ++ r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128); ++ g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128); ++ b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128); + -+ y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8); -+ y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0)); -+ y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2); -+ _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16); ++ y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry)); ++ y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy))); ++ y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby))); ++ y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd)); ++ y1oax4 = _mm_srai_epi32(y1oax4, 21); ++ y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off)); + -+ ravgx8 = _mm256_hadd_epi32(roax8, robx8); -+ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8)); -+ ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0)); -+ ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2)); -+ ravgx8 = _mm256_srai_epi32(ravgx8, 2); ++ y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry)); ++ y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy))); ++ y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby))); ++ y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd)); ++ y1obx4 = _mm_srai_epi32(y1obx4, 21); ++ y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off)); + -+ gavgx8 = _mm256_hadd_epi32(goax8, gobx8); -+ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8)); -+ gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0)); -+ gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2)); -+ gavgx8 = _mm256_srai_epi32(gavgx8, 2); ++ y1ox8 = _mm_packs_epi32(y1oax4, y1obx4); ++ _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128)); + -+ bavgx8 = _mm256_hadd_epi32(boax8, bobx8); -+ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8)); -+ bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0)); -+ bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2)); -+ bavgx8 = _mm256_srai_epi32(bavgx8, 2); ++ ravgx4 = _mm_hadd_epi32(roax4, robx4); ++ ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4)); ++ ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2)); ++ ravgx4 = _mm_srai_epi32(ravgx4, 2); + -+ uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru))); -+ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu))); -+ uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv))); -+ uox8 = _mm256_srai_epi32(uox8, out_sh); -+ uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset)); ++ gavgx4 = _mm_hadd_epi32(goax4, gobx4); ++ gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4)); ++ gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2)); ++ gavgx4 = _mm_srai_epi32(gavgx4, 2); + -+ vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv))); -+ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv))); -+ vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv))); -+ vox8 = _mm256_srai_epi32(vox8, out_sh); -+ vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset)); ++ bavgx4 = _mm_hadd_epi32(boax4, bobx4); ++ bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4)); ++ bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2)); ++ bavgx4 = _mm_srai_epi32(bavgx4, 2); + -+ uvoax8 = _mm256_unpacklo_epi32(uox8, vox8); -+ uvobx8 = _mm256_unpackhi_epi32(uox8, vox8); -+ uvox16 = _mm256_packus_epi32(uvoax8, uvobx8); -+ uvox16 = _mm256_slli_epi16(uvox16, out_sh2); -+ _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16); ++ uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru))); ++ uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu))); ++ uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv))); ++ uox4 = _mm_srai_epi32(uox4, 21); ++ uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset)); ++ _mm_storeu_si32(&dstu[x >> 1], _mm_packus_epi16(_mm_packs_epi32(uox4, zero128), zero128)); ++ ++ vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv))); ++ vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv))); ++ vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv))); ++ vox4 = _mm_srai_epi32(vox4, 21); ++ vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset)); ++ _mm_storeu_si32(&dstv[x >> 1], _mm_packus_epi16(_mm_packs_epi32(vox4, zero128), zero128)); + } + } + + // Process remaining pixels cannot fill the full simd register with scalar version + if (remainw) { -+ int offset = width & (int)0xfffffff0; ++ int offset = width & (int)0xfffffff8; + rdsty += offset; -+ rdstuv += offset; ++ rdstu += offset >> 1; ++ rdstv += offset >> 1; + rsrcy += offset; -+ rsrcuv += offset; -+ tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv, -+ rsrcy, rsrcuv, -+ dstlinesize, srclinesize, -+ dstdepth, srcdepth, -+ remainw, rheight, params); ++ rsrcu += offset >> 1; ++ rsrcv += offset >> 1; ++ tonemap_frame_dovi_2_420p(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); + } -+#endif // ENABLE_TONEMAPX_AVX_INTRINSICS ++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS +} -Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h -=================================================================== ---- /dev/null -+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h -@@ -0,0 +1,54 @@ -+/* -+ * Copyright (c) 2024 Gnattu OC -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ + -+#ifndef AVFILTER_X86_TONEMAPX_INTRIN_AVX_H -+#define AVFILTER_X86_TONEMAPX_INTRIN_AVX_H -+ -+#include "libavfilter/vf_tonemapx.h" -+ -+X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++X86_64_V2 void tonemap_frame_dovi_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, + const int *dstlinesize, const int *srclinesize, + int dstdepth, int srcdepth, + int width, int height, -+ const struct TonemapIntParams *params); -+ -+X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, -+ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); -+ -+X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); -+ -+X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv, -+ const uint16_t *srcy, const uint16_t *srcuv, -+ const int *dstlinesize, const int *srclinesize, -+ int dstdepth, int srcdepth, -+ int width, int height, -+ const struct TonemapIntParams *params); -+ -+#endif // AVFILTER_X86_TONEMAPX_INTRIN_AVX_H -Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c -=================================================================== ---- /dev/null -+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c -@@ -0,0 +1,1359 @@ -+/* -+ * Copyright (c) 2024 Gnattu OC -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ -+ -+#include "vf_tonemapx_intrin_sse.h" -+ ++ const struct TonemapIntParams *params) ++{ +#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS -+# include -+#endif // ENABLE_TONEMAPX_SSE_INTRINSICS ++ uint16_t *rdsty = dsty; ++ uint16_t *rdstu = dstu; ++ uint16_t *rdstv = dstv; ++ const uint16_t *rsrcy = srcy; ++ const uint16_t *rsrcu = srcu; ++ const uint16_t *rsrcv = srcv; ++ int rheight = height; ++ // not zero when not divisible by 8 ++ // intentionally leave last pixel emtpy when input is odd ++ int remainw = width & 6; + -+#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS -+// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction -+// cast the register into float register and store with movss as a workaround -+#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10) -+__attribute__((always_inline)) -+X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) { -+ _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a)); -+ return; -+} -+#endif ++ const int in_depth = srcdepth; ++ const float in_rng = (float)((1 << in_depth) - 1); + -+X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a) -+{ -+__m128i mask = _mm_set1_epi32(0x7FFF); -+__m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF)); ++ const int out_depth = dstdepth; ++ const int out_uv_offset = 128 << (out_depth - 8); ++ const int out_sh = 29 - out_depth; ++ const int out_rnd = 1 << (out_sh - 1); + -+__m128i zero = _mm_setzero_si128(); -+__m128i cmp = _mm_cmpeq_epi32(condition, zero); ++ int cry = (*params->rgb2yuv_coeffs)[0][0][0]; ++ int cgy = (*params->rgb2yuv_coeffs)[0][1][0]; ++ int cby = (*params->rgb2yuv_coeffs)[0][2][0]; ++ int cru = (*params->rgb2yuv_coeffs)[1][0][0]; ++ int ocgu = (*params->rgb2yuv_coeffs)[1][1][0]; ++ int cburv = (*params->rgb2yuv_coeffs)[1][2][0]; ++ int ocgv = (*params->rgb2yuv_coeffs)[2][1][0]; ++ int cbv = (*params->rgb2yuv_coeffs)[2][2][0]; + -+__m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask); -+__m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a)); ++ int16_t r[8], g[8], b[8]; ++ int16_t r1[8], g1[8], b1[8]; + -+return result; -+} ++ __m128i zero128 = _mm_setzero_si128(); ++ __m128i ux4, vx4; ++ __m128i y0x8, y1x8; ++ __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b; ++ __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b; ++ __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b; + -+X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a) -+{ -+__m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U)); -+__m128i mask = _mm_set1_epi32(~0xFFFF); -+__m128i condition = _mm_and_si128(add_result, mask); -+__m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128()); ++ __m128i r0ox8, g0ox8, b0ox8; ++ __m128i y0ox8; ++ __m128i roax4, robx4, goax4, gobx4, boax4, bobx4; ++ __m128i yoax4, yobx4; + -+__m128i shifted = _mm_srai_epi32(a, 31); -+__m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF)); ++ __m128i r1ox8, g1ox8, b1ox8; ++ __m128i y1ox8; ++ __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4; ++ __m128i y1oax4, y1obx4; ++ __m128i uox4, vox4, ravgx4, gavgx4, bavgx4; + -+return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result)); -+} ++ __m128 ipt0, ipt1, ipt2, ipt3; ++ __m128 ia1, ib1, ia2, ib2; ++ __m128 ix4, px4, tx4; ++ __m128 lx4, mx4, sx4; ++ __m128 rx4a, gx4a, bx4a, rx4b, gx4b, bx4b; ++ __m128 y0x4af, y0x4bf, y1x4af, y1x4bf, ux4af, ux4bf, vx4af, vx4bf; ++ for (; height > 1; height -= 2, ++ dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2, ++ srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) { ++ for (int xx = 0; xx < width >> 3; xx++) { ++ int x = xx << 3; + -+X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in, -+ int16_t *r_out, int16_t *g_out, int16_t *b_out, -+ float *lin_lut, float *tonemap_lut, uint16_t *delin_lut, -+ const AVLumaCoefficients *coeffs, -+ const AVLumaCoefficients *ocoeffs, double desat, -+ double (*rgb2rgb)[3][3], -+ int rgb2rgb_passthrough) -+{ -+ __m128i sig4; -+ __m128 mapvalx4, r_linx4, g_linx4, b_linx4; -+ __m128 offset = _mm_set1_ps(0.5f); -+ __m128i input_lut_offset = _mm_set1_epi32(2048); -+ __m128 intermediate_upper_bound = _mm_set1_ps(32767.0f); -+ __m128i r, g, b, rx4, gx4, bx4; ++ y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x)); ++ y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x))); ++ ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1))); ++ vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1))); + -+ float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4]; ++ y0x4a = _mm_cvtepu16_epi32(y0x8); ++ y0x4b = _mm_unpackhi_epi16(y0x8, zero128); ++ y1x4a = _mm_cvtepu16_epi32(y1x8); ++ y1x4b = _mm_unpackhi_epi16(y1x8, zero128); ++ ux4 = _mm_cvtepu16_epi32(ux4); ++ vx4 = _mm_cvtepu16_epi32(vx4); + -+ sig4 = _mm_max_epi32(r_in, _mm_max_epi32(g_in, b_in)); -+ sig4 = _mm_add_epi32(sig4, input_lut_offset); -+ sig4 = av_clip_uint16_sse(sig4); ++ ux4a = _mm_unpacklo_epi32(ux4, ux4); ++ ux4b = _mm_unpackhi_epi32(ux4, ux4); ++ vx4a = _mm_unpacklo_epi32(vx4, vx4); ++ vx4b = _mm_unpackhi_epi32(vx4, vx4); + -+ r = _mm_add_epi32(r_in, input_lut_offset); -+ r = av_clip_uint16_sse(r); -+ g = _mm_add_epi32(g_in, input_lut_offset); -+ g = av_clip_uint16_sse(g); -+ b = _mm_add_epi32(b_in, input_lut_offset); -+ b = av_clip_uint16_sse(b); ++ y0x4af = _mm_cvtepi32_ps(y0x4a); ++ y0x4bf = _mm_cvtepi32_ps(y0x4b); ++ y1x4af = _mm_cvtepi32_ps(y1x4a); ++ y1x4bf = _mm_cvtepi32_ps(y1x4b); ++ ux4af = _mm_cvtepi32_ps(ux4a); ++ ux4bf = _mm_cvtepi32_ps(ux4b); ++ vx4af = _mm_cvtepi32_ps(vx4a); ++ vx4bf = _mm_cvtepi32_ps(vx4b); ++ ++ y0x4af = _mm_div_ps(y0x4af, _mm_set1_ps(in_rng)); ++ y0x4bf = _mm_div_ps(y0x4bf, _mm_set1_ps(in_rng)); ++ y1x4af = _mm_div_ps(y1x4af, _mm_set1_ps(in_rng)); ++ y1x4bf = _mm_div_ps(y1x4bf, _mm_set1_ps(in_rng)); ++ ux4af = _mm_div_ps(ux4af, _mm_set1_ps(in_rng)); ++ ux4bf = _mm_div_ps(ux4bf, _mm_set1_ps(in_rng)); ++ vx4af = _mm_div_ps(vx4af, _mm_set1_ps(in_rng)); ++ vx4bf = _mm_div_ps(vx4bf, _mm_set1_ps(in_rng)); ++ ++ // Reshape y0x4a ++ ia1 = _mm_unpacklo_ps(y0x4af, ux4af); ++ ia2 = _mm_unpackhi_ps(y0x4af, ux4af); ++ ib1 = _mm_unpacklo_ps(vx4af, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4af, _mm_setzero_ps()); ++ ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++ ia1 = _mm_unpacklo_ps(ipt0, ipt1); ++ ia2 = _mm_unpacklo_ps(ipt2, ipt3); ++ ib1 = _mm_unpackhi_ps(ipt0, ipt1); ++ ib2 = _mm_unpackhi_ps(ipt2, ipt3); ++ ++ ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0)); ++ px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(28672.0f)); ++ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(28672.0f)); ++ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(28672.0f)); ++ ++ r0x4a = _mm_cvtps_epi32(rx4a); ++ g0x4a = _mm_cvtps_epi32(gx4a); ++ b0x4a = _mm_cvtps_epi32(bx4a); ++ ++ // Reshape y1x4a ++ ia1 = _mm_unpacklo_ps(y1x4af, ux4af); ++ ia2 = _mm_unpackhi_ps(y1x4af, ux4af); ++ ib1 = _mm_unpacklo_ps(vx4af, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4af, _mm_setzero_ps()); ++ ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++ ia1 = _mm_unpacklo_ps(ipt0, ipt1); ++ ia2 = _mm_unpacklo_ps(ipt2, ipt3); ++ ib1 = _mm_unpackhi_ps(ipt0, ipt1); ++ ib2 = _mm_unpackhi_ps(ipt2, ipt3); ++ ++ ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0)); ++ px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(28672.0f)); ++ gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(28672.0f)); ++ bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(28672.0f)); ++ ++ r1x4a = _mm_cvtps_epi32(rx4a); ++ g1x4a = _mm_cvtps_epi32(gx4a); ++ b1x4a = _mm_cvtps_epi32(bx4a); ++ ++ // Reshape y0x4b ++ ia1 = _mm_unpacklo_ps(y0x4bf, ux4bf); ++ ia2 = _mm_unpackhi_ps(y0x4bf, ux4bf); ++ ib1 = _mm_unpacklo_ps(vx4bf, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4bf, _mm_setzero_ps()); ++ ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++ ia1 = _mm_unpacklo_ps(ipt0, ipt1); ++ ia2 = _mm_unpacklo_ps(ipt2, ipt3); ++ ib1 = _mm_unpackhi_ps(ipt0, ipt1); ++ ib2 = _mm_unpackhi_ps(ipt2, ipt3); ++ ++ ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0)); ++ px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(28672.0f)); ++ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(28672.0f)); ++ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(28672.0f)); ++ ++ r0x4b = _mm_cvtps_epi32(rx4b); ++ g0x4b = _mm_cvtps_epi32(gx4b); ++ b0x4b = _mm_cvtps_epi32(bx4b); ++ ++ // Reshape y1x4b ++ ia1 = _mm_unpacklo_ps(y1x4bf, ux4bf); ++ ia2 = _mm_unpackhi_ps(y1x4bf, ux4bf); ++ ib1 = _mm_unpacklo_ps(vx4bf, _mm_setzero_ps()); ++ ib2 = _mm_unpackhi_ps(vx4bf, _mm_setzero_ps()); ++ ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2)); ++ ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ipt0 = reshape_dovi_iptpqc2(ipt0, params); ++ ipt1 = reshape_dovi_iptpqc2(ipt1, params); ++ ipt2 = reshape_dovi_iptpqc2(ipt2, params); ++ ipt3 = reshape_dovi_iptpqc2(ipt3, params); ++ ++ ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0)); ++ ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0)); ++ ++ ia1 = _mm_unpacklo_ps(ipt0, ipt1); ++ ia2 = _mm_unpacklo_ps(ipt2, ipt3); ++ ib1 = _mm_unpackhi_ps(ipt0, ipt1); ++ ib2 = _mm_unpackhi_ps(ipt2, ipt3); ++ ++ ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0)); ++ px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0)); ++ tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2)); ++ ++ ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset); ++ lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix); ++ ++ rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(28672.0f)); ++ gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(28672.0f)); ++ bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(28672.0f)); ++ ++ r1x4b = _mm_cvtps_epi32(rx4b); ++ g1x4b = _mm_cvtps_epi32(gx4b); ++ b1x4b = _mm_cvtps_epi32(bx4b); + -+ // Cannot use loop here as the lane has to be compile-time constant -+#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \ -+r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)]; \ -+g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)]; \ -+b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)]; ++ tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1, ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); ++ tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4], ++ params->lin_lut, params->tonemap_lut, params->delin_lut, ++ params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, ++ params->rgb2rgb_passthrough); + -+ LOAD_LUT(0) -+ LOAD_LUT(1) -+ LOAD_LUT(2) -+ LOAD_LUT(3) ++ r0ox8 = _mm_lddqu_si128((const __m128i_u *)r); ++ g0ox8 = _mm_lddqu_si128((const __m128i_u *)g); ++ b0ox8 = _mm_lddqu_si128((const __m128i_u *)b); + -+#undef LOAD_LUT ++ roax4 = _mm_cvtepi16_epi32(r0ox8); ++ goax4 = _mm_cvtepi16_epi32(g0ox8); ++ boax4 = _mm_cvtepi16_epi32(b0ox8); + -+ mapvalx4 = _mm_loadu_ps(mapval4); -+ r_linx4 = _mm_loadu_ps(r_lin4); -+ g_linx4 = _mm_loadu_ps(g_lin4); -+ b_linx4 = _mm_loadu_ps(b_lin4); ++ robx4 = _mm_unpackhi_epi16(r0ox8, zero128); ++ gobx4 = _mm_unpackhi_epi16(g0ox8, zero128); ++ bobx4 = _mm_unpackhi_epi16(b0ox8, zero128); + -+ if (!rgb2rgb_passthrough) { -+ r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0])); -+ r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1]))); -+ r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2]))); ++ yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry)); ++ yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy))); ++ yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby))); ++ yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd)); ++ yoax4 = _mm_srai_epi32(yoax4, out_sh); ++ yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off)); + -+ g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1])); -+ g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0]))); -+ g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2]))); ++ yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry)); ++ yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy))); ++ yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby))); ++ yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd)); ++ yobx4 = _mm_srai_epi32(yobx4, out_sh); ++ yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off)); + -+ b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2])); -+ b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0]))); -+ b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1]))); -+ } ++ y0ox8 = _mm_packus_epi32(yoax4, yobx4); ++ _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8); + -+ if (desat > 0) { -+ __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS); -+ __m128 desat4 = _mm_set1_ps((float)desat); -+ __m128 luma4 = _mm_set1_ps(0); -+ __m128 overbright4; ++ r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1); ++ g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1); ++ b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1); + -+ luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr)))); -+ luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg)))); -+ luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb)))); -+ overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4)); -+ r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4)); -+ r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4)); -+ g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4)); -+ g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4)); -+ b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4)); -+ b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4)); -+ } ++ r1oax4 = _mm_cvtepi16_epi32(r1ox8); ++ g1oax4 = _mm_cvtepi16_epi32(g1ox8); ++ b1oax4 = _mm_cvtepi16_epi32(b1ox8); + -+ r_linx4 = _mm_mul_ps(r_linx4, mapvalx4); -+ g_linx4 = _mm_mul_ps(g_linx4, mapvalx4); -+ b_linx4 = _mm_mul_ps(b_linx4, mapvalx4); ++ r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128); ++ g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128); ++ b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128); + -+ r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound); -+ r_linx4 = _mm_add_ps(r_linx4, offset); ++ y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry)); ++ y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy))); ++ y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby))); ++ y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd)); ++ y1oax4 = _mm_srai_epi32(y1oax4, out_sh); ++ y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off)); + -+ g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound); -+ g_linx4 = _mm_add_ps(g_linx4, offset); ++ y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry)); ++ y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy))); ++ y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby))); ++ y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd)); ++ y1obx4 = _mm_srai_epi32(y1obx4, out_sh); ++ y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off)); + -+ b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound); -+ b_linx4 = _mm_add_ps(b_linx4, offset); ++ y1ox8 = _mm_packus_epi32(y1oax4, y1obx4); ++ _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8); + -+ rx4 = _mm_cvttps_epi32(r_linx4); -+ rx4 = av_clip_uint16_sse(rx4); -+ gx4 = _mm_cvttps_epi32(g_linx4); -+ gx4 = av_clip_uint16_sse(gx4); -+ bx4 = _mm_cvttps_epi32(b_linx4); -+ bx4 = av_clip_uint16_sse(bx4); ++ ravgx4 = _mm_hadd_epi32(roax4, robx4); ++ ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4)); ++ ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2)); ++ ravgx4 = _mm_srai_epi32(ravgx4, 2); + -+#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \ -+g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)]; \ -+b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)]; ++ gavgx4 = _mm_hadd_epi32(goax4, gobx4); ++ gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4)); ++ gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2)); ++ gavgx4 = _mm_srai_epi32(gavgx4, 2); + -+ SAVE_COLOR(0) -+ SAVE_COLOR(1) -+ SAVE_COLOR(2) -+ SAVE_COLOR(3) ++ bavgx4 = _mm_hadd_epi32(boax4, bobx4); ++ bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4)); ++ bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2)); ++ bavgx4 = _mm_srai_epi32(bavgx4, 2); + -+#undef SAVE_COLOR -+} ++ uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru))); ++ uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu))); ++ uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv))); ++ uox4 = _mm_srai_epi32(uox4, out_sh); ++ uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset)); ++ _mm_storeu_si64((__m128i_u *) &dstu[x >> 1], _mm_packus_epi32(uox4, zero128)); ++ ++ vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv))); ++ vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv))); ++ vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv))); ++ vox4 = _mm_srai_epi32(vox4, out_sh); ++ vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset)); ++ _mm_storeu_si64((__m128i_u *) &dstv[x >> 1], _mm_packus_epi32(vox4, zero128)); ++ } ++ } ++ ++ // Process remaining pixels cannot fill the full simd register with scalar version ++ if (remainw) { ++ int offset = width & (int)0xfffffff8; ++ rdsty += offset; ++ rdstu += offset >> 1; ++ rdstv += offset >> 1; ++ rsrcy += offset; ++ rsrcu += offset >> 1; ++ rsrcv += offset >> 1; ++ tonemap_frame_dovi_2_420p10(rdsty, rdstu, rdstv, ++ rsrcy, rsrcu, rsrcv, ++ dstlinesize, srclinesize, ++ dstdepth, srcdepth, ++ remainw, rheight, params); ++ } +#endif // ENABLE_TONEMAPX_SSE_INTRINSICS ++} + +X86_64_V2 void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, @@ -5655,7 +9050,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h =================================================================== --- /dev/null +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h -@@ -0,0 +1,54 @@ +@@ -0,0 +1,68 @@ +/* + * Copyright (c) 2024 Gnattu OC + * @@ -5681,6 +9076,20 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h + +#include "libavfilter/vf_tonemapx.h" + ++X86_64_V2 void tonemap_frame_dovi_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ ++X86_64_V2 void tonemap_frame_dovi_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv, ++ const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, ++ const int *dstlinesize, const int *srclinesize, ++ int dstdepth, int srcdepth, ++ int width, int height, ++ const struct TonemapIntParams *params); ++ +X86_64_V2 void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv, + const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv, + const int *dstlinesize, const int *srclinesize,