diff --git a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
index 5df4dfe9e09..e5ae04b5872 100644
--- a/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
+++ b/debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
@@ -95,7 +95,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
-@@ -0,0 +1,1229 @@
+@@ -0,0 +1,2149 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -123,6 +123,212 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +
 +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
++inline static float32x4_t mix_float32x4(float32x4_t x, float32x4_t y, float32x4_t a)
++{
++    float32x4_t n = vsubq_f32(y, x);
++    n = vfmaq_f32(x, n, a);
++    return n;
++}
++
++static inline float reshape_poly(float s, float32x4_t coeffs)
++{
++    float32x4_t ps = vdupq_n_f32(0.0f);
++    ps = vsetq_lane_f32(1.0f, ps, 0);
++    ps = vsetq_lane_f32(s, ps, 1);
++    ps = vsetq_lane_f32(s * s, ps, 2);
++    ps = vmulq_f32(ps, coeffs);
++    return vaddvq_f32(ps);
++}
++
++inline static float reshape_mmr(float32x4_t sig, float32x4_t coeffs, const float* mmr,
++                                int mmr_single, int min_order, int max_order)
++{
++    int mmr_idx = mmr_single ? 0 : (int)vgetq_lane_f32(coeffs, 1);
++    int order = (int)vgetq_lane_f32(coeffs, 3);
++    float s = vgetq_lane_f32(coeffs, 0);
++    float32x4_t mmr_coeffs, ps;
++    float32x4_t sigX01 = vmulq_laneq_f32(sig, sig, 1); // {sig[0]*sig[1], sig[1]*sig[1], sig[2]*sig[1], sig[3]*sig[1]}
++    float32x4_t sigX02 = vmulq_laneq_f32(sig, sig, 2); // {sig[0]*sig[2], sig[1]*sig[2], sig[2]*sig[2], sig[3]*sig[2]}
++    float32x4_t sigX12 = vmulq_laneq_f32(sigX01, sig, 2); // {sig[0]*sig[1]*sig[2], sig[1]*sig[1]*sig[2], sig[2]*sig[1]*sig[2], sig[3]*sig[1]*sig[2]}
++    float32x4_t sigX = sigX01; // sig[0]*sig[1] now positioned at 0
++    sigX = vsetq_lane_f32(vgetq_lane_f32(sigX02, 0), sigX, 1); // sig[0]*sig[2] at 1
++    sigX = vsetq_lane_f32(vgetq_lane_f32(sigX02, 1), sigX, 2); // sig[1]*sig[2] at 2
++    sigX = vsetq_lane_f32(vgetq_lane_f32(sigX12, 0), sigX, 3); // sig[0]*sig[1]*sig[2] at 3
++
++    // dot first order
++    mmr_coeffs = vld1q_f32(&mmr[mmr_idx + 0*4]);
++    ps = vmulq_f32(sig, mmr_coeffs);
++    s += vaddvq_f32(ps);
++    mmr_coeffs = vld1q_f32(&mmr[mmr_idx + 1*4]);
++    ps = vmulq_f32(sigX, mmr_coeffs);
++    s += vaddvq_f32(ps);
++
++    if (max_order >= 2 && (min_order >= 2 || order >= 2)) {
++        float32x4_t sig2 = vmulq_f32(sig, sig);
++        float32x4_t sigX2 = vmulq_f32(sigX, sigX);
++
++        mmr_coeffs = vld1q_f32(&mmr[mmr_idx + 2*4]);
++        ps = vmulq_f32(sig2, mmr_coeffs);
++        s += vaddvq_f32(ps);
++        mmr_coeffs = vld1q_f32(&mmr[mmr_idx + 3*4]);
++        ps = vmulq_f32(sigX2, mmr_coeffs);
++        s += vaddvq_f32(ps);
++
++        if (max_order == 3 && (min_order == 3 || order >= 3)) {
++            float32x4_t sig3 = vmulq_f32(sig2, sig);
++            float32x4_t sigX3 = vmulq_f32(sigX2, sigX);
++
++            mmr_coeffs = vld1q_f32(&mmr[mmr_idx + 4*4]);
++            ps = vmulq_f32(sig3, mmr_coeffs);
++            s += vaddvq_f32(ps);
++            mmr_coeffs = vld1q_f32(&mmr[mmr_idx + 5*4]);
++            ps = vmulq_f32(sigX3, mmr_coeffs);
++            s += vaddvq_f32(ps);
++        }
++    }
++
++    return s;
++}
++
++#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c)))
++inline static float32x4_t reshape_dovi_iptpqc2(float32x4_t sig, const TonemapIntParams *ctx)
++{
++    int has_mmr_poly;
++    float s;
++
++    float *src_dovi_params = ctx->dovi_pbuf;
++    float *src_dovi_pivots = ctx->dovi_pbuf + 24;
++    float *src_dovi_coeffs = ctx->dovi_pbuf + 48; //float4*
++    float *src_dovi_mmr = ctx->dovi_pbuf + 144; //float4*
++
++    float* dovi_params_i = src_dovi_params + 0*8;
++    float* dovi_pivots_i = src_dovi_pivots + 0*8;
++    float* dovi_coeffs_i = src_dovi_coeffs + 0 * 8 * 4; //float4*
++    float* dovi_mmr_i = src_dovi_mmr + 0 * 48 * 4; //float4*
++    int dovi_num_pivots_i = dovi_params_i[0];
++    int dovi_has_mmr_i = dovi_params_i[1];
++    int dovi_has_poly_i = dovi_params_i[2];
++    int dovi_mmr_single_i = dovi_params_i[3];
++    int dovi_min_order_i = dovi_params_i[4];
++    int dovi_max_order_i = dovi_params_i[5];
++    float dovi_lo_i = dovi_params_i[6];
++    float dovi_hi_i = dovi_params_i[7];
++
++    float* dovi_params_p = src_dovi_params + 1*8;
++    float* dovi_coeffs_p = src_dovi_coeffs + 1*8 * 4; //float4*
++    float* dovi_mmr_p = src_dovi_mmr + 1*48 * 4; //float4*
++    int dovi_has_mmr_p = dovi_params_p[1];
++    int dovi_has_poly_p = dovi_params_p[2];
++    int dovi_mmr_single_p = dovi_params_p[3];
++    int dovi_min_order_p = dovi_params_p[4];
++    int dovi_max_order_p = dovi_params_p[5];
++    float dovi_lo_p = dovi_params_p[6];
++    float dovi_hi_p = dovi_params_p[7];
++
++    float* dovi_params_t = src_dovi_params + 2*8;
++    float* dovi_coeffs_t = src_dovi_coeffs + 2*8 * 4; //float4*
++    float* dovi_mmr_t = src_dovi_mmr + 2*48 * 4; //float4*
++    int dovi_has_mmr_t = dovi_params_t[1];
++    int dovi_has_poly_t = dovi_params_t[2];
++    int dovi_mmr_single_t = dovi_params_t[3];
++    int dovi_min_order_t = dovi_params_t[4];
++    int dovi_max_order_t = dovi_params_t[5];
++    float dovi_lo_t = dovi_params_t[6];
++    float dovi_hi_t = dovi_params_t[7];
++
++    float32x4_t coeffs, result;
++
++    // reshape I
++    s = vgetq_lane_f32(sig, 0);
++    result = sig;
++    if (dovi_num_pivots_i > 2) {
++        float32x4_t m01 = mix_float32x4(vld1q_f32(dovi_coeffs_i), vld1q_f32(dovi_coeffs_i + 4), vdupq_n_f32(s >= dovi_pivots_i[0]));
++        float32x4_t m23 = mix_float32x4(vld1q_f32(dovi_coeffs_i + 2*4), vld1q_f32(dovi_coeffs_i + 3*4), vdupq_n_f32(s >= dovi_pivots_i[2]));
++        float32x4_t m0123 = mix_float32x4(m01, m23, vdupq_n_f32(s >= dovi_pivots_i[1]));
++        float32x4_t m45 = mix_float32x4(vld1q_f32(dovi_coeffs_i + 4*4), vld1q_f32(dovi_coeffs_i + 5*4), vdupq_n_f32(s >= dovi_pivots_i[4]));
++        float32x4_t m67 = mix_float32x4(vld1q_f32(dovi_coeffs_i + 6*4), vld1q_f32(dovi_coeffs_i + 7*4), vdupq_n_f32(s >= dovi_pivots_i[6]));
++        float32x4_t m4567 = mix_float32x4(m45, m67, vdupq_n_f32(s >= dovi_pivots_i[5]));
++        coeffs = mix_float32x4(m0123, m4567, vdupq_n_f32(s >= dovi_pivots_i[3]));
++    } else {
++        coeffs = vld1q_f32(dovi_coeffs_i);
++    }
++
++    has_mmr_poly = dovi_has_mmr_i && dovi_has_poly_i;
++
++    if ((has_mmr_poly && vgetq_lane_f32(coeffs, 3) == 0.0f) || (!has_mmr_poly && dovi_has_poly_i))
++        s = reshape_poly(s, coeffs);
++    else
++        s = reshape_mmr(result, coeffs, dovi_mmr_i,
++                        dovi_mmr_single_i, dovi_min_order_i, dovi_max_order_i);
++
++    result = vsetq_lane_f32(CLAMP(s, dovi_lo_i, dovi_hi_i), result, 0);
++
++    // reshape P
++    s = vgetq_lane_f32(sig, 1);
++    coeffs = vld1q_f32(dovi_coeffs_p);
++    has_mmr_poly = dovi_has_mmr_p && dovi_has_poly_p;
++
++    if ((has_mmr_poly && vgetq_lane_f32(coeffs, 3) == 0.0f) || (!has_mmr_poly && dovi_has_poly_p))
++        s = reshape_poly(s, coeffs);
++    else
++        s = reshape_mmr(result, coeffs, dovi_mmr_p,
++                        dovi_mmr_single_p, dovi_min_order_p, dovi_max_order_p);
++
++    result = vsetq_lane_f32(CLAMP(s, dovi_lo_p, dovi_hi_p), result, 1);
++
++    // reshape T
++    s = vgetq_lane_f32(sig, 2);
++    coeffs = vld1q_f32(dovi_coeffs_t);
++    has_mmr_poly = dovi_has_mmr_t && dovi_has_poly_t;
++
++    if ((has_mmr_poly && vgetq_lane_f32(coeffs, 3) == 0.0f) || (!has_mmr_poly && dovi_has_poly_t))
++        s = reshape_poly(s, coeffs);
++    else
++        s = reshape_mmr(result, coeffs, dovi_mmr_t,
++                        dovi_mmr_single_t, dovi_min_order_t, dovi_max_order_t);
++
++    result = vsetq_lane_f32(CLAMP(s, dovi_lo_t, dovi_hi_t), result, 2);
++
++    return result;
++}
++
++inline static void ycc2rgbx4(float32x4_t* dy, float32x4_t* dcb, float32x4_t* dcr,
++                             float32x4_t y, float32x4_t cb, float32x4_t cr,
++                             const double nonlinear[3][3], const float ycc_offset[3])
++{
++    *dy = vmulq_n_f32(y, (float)nonlinear[0][0]);
++    *dy = vfmaq_n_f32(*dy, cb, (float)nonlinear[0][1]);
++    *dy = vfmaq_n_f32(*dy, cr, (float)nonlinear[0][2]);
++    *dy = vsubq_f32(*dy, vdupq_n_f32(ycc_offset[0]));
++
++    *dcb = vmulq_n_f32(y, (float)nonlinear[1][0]);
++    *dcb = vfmaq_n_f32(*dcb, cb, (float)nonlinear[1][1]);
++    *dcb = vfmaq_n_f32(*dcb, cr, (float)nonlinear[1][2]);
++    *dcb = vsubq_f32(*dcb, vdupq_n_f32(ycc_offset[1]));
++
++    *dcr = vmulq_n_f32(y, (float)nonlinear[2][0]);
++    *dcr = vfmaq_n_f32(*dcr, cb, (float)nonlinear[2][1]);
++    *dcr = vfmaq_n_f32(*dcr, cr, (float)nonlinear[2][2]);
++    *dcr = vsubq_f32(*dcr, vdupq_n_f32(ycc_offset[2]));
++}
++
++inline static void lms2rgbx4(float32x4_t* dl, float32x4_t* dm, float32x4_t* ds,
++                             float32x4_t l, float32x4_t m, float32x4_t s,
++                             const double lms2rgb_matrix[3][3])
++{
++    *dl = vmulq_n_f32(l, (float)lms2rgb_matrix[0][0]);
++    *dl = vfmaq_n_f32(*dl, m, (float)lms2rgb_matrix[0][1]);
++    *dl = vfmaq_n_f32(*dl, s, (float)lms2rgb_matrix[0][2]);
++
++    *dm = vmulq_n_f32(l, (float)lms2rgb_matrix[1][0]);
++    *dm = vfmaq_n_f32(*dm, m, (float)lms2rgb_matrix[1][1]);
++    *dm = vfmaq_n_f32(*dm, s, (float)lms2rgb_matrix[1][2]);
++
++    *ds = vmulq_n_f32(l, (float)lms2rgb_matrix[2][0]);
++    *ds = vfmaq_n_f32(*ds, m, (float)lms2rgb_matrix[2][1]);
++    *ds = vfmaq_n_f32(*ds, s, (float)lms2rgb_matrix[2][2]);
++}
++
 +static inline void tonemap_int16x8_neon(uint16x8_t r_in, uint16x8_t g_in, uint16x8_t b_in,
 +                                        int16_t *r_out, int16_t *g_out, int16_t *b_out,
 +                                        float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
@@ -311,12 +517,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +}
 +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +
-+void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                      const int *dstlinesize, const int *srclinesize,
-+                                      int dstdepth, int srcdepth,
-+                                      int width, int height,
-+                                      const struct TonemapIntParams *params)
++void tonemap_frame_dovi_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                    const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                    const int *dstlinesize, const int *srclinesize,
++                                    int dstdepth, int srcdepth,
++                                    int width, int height,
++                                    const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
 +    uint8_t *rdsty = dsty;
@@ -331,20 +537,11 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    int remainw = width & 6;
 +
 +    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
-+
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++    const float in_rng = (float)((1 << in_depth) - 1);
 +
 +    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
 +    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
@@ -357,15 +554,6 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +
 +    int16_t r[8], g[8], b[8];
 +    int16_t r1[8], g1[8], b1[8];
-+    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
-+    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
-+    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
-+    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
-+    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
-+    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
-+    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
-+    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
-+    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
 +    uint16x8_t y0x8, y1x8, ux8, vx8;
 +    uint16x8_t r0x8, g0x8, b0x8;
 +    uint16x8_t r1x8, g1x8, b1x8;
@@ -386,6 +574,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
 +    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
 +    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
++    float32x4_t ipt0, ipt1, ipt2, ipt3;
++    float32x4_t ia1, ib1, ia2, ib2;
++    float32x4_t ix4, px4, tx4;
++    float32x4_t lx4, mx4, sx4;
++    float32x4_t rx4a, gx4a, bx4a, rx4b, gx4b, bx4b;
++    float32x4_t y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
 +                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
@@ -397,34 +591,159 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            ux4 = vld1_u16(srcu + (x >> 1));
 +            vx4 = vld1_u16(srcv + (x >> 1));
 +
-+            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
-+            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
 +            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
-+            ux8 = vsubq_u16(ux8, in_uv_offx8);
 +            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
-+            vx8 = vsubq_u16(vx8, in_uv_offx8);
-+
-+            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
-+            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
-+            r0x8 = vaddq_u16(r0x8, rndx8);
-+
-+            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
-+            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
-+            g0x8 = vaddq_u16(g0x8, rndx8);
-+
-+            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
-+            b0x8 = vaddq_u16(b0x8, rndx8);
-+
-+            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
-+            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
-+            r1x8 = vaddq_u16(r1x8, rndx8);
-+
-+            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
-+            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
-+            g1x8 = vaddq_u16(g1x8, rndx8);
 +
-+            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
-+            b1x8 = vaddq_u16(b1x8, rndx8);
++            y0x4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(y0x8)));
++            y0x4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(y0x8)));
++            y1x4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(y1x8)));
++            y1x4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(y1x8)));
++
++            ux4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(ux8)));
++            ux4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(ux8)));
++            vx4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vx8)));
++            vx4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vx8)));
++
++            y0x4a = vdivq_f32(y0x4a, vdupq_n_f32(in_rng));
++            y0x4b = vdivq_f32(y0x4b, vdupq_n_f32(in_rng));
++            y1x4a = vdivq_f32(y1x4a, vdupq_n_f32(in_rng));
++            y1x4b = vdivq_f32(y1x4b, vdupq_n_f32(in_rng));
++            ux4a = vdivq_f32(ux4a, vdupq_n_f32(in_rng));
++            ux4b = vdivq_f32(ux4b, vdupq_n_f32(in_rng));
++            vx4a = vdivq_f32(vx4a, vdupq_n_f32(in_rng));
++            vx4b = vdivq_f32(vx4b, vdupq_n_f32(in_rng));
++
++            // Reshape y0x4a
++            ia1 = vzip1q_f32(y0x4a, ux4a);
++            ia2 = vzip2q_f32(y0x4a, ux4a);
++            ib1 = vzip1q_f32(vx4a, vdupq_n_f32(0.0f));
++            ib2 = vzip2q_f32(vx4a, vdupq_n_f32(0.0f));
++            ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1));
++            ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1));
++            ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2));
++            ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ia1 = vtrn1q_f32(ipt0, ipt1);
++            ia2 = vtrn1q_f32(ipt2, ipt3);
++            ib1 = vtrn2q_f32(ipt0, ipt1);
++            ib2 = vtrn2q_f32(ipt2, ipt3);
++
++            ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2));
++            px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2));
++            tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4a = vmulq_n_f32(rx4a, 28672.0f);
++            gx4a = vmulq_n_f32(gx4a, 28672.0f);
++            bx4a = vmulq_n_f32(bx4a, 28672.0f);
++
++            // Reshape y0x4b
++            ia1 = vzip1q_f32(y0x4b, ux4b);
++            ia2 = vzip2q_f32(y0x4b, ux4b);
++            ib1 = vzip1q_f32(vx4b, vdupq_n_f32(0.0f));
++            ib2 = vzip2q_f32(vx4b, vdupq_n_f32(0.0f));
++            ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1));
++            ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1));
++            ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2));
++            ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ia1 = vtrn1q_f32(ipt0, ipt1);
++            ia2 = vtrn1q_f32(ipt2, ipt3);
++            ib1 = vtrn2q_f32(ipt0, ipt1);
++            ib2 = vtrn2q_f32(ipt2, ipt3);
++
++            ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2));
++            px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2));
++            tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4b = vmulq_n_f32(rx4b, 28672.0f);
++            gx4b = vmulq_n_f32(gx4b, 28672.0f);
++            bx4b = vmulq_n_f32(bx4b, 28672.0f);
++
++            r0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b)));
++            g0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b)));
++            b0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b)));
++
++            // Reshape y1x4a
++            ia1 = vzip1q_f32(y1x4a, ux4a);
++            ia2 = vzip2q_f32(y1x4a, ux4a);
++            ib1 = vzip1q_f32(vx4a, vdupq_n_f32(0.0f));
++            ib2 = vzip2q_f32(vx4a, vdupq_n_f32(0.0f));
++            ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1));
++            ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1));
++            ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2));
++            ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ia1 = vtrn1q_f32(ipt0, ipt1);
++            ia2 = vtrn1q_f32(ipt2, ipt3);
++            ib1 = vtrn2q_f32(ipt0, ipt1);
++            ib2 = vtrn2q_f32(ipt2, ipt3);
++
++            ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2));
++            px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2));
++            tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4a = vmulq_n_f32(rx4a, 28672.0f);
++            gx4a = vmulq_n_f32(gx4a, 28672.0f);
++            bx4a = vmulq_n_f32(bx4a, 28672.0f);
++
++            // Reshape y1x4b
++            ia1 = vzip1q_f32(y1x4b, ux4b);
++            ia2 = vzip2q_f32(y1x4b, ux4b);
++            ib1 = vzip1q_f32(vx4b, vdupq_n_f32(0.0f));
++            ib2 = vzip2q_f32(vx4b, vdupq_n_f32(0.0f));
++            ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1));
++            ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1));
++            ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2));
++            ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ia1 = vtrn1q_f32(ipt0, ipt1);
++            ia2 = vtrn1q_f32(ipt2, ipt3);
++            ib1 = vtrn2q_f32(ipt0, ipt1);
++            ib2 = vtrn2q_f32(ipt2, ipt3);
++
++            ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2));
++            px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2));
++            tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4b = vmulq_n_f32(rx4b, 28672.0f);
++            gx4b = vmulq_n_f32(gx4b, 28672.0f);
++            bx4b = vmulq_n_f32(bx4b, 28672.0f);
++
++            r1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b)));
++            g1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b)));
++            b1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b)));
 +
 +            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
 +                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
@@ -546,7 +865,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +        rsrcy += offset;
 +        rsrcu += offset >> 1;
 +        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
++        tonemap_frame_dovi_2_420p(rdsty, rdstu, rdstv,
 +                                    rsrcy, rsrcu, rsrcv,
 +                                    dstlinesize, srclinesize,
 +                                    dstdepth, srcdepth,
@@ -555,18 +874,20 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +}
 +
-+void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
-+                                         const uint16_t *srcy, const uint16_t *srcuv,
-+                                         const int *dstlinesize, const int *srclinesize,
-+                                         int dstdepth, int srcdepth,
-+                                         int width, int height,
-+                                         const struct TonemapIntParams *params)
++void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                      const int *dstlinesize, const int *srclinesize,
++                                      int dstdepth, int srcdepth,
++                                      int width, int height,
++                                      const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
 +    uint8_t *rdsty = dsty;
-+    uint8_t *rdstuv = dstuv;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
@@ -608,11 +929,10 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
 +    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
 +    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
-+    uint16x8_t uvx8;
-+    uint16x4_t ux2a, vx2a, ux2b, vx2b;
 +    uint16x8_t y0x8, y1x8, ux8, vx8;
 +    uint16x8_t r0x8, g0x8, b0x8;
 +    uint16x8_t r1x8, g1x8, b1x8;
++    uint16x4_t ux4, vx4;
 +
 +    int16x8_t r0ox8, g0ox8, b0ox8;
 +    int16x8_t y0ox8;
@@ -623,7 +943,6 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    int16x8_t y1ox8;
 +    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
 +    int32x4_t y1oax4, y1obx4;
-+    int32x4_t uvoax4, uvobx4;
 +    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
 +    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
 +    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
@@ -631,32 +950,22 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
 +    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
 +
 +            y0x8 = vld1q_u16(srcy + x);
 +            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            uvx8 = vld1q_u16(srcuv + x);
-+            if (in_depth == 10) {
-+                // shift to low10bits for 10bit input
-+                // shift bit has to be compile-time constant
-+                y0x8 = vshrq_n_u16(y0x8, 6);
-+                y1x8 = vshrq_n_u16(y1x8, 6);
-+                uvx8 = vshrq_n_u16(uvx8, 6);
-+            }
++            ux4 = vld1_u16(srcu + (x >> 1));
++            vx4 = vld1_u16(srcv + (x >> 1));
++
 +            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
 +            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
-+            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
-+
-+            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
-+            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
-+            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
-+            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
-+
-+            ux8 = vcombine_u16(ux2a, ux2b);
-+            vx8 = vcombine_u16(vx2a, vx2b);
++            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
++            ux8 = vsubq_u16(ux8, in_uv_offx8);
++            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
++            vx8 = vsubq_u16(vx8, in_uv_offx8);
 +
 +            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
 +            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
@@ -780,17 +1089,14 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
 +            uox4 = vshrq_n_s32(uox4, 21);
 +            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vst1_lane_u32((uint32_t *) &dstu[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(uox4), vdup_n_s16(0)))), 0);
 +
 +            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
 +            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
 +            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
 +            vox4 = vshrq_n_s32(vox4, 21);
 +            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
-+
-+            uvoax4 = vzip1q_s32(uox4, vox4);
-+            uvobx4 = vzip2q_s32(uox4, vox4);
-+
-+            vst1_u8(&dstuv[x], vqmovun_s16(vcombine_s16(vmovn_s32(uvoax4), vmovn_s32(uvobx4))));
++            vst1_lane_u32((uint32_t *) &dstv[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(vox4), vdup_n_s16(0)))), 0);
 +        }
 +    }
 +
@@ -798,32 +1104,32 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    if (remainw) {
 +        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
-+        rdstuv += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
 +        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
-+                                       rsrcy, rsrcuv,
-+                                       dstlinesize, srclinesize,
-+                                       dstdepth, srcdepth,
-+                                       remainw, rheight, params);
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
 +    }
 +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +}
 +
-+void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                        const int *dstlinesize, const int *srclinesize,
-+                                        int dstdepth, int srcdepth,
-+                                        int width, int height,
-+                                        const struct TonemapIntParams *params)
++void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
++                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                         const int *dstlinesize, const int *srclinesize,
++                                         int dstdepth, int srcdepth,
++                                         int width, int height,
++                                         const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
-+    uint16_t *rdsty = dsty;
-+    uint16_t *rdstu = dstu;
-+    uint16_t *rdstv = dstv;
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstuv = dstuv;
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcu = srcu;
-+    const uint16_t *rsrcv = srcv;
++    const uint16_t *rsrcuv = srcuv;
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
@@ -865,20 +1171,22 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
 +    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
 +    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
-+    uint16x4_t ux4, vx4;
++    uint16x8_t uvx8;
++    uint16x4_t ux2a, vx2a, ux2b, vx2b;
 +    uint16x8_t y0x8, y1x8, ux8, vx8;
 +    uint16x8_t r0x8, g0x8, b0x8;
 +    uint16x8_t r1x8, g1x8, b1x8;
 +
 +    int16x8_t r0ox8, g0ox8, b0ox8;
-+    uint16x8_t y0ox8;
++    int16x8_t y0ox8;
 +    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
 +    int32x4_t y0oax4, y0obx4;
 +
 +    int16x8_t r1ox8, g1ox8, b1ox8;
-+    uint16x8_t y1ox8;
++    int16x8_t y1ox8;
 +    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
 +    int32x4_t y1oax4, y1obx4;
++    int32x4_t uvoax4, uvobx4;
 +    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
 +    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
 +    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
@@ -886,22 +1194,32 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
 +    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
 +
 +            y0x8 = vld1q_u16(srcy + x);
 +            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            ux4 = vld1_u16(srcu + (x >> 1));
-+            vx4 = vld1_u16(srcv + (x >> 1));
++            uvx8 = vld1q_u16(srcuv + x);
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                // shift bit has to be compile-time constant
++                y0x8 = vshrq_n_u16(y0x8, 6);
++                y1x8 = vshrq_n_u16(y1x8, 6);
++                uvx8 = vshrq_n_u16(uvx8, 6);
++            }
 +            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
 +            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
 +
-+            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
-+            ux8 = vsubq_u16(ux8, in_uv_offx8);
-+            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
-+            vx8 = vsubq_u16(vx8, in_uv_offx8);
++            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
++            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
++            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
++            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
++
++            ux8 = vcombine_u16(ux2a, ux2b);
++            vx8 = vcombine_u16(vx2a, vx2b);
 +
 +            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
 +            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
@@ -950,18 +1268,19 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
 +            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
 +            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
-+            y0oax4 = vshrq_n_s32(y0oax4, 19);
++            // output shift bits for 8bit outputs is 29 - 8 = 21
++            y0oax4 = vshrq_n_s32(y0oax4, 21);
 +            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
 +            y0obx4 = vmulq_n_s32(r0obx4, cry);
 +            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
 +            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
 +            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
-+            y0obx4 = vshrq_n_s32(y0obx4, 19);
++            y0obx4 = vshrq_n_s32(y0obx4, 21);
 +            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
 +
-+            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
-+            vst1q_u16(&dsty[x], y0ox8);
++            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
++            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
 +
 +            r1ox8 = vld1q_s16(r1);
 +            g1ox8 = vld1q_s16(g1);
@@ -979,18 +1298,18 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
 +            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
 +            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
-+            y1oax4 = vshrq_n_s32(y1oax4, 19);
++            y1oax4 = vshrq_n_s32(y1oax4, 21);
 +            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
 +
 +            y1obx4 = vmulq_n_s32(r1obx4, cry);
 +            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
 +            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
 +            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
-+            y1obx4 = vshrq_n_s32(y1obx4, 19);
++            y1obx4 = vshrq_n_s32(y1obx4, 21);
 +            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
 +
-+            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
-+            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
++            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
++            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
 +
 +            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
 +            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
@@ -1022,16 +1341,19 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
 +            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
 +            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
-+            uox4 = vshrq_n_s32(uox4, 19);
++            uox4 = vshrq_n_s32(uox4, 21);
 +            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
-+            vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4));
 +
 +            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
 +            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
 +            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
-+            vox4 = vshrq_n_s32(vox4, 19);
++            vox4 = vshrq_n_s32(vox4, 21);
 +            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
-+            vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4));
++
++            uvoax4 = vzip1q_s32(uox4, vox4);
++            uvobx4 = vzip2q_s32(uox4, vox4);
++
++            vst1_u8(&dstuv[x], vqmovun_s16(vcombine_s16(vmovn_s32(uvoax4), vmovn_s32(uvobx4))));
 +        }
 +    }
 +
@@ -1039,53 +1361,44 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    if (remainw) {
 +        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
-+        rdstu += offset >> 1;
-+        rdstv += offset >> 1;
++        rdstuv += offset;
 +        rsrcy += offset;
-+        rsrcu += offset >> 1;
-+        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
-+                                      rsrcy, rsrcu, rsrcv,
-+                                      dstlinesize, srclinesize,
-+                                      dstdepth, srcdepth,
-+                                      remainw, rheight, params);
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
 +    }
 +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +}
 +
-+void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
-+                                              const uint16_t *srcy, const uint16_t *srcuv,
-+                                              const int *dstlinesize, const int *srclinesize,
-+                                              int dstdepth, int srcdepth,
-+                                              int width, int height,
-+                                              const struct TonemapIntParams *params)
++void tonemap_frame_dovi_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                      const int *dstlinesize, const int *srclinesize,
++                                      int dstdepth, int srcdepth,
++                                      int width, int height,
++                                      const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
 +    uint16_t *rdsty = dsty;
-+    uint16_t *rdstuv = dstuv;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
 +    int remainw = width & 6;
 +
 +    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
++    const float in_rng = (float)((1 << in_depth) - 1);
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+    const int out_sh2 = 16 - out_depth;
-+
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
 +    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
 +    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
@@ -1098,17 +1411,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +
 +    int16_t r[8], g[8], b[8];
 +    int16_t r1[8], g1[8], b1[8];
-+    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
-+    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
-+    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
-+    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
-+    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
-+    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
-+    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
-+    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
-+    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
-+    uint16x8_t uvx8;
-+    uint16x4_t ux2a, vx2a, ux2b, vx2b;
++    uint16x4_t ux4, vx4;
 +    uint16x8_t y0x8, y1x8, ux8, vx8;
 +    uint16x8_t r0x8, g0x8, b0x8;
 +    uint16x8_t r1x8, g1x8, b1x8;
@@ -1122,63 +1425,182 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    uint16x8_t y1ox8;
 +    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
 +    int32x4_t y1oax4, y1obx4;
-+    int32x4_t uvoax4, uvobx4;
 +    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
 +    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
 +    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
 +    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
-+    int16x8_t out_sh2x8 = vdupq_n_s16(out_sh2);
 +    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
 +    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
++    float32x4_t ipt0, ipt1, ipt2, ipt3;
++    float32x4_t ia1, ib1, ia2, ib2;
++    float32x4_t ix4, px4, tx4;
++    float32x4_t lx4, mx4, sx4;
++    float32x4_t rx4a, gx4a, bx4a, rx4b, gx4b, bx4b;
++    float32x4_t y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
 +
 +            y0x8 = vld1q_u16(srcy + x);
 +            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            uvx8 = vld1q_u16(srcuv + x);
-+            if (in_depth == 10) {
-+                // shift to low10bits for 10bit input
-+                // shift bit has to be compile-time constant
-+                y0x8 = vshrq_n_u16(y0x8, 6);
-+                y1x8 = vshrq_n_u16(y1x8, 6);
-+                uvx8 = vshrq_n_u16(uvx8, 6);
-+            }
-+            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
-+            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
-+            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
-+
-+            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
-+            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
-+            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
-+            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
-+
-+            ux8 = vcombine_u16(ux2a, ux2b);
-+            vx8 = vcombine_u16(vx2a, vx2b);
-+
-+            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
-+            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
-+            r0x8 = vaddq_u16(r0x8, rndx8);
-+
-+            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
-+            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
-+            g0x8 = vaddq_u16(g0x8, rndx8);
-+
-+            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
-+            b0x8 = vaddq_u16(b0x8, rndx8);
-+
-+            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
-+            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
-+            r1x8 = vaddq_u16(r1x8, rndx8);
++            ux4 = vld1_u16(srcu + (x >> 1));
++            vx4 = vld1_u16(srcv + (x >> 1));
 +
-+            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
-+            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
-+            g1x8 = vaddq_u16(g1x8, rndx8);
++            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
++            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
 +
-+            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
-+            b1x8 = vaddq_u16(b1x8, rndx8);
++            y0x4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(y0x8)));
++            y0x4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(y0x8)));
++            y1x4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(y1x8)));
++            y1x4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(y1x8)));
++
++            ux4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(ux8)));
++            ux4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(ux8)));
++            vx4a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vx8)));
++            vx4b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vx8)));
++
++            y0x4a = vdivq_f32(y0x4a, vdupq_n_f32(in_rng));
++            y0x4b = vdivq_f32(y0x4b, vdupq_n_f32(in_rng));
++            y1x4a = vdivq_f32(y1x4a, vdupq_n_f32(in_rng));
++            y1x4b = vdivq_f32(y1x4b, vdupq_n_f32(in_rng));
++            ux4a = vdivq_f32(ux4a, vdupq_n_f32(in_rng));
++            ux4b = vdivq_f32(ux4b, vdupq_n_f32(in_rng));
++            vx4a = vdivq_f32(vx4a, vdupq_n_f32(in_rng));
++            vx4b = vdivq_f32(vx4b, vdupq_n_f32(in_rng));
++
++            // Reshape y0x4a
++            ia1 = vzip1q_f32(y0x4a, ux4a);
++            ia2 = vzip2q_f32(y0x4a, ux4a);
++            ib1 = vzip1q_f32(vx4a, vdupq_n_f32(0.0f));
++            ib2 = vzip2q_f32(vx4a, vdupq_n_f32(0.0f));
++            ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1));
++            ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1));
++            ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2));
++            ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ia1 = vtrn1q_f32(ipt0, ipt1);
++            ia2 = vtrn1q_f32(ipt2, ipt3);
++            ib1 = vtrn2q_f32(ipt0, ipt1);
++            ib2 = vtrn2q_f32(ipt2, ipt3);
++
++            ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2));
++            px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2));
++            tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4a = vmulq_n_f32(rx4a, 28672.0f);
++            gx4a = vmulq_n_f32(gx4a, 28672.0f);
++            bx4a = vmulq_n_f32(bx4a, 28672.0f);
++
++            // Reshape y0x4b
++            ia1 = vzip1q_f32(y0x4b, ux4b);
++            ia2 = vzip2q_f32(y0x4b, ux4b);
++            ib1 = vzip1q_f32(vx4b, vdupq_n_f32(0.0f));
++            ib2 = vzip2q_f32(vx4b, vdupq_n_f32(0.0f));
++            ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1));
++            ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1));
++            ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2));
++            ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ia1 = vtrn1q_f32(ipt0, ipt1);
++            ia2 = vtrn1q_f32(ipt2, ipt3);
++            ib1 = vtrn2q_f32(ipt0, ipt1);
++            ib2 = vtrn2q_f32(ipt2, ipt3);
++
++            ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2));
++            px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2));
++            tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4b = vmulq_n_f32(rx4b, 28672.0f);
++            gx4b = vmulq_n_f32(gx4b, 28672.0f);
++            bx4b = vmulq_n_f32(bx4b, 28672.0f);
++
++            r0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b)));
++            g0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b)));
++            b0x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b)));
++
++            // Reshape y1x4a
++            ia1 = vzip1q_f32(y1x4a, ux4a);
++            ia2 = vzip2q_f32(y1x4a, ux4a);
++            ib1 = vzip1q_f32(vx4a, vdupq_n_f32(0.0f));
++            ib2 = vzip2q_f32(vx4a, vdupq_n_f32(0.0f));
++            ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1));
++            ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1));
++            ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2));
++            ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ia1 = vtrn1q_f32(ipt0, ipt1);
++            ia2 = vtrn1q_f32(ipt2, ipt3);
++            ib1 = vtrn2q_f32(ipt0, ipt1);
++            ib2 = vtrn2q_f32(ipt2, ipt3);
++
++            ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2));
++            px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2));
++            tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4a = vmulq_n_f32(rx4a, 28672.0f);
++            gx4a = vmulq_n_f32(gx4a, 28672.0f);
++            bx4a = vmulq_n_f32(bx4a, 28672.0f);
++
++            // Reshape y1x4b
++            ia1 = vzip1q_f32(y1x4b, ux4b);
++            ia2 = vzip2q_f32(y1x4b, ux4b);
++            ib1 = vzip1q_f32(vx4b, vdupq_n_f32(0.0f));
++            ib2 = vzip2q_f32(vx4b, vdupq_n_f32(0.0f));
++            ipt0 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ib1));
++            ipt1 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ib1));
++            ipt2 = vcombine_f32(vget_low_f32(ia2), vget_low_f32(ib2));
++            ipt3 = vcombine_f32(vget_high_f32(ia2), vget_high_f32(ib2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ia1 = vtrn1q_f32(ipt0, ipt1);
++            ia2 = vtrn1q_f32(ipt2, ipt3);
++            ib1 = vtrn2q_f32(ipt0, ipt1);
++            ib2 = vtrn2q_f32(ipt2, ipt3);
++
++            ix4 = vcombine_f32(vget_low_f32(ia1), vget_low_f32(ia2));
++            px4 = vcombine_f32(vget_low_f32(ib1), vget_low_f32(ib2));
++            tx4 = vcombine_f32(vget_high_f32(ia1), vget_high_f32(ia2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4b = vmulq_n_f32(rx4b, 28672.0f);
++            gx4b = vmulq_n_f32(gx4b, 28672.0f);
++            bx4b = vmulq_n_f32(bx4b, 28672.0f);
++
++            r1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(rx4a)), vqmovn_u32(vcvtq_u32_f32(rx4b)));
++            g1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(gx4a)), vqmovn_u32(vcvtq_u32_f32(gx4b)));
++            b1x8 = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(bx4a)), vqmovn_u32(vcvtq_u32_f32(bx4b)));
 +
 +            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
 +                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
@@ -1205,11 +1627,18 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
 +            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
 +            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            y0oax4 = vshrq_n_s32(y0oax4, 19);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
 +            y0obx4 = vmulq_n_s32(r0obx4, cry);
 +            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
 +            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
 +            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, 19);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++
++            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
++            vst1q_u16(&dsty[x], y0ox8);
 +
 +            r1ox8 = vld1q_s16(r1);
 +            g1ox8 = vld1q_s16(g1);
@@ -1227,11 +1656,18 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
 +            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
 +            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, 19);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
 +
 +            y1obx4 = vmulq_n_s32(r1obx4, cry);
 +            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
 +            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
 +            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, 19);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++
++            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
++            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
 +
 +            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
 +            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
@@ -1263,50 +1699,16 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
 +            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
 +            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, 19);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4));
 +
 +            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
 +            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
 +            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
-+
-+            switch(out_depth) {
-+                default:
-+                case 10:
-+                    y0oax4 = vshrq_n_s32(y0oax4, 19);
-+                    y0obx4 = vshrq_n_s32(y0obx4, 19);
-+                    y1oax4 = vshrq_n_s32(y1oax4, 19);
-+                    y1obx4 = vshrq_n_s32(y1obx4, 19);
-+                    uox4 = vshrq_n_s32(uox4, 19);
-+                    vox4 = vshrq_n_s32(vox4, 19);
-+                    break;
-+                case 16:
-+                    y0oax4 = vshrq_n_s32(y0oax4, 13);
-+                    y0obx4 = vshrq_n_s32(y0obx4, 13);
-+                    y1oax4 = vshrq_n_s32(y1oax4, 13);
-+                    y1obx4 = vshrq_n_s32(y1obx4, 13);
-+                    uox4 = vshrq_n_s32(uox4, 13);
-+                    vox4 = vshrq_n_s32(vox4, 13);
-+                    break;
-+            }
-+
-+            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
-+            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
-+            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
-+            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
-+            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vox4 = vshrq_n_s32(vox4, 19);
 +            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
-+
-+            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
-+            y0ox8 = vshlq_u16(y0ox8, out_sh2x8);
-+            vst1q_u16(&dsty[x], y0ox8);
-+
-+            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
-+            y1ox8 = vshlq_u16(y1ox8, out_sh2x8);
-+            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
-+
-+            uvoax4 = vzip1q_s32(uox4, vox4);
-+            uvobx4 = vzip2q_s32(uox4, vox4);
-+
-+            vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8));
++            vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4));
 +        }
 +    }
 +
@@ -1314,1398 +1716,2446 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    if (remainw) {
 +        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
-+        rdstuv += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
 +        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
-+                                            rsrcy, rsrcuv,
-+                                            dstlinesize, srclinesize,
-+                                            dstdepth, srcdepth,
-+                                            remainw, rheight, params);
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_dovi_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
 +    }
 +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +}
-Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
-===================================================================
---- /dev/null
-+++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
-@@ -0,0 +1,54 @@
-+/*
-+ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H
-+#define AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H
-+
-+#include "libavfilter/vf_tonemapx.h"
-+
-+void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                      const int *dstlinesize, const int *srclinesize,
-+                                      int dstdepth, int srcdepth,
-+                                      int width, int height,
-+                                      const struct TonemapIntParams *params);
-+
-+void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
-+                                         const uint16_t *srcy, const uint16_t *srcuv,
-+                                         const int *dstlinesize, const int *srclinesize,
-+                                         int dstdepth, int srcdepth,
-+                                         int width, int height,
-+                                         const struct TonemapIntParams *params);
 +
 +void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
 +                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                        const int *dstlinesize, const int *srclinesize,
 +                                        int dstdepth, int srcdepth,
 +                                        int width, int height,
-+                                        const struct TonemapIntParams *params);
++                                        const struct TonemapIntParams *params)
++{
++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
 +
-+void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
-+                                              const uint16_t *srcy, const uint16_t *srcuv,
-+                                              const int *dstlinesize, const int *srclinesize,
-+                                              int dstdepth, int srcdepth,
-+                                              int width, int height,
-+                                              const struct TonemapIntParams *params);
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
 +
-+#endif // AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H
-Index: FFmpeg/libavfilter/allfilters.c
-===================================================================
---- FFmpeg.orig/libavfilter/allfilters.c
-+++ FFmpeg/libavfilter/allfilters.c
-@@ -498,6 +498,7 @@ extern const AVFilter ff_vf_tmedian;
- extern const AVFilter ff_vf_tmidequalizer;
- extern const AVFilter ff_vf_tmix;
- extern const AVFilter ff_vf_tonemap;
-+extern const AVFilter ff_vf_tonemapx;
- extern const AVFilter ff_vf_tonemap_cuda;
- extern const AVFilter ff_vf_tonemap_opencl;
- extern const AVFilter ff_vf_tonemap_vaapi;
-Index: FFmpeg/libavfilter/colorspace.c
-===================================================================
---- FFmpeg.orig/libavfilter/colorspace.c
-+++ FFmpeg/libavfilter/colorspace.c
-@@ -17,6 +17,7 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#include "libavutil/avassert.h"
- #include "libavutil/frame.h"
- #include "libavutil/mastering_display_metadata.h"
- #include "libavutil/pixdesc.h"
-@@ -354,3 +355,51 @@ float inverse_eotf_arib_b67(float x) {
- float inverse_eotf_bt1886(float x) {
-     return x > 0.0f ? powf(x, 1.0f / 2.4f) : 0.0f;
- }
-+
-+int ff_get_range_off(int *off, int *y_rng, int *uv_rng,
-+                     enum AVColorRange rng, int depth)
-+{
-+    switch (rng) {
-+    case AVCOL_RANGE_UNSPECIFIED:
-+    case AVCOL_RANGE_MPEG:
-+        *off = 16 << (depth - 8);
-+        *y_rng = 219 << (depth - 8);
-+        *uv_rng = 224 << (depth - 8);
-+        break;
-+    case AVCOL_RANGE_JPEG:
-+        *off = 0;
-+        *y_rng = *uv_rng = (256 << (depth - 8)) - 1;
-+        break;
-+    default:
-+        return AVERROR(EINVAL);
-+    }
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
 +
-+    return 0;
-+}
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
-+void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3],
-+                       int depth, int y_rng, int uv_rng, int yuv2rgb)
-+{
-+#define N (yuv2rgb ? m : n)
-+#define M (yuv2rgb ? n : m)
-+    int rng, n, m, o;
-+    int bits = 1 << (yuv2rgb ? (depth - 1) : (29 - depth));
-+    for (rng = y_rng, n = 0; n < 3; n++, rng = uv_rng) {
-+        for (m = 0; m < 3; m++) {
-+            out[N][M][0] = lrint(bits * (yuv2rgb ? 28672 : rng) * table[N][M] / (yuv2rgb ? rng : 28672));
-+            for (o = 1; o < 8; o++)
-+                out[N][M][o] = out[N][M][0];
-+        }
-+    }
-+#undef N
-+#undef M
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    if (yuv2rgb) {
-+        av_assert2(out[0][1][0] == 0);
-+        av_assert2(out[2][2][0] == 0);
-+        av_assert2(out[0][0][0] == out[1][0][0]);
-+        av_assert2(out[0][0][0] == out[2][0][0]);
-+    } else {
-+        av_assert2(out[1][2][0] == out[2][0][0]);
-+    }
-+}
-Index: FFmpeg/libavfilter/colorspace.h
-===================================================================
---- FFmpeg.orig/libavfilter/colorspace.h
-+++ FFmpeg/libavfilter/colorspace.h
-@@ -85,4 +85,8 @@ float eotf_arib_b67(float x);
- float inverse_eotf_arib_b67(float x);
- float inverse_eotf_bt1886(float x);
- 
-+int ff_get_range_off(int *off, int *y_rng, int *uv_rng,
-+                     enum AVColorRange rng, int depth);
-+void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3],
-+                       int depth, int y_rng, int uv_rng, int yuv2rgb);
- #endif
-Index: FFmpeg/libavfilter/vf_tonemapx.c
-===================================================================
---- /dev/null
-+++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,1261 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x4_t ux4, vx4;
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
 +
-+/**
-+ * @file
-+ * tonemap algorithms
-+ */
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    uint16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
 +
-+#include <float.h>
-+#include <string.h>
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    uint16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+#include "libavutil/avassert.h"
-+#include "libavutil/imgutils.h"
-+#include "libavutil/internal.h"
-+#include "libavutil/mem_internal.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/cpu.h"
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            ux4 = vld1_u16(srcu + (x >> 1));
++            vx4 = vld1_u16(srcv + (x >> 1));
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
 +
-+#include "vf_tonemapx.h"
++            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
++            ux8 = vsubq_u16(ux8, in_uv_offx8);
++            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
++            vx8 = vsubq_u16(vx8, in_uv_offx8);
 +
-+#ifdef CC_SUPPORTS_TONEMAPX_INTRINSICS
-+#    if ARCH_AARCH64
-+#        if HAVE_INTRINSICS_NEON
-+#            include "libavutil/aarch64/cpu.h"
-+#            include "aarch64/vf_tonemapx_intrin_neon.h"
-+#        endif
-+#    endif // ARCH_AARCH64
-+#    if ARCH_X86
-+#        include "libavutil/x86/cpu.h"
-+#        if HAVE_INTRINSICS_SSE42
-+#            include "x86/vf_tonemapx_intrin_sse.h"
-+#        endif
-+#        if HAVE_INTRINSICS_AVX2 && HAVE_INTRINSICS_FMA3
-+#            include "x86/vf_tonemapx_intrin_avx.h"
-+#        endif
-+#    endif // ARCH_X86
-+#endif // CC_SUPPORTS_TONEMAPX_INTRINSICS
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
 +
-+#include "avfilter.h"
-+#include "formats.h"
-+#include "internal.h"
-+#include "video.h"
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
 +
-+enum TonemapAlgorithm {
-+    TONEMAP_NONE,
-+    TONEMAP_LINEAR,
-+    TONEMAP_GAMMA,
-+    TONEMAP_CLIP,
-+    TONEMAP_REINHARD,
-+    TONEMAP_HABLE,
-+    TONEMAP_MOBIUS,
-+    TONEMAP_BT2390,
-+    TONEMAP_MAX,
-+};
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
 +
-+typedef struct TonemapxContext {
-+    const AVClass *class;
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
 +
-+    enum TonemapAlgorithm tonemap;
-+    enum AVColorTransferCharacteristic trc;
-+    enum AVColorSpace spc;
-+    enum AVColorPrimaries pri;
-+    enum AVColorRange range;
-+    enum AVPixelFormat format;
-+    char *format_str;
-+    double param;
-+    double desat;
-+    double peak;
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
 +
-+    const AVLumaCoefficients *coeffs, *ocoeffs;
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
 +
-+    double lut_peak;
-+    float *lin_lut;
-+    float *tonemap_lut;
-+    uint16_t *delin_lut;
-+    int in_yuv_off, out_yuv_off;
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
 +
-+    DECLARE_ALIGNED(16, int16_t, yuv2rgb_coeffs)[3][3][8];
-+    DECLARE_ALIGNED(16, int16_t, rgb2yuv_coeffs)[3][3][8];
-+    DECLARE_ALIGNED(16, double,  rgb2rgb_coeffs)[3][3];
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
 +
-+    int (*filter_slice) (AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
 +
-+    void (*tonemap_func_biplanar8) (uint8_t *dsty, uint8_t *dstuv,
-+                                    const uint16_t *srcy, const uint16_t *srcuv,
-+                                    const int *dstlinesize, const int *srclinesize,
-+                                    int dstdepth, int srcdepth,
-+                                    int width, int height,
-+                                    const struct TonemapIntParams *params);
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
 +
-+    void (*tonemap_func_planar8) (uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                  const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                  const int *dstlinesize, const int *srclinesize,
-+                                  int dstdepth, int srcdepth,
-+                                  int width, int height,
-+                                  const struct TonemapIntParams *params);
-+
-+    void (*tonemap_func_biplanar10) (uint16_t *dsty, uint16_t *dstuv,
-+                                     const uint16_t *srcy, const uint16_t *srcuv,
-+                                     const int *dstlinesize, const int *srclinesize,
-+                                     int dstdepth, int srcdepth,
-+                                     int width, int height,
-+                                     const struct TonemapIntParams *params);
-+
-+    void (*tonemap_func_planar10) (uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                   const int *dstlinesize, const int *srclinesize,
-+                                   int dstdepth, int srcdepth,
-+                                   int width, int height,
-+                                   const struct TonemapIntParams *params);
-+} TonemapxContext;
-+
-+typedef struct ThreadData {
-+    AVFrame *in, *out;
-+    const AVPixFmtDescriptor *desc, *odesc;
-+    double peak;
-+} ThreadData;
-+
-+static const enum AVPixelFormat in_pix_fmts[] = {
-+    AV_PIX_FMT_YUV420P10,
-+    AV_PIX_FMT_P010,
-+    AV_PIX_FMT_P016,
-+    AV_PIX_FMT_NONE,
-+};
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            y0oax4 = vshrq_n_s32(y0oax4, 19);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
-+static const enum AVPixelFormat out_pix_fmts[] = {
-+    AV_PIX_FMT_YUV420P,
-+    AV_PIX_FMT_YUV420P10,
-+    AV_PIX_FMT_NV12,
-+    AV_PIX_FMT_P010,
-+    AV_PIX_FMT_P016,
-+};
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, 19);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
 +
-+static int out_format_is_supported(enum AVPixelFormat fmt)
-+{
-+    int i;
++            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
++            vst1q_u16(&dsty[x], y0ox8);
 +
-+    for (i = 0; i < FF_ARRAY_ELEMS(out_pix_fmts); i++)
-+        if (out_pix_fmts[i] == fmt)
-+            return 1;
-+    return 0;
-+}
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
 +
-+static float hable(float in)
-+{
-+    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
-+    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
-+}
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
 +
-+static float mobius(float in, float j, double peak)
-+{
-+    float a, b;
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
 +
-+    if (in <= j)
-+        return in;
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, 19);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
 +
-+    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
-+    b = (j * j - 2.0f * j * peak + peak) / FFMAX(peak - 1.0f, FLOAT_EPS);
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, 19);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
 +
-+    return (b * b + 2.0f * b * j + j * j) / (b - a) * (in + a) / (in + b);
-+}
++            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
++            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
 +
-+static float bt2390(float s, float peak)
-+{
-+    float peak_pq = inverse_eotf_st2084(peak, REFERENCE_WHITE_ALT);
-+    float scale = 1.0f / peak_pq;
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
 +
-+    // SDR peak
-+    float dst_peak = 1.0f;
-+    float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE_ALT) * scale;
-+    float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale;
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
 +
-+    float ks = 1.5f * maxLum - 0.5f;
-+    float tb = (s_pq - ks) / (1.0f - ks);
-+    float tb2 = tb * tb;
-+    float tb3 = tb2 * tb;
-+    float pb = (2.0f * tb3 - 3.0f * tb2 + 1.0f) * ks +
-+               (tb3 - 2.0f * tb2 + tb) * (1.0f - ks) +
-+               (-2.0f * tb3 + 3.0f * tb2) * maxLum;
-+    float sig = (s_pq < ks) ? s_pq : pb;
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
 +
-+    return eotf_st2084(sig * peak_pq, REFERENCE_WHITE_ALT);
-+}
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, 19);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4));
 +
-+static float mapsig(enum TonemapAlgorithm alg, float sig, double peak, double param)
-+{
-+    switch(alg) {
-+    default:
-+    case TONEMAP_NONE:
-+        // do nothing
-+        break;
-+    case TONEMAP_LINEAR:
-+        sig = sig * param / peak;
-+        break;
-+    case TONEMAP_GAMMA:
-+        sig = sig > 0.05f
-+              ? pow(sig / peak, 1.0f / param)
-+              : sig * pow(0.05f / peak, 1.0f / param) / 0.05f;
-+        break;
-+    case TONEMAP_CLIP:
-+        sig = av_clipf(sig * param, 0, 1.0f);
-+        break;
-+    case TONEMAP_HABLE:
-+        sig = hable(sig) / hable(peak);
-+        break;
-+    case TONEMAP_REINHARD:
-+        sig = sig / (sig + param) * (peak + param) / peak;
-+        break;
-+    case TONEMAP_MOBIUS:
-+        sig = mobius(sig, param, peak);
-+        break;
-+    case TONEMAP_BT2390:
-+        sig = bt2390(sig, peak);
-+        break;
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            vox4 = vshrq_n_s32(vox4, 19);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++            vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4));
++        }
 +    }
 +
-+    return sig;
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
++    }
++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +}
 +
-+static float linearize(float x, enum AVColorTransferCharacteristic trc_src)
++void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
++                                              const uint16_t *srcy, const uint16_t *srcuv,
++                                              const int *dstlinesize, const int *srclinesize,
++                                              int dstdepth, int srcdepth,
++                                              int width, int height,
++                                              const struct TonemapIntParams *params)
 +{
-+    if (trc_src == AVCOL_TRC_SMPTE2084)
-+        return eotf_st2084(x, REFERENCE_WHITE_ALT);
-+    else if (trc_src == AVCOL_TRC_ARIB_STD_B67)
-+        return eotf_arib_b67(x);
-+    else
-+        return x;
-+}
++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
 +
-+static float delinearize(float x, enum AVColorTransferCharacteristic trc_dst)
-+{
-+    if (trc_dst == AVCOL_TRC_BT709 || trc_dst == AVCOL_TRC_BT2020_10)
-+        return inverse_eotf_bt1886(x);
-+    else
-+        return x;
-+}
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
 +
-+static int compute_trc_luts(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src,
-+                            enum AVColorTransferCharacteristic trc_dst)
-+{
-+    int i;
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
 +
-+    if (!s->lin_lut && !(s->lin_lut = av_calloc(32768, sizeof(float))))
-+        return AVERROR(ENOMEM);
-+    if (!s->delin_lut && !(s->delin_lut = av_calloc(32768, sizeof(uint16_t))))
-+        return AVERROR(ENOMEM);
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
-+    for (i = 0; i < 32768; i++) {
-+        double v1 = (i - 2048.0f) / 28672.0f;
-+        double v2 = i / 32767.0f;
-+        s->lin_lut[i] = FFMAX(linearize(v1, trc_src), 0);
-+        s->delin_lut[i] = av_clip_int16(lrint(delinearize(v2, trc_dst) * 28672.0f));
-+    }
-+
-+    return 0;
-+}
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+static int compute_tonemap_lut(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src)
-+{
-+    int i;
-+    double peak = s->lut_peak;
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x8_t uvx8;
++    uint16x4_t ux2a, vx2a, ux2b, vx2b;
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
 +
-+    if (!s->tonemap_lut && !(s->tonemap_lut = av_calloc(32768, sizeof(float))))
-+        return AVERROR(ENOMEM);
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    uint16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
 +
-+    for (i = 0; i < 32768; i++) {
-+        double v = (i - 2048.0f) / 28672.0f;
-+        double sig = linearize(v, trc_src);
-+        float mapped = mapsig(s->tonemap, sig, peak, s->param);
-+        s->tonemap_lut[i] = (sig > 0.0f && mapped > 0.0f) ? mapped / sig : 0.0f;
-+    }
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    uint16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x4_t uvoax4, uvobx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int16x8_t out_sh2x8 = vdupq_n_s16(out_sh2);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+    return 0;
-+}
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            uvx8 = vld1q_u16(srcuv + x);
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                // shift bit has to be compile-time constant
++                y0x8 = vshrq_n_u16(y0x8, 6);
++                y1x8 = vshrq_n_u16(y1x8, 6);
++                uvx8 = vshrq_n_u16(uvx8, 6);
++            }
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
 +
-+static int compute_yuv_coeffs(TonemapxContext *s,
-+                              const AVLumaCoefficients *coeffs,
-+                              const AVLumaCoefficients *ocoeffs,
-+                              const AVPixFmtDescriptor *idesc,
-+                              const AVPixFmtDescriptor *odesc,
-+                              enum AVColorRange irng,
-+                              enum AVColorRange orng)
-+{
-+    double rgb2yuv[3][3], yuv2rgb[3][3];
-+    int res;
-+    int y_rng, uv_rng;
++            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
++            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
++            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
++            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
 +
-+    res = ff_get_range_off(&s->in_yuv_off, &y_rng, &uv_rng,
-+                           irng, idesc->comp[0].depth);
-+    if (res < 0) {
-+        av_log(s, AV_LOG_ERROR,
-+               "Unsupported input color range %d (%s)\n",
-+               irng, av_color_range_name(irng));
-+        return res;
-+    }
++            ux8 = vcombine_u16(ux2a, ux2b);
++            vx8 = vcombine_u16(vx2a, vx2b);
 +
-+    ff_fill_rgb2yuv_table(coeffs, rgb2yuv);
-+    ff_matrix_invert_3x3(rgb2yuv, yuv2rgb);
-+    ff_fill_rgb2yuv_table(ocoeffs, rgb2yuv);
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
 +
-+    ff_get_yuv_coeffs(s->yuv2rgb_coeffs, yuv2rgb, idesc->comp[0].depth,
-+                      y_rng, uv_rng, 1);
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
 +
-+    res = ff_get_range_off(&s->out_yuv_off, &y_rng, &uv_rng,
-+                           orng, odesc->comp[0].depth);
-+    if (res < 0) {
-+        av_log(s, AV_LOG_ERROR,
-+               "Unsupported output color range %d (%s)\n",
-+               orng, av_color_range_name(orng));
-+        return res;
-+    }
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
 +
-+    ff_get_yuv_coeffs(s->rgb2yuv_coeffs, rgb2yuv, odesc->comp[0].depth,
-+                      y_rng, uv_rng, 0);
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
 +
-+    return 0;
-+}
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
 +
-+static int compute_rgb_coeffs(TonemapxContext *s,
-+                              enum AVColorPrimaries iprm,
-+                              enum AVColorPrimaries oprm)
-+{
-+    double rgb2xyz[3][3], xyz2rgb[3][3];
-+    const AVColorPrimariesDesc *iprm_desc = av_csp_primaries_desc_from_id(iprm);
-+    const AVColorPrimariesDesc *oprm_desc = av_csp_primaries_desc_from_id(oprm);
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
 +
-+    if (!iprm_desc) {
-+        av_log(s, AV_LOG_ERROR,
-+               "Unsupported input color primaries %d (%s)\n",
-+               iprm, av_color_primaries_name(iprm));
-+        return AVERROR(EINVAL);
-+    }
-+    if (!oprm_desc) {
-+        av_log(s, AV_LOG_ERROR,
-+               "Unsupported output color primaries %d (%s)\n",
-+               oprm, av_color_primaries_name(oprm));
-+        return AVERROR(EINVAL);
-+    }
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
 +
-+    ff_fill_rgb2xyz_table(&oprm_desc->prim, &oprm_desc->wp, rgb2xyz);
-+    ff_matrix_invert_3x3(rgb2xyz, xyz2rgb);
-+    ff_fill_rgb2xyz_table(&iprm_desc->prim, &iprm_desc->wp, rgb2xyz);
-+    ff_matrix_mul_3x3(s->rgb2rgb_coeffs, rgb2xyz, xyz2rgb);
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
 +
-+    return 0;
-+}
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
 +
-+static void tonemap_int16(int16_t r_in, int16_t g_in, int16_t b_in,
-+                          int16_t *r_out, int16_t *g_out, int16_t *b_out,
-+                          float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
-+                          const AVLumaCoefficients *coeffs,
-+                          const AVLumaCoefficients *ocoeffs, double desat,
-+                          double (*rgb2rgb)[3][3],
-+                          int rgb2rgb_passthrough)
-+{
-+    int16_t sig;
-+    float mapval, r_lin, g_lin, b_lin;
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
 +
-+    /* load values */
-+    *r_out = r_in;
-+    *g_out = g_in;
-+    *b_out = b_in;
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
 +
-+    /* pick the brightest component, reducing the value range as necessary
-+     * to keep the entire signal in range and preventing discoloration due to
-+     * out-of-bounds clipping */
-+    sig = FFMAX3(r_in, g_in, b_in);
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
 +
-+    mapval = tonemap_lut[av_clip_uintp2(sig + 2048, 15)];
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
 +
-+    r_lin = lin_lut[av_clip_uintp2(r_in + 2048, 15)];
-+    g_lin = lin_lut[av_clip_uintp2(g_in + 2048, 15)];
-+    b_lin = lin_lut[av_clip_uintp2(b_in + 2048, 15)];
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
 +
-+    if (!rgb2rgb_passthrough) {
-+        r_lin = (*rgb2rgb)[0][0] * r_lin + (*rgb2rgb)[0][1] * g_lin + (*rgb2rgb)[0][2] * b_lin;
-+        g_lin = (*rgb2rgb)[1][0] * r_lin + (*rgb2rgb)[1][1] * g_lin + (*rgb2rgb)[1][2] * b_lin;
-+        b_lin = (*rgb2rgb)[2][0] * r_lin + (*rgb2rgb)[2][1] * g_lin + (*rgb2rgb)[2][2] * b_lin;
-+    }
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
 +
-+#define MIX(x,y,a) (x) * (1 - (a)) + (y) * (a)
-+    /* desaturate to prevent unnatural colors */
-+    if (desat > 0) {
-+        float luma = av_q2d(coeffs->cr) * r_lin + av_q2d(coeffs->cg) * g_lin + av_q2d(coeffs->cb) * b_lin;
-+        float overbright = FFMAX(luma - desat, FLOAT_EPS) / FFMAX(luma, FLOAT_EPS);
-+        r_lin = MIX(r_lin, luma, overbright);
-+        g_lin = MIX(g_lin, luma, overbright);
-+        b_lin = MIX(b_lin, luma, overbright);
-+    }
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
 +
-+    r_lin *= mapval;
-+    g_lin *= mapval;
-+    b_lin *= mapval;
-+#undef MIX
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
 +
-+    *r_out = delin_lut[av_clip_uintp2(r_lin * 32767 + 0.5, 15)];
-+    *g_out = delin_lut[av_clip_uintp2(g_lin * 32767 + 0.5, 15)];
-+    *b_out = delin_lut[av_clip_uintp2(b_lin * 32767 + 0.5, 15)];
-+}
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
 +
-+// See also libavfilter/colorspacedsp_template.c
-+void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
-+                                    const uint16_t *srcy, const uint16_t *srcuv,
-+                                    const int *dstlinesize, const int *srclinesize,
-+                                    int dstdepth, int srcdepth,
-+                                    int width, int height,
-+                                    const struct TonemapIntParams *params)
-+{
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
-+
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
-+
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
-+
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
 +
-+    int r00, g00, b00;
-+    int r01, g01, b01;
-+    int r10, g10, b10;
-+    int r11, g11, b11;
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
 +
-+    int16_t r[4], g[4], b[4];
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
-+            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
-+            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
 +
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
 +
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            switch(out_depth) {
++                default:
++                case 10:
++                    y0oax4 = vshrq_n_s32(y0oax4, 19);
++                    y0obx4 = vshrq_n_s32(y0obx4, 19);
++                    y1oax4 = vshrq_n_s32(y1oax4, 19);
++                    y1obx4 = vshrq_n_s32(y1obx4, 19);
++                    uox4 = vshrq_n_s32(uox4, 19);
++                    vox4 = vshrq_n_s32(vox4, 19);
++                    break;
++                case 16:
++                    y0oax4 = vshrq_n_s32(y0oax4, 13);
++                    y0obx4 = vshrq_n_s32(y0obx4, 13);
++                    y1oax4 = vshrq_n_s32(y1oax4, 13);
++                    y1obx4 = vshrq_n_s32(y1obx4, 13);
++                    uox4 = vshrq_n_s32(uox4, 13);
++                    vox4 = vshrq_n_s32(vox4, 13);
++                    break;
++            }
 +
-+            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
 +
-+            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
++            y0ox8 = vshlq_u16(y0ox8, out_sh2x8);
++            vst1q_u16(&dsty[x], y0ox8);
 +
-+            r00 = r[0], g00 = g[0], b00 = b[0];
-+            r01 = r[1], g01 = g[1], b01 = b[1];
-+            r10 = r[2], g10 = g[2], b10 = b[2];
-+            r11 = r[3], g11 = g[3], b11 = b[3];
++            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
++            y1ox8 = vshlq_u16(y1ox8, out_sh2x8);
++            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
 +
-+            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
-+            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
-+            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
-+            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
++            uvoax4 = vzip1q_s32(uox4, vox4);
++            uvobx4 = vzip2q_s32(uox4, vox4);
 +
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstuv[x]     = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
-+            dstuv[x + 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
-+#undef AVG
++            vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8));
 +        }
 +    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
++                                            rsrcy, rsrcuv,
++                                            dstlinesize, srclinesize,
++                                            dstdepth, srcdepth,
++                                            remainw, rheight, params);
++    }
++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +}
+Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
+@@ -0,0 +1,68 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                 const int *dstlinesize, const int *srclinesize,
-+                                 int dstdepth, int srcdepth,
-+                                 int width, int height,
-+                                 const struct TonemapIntParams *params)
-+{
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
++#ifndef AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H
++#define AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H
 +
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
++#include "libavfilter/vf_tonemapx.h"
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++void tonemap_frame_dovi_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                    const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                    const int *dstlinesize, const int *srclinesize,
++                                    int dstdepth, int srcdepth,
++                                    int width, int height,
++                                    const struct TonemapIntParams *params);
 +
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                      const int *dstlinesize, const int *srclinesize,
++                                      int dstdepth, int srcdepth,
++                                      int width, int height,
++                                      const struct TonemapIntParams *params);
 +
-+    int r00, g00, b00;
-+    int r01, g01, b01;
-+    int r10, g10, b10;
-+    int r11, g11, b11;
++void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
++                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                         const int *dstlinesize, const int *srclinesize,
++                                         int dstdepth, int srcdepth,
++                                         int width, int height,
++                                         const struct TonemapIntParams *params);
 +
-+    int16_t r[4], g[4], b[4];
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                         ) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
-+            int u = (srcu[x >> 1]) - in_uv_offset;
-+            int v = (srcv[x >> 1]) - in_uv_offset;
++void tonemap_frame_dovi_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                      const int *dstlinesize, const int *srclinesize,
++                                      int dstdepth, int srcdepth,
++                                      int width, int height,
++                                      const struct TonemapIntParams *params);
 +
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                        const int *dstlinesize, const int *srclinesize,
++                                        int dstdepth, int srcdepth,
++                                        int width, int height,
++                                        const struct TonemapIntParams *params);
 +
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+
-+            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
++                                              const uint16_t *srcy, const uint16_t *srcuv,
++                                              const int *dstlinesize, const int *srclinesize,
++                                              int dstdepth, int srcdepth,
++                                              int width, int height,
++                                              const struct TonemapIntParams *params);
 +
-+            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++#endif // AVFILTER_AARCH64_TONEMAPX_INTRIN_NEON_H
+Index: FFmpeg/libavfilter/allfilters.c
+===================================================================
+--- FFmpeg.orig/libavfilter/allfilters.c
++++ FFmpeg/libavfilter/allfilters.c
+@@ -498,6 +498,7 @@ extern const AVFilter ff_vf_tmedian;
+ extern const AVFilter ff_vf_tmidequalizer;
+ extern const AVFilter ff_vf_tmix;
+ extern const AVFilter ff_vf_tonemap;
++extern const AVFilter ff_vf_tonemapx;
+ extern const AVFilter ff_vf_tonemap_cuda;
+ extern const AVFilter ff_vf_tonemap_opencl;
+ extern const AVFilter ff_vf_tonemap_vaapi;
+Index: FFmpeg/libavfilter/colorspace.c
+===================================================================
+--- FFmpeg.orig/libavfilter/colorspace.c
++++ FFmpeg/libavfilter/colorspace.c
+@@ -17,6 +17,7 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include "libavutil/avassert.h"
+ #include "libavutil/frame.h"
+ #include "libavutil/mastering_display_metadata.h"
+ #include "libavutil/pixdesc.h"
+@@ -354,3 +355,51 @@ float inverse_eotf_arib_b67(float x) {
+ float inverse_eotf_bt1886(float x) {
+     return x > 0.0f ? powf(x, 1.0f / 2.4f) : 0.0f;
+ }
 +
-+            r00 = r[0], g00 = g[0], b00 = b[0];
-+            r01 = r[1], g01 = g[1], b01 = b[1];
-+            r10 = r[2], g10 = g[2], b10 = b[2];
-+            r11 = r[3], g11 = g[3], b11 = b[3];
++int ff_get_range_off(int *off, int *y_rng, int *uv_rng,
++                     enum AVColorRange rng, int depth)
++{
++    switch (rng) {
++    case AVCOL_RANGE_UNSPECIFIED:
++    case AVCOL_RANGE_MPEG:
++        *off = 16 << (depth - 8);
++        *y_rng = 219 << (depth - 8);
++        *uv_rng = 224 << (depth - 8);
++        break;
++    case AVCOL_RANGE_JPEG:
++        *off = 0;
++        *y_rng = *uv_rng = (256 << (depth - 8)) - 1;
++        break;
++    default:
++        return AVERROR(EINVAL);
++    }
 +
-+            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
-+            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
-+            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
-+            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
++    return 0;
++}
 +
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstu[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
-+            dstv[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
-+#undef AVG
++void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3],
++                       int depth, int y_rng, int uv_rng, int yuv2rgb)
++{
++#define N (yuv2rgb ? m : n)
++#define M (yuv2rgb ? n : m)
++    int rng, n, m, o;
++    int bits = 1 << (yuv2rgb ? (depth - 1) : (29 - depth));
++    for (rng = y_rng, n = 0; n < 3; n++, rng = uv_rng) {
++        for (m = 0; m < 3; m++) {
++            out[N][M][0] = lrint(bits * (yuv2rgb ? 28672 : rng) * table[N][M] / (yuv2rgb ? rng : 28672));
++            for (o = 1; o < 8; o++)
++                out[N][M][o] = out[N][M][0];
 +        }
 +    }
++#undef N
++#undef M
++
++    if (yuv2rgb) {
++        av_assert2(out[0][1][0] == 0);
++        av_assert2(out[2][2][0] == 0);
++        av_assert2(out[0][0][0] == out[1][0][0]);
++        av_assert2(out[0][0][0] == out[2][0][0]);
++    } else {
++        av_assert2(out[1][2][0] == out[2][0][0]);
++    }
 +}
+Index: FFmpeg/libavfilter/colorspace.h
+===================================================================
+--- FFmpeg.orig/libavfilter/colorspace.h
++++ FFmpeg/libavfilter/colorspace.h
+@@ -85,4 +85,8 @@ float eotf_arib_b67(float x);
+ float inverse_eotf_arib_b67(float x);
+ float inverse_eotf_bt1886(float x);
+ 
++int ff_get_range_off(int *off, int *y_rng, int *uv_rng,
++                     enum AVColorRange rng, int depth);
++void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3],
++                       int depth, int y_rng, int uv_rng, int yuv2rgb);
+ #endif
+Index: FFmpeg/libavfilter/vf_tonemapx.c
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/vf_tonemapx.c
+@@ -0,0 +1,1778 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                   const int *dstlinesize, const int *srclinesize,
-+                                   int dstdepth, int srcdepth,
-+                                   int width, int height,
-+                                   const struct TonemapIntParams *params)
-+{
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
++/**
++ * @file
++ * tonemap algorithms
++ */
 +
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
++#include <float.h>
++#include <string.h>
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++#include "libavutil/avassert.h"
++#include "libavutil/imgutils.h"
++#include "libavutil/internal.h"
++#include "libavutil/mem_internal.h"
++#include "libavutil/opt.h"
++#include "libavutil/cpu.h"
 +
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++#include "vf_tonemapx.h"
 +
-+    int r00, g00, b00;
-+    int r01, g01, b01;
-+    int r10, g10, b10;
-+    int r11, g11, b11;
++#ifdef CC_SUPPORTS_TONEMAPX_INTRINSICS
++#    if ARCH_AARCH64
++#        if HAVE_INTRINSICS_NEON
++#            include "libavutil/aarch64/cpu.h"
++#            include "aarch64/vf_tonemapx_intrin_neon.h"
++#        endif
++#    endif // ARCH_AARCH64
++#    if ARCH_X86
++#        include "libavutil/x86/cpu.h"
++#        if HAVE_INTRINSICS_SSE42
++#            include "x86/vf_tonemapx_intrin_sse.h"
++#        endif
++#        if HAVE_INTRINSICS_AVX2 && HAVE_INTRINSICS_FMA3
++#            include "x86/vf_tonemapx_intrin_avx.h"
++#        endif
++#    endif // ARCH_X86
++#endif // CC_SUPPORTS_TONEMAPX_INTRINSICS
 +
-+    int16_t r[4], g[4], b[4];
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                         ) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
-+            int u = (srcu[x >> 1]) - in_uv_offset;
-+            int v = (srcv[x >> 1]) - in_uv_offset;
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "video.h"
 +
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++enum TonemapAlgorithm {
++    TONEMAP_NONE,
++    TONEMAP_LINEAR,
++    TONEMAP_GAMMA,
++    TONEMAP_CLIP,
++    TONEMAP_REINHARD,
++    TONEMAP_HABLE,
++    TONEMAP_MOBIUS,
++    TONEMAP_BT2390,
++    TONEMAP_MAX,
++};
 +
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++typedef struct TonemapxContext {
++    const AVClass *class;
 +
-+            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++    enum TonemapAlgorithm tonemap;
++    enum AVColorTransferCharacteristic trc;
++    enum AVColorSpace spc;
++    enum AVColorPrimaries pri;
++    enum AVColorRange range;
++    enum AVPixelFormat format;
++    char *format_str;
++    double param;
++    double desat;
++    double peak;
++    int apply_dovi;
 +
-+            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++    const AVLumaCoefficients *coeffs, *ocoeffs;
 +
-+            r00 = r[0], g00 = g[0], b00 = b[0];
-+            r01 = r[1], g01 = g[1], b01 = b[1];
-+            r10 = r[2], g10 = g[2], b10 = b[2];
-+            r11 = r[3], g11 = g[3], b11 = b[3];
++    double lut_peak;
++    float *lin_lut;
++    float *tonemap_lut;
++    uint16_t *delin_lut;
++    int in_yuv_off, out_yuv_off;
 +
-+            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 16);
-+            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 16);
-+            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 16);
-+            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 16);
++    struct DoviMetadata *dovi;
 +
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 16);
-+            dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 16);
-+#undef AVG
-+        }
-+    }
-+}
++    DECLARE_ALIGNED(16, float,   dovi_pbuf)[3*(params_sz+pivots_sz+coeffs_sz+mmr_sz)];
++    DECLARE_ALIGNED(16, int16_t, yuv2rgb_coeffs)[3][3][8];
++    DECLARE_ALIGNED(16, int16_t, rgb2yuv_coeffs)[3][3][8];
++    DECLARE_ALIGNED(16, double,  rgb2rgb_coeffs)[3][3];
++    DECLARE_ALIGNED(16, double,  lms2rgb_matrix)[3][3];
++    DECLARE_ALIGNED(16, float,   ycc_offset)[3];
 +
-+void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
-+                                         const uint16_t *srcy, const uint16_t *srcuv,
-+                                         const int *dstlinesize, const int *srclinesize,
-+                                         int dstdepth, int srcdepth,
-+                                         int width, int height,
-+                                         const struct TonemapIntParams *params)
-+{
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
++    int (*filter_slice) (AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
 +
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
-+    const int out_sh2 = 16 - out_depth;
++    void (*tonemap_func_biplanar8) (uint8_t *dsty, uint8_t *dstuv,
++                                    const uint16_t *srcy, const uint16_t *srcuv,
++                                    const int *dstlinesize, const int *srclinesize,
++                                    int dstdepth, int srcdepth,
++                                    int width, int height,
++                                    const struct TonemapIntParams *params);
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++    void (*tonemap_func_planar8) (uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                  const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                  const int *dstlinesize, const int *srclinesize,
++                                  int dstdepth, int srcdepth,
++                                  int width, int height,
++                                  const struct TonemapIntParams *params);
 +
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++    void (*tonemap_func_biplanar10) (uint16_t *dsty, uint16_t *dstuv,
++                                     const uint16_t *srcy, const uint16_t *srcuv,
++                                     const int *dstlinesize, const int *srclinesize,
++                                     int dstdepth, int srcdepth,
++                                     int width, int height,
++                                     const struct TonemapIntParams *params);
 +
-+    int r00, g00, b00;
-+    int r01, g01, b01;
-+    int r10, g10, b10;
-+    int r11, g11, b11;
++    void (*tonemap_func_planar10) (uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                   const int *dstlinesize, const int *srclinesize,
++                                   int dstdepth, int srcdepth,
++                                   int width, int height,
++                                   const struct TonemapIntParams *params);
 +
-+    int16_t r[4], g[4], b[4];
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
-+            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
-+            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
++    void (*tonemap_func_dovi8) (uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                const int *dstlinesize, const int *srclinesize,
++                                int dstdepth, int srcdepth,
++                                int width, int height,
++                                const struct TonemapIntParams *params);
 +
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++    void (*tonemap_func_dovi10) (uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                 const int *dstlinesize, const int *srclinesize,
++                                 int dstdepth, int srcdepth,
++                                 int width, int height,
++                                 const struct TonemapIntParams *params);
 +
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++} TonemapxContext;
 +
-+            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++typedef struct ThreadData {
++    AVFrame *in, *out;
++    const AVPixFmtDescriptor *desc, *odesc;
++    double peak;
++} ThreadData;
 +
-+            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++static const enum AVPixelFormat in_pix_fmts[] = {
++    AV_PIX_FMT_YUV420P10,
++    AV_PIX_FMT_P010,
++    AV_PIX_FMT_P016,
++    AV_PIX_FMT_NONE,
++};
 +
-+            r00 = r[0], g00 = g[0], b00 = b[0];
-+            r01 = r[1], g01 = g[1], b01 = b[1];
-+            r10 = r[2], g10 = g[2], b10 = b[2];
-+            r11 = r[3], g11 = g[3], b11 = b[3];
++static const enum AVPixelFormat out_pix_fmts[] = {
++    AV_PIX_FMT_YUV420P,
++    AV_PIX_FMT_YUV420P10,
++    AV_PIX_FMT_NV12,
++    AV_PIX_FMT_P010,
++    AV_PIX_FMT_P016,
++};
 +
-+            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++const double dovi_lms2rgb_matrix[3][3] =
++    {
++        { 3.06441879, -2.16597676,  0.10155818},
++        {-0.65612108,  1.78554118, -0.12943749},
++        { 0.01736321, -0.04725154,  1.03004253},
++    };
 +
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstuv[x]     = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
-+#undef AVG
++static void update_dovi_buf(AVFilterContext *ctx)
++{
++    TonemapxContext *s = ctx->priv;
++    float coeffs_data[8][4] = {0};
++    float mmr_packed_data[8*6][4] = {0};
++    int c, i, j, k;
++
++    for (c = 0; c < 3; c++) {
++        int has_poly = 0, has_mmr = 0, mmr_single = 1;
++        int mmr_idx = 0, min_order = 3, max_order = 1;
++        const struct ReshapeData *comp = &s->dovi->comp[c];
++        if (!comp->num_pivots)
++            continue;
++        av_assert0(comp->num_pivots >= 2 && comp->num_pivots <= 9);
++
++        memset(coeffs_data, 0, sizeof(coeffs_data));
++        for (i = 0; i < comp->num_pivots - 1; i++) {
++            switch (comp->method[i]) {
++                case 0: // polynomial
++                    has_poly = 1;
++                    coeffs_data[i][3] = 0.0f; // order=0 signals polynomial
++                    for (k = 0; k < 3; k++)
++                        coeffs_data[i][k] = comp->poly_coeffs[i][k];
++                    break;
++                case 1:
++                    min_order = FFMIN(min_order, comp->mmr_order[i]);
++                    max_order = FFMAX(max_order, comp->mmr_order[i]);
++                    mmr_single = !has_mmr;
++                    has_mmr = 1;
++                    coeffs_data[i][3] = (float)comp->mmr_order[i];
++                    coeffs_data[i][0] = comp->mmr_constant[i];
++                    coeffs_data[i][1] = (float)mmr_idx;
++                    for (j = 0; j < comp->mmr_order[i]; j++) {
++                        // store weights per order as two packed vec4s
++                        float *mmr = &mmr_packed_data[mmr_idx][0];
++                        mmr[0] = comp->mmr_coeffs[i][j][0];
++                        mmr[1] = comp->mmr_coeffs[i][j][1];
++                        mmr[2] = comp->mmr_coeffs[i][j][2];
++                        mmr[3] = 0.0f; // unused
++                        mmr[4] = comp->mmr_coeffs[i][j][3];
++                        mmr[5] = comp->mmr_coeffs[i][j][4];
++                        mmr[6] = comp->mmr_coeffs[i][j][5];
++                        mmr[7] = comp->mmr_coeffs[i][j][6];
++                        mmr_idx += 2;
++                    }
++                    break;
++                default:
++                    av_assert0(0);
++            }
 +        }
-+    }
-+}
 +
-+#define LOAD_TONEMAP_PARAMS     TonemapxContext *s = ctx->priv; \
-+ThreadData *td = arg;                                           \
-+AVFrame *in = td->in;                                           \
-+AVFrame *out = td->out;                                         \
-+const AVPixFmtDescriptor *desc  = td->desc;                     \
-+const AVPixFmtDescriptor *odesc = td->odesc;                    \
-+const int ss = 1 << FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h); \
-+const int slice_start = (in->height / ss *  jobnr     ) / nb_jobs * ss; \
-+const int slice_end   = (in->height / ss * (jobnr + 1)) / nb_jobs * ss; \
-+TonemapIntParams params = {                                     \
-+.lut_peak            = s->lut_peak,                             \
-+.lin_lut             = s->lin_lut,                              \
-+.tonemap_lut         = s->tonemap_lut,                          \
-+.delin_lut           = s->delin_lut,                            \
-+.in_yuv_off          = s->in_yuv_off,                           \
-+.out_yuv_off         = s->out_yuv_off,                          \
-+.yuv2rgb_coeffs      = &s->yuv2rgb_coeffs,                      \
-+.rgb2yuv_coeffs      = &s->rgb2yuv_coeffs,                      \
-+.rgb2rgb_coeffs      = &s->rgb2rgb_coeffs,                      \
-+.rgb2rgb_passthrough = in->color_primaries == out->color_primaries,   \
-+.coeffs              = s->coeffs,                               \
-+.ocoeffs             = s->ocoeffs,                              \
-+.desat               = s->desat,                                \
-+};
++        av_assert0(has_poly || has_mmr);
 +
-+static int filter_slice_planar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
-+{
-+    LOAD_TONEMAP_PARAMS
-+    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++        if (has_mmr)
++            av_assert0(min_order <= max_order);
 +
-+    s->tonemap_func_planar8(out->data[0] + out->linesize[0] * slice_start,
-+                            out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                            out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                            (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                            (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                            (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                            out->linesize, in->linesize,
-+                            odesc->comp[0].depth, desc->comp[0].depth,
-+                            out->width, slice_end - slice_start,
-+                            &params);
++        // dovi_params
++        {
++            float params[8] = {
++                comp->num_pivots, !!has_mmr, !!has_poly,
++                mmr_single, min_order, max_order,
++                comp->pivots[0], comp->pivots[comp->num_pivots - 1]
++            };
++            memcpy(s->dovi_pbuf + c*params_cnt, params, params_sz);
++        }
 +
-+    return 0;
++        // dovi_pivots
++        if (c == 0 && comp->num_pivots > 2) {
++            // Skip the (irrelevant) lower and upper bounds
++            float pivots_data[7+1] = {0};
++            memcpy(pivots_data, comp->pivots + 1,
++                   (comp->num_pivots - 2) * sizeof(pivots_data[0]));
++            // Fill the remainder with a quasi-infinite sentinel pivot
++            for (i = comp->num_pivots - 2; i < FF_ARRAY_ELEMS(pivots_data); i++)
++                pivots_data[i] = 1e9f;
++            memcpy(s->dovi_pbuf + 3*params_cnt + c*pivots_cnt, pivots_data, pivots_sz);
++        }
++
++        // dovi_coeffs
++        memcpy(s->dovi_pbuf + 3*(params_cnt+pivots_cnt) + c*coeffs_cnt, &coeffs_data[0], coeffs_sz);
++
++        // dovi_mmr
++        if (has_mmr)
++            memcpy(s->dovi_pbuf + 3*(params_cnt+pivots_cnt+coeffs_cnt) + c*mmr_cnt, &mmr_packed_data[0], mmr_sz);
++    }
 +}
 +
-+static int filter_slice_biplanar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++inline static float dot(const float* x, const float* y, int len)
 +{
-+    LOAD_TONEMAP_PARAMS
-+    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
-+
-+    s->tonemap_func_biplanar8(out->data[0] + out->linesize[0] * slice_start,
-+                              out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                              (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                              (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                              out->linesize, in->linesize,
-+                              odesc->comp[0].depth, desc->comp[0].depth,
-+                              out->width, slice_end - slice_start,
-+                              &params);
++    int i;
++    float result = 0;
++    for (i = 0; i < len; i++) {
++        result += x[i] * y[i];
++    }
++    return result;
++}
 +
-+    return 0;
++inline static float reshape_poly(float s, float* coeffs) {
++    return (coeffs[2] * s + coeffs[1]) * s + coeffs[0];
 +}
 +
-+static int filter_slice_planar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++inline static float reshape_mmr(const float* sig, const float* coeffs, const float* mmr,
++                                int mmr_single, int min_order, int max_order)
 +{
-+    LOAD_TONEMAP_PARAMS
-+    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++    int mmr_idx = mmr_single ? 0 : (int)coeffs[1];
++    int order = (int)coeffs[3];
++    float s = coeffs[0];
++    float sigX[7+1] = {sig[0], sig[1], sig[2], 0,
++                       sig[0]*sig[1], sig[0]*sig[2], sig[1]*sig[2], sig[0]*sig[1]*sig[2]};
++
++    s += dot(&mmr[mmr_idx + 0*4], sigX, 7+1);
++    if (max_order >= 2 && (min_order >= 2 || order >= 2)) {
++        float sigX2[7+1] = {sig[0]*sig[0], sig[1]*sig[1], sig[2]*sig[2], 0,
++                            sigX[4]*sigX[4], sigX[5]*sigX[5], sigX[6]*sigX[6], sigX[7]*sigX[7]};
++        s += dot(&mmr[mmr_idx + 2*4], sigX2, 7+1);
++
++        if (max_order == 3 && (min_order == 3 || order >= 3)) {
++            float sigX3[7+1] = {sig[0]*sig[0]*sig[0], sig[1]*sig[1]*sig[1], sig[2]*sig[2]*sig[2], 0,
++                                sigX2[4]*sigX[4], sigX2[5]*sigX[5], sigX2[6]*sigX[6], sigX2[7]*sigX[7]};
++            s += dot(&mmr[mmr_idx + 4*4], sigX3, 7+1);
++        }
++    }
 +
-+    s->tonemap_func_planar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start),
-+                             (uint16_t *) (out->data[1] +
-+                                           out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
-+                             (uint16_t *) (out->data[2] +
-+                                           out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
-+                             (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                             (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                             (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                             out->linesize, in->linesize,
-+                             odesc->comp[0].depth, desc->comp[0].depth,
-+                             out->width, slice_end - slice_start,
-+                             &params);
++    return s;
++}
 +
-+    return 0;
++inline static void ycc2rgb(float* dest, float y, float cb, float cr, const double nonlinear[3][3], const float ycc_offset[3])
++{
++    dest[0] = (y * (float)nonlinear[0][0] + cb * (float)nonlinear[0][1] + cr * (float)nonlinear[0][2]) - ycc_offset[0];
++    dest[1] = (y * (float)nonlinear[1][0] + cb * (float)nonlinear[1][1] + cr * (float)nonlinear[1][2]) - ycc_offset[1];
++    dest[2] = (y * (float)nonlinear[2][0] + cb * (float)nonlinear[2][1] + cr * (float)nonlinear[2][2]) - ycc_offset[2];
 +}
 +
-+static int filter_slice_biplanar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++// This implementation does not do the costly linearization and de-linearization for performance reasons
++// The output color accuracy will be affected due to this
++inline static void lms2rgb(float* dest, float l, float m, float s, const double linear[3][3], const double lms2rgb_matrix[3][3])
 +{
-+    LOAD_TONEMAP_PARAMS
-+    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++    dest[0] = l * (float)lms2rgb_matrix[0][0] + m * (float)lms2rgb_matrix[0][1] + s * (float)lms2rgb_matrix[0][2];
++    dest[1] = l * (float)lms2rgb_matrix[1][0] + m * (float)lms2rgb_matrix[1][1] + s * (float)lms2rgb_matrix[1][2];
++    dest[2] = l * (float)lms2rgb_matrix[2][0] + m * (float)lms2rgb_matrix[2][1] + s * (float)lms2rgb_matrix[2][2];
++}
 +
-+    s->tonemap_func_biplanar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start),
-+                               (uint16_t *) (out->data[1] +
-+                                             out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
-+                               (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                               (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                               out->linesize, in->linesize,
-+                               odesc->comp[0].depth, desc->comp[0].depth,
-+                               out->width, slice_end - slice_start,
-+                               &params);
++#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c)))
++inline static void reshape_dovi_yuv(float* dest, float* src, const TonemapIntParams *ctx)
++{
++    int i;
++    float s;
++    float coeffs[4] = {0, 0, 0, 0};
++    float sig_arr[3] = {src[0],src[1],src[2]};
++
++    int dovi_num_pivots, dovi_has_mmr, dovi_has_poly;
++    int dovi_mmr_single, dovi_min_order, dovi_max_order;
++    int has_mmr_poly;
++    float dovi_lo, dovi_hi;
++    float *dovi_params;
++    float *dovi_pivots;
++    float *dovi_coeffs, *dovi_mmr; //float4*
++
++    float *src_dovi_params = ctx->dovi_pbuf;
++    float *src_dovi_pivots = ctx->dovi_pbuf + 24;
++    float *src_dovi_coeffs = ctx->dovi_pbuf + 48; //float4*
++    float *src_dovi_mmr = ctx->dovi_pbuf + 144; //float4*
++
++    for (i = 0; i < 3; i++) {
++        dovi_params = src_dovi_params + i*8;
++        dovi_pivots = src_dovi_pivots + i*8;
++        dovi_coeffs = src_dovi_coeffs + i*8*4; //float4*
++        dovi_mmr = src_dovi_mmr + i*48*4; //float4*
++        dovi_num_pivots = dovi_params[0];
++        dovi_has_mmr = dovi_params[1];
++        dovi_has_poly = dovi_params[2];
++        dovi_mmr_single = dovi_params[3];
++        dovi_min_order = dovi_params[4];
++        dovi_max_order = dovi_params[5];
++        dovi_lo = dovi_params[6];
++        dovi_hi = dovi_params[7];
++
++        s = sig_arr[i];
++        coeffs[0] = dovi_coeffs[0*4+0];
++        coeffs[1] = dovi_coeffs[0*4+1];
++        coeffs[2] = dovi_coeffs[0*4+2];
++        coeffs[3] = dovi_coeffs[0*4+3];
++
++#define mix(x, y, a) ((x) + ((y) - (x)) * (a))
++        if (i == 0 && dovi_num_pivots > 2) {
++            int t0 = s >= dovi_pivots[0], t1 = s >= dovi_pivots[1];
++            int t2 = s >= dovi_pivots[2], t3 = s >= dovi_pivots[3];
++            int t4 = s >= dovi_pivots[4], t5 = s >= dovi_pivots[5], t6 = s >= dovi_pivots[6];
++
++            float m01[4] = { mix(dovi_coeffs[0*4+0], dovi_coeffs[1*4+0], t0),
++                             mix(dovi_coeffs[0*4+1], dovi_coeffs[1*4+1], t0),
++                             mix(dovi_coeffs[0*4+2], dovi_coeffs[1*4+2], t0),
++                             mix(dovi_coeffs[0*4+3], dovi_coeffs[1*4+3], t0) };
++            float m23[4] = { mix(dovi_coeffs[2*4+0], dovi_coeffs[3*4+0], t2),
++                             mix(dovi_coeffs[2*4+1], dovi_coeffs[3*4+1], t2),
++                             mix(dovi_coeffs[2*4+2], dovi_coeffs[3*4+2], t2),
++                             mix(dovi_coeffs[2*4+3], dovi_coeffs[3*4+3], t2) };
++            float m0123[4] = { mix(m01[0], m23[0], t1),
++                               mix(m01[1], m23[1], t1),
++                               mix(m01[2], m23[2], t1),
++                               mix(m01[3], m23[3], t1) };
++            float m45[4] = { mix(dovi_coeffs[4*4+0], dovi_coeffs[5*4+0], t4),
++                             mix(dovi_coeffs[4*4+1], dovi_coeffs[5*4+1], t4),
++                             mix(dovi_coeffs[4*4+2], dovi_coeffs[5*4+2], t4),
++                             mix(dovi_coeffs[4*4+3], dovi_coeffs[5*4+3], t4) };
++            float m67[4] = { mix(dovi_coeffs[6*4+0], dovi_coeffs[7*4+0], t6),
++                             mix(dovi_coeffs[6*4+1], dovi_coeffs[7*4+1], t6),
++                             mix(dovi_coeffs[6*4+2], dovi_coeffs[7*4+2], t6),
++                             mix(dovi_coeffs[6*4+3], dovi_coeffs[7*4+3], t6) };
++            float m4567[4] = { mix(m45[0], m67[0], t5),
++                               mix(m45[1], m67[1], t5),
++                               mix(m45[2], m67[2], t5),
++                               mix(m45[3], m67[3], t5) };
++
++            coeffs[0] = mix(m0123[0], m4567[0], t3);
++            coeffs[1] = mix(m0123[1], m4567[1], t3);
++            coeffs[2] = mix(m0123[2], m4567[2], t3);
++            coeffs[3] = mix(m0123[3], m4567[3], t3);
++        }
 +
-+    return 0;
++        has_mmr_poly = dovi_has_mmr && dovi_has_poly;
++
++        if ((has_mmr_poly && coeffs[3] == 0.0f) || (!has_mmr_poly && dovi_has_poly))
++            s = reshape_poly(s, coeffs);
++        else
++            s = reshape_mmr(sig_arr, coeffs, dovi_mmr,
++                            dovi_mmr_single, dovi_min_order, dovi_max_order);
++
++        sig_arr[i] = CLAMP(s, dovi_lo, dovi_hi);
++    }
++
++    dest[0] = sig_arr[0];
++    dest[1] = sig_arr[1];
++    dest[2] = sig_arr[2];
 +}
 +
-+static int filter_frame(AVFilterLink *link, AVFrame *in)
++static int out_format_is_supported(enum AVPixelFormat fmt)
 +{
-+    AVFilterContext *ctx = link->dst;
-+    TonemapxContext *s = ctx->priv;
-+    AVFilterLink *outlink = ctx->outputs[0];
-+    AVFrame *out;
-+    const AVPixFmtDescriptor *desc;
-+    const AVPixFmtDescriptor *odesc;
-+    int ret;
-+    double peak = s->peak;
-+    const AVLumaCoefficients *coeffs;
-+    ThreadData td;
++    int i;
 +
-+    desc = av_pix_fmt_desc_get(link->format);
-+    odesc = av_pix_fmt_desc_get(outlink->format);
-+    if (!desc || !odesc) {
-+        av_frame_free(&in);
-+        return AVERROR_BUG;
-+    }
++    for (i = 0; i < FF_ARRAY_ELEMS(out_pix_fmts); i++)
++        if (out_pix_fmts[i] == fmt)
++            return 1;
++    return 0;
++}
 +
-+    switch (odesc->comp[2].plane) {
-+        case 1: // biplanar
-+            if (odesc->comp[0].depth == 8) {
-+                s->filter_slice = filter_slice_biplanar8;
-+            } else {
-+                s->filter_slice = filter_slice_biplanar10;
-+            }
-+            break;
-+        default:
-+        case 2: // planar
-+            if (odesc->comp[0].depth == 8) {
-+                s->filter_slice = filter_slice_planar8;
-+            } else {
-+                s->filter_slice = filter_slice_planar10;
-+            }
-+            break;
-+    }
++static float hable(float in)
++{
++    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
++    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
++}
 +
-+    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
-+    if (!out) {
-+        av_frame_free(&in);
-+        return AVERROR(ENOMEM);
-+    }
++static float mobius(float in, float j, double peak)
++{
++    float a, b;
 +
-+    if ((ret = av_frame_copy_props(out, in)) < 0)
-+        goto fail;
++    if (in <= j)
++        return in;
 +
-+    /* read peak from side data if not passed in */
-+    if (!peak) {
-+        peak = ff_determine_signal_peak(in);
-+        av_log(s, AV_LOG_DEBUG, "Computed signal peak: %f\n", peak);
-+    }
++    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
++    b = (j * j - 2.0f * j * peak + peak) / FFMAX(peak - 1.0f, FLOAT_EPS);
 +
-+    out->color_trc = s->trc;
-+    out->colorspace = s->spc;
-+    out->color_primaries = s->pri;
-+    out->color_range = s->range;
++    return (b * b + 2.0f * b * j + j * j) / (b - a) * (in + a) / (in + b);
++}
 +
-+    if (in->color_trc == AVCOL_TRC_UNSPECIFIED)
-+        in->color_trc = AVCOL_TRC_SMPTE2084;
-+    if (out->color_trc == AVCOL_TRC_UNSPECIFIED)
-+        out->color_trc = AVCOL_TRC_BT709;
++static float bt2390(float s, float peak)
++{
++    float peak_pq = inverse_eotf_st2084(peak, REFERENCE_WHITE_ALT);
++    float scale = 1.0f / peak_pq;
 +
-+    if (in->colorspace == AVCOL_SPC_UNSPECIFIED)
-+        in->colorspace = AVCOL_SPC_BT2020_NCL;
-+    if (out->colorspace == AVCOL_SPC_UNSPECIFIED)
-+        out->colorspace = AVCOL_SPC_BT709;
++    // SDR peak
++    float dst_peak = 1.0f;
++    float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE_ALT) * scale;
++    float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale;
 +
-+    if (in->color_primaries == AVCOL_PRI_UNSPECIFIED)
-+        in->color_primaries = AVCOL_PRI_BT2020;
-+    if (out->color_primaries == AVCOL_PRI_UNSPECIFIED)
-+        out->color_primaries = AVCOL_PRI_BT709;
++    float ks = 1.5f * maxLum - 0.5f;
++    float tb = (s_pq - ks) / (1.0f - ks);
++    float tb2 = tb * tb;
++    float tb3 = tb2 * tb;
++    float pb = (2.0f * tb3 - 3.0f * tb2 + 1.0f) * ks +
++               (tb3 - 2.0f * tb2 + tb) * (1.0f - ks) +
++               (-2.0f * tb3 + 3.0f * tb2) * maxLum;
++    float sig = (s_pq < ks) ? s_pq : pb;
 +
-+    if (in->color_range == AVCOL_RANGE_UNSPECIFIED)
-+        in->color_range = AVCOL_RANGE_MPEG;
-+    if (out->color_range == AVCOL_RANGE_UNSPECIFIED)
-+        out->color_range = AVCOL_RANGE_MPEG;
++    return eotf_st2084(sig * peak_pq, REFERENCE_WHITE_ALT);
++}
 +
-+    if (!s->lin_lut || !s->delin_lut) {
-+        if ((ret = compute_trc_luts(s, in->color_trc, out->color_trc)) < 0)
-+            goto fail;
++static float mapsig(enum TonemapAlgorithm alg, float sig, double peak, double param)
++{
++    switch(alg) {
++    default:
++    case TONEMAP_NONE:
++        // do nothing
++        break;
++    case TONEMAP_LINEAR:
++        sig = sig * param / peak;
++        break;
++    case TONEMAP_GAMMA:
++        sig = sig > 0.05f
++              ? pow(sig / peak, 1.0f / param)
++              : sig * pow(0.05f / peak, 1.0f / param) / 0.05f;
++        break;
++    case TONEMAP_CLIP:
++        sig = av_clipf(sig * param, 0, 1.0f);
++        break;
++    case TONEMAP_HABLE:
++        sig = hable(sig) / hable(peak);
++        break;
++    case TONEMAP_REINHARD:
++        sig = sig / (sig + param) * (peak + param) / peak;
++        break;
++    case TONEMAP_MOBIUS:
++        sig = mobius(sig, param, peak);
++        break;
++    case TONEMAP_BT2390:
++        sig = bt2390(sig, peak);
++        break;
 +    }
 +
-+    if (!s->tonemap_lut || s->lut_peak != peak) {
-+        s->lut_peak = peak;
-+        if ((ret = compute_tonemap_lut(s, out->color_trc)) < 0)
-+            goto fail;
-+    }
++    return sig;
++}
 +
-+    coeffs = av_csp_luma_coeffs_from_avcsp(in->colorspace);
-+    if (s->coeffs != coeffs) {
-+        s->coeffs = coeffs;
-+        s->ocoeffs = av_csp_luma_coeffs_from_avcsp(out->colorspace);
-+        if ((ret = compute_yuv_coeffs(s, coeffs, s->ocoeffs, desc, odesc,
-+             in->color_range, out->color_range)) < 0)
-+            goto fail;
-+        if ((ret = compute_rgb_coeffs(s, in->color_primaries, out->color_primaries)) < 0)
-+            goto fail;
-+    }
++static float linearize(float x, enum AVColorTransferCharacteristic trc_src)
++{
++    if (trc_src == AVCOL_TRC_SMPTE2084)
++        return eotf_st2084(x, REFERENCE_WHITE_ALT);
++    else if (trc_src == AVCOL_TRC_ARIB_STD_B67)
++        return eotf_arib_b67(x);
++    else
++        return x;
++}
 +
-+    /* do the tonemap */
-+    td.in    = in;
-+    td.out   = out;
-+    td.desc  = desc;
-+    td.odesc = odesc;
-+    td.peak  = peak;
-+    ff_filter_execute(ctx, s->filter_slice, &td, NULL,
-+                      FFMIN(outlink->h >> FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h), ff_filter_get_nb_threads(ctx)));
++static float delinearize(float x, enum AVColorTransferCharacteristic trc_dst)
++{
++    if (trc_dst == AVCOL_TRC_BT709 || trc_dst == AVCOL_TRC_BT2020_10)
++        return inverse_eotf_bt1886(x);
++    else
++        return x;
++}
 +
-+    av_frame_free(&in);
++static int compute_trc_luts(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src,
++                            enum AVColorTransferCharacteristic trc_dst)
++{
++    int i;
 +
-+    av_frame_remove_side_data(out, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
-+    av_frame_remove_side_data(out, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
++    if (!s->lin_lut && !(s->lin_lut = av_calloc(32768, sizeof(float))))
++        return AVERROR(ENOMEM);
++    if (!s->delin_lut && !(s->delin_lut = av_calloc(32768, sizeof(uint16_t))))
++        return AVERROR(ENOMEM);
 +
-+    return ff_filter_frame(outlink, out);
-+fail:
-+    av_frame_free(&in);
-+    av_frame_free(&out);
-+    return ret;
++    for (i = 0; i < 32768; i++) {
++        double v1 = (i - 2048.0f) / 28672.0f;
++        double v2 = i / 32767.0f;
++        s->lin_lut[i] = FFMAX(linearize(v1, trc_src), 0);
++        s->delin_lut[i] = av_clip_int16(lrint(delinearize(v2, trc_dst) * 28672.0f));
++    }
++
++    return 0;
 +}
 +
-+static void uninit(AVFilterContext *ctx)
++static int compute_tonemap_lut(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src)
 +{
-+    TonemapxContext *s = ctx->priv;
++    int i;
++    double peak = s->lut_peak;
 +
-+    av_freep(&s->lin_lut);
-+    av_freep(&s->delin_lut);
-+    av_freep(&s->tonemap_lut);
++    if (!s->tonemap_lut && !(s->tonemap_lut = av_calloc(32768, sizeof(float))))
++        return AVERROR(ENOMEM);
++
++    for (i = 0; i < 32768; i++) {
++        double v = (i - 2048.0f) / 28672.0f;
++        double sig = linearize(v, trc_src);
++        float mapped = mapsig(s->tonemap, sig, peak, s->param);
++        s->tonemap_lut[i] = (sig > 0.0f && mapped > 0.0f) ? mapped / sig : 0.0f;
++    }
++
++    return 0;
 +}
 +
-+static int query_formats(AVFilterContext *ctx)
++static int compute_yuv_coeffs(TonemapxContext *s,
++                              const AVLumaCoefficients *coeffs,
++                              const AVLumaCoefficients *ocoeffs,
++                              const AVPixFmtDescriptor *idesc,
++                              const AVPixFmtDescriptor *odesc,
++                              enum AVColorRange irng,
++                              enum AVColorRange orng)
 +{
-+    enum AVPixelFormat valid_in_pix_fmts[4];
-+    AVFilterFormats *formats;
-+    const AVPixFmtDescriptor *desc;
-+    TonemapxContext *s = ctx->priv;
++    double rgb2yuv[3][3], yuv2rgb[3][3];
++    int res;
++    int y_rng, uv_rng;
 +
-+    if (!strcmp(s->format_str, "same")) {
-+        int res;
-+        formats = ff_make_format_list(in_pix_fmts);
-+        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
-+        if (res < 0)
-+            return res;
-+        s->format = AV_PIX_FMT_NONE;
-+    } else {
-+        int i, j = 0;
-+        int res;
-+        formats = ff_make_format_list(in_pix_fmts);
-+        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
-+        if (res < 0)
-+            return res;
-+        if (s->format == AV_PIX_FMT_NONE) {
-+            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
-+            return AVERROR(EINVAL);
-+        }
-+        s->format = av_get_pix_fmt(s->format_str);
-+        // Check again in case of the string is invalid
-+        if (s->format == AV_PIX_FMT_NONE) {
-+            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
-+            return AVERROR(EINVAL);
-+        }
-+        desc = av_pix_fmt_desc_get(s->format);
-+        // Filter out the input formats for requested output formats
-+        // The input and output must have the same planar format, either planar or bi-planar packed
-+        for (i = 0; in_pix_fmts[i] != AV_PIX_FMT_NONE; i++) {
-+            const AVPixFmtDescriptor *tdesc = av_pix_fmt_desc_get(in_pix_fmts[i]);
-+            if (tdesc->comp[2].plane == desc->comp[2].plane) {
-+                valid_in_pix_fmts[j] = in_pix_fmts[i];
-+                j++;
-+            }
-+        }
-+        valid_in_pix_fmts[j] = AV_PIX_FMT_NONE;
-+        formats = ff_make_format_list(valid_in_pix_fmts);
-+        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
-+        if (res < 0)
-+            return res;
-+        if (out_format_is_supported(s->format)) {
-+            formats = NULL;
-+            res = ff_add_format(&formats, s->format);
-+            if (res < 0)
-+                return res;
-+        } else {
-+            av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
-+                   av_get_pix_fmt_name(s->format));
-+            return AVERROR(ENOSYS);
-+        }
++    res = ff_get_range_off(&s->in_yuv_off, &y_rng, &uv_rng,
++                           irng, idesc->comp[0].depth);
++    if (res < 0) {
++        av_log(s, AV_LOG_ERROR,
++               "Unsupported input color range %d (%s)\n",
++               irng, av_color_range_name(irng));
++        return res;
 +    }
 +
-+    return ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats);
++    ff_fill_rgb2yuv_table(coeffs, rgb2yuv);
++    ff_matrix_invert_3x3(rgb2yuv, yuv2rgb);
++    ff_fill_rgb2yuv_table(ocoeffs, rgb2yuv);
++
++    ff_get_yuv_coeffs(s->yuv2rgb_coeffs, yuv2rgb, idesc->comp[0].depth,
++                      y_rng, uv_rng, 1);
++
++    res = ff_get_range_off(&s->out_yuv_off, &y_rng, &uv_rng,
++                           orng, odesc->comp[0].depth);
++    if (res < 0) {
++        av_log(s, AV_LOG_ERROR,
++               "Unsupported output color range %d (%s)\n",
++               orng, av_color_range_name(orng));
++        return res;
++    }
++
++    ff_get_yuv_coeffs(s->rgb2yuv_coeffs, rgb2yuv, odesc->comp[0].depth,
++                      y_rng, uv_rng, 0);
++
++    return 0;
 +}
 +
-+static av_cold int init(AVFilterContext *ctx)
++static int compute_rgb_coeffs(TonemapxContext *s,
++                              enum AVColorPrimaries iprm,
++                              enum AVColorPrimaries oprm)
 +{
-+    TonemapxContext *s = ctx->priv;
-+    enum SIMDVariant active_simd = SIMD_NONE;
-+    av_log(s, AV_LOG_DEBUG, "Requested output format: %s\n",
-+           s->format_str);
++    double rgb2xyz[3][3], xyz2rgb[3][3];
++    const AVColorPrimariesDesc *iprm_desc = av_csp_primaries_desc_from_id(iprm);
++    const AVColorPrimariesDesc *oprm_desc = av_csp_primaries_desc_from_id(oprm);
 +
-+#if ARCH_AARCH64
-+#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
-+    {
-+        int cpu_flags = av_get_cpu_flags();
-+        if (have_neon(cpu_flags)) {
-+            s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_neon;
-+            s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_neon;
-+            s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon;
-+            s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon;
-+            active_simd = SIMD_NEON;
-+        }
-+    }
-+#else
-+    av_log(s, AV_LOG_WARNING, "NEON optimization disabled at compile time\n");
-+#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
-+#elif ARCH_X86
-+#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
-+    {
-+        int cpu_flags = av_get_cpu_flags();
-+        if (X86_SSE42(cpu_flags)) {
-+            s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_sse;
-+            s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_sse;
-+            s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse;
-+            s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse;
-+            active_simd = SIMD_SSE;
-+        }
++    if (!iprm_desc) {
++        av_log(s, AV_LOG_ERROR,
++               "Unsupported input color primaries %d (%s)\n",
++               iprm, av_color_primaries_name(iprm));
++        return AVERROR(EINVAL);
 +    }
-+#else
-+    av_log(s, AV_LOG_WARNING, "SSE optimization disabled at compile time\n");
-+#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
-+#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
-+    {
-+        int cpu_flags = av_get_cpu_flags();
-+        if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) {
-+            s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_avx;
-+            s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_avx;
-+            s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx;
-+            s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx;
-+            active_simd = SIMD_AVX;
-+        }
++    if (!oprm_desc) {
++        av_log(s, AV_LOG_ERROR,
++               "Unsupported output color primaries %d (%s)\n",
++               oprm, av_color_primaries_name(oprm));
++        return AVERROR(EINVAL);
 +    }
-+#else
-+    av_log(s, AV_LOG_WARNING, "AVX optimization disabled at compile time\n");
-+#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
-+#endif // ARCH_X86/ARCH_AARCH64
 +
-+#if !defined(ENABLE_TONEMAPX_NEON_INTRINSICS) && \
-+    !defined(ENABLE_TONEMAPX_SSE_INTRINSICS) && \
-+    !defined(ENABLE_TONEMAPX_AVX_INTRINSICS)
-+    av_log(s, AV_LOG_WARNING, "SIMD optimization disabled at compile time\n");
-+#endif
++    ff_fill_rgb2xyz_table(&oprm_desc->prim, &oprm_desc->wp, rgb2xyz);
++    ff_matrix_invert_3x3(rgb2xyz, xyz2rgb);
++    ff_fill_rgb2xyz_table(&iprm_desc->prim, &iprm_desc->wp, rgb2xyz);
++    ff_matrix_mul_3x3(s->rgb2rgb_coeffs, rgb2xyz, xyz2rgb);
 +
-+    if (!s->tonemap_func_biplanar8) {
-+        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12;
-+    }
++    return 0;
++}
 +
-+    if (!s->tonemap_func_biplanar10) {
-+        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010;
-+    }
++__attribute__((always_inline))
++static inline void dovi2rgb(int y00, int y01, int y10, int y11, int u, int v,
++                            const struct TonemapIntParams *params,
++                            const float in_rng,
++                            int16_t r[4], int16_t g[4], int16_t b[4])
++{
++    float yuv1[3], yuv2[3], yuv3[3], yuv4[3];
++    float c1[3], c2[3], c3[3], c4[3];
++
++    yuv1[0] = CLAMP(y00 / in_rng, 0.0f, 1.0f);
++    yuv2[0] = CLAMP(y01 / in_rng, 0.0f, 1.0f);
++    yuv3[0] = CLAMP(y10 / in_rng, 0.0f, 1.0f);
++    yuv4[0] = CLAMP(y11 / in_rng, 0.0f, 1.0f);
++    yuv1[1] = yuv2[1] = yuv3[1] = yuv4[1] = CLAMP(u / in_rng, 0.0f, 1.0f);
++    yuv1[2] = yuv2[2] = yuv3[2] = yuv4[2] = CLAMP(v / in_rng, 0.0f, 1.0f);
++
++    reshape_dovi_yuv(yuv1, yuv1, params);
++    reshape_dovi_yuv(yuv2, yuv2, params);
++    reshape_dovi_yuv(yuv3, yuv3, params);
++    reshape_dovi_yuv(yuv4, yuv4, params);
++
++    ycc2rgb(c1, yuv1[0], yuv1[1], yuv1[2], params->dovi->nonlinear, *params->ycc_offset);
++    ycc2rgb(c2, yuv2[0], yuv2[1], yuv2[2], params->dovi->nonlinear, *params->ycc_offset);
++    ycc2rgb(c3, yuv3[0], yuv3[1], yuv3[2], params->dovi->nonlinear, *params->ycc_offset);
++    ycc2rgb(c4, yuv4[0], yuv4[1], yuv4[2], params->dovi->nonlinear, *params->ycc_offset);
++
++    lms2rgb(c1, c1[0], c1[1], c1[2], params->dovi->linear, *params->lms2rgb_matrix);
++    lms2rgb(c2, c2[0], c2[1], c2[2], params->dovi->linear, *params->lms2rgb_matrix);
++    lms2rgb(c3, c3[0], c3[1], c3[2], params->dovi->linear, *params->lms2rgb_matrix);
++    lms2rgb(c4, c4[0], c4[1], c4[2], params->dovi->linear, *params->lms2rgb_matrix);
++
++    r[0] = av_clip_int16(c1[0] * 28672);
++    r[1] = av_clip_int16(c2[0] * 28672);
++    r[2] = av_clip_int16(c3[0] * 28672);
++    r[3] = av_clip_int16(c4[0] * 28672);
++
++    g[0] = av_clip_int16(c1[1] * 28672);
++    g[1] = av_clip_int16(c2[1] * 28672);
++    g[2] = av_clip_int16(c3[1] * 28672);
++    g[3] = av_clip_int16(c4[1] * 28672);
++
++    b[0] = av_clip_int16(c1[2] * 28672);
++    b[1] = av_clip_int16(c2[2] * 28672);
++    b[2] = av_clip_int16(c3[2] * 28672);
++    b[3] = av_clip_int16(c4[2] * 28672);
++}
 +
-+    if (!s->tonemap_func_planar8) {
-+        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p;
-+    }
++inline static void tonemap_int16(int16_t r_in, int16_t g_in, int16_t b_in,
++                          int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                          float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                          const AVLumaCoefficients *coeffs,
++                          const AVLumaCoefficients *ocoeffs, double desat,
++                          double (*rgb2rgb)[3][3],
++                          int rgb2rgb_passthrough)
++{
++    int16_t sig;
++    float mapval, r_lin, g_lin, b_lin;
 +
-+    if (!s->tonemap_func_planar10) {
-+        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10;
-+    }
++    /* load values */
++    *r_out = r_in;
++    *g_out = g_in;
++    *b_out = b_in;
 +
-+    switch (active_simd) {
-+        case SIMD_NEON:
-+            av_log(s, AV_LOG_INFO, "Using CPU capability: NEON\n");
-+            break;
-+        case SIMD_SSE:
-+            av_log(s, AV_LOG_INFO, "Using CPU capability: SSE4.2\n");
-+            break;
-+        case SIMD_AVX:
-+            av_log(s, AV_LOG_INFO, "Using CPU capabilities: AVX2 FMA3\n");
-+            break;
-+        default:
-+        case SIMD_NONE:
-+            av_log(s, AV_LOG_INFO, "No CPU SIMD extension available\n");
-+            break;
++    /* pick the brightest component, reducing the value range as necessary
++     * to keep the entire signal in range and preventing discoloration due to
++     * out-of-bounds clipping */
++    sig = FFMAX3(r_in, g_in, b_in);
++
++    mapval = tonemap_lut[av_clip_uintp2(sig + 2048, 15)];
++
++    r_lin = lin_lut[av_clip_uintp2(r_in + 2048, 15)];
++    g_lin = lin_lut[av_clip_uintp2(g_in + 2048, 15)];
++    b_lin = lin_lut[av_clip_uintp2(b_in + 2048, 15)];
++
++    if (!rgb2rgb_passthrough) {
++        r_lin = (*rgb2rgb)[0][0] * r_lin + (*rgb2rgb)[0][1] * g_lin + (*rgb2rgb)[0][2] * b_lin;
++        g_lin = (*rgb2rgb)[1][0] * r_lin + (*rgb2rgb)[1][1] * g_lin + (*rgb2rgb)[1][2] * b_lin;
++        b_lin = (*rgb2rgb)[2][0] * r_lin + (*rgb2rgb)[2][1] * g_lin + (*rgb2rgb)[2][2] * b_lin;
 +    }
 +
-+    switch (s->tonemap) {
-+        case TONEMAP_GAMMA:
-+            if (isnan(s->param))
-+                s->param = 1.8f;
-+            break;
-+        case TONEMAP_REINHARD:
-+            if (!isnan(s->param))
-+                s->param = (1.0f - s->param) / s->param;
-+            break;
-+        case TONEMAP_MOBIUS:
-+            if (isnan(s->param))
-+                s->param = 0.3f;
-+            break;
++#define MIX(x,y,a) (x) * (1 - (a)) + (y) * (a)
++    /* desaturate to prevent unnatural colors */
++    if (desat > 0) {
++        float luma = av_q2d(coeffs->cr) * r_lin + av_q2d(coeffs->cg) * g_lin + av_q2d(coeffs->cb) * b_lin;
++        float overbright = FFMAX(luma - desat, FLOAT_EPS) / FFMAX(luma, FLOAT_EPS);
++        r_lin = MIX(r_lin, luma, overbright);
++        g_lin = MIX(g_lin, luma, overbright);
++        b_lin = MIX(b_lin, luma, overbright);
 +    }
 +
-+    if (isnan(s->param))
-+        s->param = 1.0f;
++    r_lin *= mapval;
++    g_lin *= mapval;
++    b_lin *= mapval;
++#undef MIX
 +
-+    return 0;
++    *r_out = delin_lut[av_clip_uintp2(r_lin * 32767 + 0.5, 15)];
++    *g_out = delin_lut[av_clip_uintp2(g_lin * 32767 + 0.5, 15)];
++    *b_out = delin_lut[av_clip_uintp2(b_lin * 32767 + 0.5, 15)];
 +}
 +
-+#define OFFSET(x) offsetof(TonemapxContext, x)
-+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
-+static const AVOption tonemapx_options[] = {
-+    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_BT2390}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, .unit = "tonemap" },
-+    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},              0, 0, FLAGS, .unit = "tonemap" },
-+    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},            0, 0, FLAGS, .unit = "tonemap" },
-+    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},             0, 0, FLAGS, .unit = "tonemap" },
-+    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0, 0, FLAGS, .unit = "tonemap" },
-+    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},          0, 0, FLAGS, .unit = "tonemap" },
-+    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},             0, 0, FLAGS, .unit = "tonemap" },
-+    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},            0, 0, FLAGS, .unit = "tonemap" },
-+    {     "bt2390",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_BT2390},            0, 0, FLAGS, .unit = "tonemap" },
-+    { "transfer",     "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, .unit = "transfer" },
-+    { "t",            "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, .unit = "transfer" },
-+    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},           0, 0, FLAGS, .unit = "transfer" },
-+    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},       0, 0, FLAGS, .unit = "transfer" },
-+    { "matrix",       "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, .unit = "matrix" },
-+    { "m",            "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, .unit = "matrix" },
-+    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},           0, 0, FLAGS, .unit = "matrix" },
-+    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},      0, 0, FLAGS, .unit = "matrix" },
++// See also libavfilter/colorspacedsp_template.c
++void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
++                                    const uint16_t *srcy, const uint16_t *srcuv,
++                                    const int *dstlinesize, const int *srclinesize,
++                                    int dstdepth, int srcdepth,
++                                    int width, int height,
++                                    const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
++
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
++            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
++            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
++
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
++            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstuv[x]     = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
++            dstuv[x + 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
++#undef AVG
++        }
++    }
++}
++
++void tonemap_frame_dovi_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                               const int *dstlinesize, const int *srclinesize,
++                               int dstdepth, int srcdepth,
++                               int width, int height,
++                               const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
++
++    const float in_rng = (float)((1 << in_depth) - 1);
++
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                         );
++            int y01 = (srcy[x + 1]                     );
++            int y10 = (srcy[srclinesize[0] / 2 + x]    );
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1]);
++            int u = (srcu[x >> 1]);
++            int v = (srcv[x >> 1]);
++
++            dovi2rgb(y00, y01, y10, y11, u, v, params, in_rng, r, g, b);
++
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
++            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstu[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
++            dstv[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
++#undef AVG
++        }
++    }
++}
++
++void tonemap_frame_dovi_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                 const int *dstlinesize, const int *srclinesize,
++                                 int dstdepth, int srcdepth,
++                                 int width, int height,
++                                 const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
++
++    const float in_rng = (float)((1 << in_depth) - 1);
++
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                         );
++            int y01 = (srcy[x + 1]                     );
++            int y10 = (srcy[srclinesize[0] / 2 + x]    );
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1]);
++            int u = (srcu[x >> 1]);
++            int v = (srcv[x >> 1]);
++
++            dovi2rgb(y00, y01, y10, y11, u, v, params, in_rng, r, g, b);
++
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 16);
++            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 16);
++            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 16);
++            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 16);
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 16);
++            dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 16);
++#undef AVG
++        }
++    }
++}
++
++void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                 const int *dstlinesize, const int *srclinesize,
++                                 int dstdepth, int srcdepth,
++                                 int width, int height,
++                                 const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
++
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                         ) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
++            int u = (srcu[x >> 1]) - in_uv_offset;
++            int v = (srcv[x >> 1]) - in_uv_offset;
++
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
++            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstu[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
++            dstv[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
++#undef AVG
++        }
++    }
++}
++
++void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                   const int *dstlinesize, const int *srclinesize,
++                                   int dstdepth, int srcdepth,
++                                   int width, int height,
++                                   const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
++
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                         ) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
++            int u = (srcu[x >> 1]) - in_uv_offset;
++            int v = (srcv[x >> 1]) - in_uv_offset;
++
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 16);
++            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 16);
++            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 16);
++            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 16);
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 16);
++            dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 16);
++#undef AVG
++        }
++    }
++}
++
++void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
++                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                         const int *dstlinesize, const int *srclinesize,
++                                         int dstdepth, int srcdepth,
++                                         int width, int height,
++                                         const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
++
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
++            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
++            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
++
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstuv[x]     = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
++            dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
++#undef AVG
++        }
++    }
++}
++
++#define LOAD_TONEMAP_PARAMS     TonemapxContext *s = ctx->priv; \
++ThreadData *td = arg;                                           \
++AVFrame *in = td->in;                                           \
++AVFrame *out = td->out;                                         \
++const AVPixFmtDescriptor *desc  = td->desc;                     \
++const AVPixFmtDescriptor *odesc = td->odesc;                    \
++const int ss = 1 << FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h); \
++const int slice_start = (in->height / ss *  jobnr     ) / nb_jobs * ss; \
++const int slice_end   = (in->height / ss * (jobnr + 1)) / nb_jobs * ss; \
++TonemapIntParams params = {                                     \
++.lut_peak            = s->lut_peak,                             \
++.lin_lut             = s->lin_lut,                              \
++.tonemap_lut         = s->tonemap_lut,                          \
++.delin_lut           = s->delin_lut,                            \
++.in_yuv_off          = s->in_yuv_off,                           \
++.out_yuv_off         = s->out_yuv_off,                          \
++.yuv2rgb_coeffs      = &s->yuv2rgb_coeffs,                      \
++.rgb2yuv_coeffs      = &s->rgb2yuv_coeffs,                      \
++.rgb2rgb_coeffs      = &s->rgb2rgb_coeffs,                      \
++.rgb2rgb_passthrough = in->color_primaries == out->color_primaries,   \
++.coeffs              = s->coeffs,                               \
++.ocoeffs             = s->ocoeffs,                              \
++.desat               = s->desat,                                \
++.dovi = s->dovi,                                                \
++.dovi_pbuf = s->dovi_pbuf,                                      \
++.lms2rgb_matrix = &s->lms2rgb_matrix,                            \
++.ycc_offset = &s->ycc_offset                                     \
++};
++
++static int filter_slice_planar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++
++    s->tonemap_func_planar8(out->data[0] + out->linesize[0] * slice_start,
++                            out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                            out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                            (void*)(in->data[0] + in->linesize[0] * slice_start),
++                            (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                            (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                            out->linesize, in->linesize,
++                            odesc->comp[0].depth, desc->comp[0].depth,
++                            out->width, slice_end - slice_start,
++                            &params);
++
++    return 0;
++}
++
++static int filter_slice_biplanar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++
++    s->tonemap_func_biplanar8(out->data[0] + out->linesize[0] * slice_start,
++                              out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                              (void*)(in->data[0] + in->linesize[0] * slice_start),
++                              (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                              out->linesize, in->linesize,
++                              odesc->comp[0].depth, desc->comp[0].depth,
++                              out->width, slice_end - slice_start,
++                              &params);
++
++    return 0;
++}
++
++static int filter_slice_planar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++
++    s->tonemap_func_planar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start),
++                             (uint16_t *) (out->data[1] +
++                                           out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
++                             (uint16_t *) (out->data[2] +
++                                           out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
++                             (void*)(in->data[0] + in->linesize[0] * slice_start),
++                             (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                             (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                             out->linesize, in->linesize,
++                             odesc->comp[0].depth, desc->comp[0].depth,
++                             out->width, slice_end - slice_start,
++                             &params);
++
++    return 0;
++}
++
++static int filter_slice_biplanar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++
++    s->tonemap_func_biplanar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start),
++                               (uint16_t *) (out->data[1] +
++                                             out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
++                               (void*)(in->data[0] + in->linesize[0] * slice_start),
++                               (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                               out->linesize, in->linesize,
++                               odesc->comp[0].depth, desc->comp[0].depth,
++                               out->width, slice_end - slice_start,
++                               &params);
++
++    return 0;
++}
++
++static int filter_frame(AVFilterLink *link, AVFrame *in)
++{
++    AVFilterContext *ctx = link->dst;
++    TonemapxContext *s = ctx->priv;
++    AVFilterLink *outlink = ctx->outputs[0];
++    AVFrame *out;
++    const AVPixFmtDescriptor *desc;
++    const AVPixFmtDescriptor *odesc;
++    int ret;
++    double peak = s->peak;
++    const AVLumaCoefficients *coeffs;
++    ThreadData td;
++
++    desc = av_pix_fmt_desc_get(link->format);
++    odesc = av_pix_fmt_desc_get(outlink->format);
++    if (!desc || !odesc) {
++        av_frame_free(&in);
++        return AVERROR_BUG;
++    }
++
++    switch (odesc->comp[2].plane) {
++        case 1: // biplanar
++            if (odesc->comp[0].depth == 8) {
++                s->filter_slice = filter_slice_biplanar8;
++            } else {
++                s->filter_slice = filter_slice_biplanar10;
++            }
++            break;
++        default:
++        case 2: // planar
++            if (odesc->comp[0].depth == 8) {
++                s->filter_slice = filter_slice_planar8;
++            } else {
++                s->filter_slice = filter_slice_planar10;
++            }
++            break;
++    }
++
++    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
++    if (!out) {
++        av_frame_free(&in);
++        return AVERROR(ENOMEM);
++    }
++
++    if ((ret = av_frame_copy_props(out, in)) < 0)
++        goto fail;
++
++    /* read peak from side data if not passed in */
++    if (!peak) {
++        peak = ff_determine_signal_peak(in);
++        av_log(s, AV_LOG_DEBUG, "Computed signal peak: %f\n", peak);
++    }
++
++    out->color_trc = s->trc;
++    out->colorspace = s->spc;
++    out->color_primaries = s->pri;
++    if (s->range != -1) out->color_range = s->range;
++
++    if (in->color_trc == AVCOL_TRC_UNSPECIFIED)
++        in->color_trc = AVCOL_TRC_SMPTE2084;
++    if (out->color_trc == AVCOL_TRC_UNSPECIFIED)
++        out->color_trc = AVCOL_TRC_BT709;
++
++    if (in->colorspace == AVCOL_SPC_UNSPECIFIED)
++        in->colorspace = AVCOL_SPC_BT2020_NCL;
++    if (out->colorspace == AVCOL_SPC_UNSPECIFIED)
++        out->colorspace = AVCOL_SPC_BT709;
++
++    if (in->color_primaries == AVCOL_PRI_UNSPECIFIED)
++        in->color_primaries = AVCOL_PRI_BT2020;
++    if (out->color_primaries == AVCOL_PRI_UNSPECIFIED)
++        out->color_primaries = AVCOL_PRI_BT709;
++
++    if (in->color_range == AVCOL_RANGE_UNSPECIFIED)
++        in->color_range = AVCOL_RANGE_MPEG;
++    if (out->color_range == AVCOL_RANGE_UNSPECIFIED)
++        out->color_range = AVCOL_RANGE_MPEG;
++
++    if (!s->lin_lut || !s->delin_lut) {
++        if ((ret = compute_trc_luts(s, in->color_trc, out->color_trc)) < 0)
++            goto fail;
++    }
++
++    if (!s->tonemap_lut || s->lut_peak != peak) {
++        s->lut_peak = peak;
++        if ((ret = compute_tonemap_lut(s, out->color_trc)) < 0)
++            goto fail;
++    }
++
++    coeffs = av_csp_luma_coeffs_from_avcsp(in->colorspace);
++    if (s->coeffs != coeffs) {
++        s->coeffs = coeffs;
++        s->ocoeffs = av_csp_luma_coeffs_from_avcsp(out->colorspace);
++        if ((ret = compute_yuv_coeffs(s, coeffs, s->ocoeffs, desc, odesc,
++             in->color_range, out->color_range)) < 0)
++            goto fail;
++        if ((ret = compute_rgb_coeffs(s, in->color_primaries, out->color_primaries)) < 0)
++            goto fail;
++    }
++
++    if (s->apply_dovi) {
++        AVFrameSideData *dovi_sd = av_frame_get_side_data(in, AV_FRAME_DATA_DOVI_METADATA);
++        if (dovi_sd) {
++            const AVDOVIMetadata *metadata = (AVDOVIMetadata *) dovi_sd->data;
++            const AVDOVIRpuDataHeader *rpu = av_dovi_get_header(metadata);
++            // only map dovi rpus that don't require an EL and has rpu profile == 0
++            // for performance reason we only want to do reshaping when absolutely needed
++            // such videos usually have vdr_rpu_profile == 0, for example profile 5 videos
++            // this could be wrong as there is no public documentation on this field
++            if (rpu->disable_residual_flag && rpu->vdr_rpu_profile == 0) {
++                struct DoviMetadata *dovi = av_malloc(sizeof(*dovi));
++                s->dovi = dovi;
++                if (!s->dovi)
++                    goto fail;
++
++                ff_map_dovi_metadata(s->dovi, metadata);
++            }
++        }
++
++        if (s->dovi) {
++            if (desc->comp[2].plane == 1) {
++                av_log(s, AV_LOG_ERROR, "Input pixel format has to be yuv420p10 for Dolby Vision reshaping\n");
++                av_assert0(0);
++            }
++            update_dovi_buf(ctx);
++            ff_matrix_mul_3x3(s->lms2rgb_matrix, dovi_lms2rgb_matrix, s->dovi->linear);
++            s->ycc_offset[0] = s->dovi->nonlinear_offset[0] * (float)s->dovi->nonlinear[0][0] + s->dovi->nonlinear_offset[1] * (float)s->dovi->nonlinear[0][1] + s->dovi->nonlinear_offset[2] * (float)s->dovi->nonlinear[0][2];
++            s->ycc_offset[1] = s->dovi->nonlinear_offset[0] * (float)s->dovi->nonlinear[1][0] + s->dovi->nonlinear_offset[1] * (float)s->dovi->nonlinear[1][1] + s->dovi->nonlinear_offset[2] * (float)s->dovi->nonlinear[1][2];
++            s->ycc_offset[2] = s->dovi->nonlinear_offset[0] * (float)s->dovi->nonlinear[2][0] + s->dovi->nonlinear_offset[1] * (float)s->dovi->nonlinear[2][1] + s->dovi->nonlinear_offset[2] * (float)s->dovi->nonlinear[2][2];
++            s->tonemap_func_planar8 = s->tonemap_func_dovi8;
++            s->tonemap_func_planar10 = s->tonemap_func_dovi10;
++        }
++    }
++
++    /* do the tonemap */
++    td.in    = in;
++    td.out   = out;
++    td.desc  = desc;
++    td.odesc = odesc;
++    td.peak  = peak;
++    ff_filter_execute(ctx, s->filter_slice, &td, NULL,
++                      FFMIN(outlink->h >> FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h), ff_filter_get_nb_threads(ctx)));
++
++    av_frame_free(&in);
++
++    av_frame_remove_side_data(out, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
++    av_frame_remove_side_data(out, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
++    av_frame_remove_side_data(out, AV_FRAME_DATA_DOVI_RPU_BUFFER);
++    av_frame_remove_side_data(out, AV_FRAME_DATA_DOVI_METADATA);
++
++    return ff_filter_frame(outlink, out);
++fail:
++    av_frame_free(&in);
++    av_frame_free(&out);
++    return ret;
++}
++
++static void uninit(AVFilterContext *ctx)
++{
++    TonemapxContext *s = ctx->priv;
++
++    av_freep(&s->lin_lut);
++    av_freep(&s->delin_lut);
++    av_freep(&s->tonemap_lut);
++
++    if (s->dovi)
++        av_freep(&s->dovi);
++}
++
++static int query_formats(AVFilterContext *ctx)
++{
++    enum AVPixelFormat valid_in_pix_fmts[4];
++    AVFilterFormats *formats;
++    const AVPixFmtDescriptor *desc;
++    TonemapxContext *s = ctx->priv;
++
++    if (!strcmp(s->format_str, "same")) {
++        int res;
++        formats = ff_make_format_list(in_pix_fmts);
++        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++        if (res < 0)
++            return res;
++        s->format = AV_PIX_FMT_NONE;
++    } else {
++        int i, j = 0;
++        int res;
++        formats = ff_make_format_list(in_pix_fmts);
++        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++        if (res < 0)
++            return res;
++        if (s->format == AV_PIX_FMT_NONE) {
++            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
++            return AVERROR(EINVAL);
++        }
++        s->format = av_get_pix_fmt(s->format_str);
++        // Check again in case of the string is invalid
++        if (s->format == AV_PIX_FMT_NONE) {
++            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
++            return AVERROR(EINVAL);
++        }
++        desc = av_pix_fmt_desc_get(s->format);
++        // Filter out the input formats for requested output formats
++        // The input and output must have the same planar format, either planar or bi-planar packed
++        for (i = 0; in_pix_fmts[i] != AV_PIX_FMT_NONE; i++) {
++            const AVPixFmtDescriptor *tdesc = av_pix_fmt_desc_get(in_pix_fmts[i]);
++            if (tdesc->comp[2].plane == desc->comp[2].plane) {
++                valid_in_pix_fmts[j] = in_pix_fmts[i];
++                j++;
++            }
++        }
++        valid_in_pix_fmts[j] = AV_PIX_FMT_NONE;
++        formats = ff_make_format_list(valid_in_pix_fmts);
++        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++        if (res < 0)
++            return res;
++        if (out_format_is_supported(s->format)) {
++            formats = NULL;
++            res = ff_add_format(&formats, s->format);
++            if (res < 0)
++                return res;
++        } else {
++            av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
++                   av_get_pix_fmt_name(s->format));
++            return AVERROR(ENOSYS);
++        }
++    }
++
++    return ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats);
++}
++
++static av_cold int init(AVFilterContext *ctx)
++{
++    TonemapxContext *s = ctx->priv;
++    enum SIMDVariant active_simd = SIMD_NONE;
++    av_log(s, AV_LOG_DEBUG, "Requested output format: %s\n",
++           s->format_str);
++
++#if ARCH_AARCH64
++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
++    {
++        int cpu_flags = av_get_cpu_flags();
++        if (have_neon(cpu_flags)) {
++            s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_neon;
++            s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_neon;
++            s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon;
++            s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon;
++            s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_neon;
++            s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10_neon;
++            active_simd = SIMD_NEON;
++        }
++    }
++#else
++    av_log(s, AV_LOG_WARNING, "NEON optimization disabled at compile time\n");
++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
++#elif ARCH_X86
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
++    {
++        int cpu_flags = av_get_cpu_flags();
++        if (X86_SSE42(cpu_flags)) {
++            s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_sse;
++            s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_sse;
++            s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse;
++            s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse;
++            s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_sse;
++            s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10_sse;
++            active_simd = SIMD_SSE;
++        }
++    }
++#else
++    av_log(s, AV_LOG_WARNING, "SSE optimization disabled at compile time\n");
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
++    {
++        int cpu_flags = av_get_cpu_flags();
++        if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) {
++            s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_avx;
++            s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_avx;
++            s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx;
++            s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx;
++            s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p_avx;
++            s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10_avx;
++            active_simd = SIMD_AVX;
++        }
++    }
++#else
++    av_log(s, AV_LOG_WARNING, "AVX optimization disabled at compile time\n");
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
++#endif // ARCH_X86/ARCH_AARCH64
++
++#if !defined(ENABLE_TONEMAPX_NEON_INTRINSICS) && \
++    !defined(ENABLE_TONEMAPX_SSE_INTRINSICS) && \
++    !defined(ENABLE_TONEMAPX_AVX_INTRINSICS)
++    av_log(s, AV_LOG_WARNING, "SIMD optimization disabled at compile time\n");
++#endif
++
++    if (!s->tonemap_func_biplanar8) {
++        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12;
++    }
++
++    if (!s->tonemap_func_biplanar10) {
++        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010;
++    }
++
++    if (!s->tonemap_func_planar8) {
++        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p;
++    }
++
++    if (!s->tonemap_func_planar10) {
++        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10;
++    }
++
++    if (!s->tonemap_func_dovi8) {
++        s->tonemap_func_dovi8 = tonemap_frame_dovi_2_420p;
++    }
++
++    if (!s->tonemap_func_dovi10) {
++        s->tonemap_func_dovi10 = tonemap_frame_dovi_2_420p10;
++    }
++
++    switch (active_simd) {
++        case SIMD_NEON:
++            av_log(s, AV_LOG_INFO, "Using CPU capability: NEON\n");
++            break;
++        case SIMD_SSE:
++            av_log(s, AV_LOG_INFO, "Using CPU capability: SSE4.2\n");
++            break;
++        case SIMD_AVX:
++            av_log(s, AV_LOG_INFO, "Using CPU capabilities: AVX2 FMA3\n");
++            break;
++        default:
++        case SIMD_NONE:
++            av_log(s, AV_LOG_INFO, "No CPU SIMD extension available\n");
++            break;
++    }
++
++    switch (s->tonemap) {
++        case TONEMAP_GAMMA:
++            if (isnan(s->param))
++                s->param = 1.8f;
++            break;
++        case TONEMAP_REINHARD:
++            if (!isnan(s->param))
++                s->param = (1.0f - s->param) / s->param;
++            break;
++        case TONEMAP_MOBIUS:
++            if (isnan(s->param))
++                s->param = 0.3f;
++            break;
++    }
++
++    if (isnan(s->param))
++        s->param = 1.0f;
++
++    return 0;
++}
++
++#define OFFSET(x) offsetof(TonemapxContext, x)
++#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
++static const AVOption tonemapx_options[] = {
++    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_BT2390}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, .unit = "tonemap" },
++    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},              0, 0, FLAGS, .unit = "tonemap" },
++    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},            0, 0, FLAGS, .unit = "tonemap" },
++    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},             0, 0, FLAGS, .unit = "tonemap" },
++    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0, 0, FLAGS, .unit = "tonemap" },
++    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},          0, 0, FLAGS, .unit = "tonemap" },
++    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},             0, 0, FLAGS, .unit = "tonemap" },
++    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},            0, 0, FLAGS, .unit = "tonemap" },
++    {     "bt2390",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_BT2390},            0, 0, FLAGS, .unit = "tonemap" },
++    { "transfer",     "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, .unit = "transfer" },
++    { "t",            "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, .unit = "transfer" },
++    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},           0, 0, FLAGS, .unit = "transfer" },
++    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},       0, 0, FLAGS, .unit = "transfer" },
++    { "matrix",       "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, .unit = "matrix" },
++    { "m",            "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, .unit = "matrix" },
++    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},           0, 0, FLAGS, .unit = "matrix" },
++    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},      0, 0, FLAGS, .unit = "matrix" },
 +    { "primaries",    "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, .unit = "primaries" },
 +    { "p",            "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, .unit = "primaries" },
 +    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709},           0, 0, FLAGS, .unit = "primaries" },
 +    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020},          0, 0, FLAGS, .unit = "primaries" },
-+    { "range",        "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, .unit = "range" },
-+    { "r",            "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, .unit = "range" },
++    { "range",        "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, .unit = "range" },
++    { "r",            "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = -1}, -1, INT_MAX, FLAGS, .unit = "range" },
 +    {     "tv",       0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},          0, 0, FLAGS, .unit = "range" },
 +    {     "pc",       0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},          0, 0, FLAGS, .unit = "range" },
 +    {     "limited",  0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},          0, 0, FLAGS, .unit = "range" },
@@ -2714,331 +4164,1865 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    { "param",        "tonemap parameter", OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },
 +    { "desat",        "desaturation strength", OFFSET(desat), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
 +    { "peak",         "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
++    { "apply_dovi",  "Apply Dolby Vision metadata if possible", OFFSET(apply_dovi), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS },
 +    { NULL }
 +};
 +
-+AVFILTER_DEFINE_CLASS(tonemapx);
++AVFILTER_DEFINE_CLASS(tonemapx);
++
++static const AVFilterPad tonemapx_inputs[] = {
++    {
++        .name         = "default",
++        .type         = AVMEDIA_TYPE_VIDEO,
++        .filter_frame = filter_frame,
++    },
++};
++
++AVFilter ff_vf_tonemapx = {
++    .name            = "tonemapx",
++    .description     = NULL_IF_CONFIG_SMALL("SIMD optimized HDR to SDR tonemapping"),
++    .init            = init,
++    .uninit          = uninit,
++    .priv_size       = sizeof(TonemapxContext),
++    .priv_class      = &tonemapx_class,
++    FILTER_INPUTS(tonemapx_inputs),
++    FILTER_OUTPUTS(ff_video_default_filterpad),
++    FILTER_QUERY_FUNC(query_formats),
++    .flags           = AVFILTER_FLAG_SLICE_THREADS,
++};
+Index: FFmpeg/libavfilter/vf_tonemapx.h
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/vf_tonemapx.h
+@@ -0,0 +1,126 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVFILTER_TONEMAPX_H
++#define AVFILTER_TONEMAPX_H
++
++#include "config.h"
++#include "colorspace.h"
++
++#define X86_64_V2 __attribute__((target("sse4.2")))
++#define X86_64_V3 __attribute__((target("avx2,fma")))
++
++#if defined(__GNUC__) || defined(__clang__)
++#    if (__GNUC__ >= 9) || (__clang_major__ >= 11)
++#        define CC_SUPPORTS_TONEMAPX_INTRINSICS
++#    endif // (__GNUC__ >= 10) || (__clang_major__ >= 11)
++#endif // defined(__GNUC__) || defined(__clang__)
++
++#ifdef CC_SUPPORTS_TONEMAPX_INTRINSICS
++#    if ARCH_AARCH64
++#        if HAVE_INTRINSICS_NEON
++#            define ENABLE_TONEMAPX_NEON_INTRINSICS
++#        endif
++#    endif // ARCH_AARCH64
++#    if ARCH_X86
++#        if HAVE_INTRINSICS_SSE42
++#           define ENABLE_TONEMAPX_SSE_INTRINSICS
++#        endif
++#        if HAVE_INTRINSICS_AVX2 && HAVE_INTRINSICS_FMA3
++#            define ENABLE_TONEMAPX_AVX_INTRINSICS
++#        endif
++#    endif // ARCH_X86
++#endif // CC_SUPPORTS_TONEMAPX_INTRINSICS
++
++#define params_cnt 8
++#define pivots_cnt (7+1)
++#define coeffs_cnt 8*4
++#define mmr_cnt 8*6*4
++#define params_sz params_cnt*sizeof(float)
++#define pivots_sz pivots_cnt*sizeof(float)
++#define coeffs_sz coeffs_cnt*sizeof(float)
++#define mmr_sz mmr_cnt*sizeof(float)
++
++typedef struct TonemapIntParams {
++    double lut_peak;
++    float *lin_lut;
++    float *tonemap_lut;
++    uint16_t *delin_lut;
++    int in_yuv_off, out_yuv_off;
++    int16_t (*yuv2rgb_coeffs)[3][3][8];
++    int16_t (*rgb2yuv_coeffs)[3][3][8];
++    double  (*rgb2rgb_coeffs)[3][3];
++    int rgb2rgb_passthrough;
++    const AVLumaCoefficients *coeffs, *ocoeffs;
++    double desat;
++    struct DoviMetadata *dovi;
++    float *dovi_pbuf;
++    double (*lms2rgb_matrix)[3][3];
++    float (*ycc_offset)[3];
++} TonemapIntParams;
++
++enum SIMDVariant {
++    SIMD_NONE = -1,
++    SIMD_NEON,
++    SIMD_SSE,
++    SIMD_AVX
++};
++
++void tonemap_frame_dovi_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                               const int *dstlinesize, const int *srclinesize,
++                               int dstdepth, int srcdepth,
++                               int width, int height,
++                               const struct TonemapIntParams *params);
++
++void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                 const int *dstlinesize, const int *srclinesize,
++                                 int dstdepth, int srcdepth,
++                                 int width, int height,
++                                 const struct TonemapIntParams *params);
++
++void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
++                                    const uint16_t *srcy, const uint16_t *srcuv,
++                                    const int *dstlinesize, const int *srclinesize,
++                                    int dstdepth, int srcdepth,
++                                    int width, int height,
++                                    const struct TonemapIntParams *params);
++
++void tonemap_frame_dovi_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                 const int *dstlinesize, const int *srclinesize,
++                                 int dstdepth, int srcdepth,
++                                 int width, int height,
++                                 const struct TonemapIntParams *params);
++
++void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                   const int *dstlinesize, const int *srclinesize,
++                                   int dstdepth, int srcdepth,
++                                   int width, int height,
++                                   const struct TonemapIntParams *params);
++
++void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
++                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                         const int *dstlinesize, const int *srclinesize,
++                                         int dstdepth, int srcdepth,
++                                         int width, int height,
++                                         const struct TonemapIntParams *params);
++
++#endif // AVFILTER_TONEMAPX_H
+Index: FFmpeg/libavfilter/x86/Makefile
+===================================================================
+--- FFmpeg.orig/libavfilter/x86/Makefile
++++ FFmpeg/libavfilter/x86/Makefile
+@@ -34,6 +34,8 @@ OBJS-$(CONFIG_STEREO3D_FILTER)
+ OBJS-$(CONFIG_TBLEND_FILTER)                 += x86/vf_blend_init.o
+ OBJS-$(CONFIG_THRESHOLD_FILTER)              += x86/vf_threshold_init.o
+ OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
++OBJS-$(CONFIG_TONEMAPX_FILTER)               += x86/vf_tonemapx_intrin_sse.o \
++                                                x86/vf_tonemapx_intrin_avx.o
+ OBJS-$(CONFIG_TRANSPOSE_FILTER)              += x86/vf_transpose_init.o
+ OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
+ OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
+@@ -0,0 +1,2276 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "vf_tonemapx_intrin_avx.h"
++
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
++#    include <immintrin.h>
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
++
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
++X86_64_V3 static inline __m256i av_clip_int16_avx(__m256i a)
++{
++    __m256i add_result = _mm256_add_epi32(a, _mm256_set1_epi32(0x8000U));
++    __m256i mask = _mm256_set1_epi32(~0xFFFF);
++    __m256i condition = _mm256_and_si256(add_result, mask);
++    __m256i cmp = _mm256_cmpeq_epi32(condition, _mm256_setzero_si256());
++
++    __m256i shifted = _mm256_srai_epi32(a, 31);
++    __m256i xor_result = _mm256_xor_si256(shifted, _mm256_set1_epi32(0x7FFF));
++
++    return _mm256_or_si256(_mm256_and_si256(cmp, a), _mm256_andnot_si256(cmp, xor_result));
++}
++
++X86_64_V3 inline static __m128 mix_float32x4(__m128 x, __m128 y, __m128 a)
++{
++    __m128 n = _mm_sub_ps(y, x);
++    n = _mm_fmadd_ps(a, n, x);
++    return n;
++}
++
++X86_64_V3 inline static float reduce_floatx4(__m128 x) {
++    x = _mm_hadd_ps(x, x);
++    x = _mm_hadd_ps(x, x);
++    return _mm_cvtss_f32(x);
++}
++
++X86_64_V3 inline static float reduce_floatx8(__m256 x) {
++    __m256 x2 = _mm256_permute2f128_ps(x , x , 1);
++    x = _mm256_add_ps(x, x2);
++    x = _mm256_hadd_ps(x, x);
++    x = _mm256_hadd_ps(x, x);
++    return _mm256_cvtss_f32(x);
++}
++
++X86_64_V3 static inline float reshape_poly(float s, __m128 coeffs)
++{
++    __m128 ps = _mm_set_ps(0.0f, s * s, s, 1.0f);
++    ps = _mm_mul_ps(ps, coeffs);
++    return reduce_floatx4(ps);
++}
++
++X86_64_V3 inline static float reshape_mmr(__m128 sig, __m128 coeffs, const float* mmr,
++                                          int mmr_single, int min_order, int max_order)
++{
++    float s = _mm_cvtss_f32(coeffs);
++    int mmr_idx = 0;
++    int order = 0;
++
++    __m256 sigX, mmr_coeffs, ps;
++    __m128 sigX01 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1))); // {sig[0]*sig[1], sig[1]*sig[1], sig[2]*sig[1], sig[3]*sig[1]}
++    __m128 sigX02 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[2], sig[1]*sig[2], sig[2]*sig[2], sig[3]*sig[2]}
++    __m128 sigX12 = _mm_mul_ps(sigX01, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[1]*sig[2], sig[1]*sig[1]*sig[2], sig[2]*sig[1]*sig[2], sig[3]*sig[1]*sig[2]}
++    __m128 sigX0 = sigX01; // sig[0]*sig[1] now positioned at 0
++
++    sigX0 = _mm_insert_ps(sigX0, sigX02, _MM_MK_INSERTPS_NDX(0, 1, 0)); // sig[0]*sig[2] at 1
++    sigX0 = _mm_insert_ps(sigX0, sigX02, _MM_MK_INSERTPS_NDX(1, 2, 0)); // sig[1]*sig[2] at 2
++    sigX0 = _mm_insert_ps(sigX0, sigX12, _MM_MK_INSERTPS_NDX(0, 3, 0)); // sig[0]*sig[1]*sig[2] at 3
++
++    sigX = _mm256_set_m128(sigX0, sig);
++
++    mmr_idx = mmr_single ? 0 : (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 2, 0, 1)));
++    order = (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(1, 2, 0, 3)));
++
++    // dot first order
++    mmr_coeffs = _mm256_loadu_ps(&mmr[mmr_idx + 0*4]);
++    ps = _mm256_mul_ps(sigX, mmr_coeffs);
++    s += reduce_floatx8(ps);
++
++    if (max_order >= 2 && (min_order >= 2 || order >= 2)) {
++        __m256 sigX2 = _mm256_mul_ps(sigX, sigX);
++        mmr_coeffs = _mm256_loadu_ps(&mmr[mmr_idx + 2*4]);
++        ps = _mm256_mul_ps(sigX2, mmr_coeffs);
++        s += reduce_floatx8(ps);
++
++        if (max_order == 3 && (min_order == 3 || order >= 3)) {
++            __m256 sigX3 = _mm256_mul_ps(sigX2, sigX);
++            mmr_coeffs = _mm256_loadu_ps(&mmr[mmr_idx + 4*4]);
++            ps = _mm256_mul_ps(sigX3, mmr_coeffs);
++            s += reduce_floatx8(ps);
++        }
++    }
++
++    return s;
++}
++
++#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c)))
++X86_64_V3 inline static __m128 reshape_dovi_iptpqc2(__m128 sig, const TonemapIntParams *ctx)
++{
++    int has_mmr_poly;
++    float s;
++
++    float *src_dovi_params = ctx->dovi_pbuf;
++    float *src_dovi_pivots = ctx->dovi_pbuf + 24;
++    float *src_dovi_coeffs = ctx->dovi_pbuf + 48; //float4*
++    float *src_dovi_mmr = ctx->dovi_pbuf + 144; //float4*
++
++    float* dovi_params_i = src_dovi_params + 0*8;
++    float* dovi_pivots_i = src_dovi_pivots + 0*8;
++    float* dovi_coeffs_i = src_dovi_coeffs + 0 * 8 * 4; //float4*
++    float* dovi_mmr_i = src_dovi_mmr + 0 * 48 * 4; //float4*
++    int dovi_num_pivots_i = dovi_params_i[0];
++    int dovi_has_mmr_i = dovi_params_i[1];
++    int dovi_has_poly_i = dovi_params_i[2];
++    int dovi_mmr_single_i = dovi_params_i[3];
++    int dovi_min_order_i = dovi_params_i[4];
++    int dovi_max_order_i = dovi_params_i[5];
++    float dovi_lo_i = dovi_params_i[6];
++    float dovi_hi_i = dovi_params_i[7];
++
++    float* dovi_params_p = src_dovi_params + 1*8;
++    float* dovi_coeffs_p = src_dovi_coeffs + 1*8 * 4; //float4*
++    float* dovi_mmr_p = src_dovi_mmr + 1*48 * 4; //float4*
++    int dovi_has_mmr_p = dovi_params_p[1];
++    int dovi_has_poly_p = dovi_params_p[2];
++    int dovi_mmr_single_p = dovi_params_p[3];
++    int dovi_min_order_p = dovi_params_p[4];
++    int dovi_max_order_p = dovi_params_p[5];
++    float dovi_lo_p = dovi_params_p[6];
++    float dovi_hi_p = dovi_params_p[7];
++
++    float* dovi_params_t = src_dovi_params + 2*8;
++    float* dovi_coeffs_t = src_dovi_coeffs + 2*8 * 4; //float4*
++    float* dovi_mmr_t = src_dovi_mmr + 2*48 * 4; //float4*
++    int dovi_has_mmr_t = dovi_params_t[1];
++    int dovi_has_poly_t = dovi_params_t[2];
++    int dovi_mmr_single_t = dovi_params_t[3];
++    int dovi_min_order_t = dovi_params_t[4];
++    int dovi_max_order_t = dovi_params_t[5];
++    float dovi_lo_t = dovi_params_t[6];
++    float dovi_hi_t = dovi_params_t[7];
++
++    __m128 coeffs, result;
++
++    // reshape I
++    s = _mm_cvtss_f32(sig);
++    result = sig;
++    if (dovi_num_pivots_i > 2) {
++        __m128 m01 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i), _mm_loadu_ps(dovi_coeffs_i + 4), _mm_set1_ps(s >= dovi_pivots_i[0]));
++        __m128 m23 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 2*4), _mm_loadu_ps(dovi_coeffs_i + 3*4), _mm_set1_ps(s >= dovi_pivots_i[2]));
++        __m128 m0123 = mix_float32x4(m01, m23, _mm_set1_ps(s >= dovi_pivots_i[1]));
++        __m128 m45 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 4*4), _mm_loadu_ps(dovi_coeffs_i + 5*4), _mm_set1_ps(s >= dovi_pivots_i[4]));
++        __m128 m67 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 6*4), _mm_loadu_ps(dovi_coeffs_i + 7*4), _mm_set1_ps(s >= dovi_pivots_i[6]));
++        __m128 m4567 = mix_float32x4(m45, m67, _mm_set1_ps(s >= dovi_pivots_i[5]));
++        coeffs = mix_float32x4(m0123, m4567, _mm_set1_ps(s >= dovi_pivots_i[3]));
++    } else {
++        coeffs = _mm_loadu_ps(dovi_coeffs_i);
++    }
++
++    has_mmr_poly = dovi_has_mmr_i && dovi_has_poly_i;
++
++    if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_i))
++        s = reshape_poly(s, coeffs);
++    else
++        s = reshape_mmr(result, coeffs, dovi_mmr_i,
++                        dovi_mmr_single_i, dovi_min_order_i, dovi_max_order_i);
++
++    result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_i, dovi_hi_i)), _MM_MK_INSERTPS_NDX(0, 0, 0));
++
++    // reshape P
++    s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1)));
++    coeffs = _mm_loadu_ps(dovi_coeffs_p);
++    has_mmr_poly = dovi_has_mmr_p && dovi_has_poly_p;
++
++    if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_p))
++        s = reshape_poly(s, coeffs);
++    else
++        s = reshape_mmr(result, coeffs, dovi_mmr_p,
++                        dovi_mmr_single_p, dovi_min_order_p, dovi_max_order_p);
++
++    result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_p, dovi_hi_p)), _MM_MK_INSERTPS_NDX(0, 1, 0));
++
++    // reshape T
++    s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2)));
++    coeffs = _mm_loadu_ps(dovi_coeffs_t);
++    has_mmr_poly = dovi_has_mmr_t && dovi_has_poly_t;
++
++    if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_t))
++        s = reshape_poly(s, coeffs);
++    else
++        s = reshape_mmr(result, coeffs, dovi_mmr_t,
++                        dovi_mmr_single_t, dovi_min_order_t, dovi_max_order_t);
++
++    result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_t, dovi_hi_t)), _MM_MK_INSERTPS_NDX(0, 2, 0));
++
++    return result;
++}
++
++X86_64_V3 inline static void ycc2rgbx8(__m256* dy, __m256* dcb, __m256* dcr,
++                                       __m256 y, __m256 cb, __m256 cr,
++                                       const double nonlinear[3][3], const float ycc_offset[3])
++{
++    *dy = _mm256_mul_ps(y, _mm256_set1_ps((float)nonlinear[0][0]));
++    *dy = _mm256_fmadd_ps(cb, _mm256_set1_ps((float)nonlinear[0][1]), *dy);
++    *dy = _mm256_fmadd_ps(cr, _mm256_set1_ps((float)nonlinear[0][2]), *dy);
++    *dy = _mm256_sub_ps(*dy, _mm256_set1_ps(ycc_offset[0]));
++
++    *dcb = _mm256_mul_ps(y, _mm256_set1_ps((float)nonlinear[1][0]));
++    *dcb = _mm256_fmadd_ps(cb, _mm256_set1_ps((float)nonlinear[1][1]), *dcb);
++    *dcb = _mm256_fmadd_ps(cr, _mm256_set1_ps((float)nonlinear[1][2]), *dcb);
++    *dcb = _mm256_sub_ps(*dcb, _mm256_set1_ps(ycc_offset[1]));
++
++    *dcr = _mm256_mul_ps(y, _mm256_set1_ps((float)nonlinear[2][0]));
++    *dcr = _mm256_fmadd_ps(cb, _mm256_set1_ps((float)nonlinear[2][1]), *dcr);
++    *dcr = _mm256_fmadd_ps(cr, _mm256_set1_ps((float)nonlinear[2][2]), *dcr);
++    *dcr = _mm256_sub_ps(*dcr, _mm256_set1_ps(ycc_offset[2]));
++}
++
++X86_64_V3 inline static void lms2rgbx8(__m256* dl, __m256* dm, __m256* ds,
++                                       __m256 l, __m256 m, __m256 s,
++                                       const double lms2rgb_matrix[3][3])
++{
++    *dl = _mm256_mul_ps(l, _mm256_set1_ps((float)lms2rgb_matrix[0][0]));
++    *dl = _mm256_fmadd_ps(m, _mm256_set1_ps((float)lms2rgb_matrix[0][1]), *dl);
++    *dl = _mm256_fmadd_ps(s, _mm256_set1_ps((float)lms2rgb_matrix[0][2]), *dl);
++
++    *dm = _mm256_mul_ps(l, _mm256_set1_ps((float)lms2rgb_matrix[1][0]));
++    *dm = _mm256_fmadd_ps(m, _mm256_set1_ps((float)lms2rgb_matrix[1][1]), *dm);
++    *dm = _mm256_fmadd_ps(s, _mm256_set1_ps((float)lms2rgb_matrix[1][2]), *dm);
++
++    *ds = _mm256_mul_ps(l, _mm256_set1_ps((float)lms2rgb_matrix[2][0]));
++    *ds = _mm256_fmadd_ps(m, _mm256_set1_ps((float)lms2rgb_matrix[2][1]), *ds);
++    *ds = _mm256_fmadd_ps(s, _mm256_set1_ps((float)lms2rgb_matrix[2][2]), *ds);
++}
++
++X86_64_V3 inline static void reshapeiptx8(__m128* ipt0, __m128* ipt1, __m128* ipt2, __m128* ipt3,
++                                          __m128* ipt4, __m128* ipt5, __m128* ipt6, __m128* ipt7,
++                                          __m256 yx8, __m256 ux8, __m256 vx8,
++                                          const struct TonemapIntParams *params)
++{
++    __m128 yx4a = _mm256_extractf128_ps(yx8, 0);
++    __m128 yx4b = _mm256_extractf128_ps(yx8, 1);
++    __m128 ux4a = _mm256_extractf128_ps(ux8, 0);
++    __m128 ux4b = _mm256_extractf128_ps(ux8, 1);
++    __m128 vx4a = _mm256_extractf128_ps(vx8, 0);
++    __m128 vx4b = _mm256_extractf128_ps(vx8, 1);
++
++    __m128 ia1 = _mm_unpacklo_ps(yx4a, ux4a);
++    __m128 ia2 = _mm_unpackhi_ps(yx4a, ux4a);
++    __m128 ib1 = _mm_unpacklo_ps(vx4a, _mm_setzero_ps());
++    __m128 ib2 = _mm_unpackhi_ps(vx4a, _mm_setzero_ps());
++
++    *ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++    *ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++    *ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++    *ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
++
++    *ipt0 = reshape_dovi_iptpqc2(*ipt0, params);
++    *ipt1 = reshape_dovi_iptpqc2(*ipt1, params);
++    *ipt2 = reshape_dovi_iptpqc2(*ipt2, params);
++    *ipt3 = reshape_dovi_iptpqc2(*ipt3, params);
++
++    ia1 = _mm_unpacklo_ps(yx4b, ux4b);
++    ia2 = _mm_unpackhi_ps(yx4b, ux4b);
++    ib1 = _mm_unpacklo_ps(vx4b, _mm_setzero_ps());
++    ib2 = _mm_unpackhi_ps(vx4b, _mm_setzero_ps());
++
++    *ipt4 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++    *ipt5 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++    *ipt6 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++    *ipt7 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
++
++    *ipt4 = reshape_dovi_iptpqc2(*ipt4, params);
++    *ipt5 = reshape_dovi_iptpqc2(*ipt5, params);
++    *ipt6 = reshape_dovi_iptpqc2(*ipt6, params);
++    *ipt7 = reshape_dovi_iptpqc2(*ipt7, params);
++}
++
++X86_64_V3 inline static void transpose_ipt8x4(__m128 ipt0, __m128 ipt1, __m128 ipt2, __m128 ipt3,
++                                              __m128 ipt4, __m128 ipt5, __m128 ipt6, __m128 ipt7,
++                                              __m256* ix8, __m256* px8, __m256* tx8)
++{
++    __m256 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
++    tmp0 = _mm256_castps128_ps256(ipt0);
++    tmp0 = _mm256_insertf128_ps(tmp0, ipt4, 1);
++
++    tmp1 = _mm256_castps128_ps256(ipt1);
++    tmp1 = _mm256_insertf128_ps(tmp1, ipt5, 1);
++
++    tmp2 = _mm256_castps128_ps256(ipt2);
++    tmp2 = _mm256_insertf128_ps(tmp2, ipt6, 1);
++
++    tmp3 = _mm256_castps128_ps256(ipt3);
++    tmp3 = _mm256_insertf128_ps(tmp3, ipt7, 1);
++
++    tmp4 = _mm256_unpacklo_ps(tmp0, tmp1);
++    tmp5 = _mm256_unpackhi_ps(tmp0, tmp1);
++    tmp6 = _mm256_unpacklo_ps(tmp2, tmp3);
++    tmp7 = _mm256_unpackhi_ps(tmp2, tmp3);
++
++    *ix8 = _mm256_shuffle_ps(tmp4, tmp6, _MM_SHUFFLE(1, 0, 1, 0));
++    *px8 = _mm256_shuffle_ps(tmp4, tmp6, _MM_SHUFFLE(3, 2, 3, 2));
++    *tx8 = _mm256_shuffle_ps(tmp5, tmp7, _MM_SHUFFLE(1, 0, 1, 0));
++}
++
++X86_64_V3 static inline void tonemap_int32x8_avx(__m256i r_in, __m256i g_in, __m256i b_in,
++                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                                                 const AVLumaCoefficients *coeffs,
++                                                 const AVLumaCoefficients *ocoeffs, double desat,
++                                                 double (*rgb2rgb)[3][3],
++                                                 int rgb2rgb_passthrough)
++{
++    __m256i sig8;
++    __m256 mapvalx8, r_linx8, g_linx8, b_linx8;
++    __m256 offset = _mm256_set1_ps(0.5f);
++    __m256i zerox8 = _mm256_setzero_si256();
++    __m256i input_lut_offset = _mm256_set1_epi32(2048);
++    __m256i upper_bound = _mm256_set1_epi32(32767);
++    __m256 intermediate_upper_bound = _mm256_set1_ps(32767.0f);
++    __m256i r, g, b, rx8, gx8, bx8;
++
++    float mapval8[8], r_lin8[8], g_lin8[8], b_lin8[8];
++
++    sig8 = _mm256_max_epi32(r_in, _mm256_max_epi32(g_in, b_in));
++    sig8 = _mm256_add_epi32(sig8, input_lut_offset);
++    sig8 = _mm256_min_epi32(sig8, upper_bound);
++    sig8 = _mm256_max_epi32(sig8, zerox8);
++
++    r = _mm256_add_epi32(r_in, input_lut_offset);
++    r = _mm256_min_epi32(r, upper_bound);
++    r = _mm256_max_epi32(r, zerox8);
++    g = _mm256_add_epi32(g_in, input_lut_offset);
++    g = _mm256_min_epi32(g, upper_bound);
++    g = _mm256_max_epi32(g, zerox8);
++    b = _mm256_add_epi32(b_in, input_lut_offset);
++    b = _mm256_min_epi32(b, upper_bound);
++    b = _mm256_max_epi32(b, zerox8);
++
++#define LOAD_LUT(i) mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)]; \
++r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)];                             \
++g_lin8[i] = lin_lut[_mm256_extract_epi32(g, i)];                             \
++b_lin8[i] = lin_lut[_mm256_extract_epi32(b, i)];
++
++    LOAD_LUT(0)
++    LOAD_LUT(1)
++    LOAD_LUT(2)
++    LOAD_LUT(3)
++    LOAD_LUT(4)
++    LOAD_LUT(5)
++    LOAD_LUT(6)
++    LOAD_LUT(7)
++
++#undef LOAD_LUT
++
++    mapvalx8 = _mm256_loadu_ps(mapval8);
++    r_linx8 = _mm256_loadu_ps(r_lin8);
++    g_linx8 = _mm256_loadu_ps(g_lin8);
++    b_linx8 = _mm256_loadu_ps(b_lin8);
++
++    if (!rgb2rgb_passthrough) {
++        r_linx8 = _mm256_mul_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][0]));
++        r_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][1]), r_linx8);
++        r_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][2]), r_linx8);
++
++        g_linx8 = _mm256_mul_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][1]));
++        g_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][0]), g_linx8);
++        g_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][2]), g_linx8);
++
++        b_linx8 = _mm256_mul_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][2]));
++        b_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][0]), b_linx8);
++        b_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][1]), b_linx8);
++    }
++
++    if (desat > 0) {
++        __m256 eps_x8 = _mm256_set1_ps(FLOAT_EPS);
++        __m256 desat8 = _mm256_set1_ps((float)desat);
++        __m256 luma8 = _mm256_set1_ps(0);
++        __m256 overbright8;
++
++        luma8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cr)), luma8);
++        luma8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cg)), luma8);
++        luma8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cb)), luma8);
++        overbright8 = _mm256_div_ps(_mm256_max_ps(_mm256_sub_ps(luma8, desat8), eps_x8), _mm256_max_ps(luma8, eps_x8));
++        r_linx8 = _mm256_fnmadd_ps(r_linx8, overbright8, r_linx8);
++        r_linx8 = _mm256_fmadd_ps(luma8, overbright8, r_linx8);
++        g_linx8 = _mm256_fnmadd_ps(g_linx8, overbright8, g_linx8);
++        g_linx8 = _mm256_fmadd_ps(luma8, overbright8, g_linx8);
++        b_linx8 = _mm256_fnmadd_ps(b_linx8, overbright8, b_linx8);
++        b_linx8 = _mm256_fmadd_ps(luma8, overbright8, b_linx8);
++    }
++
++    r_linx8 = _mm256_mul_ps(r_linx8, mapvalx8);
++    g_linx8 = _mm256_mul_ps(g_linx8, mapvalx8);
++    b_linx8 = _mm256_mul_ps(b_linx8, mapvalx8);
++
++    r_linx8 = _mm256_fmadd_ps(r_linx8, intermediate_upper_bound, offset);
++    g_linx8 = _mm256_fmadd_ps(g_linx8, intermediate_upper_bound, offset);
++    b_linx8 = _mm256_fmadd_ps(b_linx8, intermediate_upper_bound, offset);
++
++    rx8 = _mm256_cvttps_epi32(r_linx8);
++    rx8 = _mm256_min_epi32(rx8, upper_bound);
++    rx8 = _mm256_max_epi32(rx8, zerox8);
++
++    gx8 = _mm256_cvttps_epi32(g_linx8);
++    gx8 = _mm256_min_epi32(gx8, upper_bound);
++    gx8 = _mm256_max_epi32(gx8, zerox8);
++
++    bx8 = _mm256_cvttps_epi32(b_linx8);
++    bx8 = _mm256_min_epi32(bx8, upper_bound);
++    bx8 = _mm256_max_epi32(bx8, zerox8);
++
++#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm256_extract_epi32(rx8, i)]; \
++g_out[i] = delin_lut[_mm256_extract_epi32(gx8, i)];                       \
++b_out[i] = delin_lut[_mm256_extract_epi32(bx8, i)];
++
++    SAVE_COLOR(0)
++    SAVE_COLOR(1)
++    SAVE_COLOR(2)
++    SAVE_COLOR(3)
++    SAVE_COLOR(4)
++    SAVE_COLOR(5)
++    SAVE_COLOR(6)
++    SAVE_COLOR(7)
++
++#undef SAVE_COLOR
++}
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
++
++X86_64_V3 void tonemap_frame_dovi_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                             const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                             const int *dstlinesize, const int *srclinesize,
++                                             int dstdepth, int srcdepth,
++                                             int width, int height,
++                                             const struct TonemapIntParams *params)
++{
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 16
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 14;
++
++    const int in_depth = srcdepth;
++    const float in_rng = (float)((1 << in_depth) - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++
++    __m256i ux8, vx8;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++
++    __m128 ipt0, ipt1, ipt2, ipt3, ipt4, ipt5, ipt6, ipt7;
++    __m256 ix8, px8, tx8;
++    __m256 lx8, mx8, sx8;
++    __m256 rx8a, gx8a, bx8a, rx8b, gx8b, bx8b;
++    __m256 y0x8af, y0x8bf, y1x8af, y1x8bf, ux8af, ux8bf, vx8af, vx8bf;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
++
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
++            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
++
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++
++            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++
++            y0x8af = _mm256_cvtepi32_ps(y0x8a);
++            y0x8bf = _mm256_cvtepi32_ps(y0x8b);
++            y1x8af = _mm256_cvtepi32_ps(y1x8a);
++            y1x8bf = _mm256_cvtepi32_ps(y1x8b);
++            ux8af = _mm256_cvtepi32_ps(ux8a);
++            ux8bf = _mm256_cvtepi32_ps(ux8b);
++            vx8af = _mm256_cvtepi32_ps(vx8a);
++            vx8bf = _mm256_cvtepi32_ps(vx8b);
++
++            y0x8af = _mm256_div_ps(y0x8af, _mm256_set1_ps(in_rng));
++            y0x8bf = _mm256_div_ps(y0x8bf, _mm256_set1_ps(in_rng));
++            y1x8af = _mm256_div_ps(y1x8af, _mm256_set1_ps(in_rng));
++            y1x8bf = _mm256_div_ps(y1x8bf, _mm256_set1_ps(in_rng));
++            ux8af = _mm256_div_ps(ux8af, _mm256_set1_ps(in_rng));
++            ux8bf = _mm256_div_ps(ux8bf, _mm256_set1_ps(in_rng));
++            vx8af = _mm256_div_ps(vx8af, _mm256_set1_ps(in_rng));
++            vx8bf = _mm256_div_ps(vx8bf, _mm256_set1_ps(in_rng));
++
++            // Reshape y0x8a
++            reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3,
++                         &ipt4, &ipt5, &ipt6, &ipt7,
++                         y0x8af, ux8af, vx8af, params);
++
++            transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3,
++                             ipt4, ipt5, ipt6, ipt7,
++                             &ix8, &px8, &tx8);
++
++            ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix);
++
++            rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(28672.0f));
++            gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(28672.0f));
++            bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(28672.0f));
++
++            r0x8a = _mm256_cvtps_epi32(rx8a);
++            g0x8a = _mm256_cvtps_epi32(gx8a);
++            b0x8a = _mm256_cvtps_epi32(bx8a);
++
++            // Reshape y1x8a
++            reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3,
++                         &ipt4, &ipt5, &ipt6, &ipt7,
++                         y1x8af, ux8af, vx8af, params);
++
++            transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3,
++                             ipt4, ipt5, ipt6, ipt7,
++                             &ix8, &px8, &tx8);
++
++            ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix);
++
++            rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(28672.0f));
++            gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(28672.0f));
++            bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(28672.0f));
++
++            r1x8a = _mm256_cvtps_epi32(rx8a);
++            g1x8a = _mm256_cvtps_epi32(gx8a);
++            b1x8a = _mm256_cvtps_epi32(bx8a);
++
++            // Reshape y0x8b
++            reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3,
++                         &ipt4, &ipt5, &ipt6, &ipt7,
++                         y0x8bf, ux8bf, vx8bf, params);
++
++            transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3,
++                             ipt4, ipt5, ipt6, ipt7,
++                             &ix8, &px8, &tx8);
++
++            ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix);
++
++            rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(28672.0f));
++            gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(28672.0f));
++            bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(28672.0f));
++
++            r0x8b = _mm256_cvtps_epi32(rx8b);
++            g0x8b = _mm256_cvtps_epi32(gx8b);
++            b0x8b = _mm256_cvtps_epi32(bx8b);
++
++            // Reshape y1x8b
++            reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3,
++                         &ipt4, &ipt5, &ipt6, &ipt7,
++                         y1x8bf, ux8bf, vx8bf, params);
++
++            transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3,
++                             ipt4, ipt5, ipt6, ipt7,
++                             &ix8, &px8, &tx8);
++
++            ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix);
++
++            rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(28672.0f));
++            gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(28672.0f));
++            bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(28672.0f));
++
++            r1x8b = _mm256_cvtps_epi32(rx8b);
++            g1x8b = _mm256_cvtps_epi32(gx8b);
++            b1x8b = _mm256_cvtps_epi32(bx8b);
++
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
++            uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256());
++            _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
++            vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256());
++            _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8));
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_dovi_2_420p(rdsty, rdstu, rdstv,
++                                  rsrcy, rsrcu, rsrcv,
++                                  dstlinesize, srclinesize,
++                                  dstdepth, srcdepth,
++                                  remainw, rheight, params);
++    }
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
++}
++
++X86_64_V3 void tonemap_frame_dovi_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params)
++{
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 14;
++
++    const int in_depth = srcdepth;
++    const float in_rng = (float)((1 << in_depth) - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++
++    __m256i ux8, vx8;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++
++    __m128 ipt0, ipt1, ipt2, ipt3, ipt4, ipt5, ipt6, ipt7;
++    __m256 ix8, px8, tx8;
++    __m256 lx8, mx8, sx8;
++    __m256 rx8a, gx8a, bx8a, rx8b, gx8b, bx8b;
++    __m256 y0x8af, y0x8bf, y1x8af, y1x8bf, ux8af, ux8bf, vx8af, vx8bf;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
++
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
++            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
++
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++
++            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++
++            y0x8af = _mm256_cvtepi32_ps(y0x8a);
++            y0x8bf = _mm256_cvtepi32_ps(y0x8b);
++            y1x8af = _mm256_cvtepi32_ps(y1x8a);
++            y1x8bf = _mm256_cvtepi32_ps(y1x8b);
++            ux8af = _mm256_cvtepi32_ps(ux8a);
++            ux8bf = _mm256_cvtepi32_ps(ux8b);
++            vx8af = _mm256_cvtepi32_ps(vx8a);
++            vx8bf = _mm256_cvtepi32_ps(vx8b);
++
++            y0x8af = _mm256_div_ps(y0x8af, _mm256_set1_ps(in_rng));
++            y0x8bf = _mm256_div_ps(y0x8bf, _mm256_set1_ps(in_rng));
++            y1x8af = _mm256_div_ps(y1x8af, _mm256_set1_ps(in_rng));
++            y1x8bf = _mm256_div_ps(y1x8bf, _mm256_set1_ps(in_rng));
++            ux8af = _mm256_div_ps(ux8af, _mm256_set1_ps(in_rng));
++            ux8bf = _mm256_div_ps(ux8bf, _mm256_set1_ps(in_rng));
++            vx8af = _mm256_div_ps(vx8af, _mm256_set1_ps(in_rng));
++            vx8bf = _mm256_div_ps(vx8bf, _mm256_set1_ps(in_rng));
++
++            // Reshape y0x8a
++            reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3,
++                         &ipt4, &ipt5, &ipt6, &ipt7,
++                         y0x8af, ux8af, vx8af, params);
++
++            transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3,
++                             ipt4, ipt5, ipt6, ipt7,
++                             &ix8, &px8, &tx8);
++
++            ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix);
++
++            rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(28672.0f));
++            gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(28672.0f));
++            bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(28672.0f));
++
++            r0x8a = _mm256_cvtps_epi32(rx8a);
++            g0x8a = _mm256_cvtps_epi32(gx8a);
++            b0x8a = _mm256_cvtps_epi32(bx8a);
++
++            // Reshape y1x8a
++            reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3,
++                         &ipt4, &ipt5, &ipt6, &ipt7,
++                         y1x8af, ux8af, vx8af, params);
++
++            transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3,
++                             ipt4, ipt5, ipt6, ipt7,
++                             &ix8, &px8, &tx8);
++
++            ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx8(&rx8a, &gx8a, &bx8a, lx8, mx8, sx8, *params->lms2rgb_matrix);
++
++            rx8a = _mm256_mul_ps(rx8a, _mm256_set1_ps(28672.0f));
++            gx8a = _mm256_mul_ps(gx8a, _mm256_set1_ps(28672.0f));
++            bx8a = _mm256_mul_ps(bx8a, _mm256_set1_ps(28672.0f));
++
++            r1x8a = _mm256_cvtps_epi32(rx8a);
++            g1x8a = _mm256_cvtps_epi32(gx8a);
++            b1x8a = _mm256_cvtps_epi32(bx8a);
++
++            // Reshape y0x8b
++            reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3,
++                         &ipt4, &ipt5, &ipt6, &ipt7,
++                         y0x8bf, ux8bf, vx8bf, params);
++
++            transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3,
++                             ipt4, ipt5, ipt6, ipt7,
++                             &ix8, &px8, &tx8);
++
++            ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix);
++
++            rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(28672.0f));
++            gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(28672.0f));
++            bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(28672.0f));
++
++            r0x8b = _mm256_cvtps_epi32(rx8b);
++            g0x8b = _mm256_cvtps_epi32(gx8b);
++            b0x8b = _mm256_cvtps_epi32(bx8b);
++
++            // Reshape y1x8b
++            reshapeiptx8(&ipt0, &ipt1, &ipt2, &ipt3,
++                         &ipt4, &ipt5, &ipt6, &ipt7,
++                         y1x8bf, ux8bf, vx8bf, params);
++
++            transpose_ipt8x4(ipt0, ipt1, ipt2, ipt3,
++                             ipt4, ipt5, ipt6, ipt7,
++                             &ix8, &px8, &tx8);
++
++            ycc2rgbx8(&lx8, &mx8, &sx8, ix8, px8, tx8, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx8(&rx8b, &gx8b, &bx8b, lx8, mx8, sx8, *params->lms2rgb_matrix);
++
++            rx8b = _mm256_mul_ps(rx8b, _mm256_set1_ps(28672.0f));
++            gx8b = _mm256_mul_ps(gx8b, _mm256_set1_ps(28672.0f));
++            bx8b = _mm256_mul_ps(bx8b, _mm256_set1_ps(28672.0f));
++
++            r1x8b = _mm256_cvtps_epi32(rx8b);
++            g1x8b = _mm256_cvtps_epi32(gx8b);
++            b1x8b = _mm256_cvtps_epi32(bx8b);
++
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8));
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_dovi_2_420p10(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
++    }
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
++}
++
++X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params)
++{
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 16
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 14;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++
++    __m256i ux8, vx8;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
++
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
++            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
++
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
++            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
++
++            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
++
++            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
++
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
++
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
++
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+static const AVFilterPad tonemapx_inputs[] = {
-+    {
-+        .name         = "default",
-+        .type         = AVMEDIA_TYPE_VIDEO,
-+        .filter_frame = filter_frame,
-+    },
-+};
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+AVFilter ff_vf_tonemapx = {
-+    .name            = "tonemapx",
-+    .description     = NULL_IF_CONFIG_SMALL("SIMD optimized HDR to SDR tonemapping"),
-+    .init            = init,
-+    .uninit          = uninit,
-+    .priv_size       = sizeof(TonemapxContext),
-+    .priv_class      = &tonemapx_class,
-+    FILTER_INPUTS(tonemapx_inputs),
-+    FILTER_OUTPUTS(ff_video_default_filterpad),
-+    FILTER_QUERY_FUNC(query_formats),
-+    .flags           = AVFILTER_FLAG_SLICE_THREADS,
-+};
-Index: FFmpeg/libavfilter/vf_tonemapx.h
-===================================================================
---- /dev/null
-+++ FFmpeg/libavfilter/vf_tonemapx.h
-@@ -0,0 +1,99 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
++            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
++
++            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
++
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
++
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
++
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
++
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
++
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
++            uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256());
++            _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
++            vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256());
++            _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8));
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
++    }
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
++}
++
++X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                                 const int *dstlinesize, const int *srclinesize,
++                                                 int dstdepth, int srcdepth,
++                                                 int width, int height,
++                                                 const struct TonemapIntParams *params)
++{
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 14;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+#ifndef AVFILTER_TONEMAPX_H
-+#define AVFILTER_TONEMAPX_H
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+#include "config.h"
-+#include "colorspace.h"
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++    __m256i ux8, vx8;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
-+#define X86_64_V2 __attribute__((target("sse4.2")))
-+#define X86_64_V3 __attribute__((target("avx2,fma")))
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
 +
-+#if defined(__GNUC__) || defined(__clang__)
-+#    if (__GNUC__ >= 9) || (__clang_major__ >= 11)
-+#        define CC_SUPPORTS_TONEMAPX_INTRINSICS
-+#    endif // (__GNUC__ >= 10) || (__clang_major__ >= 11)
-+#endif // defined(__GNUC__) || defined(__clang__)
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
++            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
 +
-+#ifdef CC_SUPPORTS_TONEMAPX_INTRINSICS
-+#    if ARCH_AARCH64
-+#        if HAVE_INTRINSICS_NEON
-+#            define ENABLE_TONEMAPX_NEON_INTRINSICS
-+#        endif
-+#    endif // ARCH_AARCH64
-+#    if ARCH_X86
-+#        if HAVE_INTRINSICS_SSE42
-+#           define ENABLE_TONEMAPX_SSE_INTRINSICS
-+#        endif
-+#        if HAVE_INTRINSICS_AVX2 && HAVE_INTRINSICS_FMA3
-+#            define ENABLE_TONEMAPX_AVX_INTRINSICS
-+#        endif
-+#    endif // ARCH_X86
-+#endif // CC_SUPPORTS_TONEMAPX_INTRINSICS
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
 +
-+typedef struct TonemapIntParams {
-+    double lut_peak;
-+    float *lin_lut;
-+    float *tonemap_lut;
-+    uint16_t *delin_lut;
-+    int in_yuv_off, out_yuv_off;
-+    int16_t (*yuv2rgb_coeffs)[3][3][8];
-+    int16_t (*rgb2yuv_coeffs)[3][3][8];
-+    double  (*rgb2rgb_coeffs)[3][3];
-+    int rgb2rgb_passthrough;
-+    const AVLumaCoefficients *coeffs, *ocoeffs;
-+    double desat;
-+} TonemapIntParams;
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
++            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
 +
-+enum SIMDVariant {
-+    SIMD_NONE = -1,
-+    SIMD_NEON,
-+    SIMD_SSE,
-+    SIMD_AVX
-+};
++            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
 +
-+void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                 const int *dstlinesize, const int *srclinesize,
-+                                 int dstdepth, int srcdepth,
-+                                 int width, int height,
-+                                 const struct TonemapIntParams *params);
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
 +
-+void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
-+                                    const uint16_t *srcy, const uint16_t *srcuv,
-+                                    const int *dstlinesize, const int *srclinesize,
-+                                    int dstdepth, int srcdepth,
-+                                    int width, int height,
-+                                    const struct TonemapIntParams *params);
++            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
 +
-+void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                   const int *dstlinesize, const int *srclinesize,
-+                                   int dstdepth, int srcdepth,
-+                                   int width, int height,
-+                                   const struct TonemapIntParams *params);
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
 +
-+void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
-+                                         const uint16_t *srcy, const uint16_t *srcuv,
-+                                         const int *dstlinesize, const int *srclinesize,
-+                                         int dstdepth, int srcdepth,
-+                                         int width, int height,
-+                                         const struct TonemapIntParams *params);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
 +
-+#endif // AVFILTER_TONEMAPX_H
-Index: FFmpeg/libavfilter/x86/Makefile
-===================================================================
---- FFmpeg.orig/libavfilter/x86/Makefile
-+++ FFmpeg/libavfilter/x86/Makefile
-@@ -34,6 +34,8 @@ OBJS-$(CONFIG_STEREO3D_FILTER)
- OBJS-$(CONFIG_TBLEND_FILTER)                 += x86/vf_blend_init.o
- OBJS-$(CONFIG_THRESHOLD_FILTER)              += x86/vf_threshold_init.o
- OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
-+OBJS-$(CONFIG_TONEMAPX_FILTER)               += x86/vf_tonemapx_intrin_sse.o \
-+                                                x86/vf_tonemapx_intrin_avx.o
- OBJS-$(CONFIG_TRANSPOSE_FILTER)              += x86/vf_transpose_init.o
- OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
- OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
-Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
-===================================================================
---- /dev/null
-+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
-@@ -0,0 +1,1367 @@
-+/*
-+ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+#include "vf_tonemapx_intrin_avx.h"
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
-+#    include <immintrin.h>
-+#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
++            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
 +
-+#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
-+X86_64_V3 static inline __m256i av_clip_int16_avx(__m256i a)
-+{
-+__m256i add_result = _mm256_add_epi32(a, _mm256_set1_epi32(0x8000U));
-+__m256i mask = _mm256_set1_epi32(~0xFFFF);
-+__m256i condition = _mm256_and_si256(add_result, mask);
-+__m256i cmp = _mm256_cmpeq_epi32(condition, _mm256_setzero_si256());
++            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
 +
-+__m256i shifted = _mm256_srai_epi32(a, 31);
-+__m256i xor_result = _mm256_xor_si256(shifted, _mm256_set1_epi32(0x7FFF));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
 +
-+return _mm256_or_si256(_mm256_and_si256(cmp, a), _mm256_andnot_si256(cmp, xor_result));
-+}
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
 +
-+X86_64_V3 static inline void tonemap_int32x8_avx(__m256i r_in, __m256i g_in, __m256i b_in,
-+                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
-+                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
-+                                                 const AVLumaCoefficients *coeffs,
-+                                                 const AVLumaCoefficients *ocoeffs, double desat,
-+                                                 double (*rgb2rgb)[3][3],
-+                                                 int rgb2rgb_passthrough)
-+{
-+    __m256i sig8;
-+    __m256 mapvalx8, r_linx8, g_linx8, b_linx8;
-+    __m256 offset = _mm256_set1_ps(0.5f);
-+    __m256i zerox8 = _mm256_setzero_si256();
-+    __m256i input_lut_offset = _mm256_set1_epi32(2048);
-+    __m256i upper_bound = _mm256_set1_epi32(32767);
-+    __m256 intermediate_upper_bound = _mm256_set1_ps(32767.0f);
-+    __m256i r, g, b, rx8, gx8, bx8;
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+    float mapval8[8], r_lin8[8], g_lin8[8], b_lin8[8];
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
 +
-+    sig8 = _mm256_max_epi32(r_in, _mm256_max_epi32(g_in, b_in));
-+    sig8 = _mm256_add_epi32(sig8, input_lut_offset);
-+    sig8 = _mm256_min_epi32(sig8, upper_bound);
-+    sig8 = _mm256_max_epi32(sig8, zerox8);
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
 +
-+    r = _mm256_add_epi32(r_in, input_lut_offset);
-+    r = _mm256_min_epi32(r, upper_bound);
-+    r = _mm256_max_epi32(r, zerox8);
-+    g = _mm256_add_epi32(g_in, input_lut_offset);
-+    g = _mm256_min_epi32(g, upper_bound);
-+    g = _mm256_max_epi32(g, zerox8);
-+    b = _mm256_add_epi32(b_in, input_lut_offset);
-+    b = _mm256_min_epi32(b, upper_bound);
-+    b = _mm256_max_epi32(b, zerox8);
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
 +
-+#define LOAD_LUT(i) mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)]; \
-+r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)];                             \
-+g_lin8[i] = lin_lut[_mm256_extract_epi32(g, i)];                             \
-+b_lin8[i] = lin_lut[_mm256_extract_epi32(b, i)];
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
 +
-+    LOAD_LUT(0)
-+    LOAD_LUT(1)
-+    LOAD_LUT(2)
-+    LOAD_LUT(3)
-+    LOAD_LUT(4)
-+    LOAD_LUT(5)
-+    LOAD_LUT(6)
-+    LOAD_LUT(7)
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
 +
-+#undef LOAD_LUT
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+    mapvalx8 = _mm256_loadu_ps(mapval8);
-+    r_linx8 = _mm256_loadu_ps(r_lin8);
-+    g_linx8 = _mm256_loadu_ps(g_lin8);
-+    b_linx8 = _mm256_loadu_ps(b_lin8);
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+    if (!rgb2rgb_passthrough) {
-+        r_linx8 = _mm256_mul_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][0]));
-+        r_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][1]), r_linx8);
-+        r_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][2]), r_linx8);
++            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
 +
-+        g_linx8 = _mm256_mul_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][1]));
-+        g_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][0]), g_linx8);
-+        g_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][2]), g_linx8);
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
 +
-+        b_linx8 = _mm256_mul_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][2]));
-+        b_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][0]), b_linx8);
-+        b_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][1]), b_linx8);
-+    }
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
 +
-+    if (desat > 0) {
-+        __m256 eps_x8 = _mm256_set1_ps(FLOAT_EPS);
-+        __m256 desat8 = _mm256_set1_ps((float)desat);
-+        __m256 luma8 = _mm256_set1_ps(0);
-+        __m256 overbright8;
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
 +
-+        luma8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cr)), luma8);
-+        luma8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cg)), luma8);
-+        luma8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cb)), luma8);
-+        overbright8 = _mm256_div_ps(_mm256_max_ps(_mm256_sub_ps(luma8, desat8), eps_x8), _mm256_max_ps(luma8, eps_x8));
-+        r_linx8 = _mm256_fnmadd_ps(r_linx8, overbright8, r_linx8);
-+        r_linx8 = _mm256_fmadd_ps(luma8, overbright8, r_linx8);
-+        g_linx8 = _mm256_fnmadd_ps(g_linx8, overbright8, g_linx8);
-+        g_linx8 = _mm256_fmadd_ps(luma8, overbright8, g_linx8);
-+        b_linx8 = _mm256_fnmadd_ps(b_linx8, overbright8, b_linx8);
-+        b_linx8 = _mm256_fmadd_ps(luma8, overbright8, b_linx8);
-+    }
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+    r_linx8 = _mm256_mul_ps(r_linx8, mapvalx8);
-+    g_linx8 = _mm256_mul_ps(g_linx8, mapvalx8);
-+    b_linx8 = _mm256_mul_ps(b_linx8, mapvalx8);
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+    r_linx8 = _mm256_fmadd_ps(r_linx8, intermediate_upper_bound, offset);
-+    g_linx8 = _mm256_fmadd_ps(g_linx8, intermediate_upper_bound, offset);
-+    b_linx8 = _mm256_fmadd_ps(b_linx8, intermediate_upper_bound, offset);
++            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
 +
-+    rx8 = _mm256_cvttps_epi32(r_linx8);
-+    rx8 = _mm256_min_epi32(rx8, upper_bound);
-+    rx8 = _mm256_max_epi32(rx8, zerox8);
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
 +
-+    gx8 = _mm256_cvttps_epi32(g_linx8);
-+    gx8 = _mm256_min_epi32(gx8, upper_bound);
-+    gx8 = _mm256_max_epi32(gx8, zerox8);
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
 +
-+    bx8 = _mm256_cvttps_epi32(b_linx8);
-+    bx8 = _mm256_min_epi32(bx8, upper_bound);
-+    bx8 = _mm256_max_epi32(bx8, zerox8);
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
 +
-+#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm256_extract_epi32(rx8, i)]; \
-+g_out[i] = delin_lut[_mm256_extract_epi32(gx8, i)];                       \
-+b_out[i] = delin_lut[_mm256_extract_epi32(bx8, i)];
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8));
 +
-+    SAVE_COLOR(0)
-+    SAVE_COLOR(1)
-+    SAVE_COLOR(2)
-+    SAVE_COLOR(3)
-+    SAVE_COLOR(4)
-+    SAVE_COLOR(5)
-+    SAVE_COLOR(6)
-+    SAVE_COLOR(7)
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8));
++        }
++    }
 +
-+#undef SAVE_COLOR
-+}
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
++    }
 +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
++}
 +
-+X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                               const int *dstlinesize, const int *srclinesize,
-+                                               int dstdepth, int srcdepth,
-+                                               int width, int height,
-+                                               const struct TonemapIntParams *params)
++X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
++                                                  const uint16_t *srcy, const uint16_t *srcuv,
++                                                  const int *dstlinesize, const int *srclinesize,
++                                                  int dstdepth, int srcdepth,
++                                                  int width, int height,
++                                                  const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
 +    uint8_t *rdsty = dsty;
-+    uint8_t *rdstu = dstu;
-+    uint8_t *rdstv = dstv;
++    uint8_t *rdstuv = dstuv;
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcu = srcu;
-+    const uint16_t *rsrcv = srcv;
++    const uint16_t *rsrcuv = srcuv;
 +    int rheight = height;
 +    // not zero when not divisible by 16
 +    // intentionally leave last pixel emtpy when input is odd
@@ -3076,7 +6060,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    __m256i cyx8 = _mm256_set1_epi32(cy);
 +    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+    __m256i ux8, vx8;
++    __m256i uvx16, uvx8a, uvx8b;
 +    __m256i y0x16, y1x16;
 +    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
 +    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
@@ -3090,35 +6074,42 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    __m256i r1ox16, g1ox16, b1ox16;
 +    __m256i y1ox16;
 +    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8;
++    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
 +    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 4; xx++) {
 +            int x = xx << 4;
 +
 +            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
 +            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
-+            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
++            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
++
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                y0x16 = _mm256_srli_epi16(y0x16, 6);
++                y1x16 = _mm256_srli_epi16(y1x16, 6);
++                uvx16 = _mm256_srli_epi16(uvx16, 6);
++            }
 +
 +            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
 +            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
 +            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
 +            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
-+
++            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
++            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
 +            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
 +            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
 +            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
 +            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
-+            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
++            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
++            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
 +
-+            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-+            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-+            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-+            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
 +            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
@@ -3291,20 +6282,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
 +            uox8 = _mm256_srai_epi32(uox8, out_sh);
 +            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
-+            uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256());
-+            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256());
-+            _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8));
 +
 +            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
 +            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
 +            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
 +            vox8 = _mm256_srai_epi32(vox8, out_sh);
 +            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
-+            vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256());
-+            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256());
-+            _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8));
++
++            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
++            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
++            uvox16 = _mm256_packs_epi32(uvoax8, uvobx8);
++            _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +        }
 +    }
 +
@@ -3312,34 +6300,30 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    if (remainw) {
 +        int offset = width & (int)0xfffffff0;
 +        rdsty += offset;
-+        rdstu += offset >> 1;
-+        rdstv += offset >> 1;
-+        rsrcy += offset;
-+        rsrcu += offset >> 1;
-+        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
-+                                    rsrcy, rsrcu, rsrcv,
-+                                    dstlinesize, srclinesize,
-+                                    dstdepth, srcdepth,
-+                                    remainw, rheight, params);
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
 +    }
 +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
 +
-+X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                                 const int *dstlinesize, const int *srclinesize,
-+                                                 int dstdepth, int srcdepth,
-+                                                 int width, int height,
-+                                                 const struct TonemapIntParams *params)
++X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
++                                                       const uint16_t *srcy, const uint16_t *srcuv,
++                                                       const int *dstlinesize, const int *srclinesize,
++                                                       int dstdepth, int srcdepth,
++                                                       int width, int height,
++                                                       const struct TonemapIntParams *params)
 +{
 +#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
 +    uint16_t *rdsty = dsty;
-+    uint16_t *rdstu = dstu;
-+    uint16_t *rdstv = dstv;
++    uint16_t *rdstuv = dstuv;
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcu = srcu;
-+    const uint16_t *rsrcv = srcv;
++    const uint16_t *rsrcuv = srcuv;
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
@@ -3354,6 +6338,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -3381,7 +6366,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    __m256i y0ox16;
 +    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
 +    __m256i yoax8, yobx8;
-+    __m256i ux8, vx8;
++    __m256i uvx16, uvx8a, uvx8b;
 +    __m256i y0x16, y1x16;
 +    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
 +    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
@@ -3390,35 +6375,42 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    __m256i r1ox16, g1ox16, b1ox16;
 +    __m256i y1ox16;
 +    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8;
++    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
 +    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 4; xx++) {
 +            int x = xx << 4;
 +
 +            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
 +            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
-+            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
++            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
++
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                y0x16 = _mm256_srli_epi16(y0x16, 6);
++                y1x16 = _mm256_srli_epi16(y1x16, 6);
++                uvx16 = _mm256_srli_epi16(uvx16, 6);
++            }
 +
 +            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
 +            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
 +            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
 +            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
-+
++            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
++            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
 +            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
 +            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
 +            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
 +            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
-+            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
++            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
++            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
 +
-+            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-+            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-+            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-+            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
 +            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
@@ -3536,6 +6528,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +
 +            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
 +            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2);
 +            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
 +
 +            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
@@ -3566,6 +6559,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +
 +            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
 +            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2);
 +            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
 +
 +            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
@@ -3591,18 +6585,18 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
 +            uox8 = _mm256_srai_epi32(uox8, out_sh);
 +            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
-+            uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256());
-+            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8));
 +
 +            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
 +            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
 +            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
 +            vox8 = _mm256_srai_epi32(vox8, out_sh);
 +            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
-+            vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256());
-+            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8));
++
++            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
++            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
++            uvox16 = _mm256_packus_epi32(uvoax8, uvobx8);
++            uvox16 = _mm256_slli_epi16(uvox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16);
 +        }
 +    }
 +
@@ -3610,354 +6604,513 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    if (remainw) {
 +        int offset = width & (int)0xfffffff0;
 +        rdsty += offset;
-+        rdstu += offset >> 1;
-+        rdstv += offset >> 1;
++        rdstuv += offset;
 +        rsrcy += offset;
-+        rsrcu += offset >> 1;
-+        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
-+                                      rsrcy, rsrcu, rsrcv,
-+                                      dstlinesize, srclinesize,
-+                                      dstdepth, srcdepth,
-+                                      remainw, rheight, params);
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
++                                            rsrcy, rsrcuv,
++                                            dstlinesize, srclinesize,
++                                            dstdepth, srcdepth,
++                                            remainw, rheight, params);
 +    }
 +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
+@@ -0,0 +1,68 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVFILTER_X86_TONEMAPX_INTRIN_AVX_H
++#define AVFILTER_X86_TONEMAPX_INTRIN_AVX_H
++
++#include "libavfilter/vf_tonemapx.h"
++
++X86_64_V3 void tonemap_frame_dovi_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                             const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                             const int *dstlinesize, const int *srclinesize,
++                                             int dstdepth, int srcdepth,
++                                             int width, int height,
++                                             const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_dovi_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                                 const int *dstlinesize, const int *srclinesize,
++                                                 int dstdepth, int srcdepth,
++                                                 int width, int height,
++                                                 const struct TonemapIntParams *params);
 +
 +X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
 +                                                  const uint16_t *srcy, const uint16_t *srcuv,
 +                                                  const int *dstlinesize, const int *srclinesize,
 +                                                  int dstdepth, int srcdepth,
 +                                                  int width, int height,
-+                                                  const struct TonemapIntParams *params)
++                                                  const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
++                                                       const uint16_t *srcy, const uint16_t *srcuv,
++                                                       const int *dstlinesize, const int *srclinesize,
++                                                       int dstdepth, int srcdepth,
++                                                       int width, int height,
++                                                       const struct TonemapIntParams *params);
++
++#endif // AVFILTER_X86_TONEMAPX_INTRIN_AVX_H
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
+@@ -0,0 +1,2353 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "vf_tonemapx_intrin_sse.h"
++
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
++#    include <immintrin.h>
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
++
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
++// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction
++// cast the register into float register and store with movss as a workaround
++#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
++__attribute__((always_inline))
++X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) {
++    _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a));
++    return;
++}
++#endif
++
++X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a)
 +{
-+#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
-+    uint8_t *rdsty = dsty;
-+    uint8_t *rdstuv = dstuv;
-+    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
-+    int rheight = height;
-+    // not zero when not divisible by 16
-+    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 14;
++    __m128i mask = _mm_set1_epi32(0x7FFF);
++    __m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF));
 +
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
++    __m128i zero = _mm_setzero_si128();
++    __m128i cmp = _mm_cmpeq_epi32(condition, zero);
 +
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
++    __m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask);
++    __m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a));
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++    return result;
++}
 +
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a)
++{
++    __m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U));
++    __m128i mask = _mm_set1_epi32(~0xFFFF);
++    __m128i condition = _mm_and_si128(add_result, mask);
++    __m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128());
 +
-+    int16_t r[16], g[16], b[16];
-+    int16_t r1[16], g1[16], b1[16];
-+    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
-+    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
-+    __m256i cyx8 = _mm256_set1_epi32(cy);
-+    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++    __m128i shifted = _mm_srai_epi32(a, 31);
++    __m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF));
 +
-+    __m256i uvx16, uvx8a, uvx8b;
-+    __m256i y0x16, y1x16;
-+    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++    return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result));
++}
 +
-+    __m256i r0ox16, g0ox16, b0ox16;
-+    __m256i y0ox16;
-+    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
-+    __m256i yoax8, yobx8;
++X86_64_V2 inline static __m128 mix_float32x4(__m128 x, __m128 y, __m128 a)
++{
++    __m128 n = _mm_sub_ps(y, x);
++    n = _mm_mul_ps(n, a);
++    n = _mm_add_ps(n, x);
++    return n;
++}
 +
-+    __m256i r1ox16, g1ox16, b1ox16;
-+    __m256i y1ox16;
-+    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
-+    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 4; xx++) {
-+            int x = xx << 4;
++X86_64_V2 inline static float reduce_floatx4(__m128 x) {
++    x = _mm_hadd_ps(x, x);
++    x = _mm_hadd_ps(x, x);
++    return _mm_cvtss_f32(x);
++}
++
++X86_64_V2 static inline float reshape_poly(float s, __m128 coeffs)
++{
++    __m128 ps = _mm_set_ps(0.0f, s * s, s, 1.0f);
++    ps = _mm_mul_ps(ps, coeffs);
++    return reduce_floatx4(ps);
++}
 +
-+            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
-+            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
++X86_64_V2 inline static float reshape_mmr(__m128 sig, __m128 coeffs, const float* mmr,
++                                          int mmr_single, int min_order, int max_order)
++{
++    float s = _mm_cvtss_f32(coeffs);
++    int mmr_idx = 0;
++    int order = 0;
++
++    __m128 mmr_coeffs, ps;
++    __m128 sigX01 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1))); // {sig[0]*sig[1], sig[1]*sig[1], sig[2]*sig[1], sig[3]*sig[1]}
++    __m128 sigX02 = _mm_mul_ps(sig, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[2], sig[1]*sig[2], sig[2]*sig[2], sig[3]*sig[2]}
++    __m128 sigX12 = _mm_mul_ps(sigX01, _mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2))); // {sig[0]*sig[1]*sig[2], sig[1]*sig[1]*sig[2], sig[2]*sig[1]*sig[2], sig[3]*sig[1]*sig[2]}
++    __m128 sigX = sigX01; // sig[0]*sig[1] now positioned at 0
++
++    sigX = _mm_insert_ps(sigX, sigX02, _MM_MK_INSERTPS_NDX(0, 1, 0)); // sig[0]*sig[2] at 1
++    sigX = _mm_insert_ps(sigX, sigX02, _MM_MK_INSERTPS_NDX(1, 2, 0)); // sig[1]*sig[2] at 2
++    sigX = _mm_insert_ps(sigX, sigX12, _MM_MK_INSERTPS_NDX(0, 3, 0)); // sig[0]*sig[1]*sig[2] at 3
++
++    mmr_idx = mmr_single ? 0 : (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 2, 0, 1)));
++    order = (int)_mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(1, 2, 0, 3)));
++
++    // dot first order
++    mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 0*4]);
++    ps = _mm_mul_ps(sig, mmr_coeffs);
++    s += reduce_floatx4(ps);
++    mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 1*4]);
++    ps = _mm_mul_ps(sigX, mmr_coeffs);
++    s += reduce_floatx4(ps);
++
++    if (max_order >= 2 && (min_order >= 2 || order >= 2)) {
++        __m128 sig2 = _mm_mul_ps(sig, sig);
++        __m128 sigX2 = _mm_mul_ps(sigX, sigX);
++
++        mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 2*4]);
++        ps = _mm_mul_ps(sig2, mmr_coeffs);
++        s += reduce_floatx4(ps);
++        mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 3*4]);
++        ps = _mm_mul_ps(sigX2, mmr_coeffs);
++        s += reduce_floatx4(ps);
++
++        if (max_order == 3 && (min_order == 3 || order >= 3)) {
++            __m128 sig3 = _mm_mul_ps(sig2, sig);
++            __m128 sigX3 = _mm_mul_ps(sigX2, sigX);
++
++            mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 4*4]);
++            ps = _mm_mul_ps(sig3, mmr_coeffs);
++            s += reduce_floatx4(ps);
++            mmr_coeffs = _mm_loadu_ps(&mmr[mmr_idx + 5*4]);
++            ps = _mm_mul_ps(sigX3, mmr_coeffs);
++            s += reduce_floatx4(ps);
++        }
++    }
 +
-+            if (in_depth == 10) {
-+                // shift to low10bits for 10bit input
-+                y0x16 = _mm256_srli_epi16(y0x16, 6);
-+                y1x16 = _mm256_srli_epi16(y1x16, 6);
-+                uvx16 = _mm256_srli_epi16(uvx16, 6);
-+            }
++    return s;
++}
 +
-+            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
-+            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
-+            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
-+            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
-+            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
-+            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
-+            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
-+            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
-+            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
-+            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
-+            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
++#define CLAMP(a, b, c) (FFMIN(FFMAX((a), (b)), (c)))
++X86_64_V2 inline static __m128 reshape_dovi_iptpqc2(__m128 sig, const TonemapIntParams *ctx)
++{
++    int has_mmr_poly;
++    float s;
++
++    float *src_dovi_params = ctx->dovi_pbuf;
++    float *src_dovi_pivots = ctx->dovi_pbuf + 24;
++    float *src_dovi_coeffs = ctx->dovi_pbuf + 48; //float4*
++    float *src_dovi_mmr = ctx->dovi_pbuf + 144; //float4*
++
++    float* dovi_params_i = src_dovi_params + 0*8;
++    float* dovi_pivots_i = src_dovi_pivots + 0*8;
++    float* dovi_coeffs_i = src_dovi_coeffs + 0 * 8 * 4; //float4*
++    float* dovi_mmr_i = src_dovi_mmr + 0 * 48 * 4; //float4*
++    int dovi_num_pivots_i = dovi_params_i[0];
++    int dovi_has_mmr_i = dovi_params_i[1];
++    int dovi_has_poly_i = dovi_params_i[2];
++    int dovi_mmr_single_i = dovi_params_i[3];
++    int dovi_min_order_i = dovi_params_i[4];
++    int dovi_max_order_i = dovi_params_i[5];
++    float dovi_lo_i = dovi_params_i[6];
++    float dovi_hi_i = dovi_params_i[7];
++
++    float* dovi_params_p = src_dovi_params + 1*8;
++    float* dovi_coeffs_p = src_dovi_coeffs + 1*8 * 4; //float4*
++    float* dovi_mmr_p = src_dovi_mmr + 1*48 * 4; //float4*
++    int dovi_has_mmr_p = dovi_params_p[1];
++    int dovi_has_poly_p = dovi_params_p[2];
++    int dovi_mmr_single_p = dovi_params_p[3];
++    int dovi_min_order_p = dovi_params_p[4];
++    int dovi_max_order_p = dovi_params_p[5];
++    float dovi_lo_p = dovi_params_p[6];
++    float dovi_hi_p = dovi_params_p[7];
++
++    float* dovi_params_t = src_dovi_params + 2*8;
++    float* dovi_coeffs_t = src_dovi_coeffs + 2*8 * 4; //float4*
++    float* dovi_mmr_t = src_dovi_mmr + 2*48 * 4; //float4*
++    int dovi_has_mmr_t = dovi_params_t[1];
++    int dovi_has_poly_t = dovi_params_t[2];
++    int dovi_mmr_single_t = dovi_params_t[3];
++    int dovi_min_order_t = dovi_params_t[4];
++    int dovi_max_order_t = dovi_params_t[5];
++    float dovi_lo_t = dovi_params_t[6];
++    float dovi_hi_t = dovi_params_t[7];
++
++    __m128 coeffs, result;
++
++    // reshape I
++    s = _mm_cvtss_f32(sig);
++    result = sig;
++    if (dovi_num_pivots_i > 2) {
++        __m128 m01 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i), _mm_loadu_ps(dovi_coeffs_i + 4), _mm_set1_ps(s >= dovi_pivots_i[0]));
++        __m128 m23 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 2*4), _mm_loadu_ps(dovi_coeffs_i + 3*4), _mm_set1_ps(s >= dovi_pivots_i[2]));
++        __m128 m0123 = mix_float32x4(m01, m23, _mm_set1_ps(s >= dovi_pivots_i[1]));
++        __m128 m45 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 4*4), _mm_loadu_ps(dovi_coeffs_i + 5*4), _mm_set1_ps(s >= dovi_pivots_i[4]));
++        __m128 m67 = mix_float32x4(_mm_loadu_ps(dovi_coeffs_i + 6*4), _mm_loadu_ps(dovi_coeffs_i + 7*4), _mm_set1_ps(s >= dovi_pivots_i[6]));
++        __m128 m4567 = mix_float32x4(m45, m67, _mm_set1_ps(s >= dovi_pivots_i[5]));
++        coeffs = mix_float32x4(m0123, m4567, _mm_set1_ps(s >= dovi_pivots_i[3]));
++    } else {
++        coeffs = _mm_loadu_ps(dovi_coeffs_i);
++    }
 +
-+            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
-+            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
-+            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
-+            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++    has_mmr_poly = dovi_has_mmr_i && dovi_has_poly_i;
 +
-+            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
-+            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
-+            r0x8a = av_clip_int16_avx(r0x8a);
++    if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_i))
++        s = reshape_poly(s, coeffs);
++    else
++        s = reshape_mmr(result, coeffs, dovi_mmr_i,
++                        dovi_mmr_single_i, dovi_min_order_i, dovi_max_order_i);
 +
-+            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
-+            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
-+            r1x8a = av_clip_int16_avx(r1x8a);
++    result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_i, dovi_hi_i)), _MM_MK_INSERTPS_NDX(0, 0, 0));
 +
-+            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
-+            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
-+            g0x8a = av_clip_int16_avx(g0x8a);
++    // reshape P
++    s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(1, 1, 1, 1)));
++    coeffs = _mm_loadu_ps(dovi_coeffs_p);
++    has_mmr_poly = dovi_has_mmr_p && dovi_has_poly_p;
 +
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
-+            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
-+            g1x8a = av_clip_int16_avx(g1x8a);
++    if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_p))
++        s = reshape_poly(s, coeffs);
++    else
++        s = reshape_mmr(result, coeffs, dovi_mmr_p,
++                        dovi_mmr_single_p, dovi_min_order_p, dovi_max_order_p);
 +
-+            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
-+            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
-+            b0x8a = av_clip_int16_avx(b0x8a);
++    result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_p, dovi_hi_p)), _MM_MK_INSERTPS_NDX(0, 1, 0));
 +
-+            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
-+            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
-+            b1x8a = av_clip_int16_avx(b1x8a);
++    // reshape T
++    s = _mm_cvtss_f32(_mm_shuffle_ps(sig, sig, _MM_SHUFFLE(2, 2, 2, 2)));
++    coeffs = _mm_loadu_ps(dovi_coeffs_t);
++    has_mmr_poly = dovi_has_mmr_t && dovi_has_poly_t;
 +
-+            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
-+            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
-+            r0x8b = av_clip_int16_avx(r0x8b);
++    if ((has_mmr_poly && _mm_cvtss_f32(_mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3, 3, 3, 3))) == 0.0f) || (!has_mmr_poly && dovi_has_poly_t))
++        s = reshape_poly(s, coeffs);
++    else
++        s = reshape_mmr(result, coeffs, dovi_mmr_t,
++                        dovi_mmr_single_t, dovi_min_order_t, dovi_max_order_t);
 +
-+            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
-+            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
-+            r1x8b = av_clip_int16_avx(r1x8b);
++    result = _mm_insert_ps(result, _mm_set1_ps(CLAMP(s, dovi_lo_t, dovi_hi_t)), _MM_MK_INSERTPS_NDX(0, 2, 0));
 +
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
-+            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
-+            g0x8b = av_clip_int16_avx(g0x8b);
++    return result;
++}
 +
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
-+            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
-+            g1x8b = av_clip_int16_avx(g1x8b);
++X86_64_V2 inline static void ycc2rgbx4(__m128* dy, __m128* dcb, __m128* dcr,
++                                       __m128 y, __m128 cb, __m128 cr,
++                                       const double nonlinear[3][3], const float ycc_offset[3])
++{
++    *dy = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[0][0]));
++    *dy = _mm_add_ps(*dy, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[0][1])));
++    *dy = _mm_add_ps(*dy, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[0][2])));
++    *dy = _mm_sub_ps(*dy, _mm_set1_ps(ycc_offset[0]));
++
++    *dcb = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[1][0]));
++    *dcb = _mm_add_ps(*dcb, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[1][1])));
++    *dcb = _mm_add_ps(*dcb, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[1][2])));
++    *dcb = _mm_sub_ps(*dcb, _mm_set1_ps(ycc_offset[1]));
++
++    *dcr = _mm_mul_ps(y, _mm_set1_ps((float)nonlinear[2][0]));
++    *dcr = _mm_add_ps(*dcr, _mm_mul_ps(cb, _mm_set1_ps((float)nonlinear[2][1])));
++    *dcr = _mm_add_ps(*dcr, _mm_mul_ps(cr, _mm_set1_ps((float)nonlinear[2][2])));
++    *dcr = _mm_sub_ps(*dcr, _mm_set1_ps(ycc_offset[2]));
++}
 +
-+            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
-+            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
-+            b0x8b = av_clip_int16_avx(b0x8b);
++X86_64_V2 inline static void lms2rgbx4(__m128* dl, __m128* dm, __m128* ds,
++                                       __m128 l, __m128 m, __m128 s,
++                                       const double lms2rgb_matrix[3][3])
++{
++    *dl = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[0][0]));
++    *dl = _mm_add_ps(*dl, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[0][1])));
++    *dl = _mm_add_ps(*dl, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[0][2])));
 +
-+            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
-+            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
-+            b1x8b = av_clip_int16_avx(b1x8b);
++    *dm = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[1][0]));
++    *dm = _mm_add_ps(*dm, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[1][1])));
++    *dm = _mm_add_ps(*dm, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[1][2])));
 +
-+            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
++    *ds = _mm_mul_ps(l, _mm_set1_ps((float)lms2rgb_matrix[2][0]));
++    *ds = _mm_add_ps(*ds, _mm_mul_ps(m, _mm_set1_ps((float)lms2rgb_matrix[2][1])));
++    *ds = _mm_add_ps(*ds, _mm_mul_ps(s, _mm_set1_ps((float)lms2rgb_matrix[2][2])));
++}
 +
-+            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
-+            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
-+            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in,
++                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                                                 const AVLumaCoefficients *coeffs,
++                                                 const AVLumaCoefficients *ocoeffs, double desat,
++                                                 double (*rgb2rgb)[3][3],
++                                                 int rgb2rgb_passthrough)
++{
++    __m128i sig4;
++    __m128 mapvalx4, r_linx4, g_linx4, b_linx4;
++    __m128 offset = _mm_set1_ps(0.5f);
++    __m128i input_lut_offset = _mm_set1_epi32(2048);
++    __m128 intermediate_upper_bound = _mm_set1_ps(32767.0f);
++    __m128i r, g, b, rx4, gx4, bx4;
 +
-+            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
-+            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
-+            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++    float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4];
 +
-+            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
-+            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
-+            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++    sig4 = _mm_max_epi32(r_in, _mm_max_epi32(g_in, b_in));
++    sig4 = _mm_add_epi32(sig4, input_lut_offset);
++    sig4 = av_clip_uint16_sse(sig4);
 +
-+            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
-+            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++    r = _mm_add_epi32(r_in, input_lut_offset);
++    r = av_clip_uint16_sse(r);
++    g = _mm_add_epi32(g_in, input_lut_offset);
++    g = av_clip_uint16_sse(g);
++    b = _mm_add_epi32(b_in, input_lut_offset);
++    b = av_clip_uint16_sse(b);
 +
-+            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
-+            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++    // Cannot use loop here as the lane has to be compile-time constant
++#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \
++r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)];                             \
++g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)];                             \
++b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)];
 +
-+            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
-+            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++    LOAD_LUT(0)
++    LOAD_LUT(1)
++    LOAD_LUT(2)
++    LOAD_LUT(3)
 +
-+            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
-+            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
-+            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++#undef LOAD_LUT
 +
-+            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
-+            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
-+            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++    mapvalx4 = _mm_loadu_ps(mapval4);
++    r_linx4 = _mm_loadu_ps(r_lin4);
++    g_linx4 = _mm_loadu_ps(g_lin4);
++    b_linx4 = _mm_loadu_ps(b_lin4);
 +
-+            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
-+            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
-+            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++    if (!rgb2rgb_passthrough) {
++        r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0]));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1])));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2])));
 +
-+            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
-+            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++        g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1]));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0])));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2])));
 +
-+            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
-+            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++        b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2]));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0])));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1])));
++    }
 +
-+            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
-+            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++    if (desat > 0) {
++        __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS);
++        __m128 desat4 = _mm_set1_ps((float)desat);
++        __m128 luma4 = _mm_set1_ps(0);
++        __m128 overbright4;
 +
-+            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
-+            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
-+            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr))));
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg))));
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb))));
++        overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4));
++        r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4));
++        g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4));
++        b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4));
++    }
 +
-+            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
-+            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
-+            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++    r_linx4 = _mm_mul_ps(r_linx4, mapvalx4);
++    g_linx4 = _mm_mul_ps(g_linx4, mapvalx4);
++    b_linx4 = _mm_mul_ps(b_linx4, mapvalx4);
 +
-+            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
-+            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
-+            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++    r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound);
++    r_linx4 = _mm_add_ps(r_linx4, offset);
 +
-+            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
-+            uox8 = _mm256_srai_epi32(uox8, out_sh);
-+            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++    g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound);
++    g_linx4 = _mm_add_ps(g_linx4, offset);
++
++    b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound);
++    b_linx4 = _mm_add_ps(b_linx4, offset);
 +
-+            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
-+            vox8 = _mm256_srai_epi32(vox8, out_sh);
-+            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++    rx4 = _mm_cvttps_epi32(r_linx4);
++    rx4 = av_clip_uint16_sse(rx4);
++    gx4 = _mm_cvttps_epi32(g_linx4);
++    gx4 = av_clip_uint16_sse(gx4);
++    bx4 = _mm_cvttps_epi32(b_linx4);
++    bx4 = av_clip_uint16_sse(bx4);
 +
-+            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
-+            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
-+            uvox16 = _mm256_packs_epi32(uvoax8, uvobx8);
-+            _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
-+        }
-+    }
++#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \
++g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)];                       \
++b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)];
 +
-+    // Process remaining pixels cannot fill the full simd register with scalar version
-+    if (remainw) {
-+        int offset = width & (int)0xfffffff0;
-+        rdsty += offset;
-+        rdstuv += offset;
-+        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
-+                                       rsrcy, rsrcuv,
-+                                       dstlinesize, srclinesize,
-+                                       dstdepth, srcdepth,
-+                                       remainw, rheight, params);
-+    }
-+#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
++    SAVE_COLOR(0)
++    SAVE_COLOR(1)
++    SAVE_COLOR(2)
++    SAVE_COLOR(3)
++
++#undef SAVE_COLOR
 +}
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
 +
-+X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
-+                                                       const uint16_t *srcy, const uint16_t *srcuv,
-+                                                       const int *dstlinesize, const int *srclinesize,
-+                                                       int dstdepth, int srcdepth,
-+                                                       int width, int height,
-+                                                       const struct TonemapIntParams *params)
++X86_64_V2 void tonemap_frame_dovi_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                             const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                             const int *dstlinesize, const int *srclinesize,
++                                             int dstdepth, int srcdepth,
++                                             int width, int height,
++                                             const struct TonemapIntParams *params)
 +{
-+#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
-+    uint16_t *rdsty = dsty;
-+    uint16_t *rdstuv = dstuv;
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
++
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 14;
++    int remainw = width & 6;
 +
 +    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
++    const float in_rng = (float)((1 << in_depth) - 1);
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+    const int out_sh2 = 16 - out_depth;
-+
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
 +    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
 +    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
@@ -3968,504 +7121,746 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[16], g[16], b[16];
-+    int16_t r1[16], g1[16], b1[16];
-+    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
-+    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
-+    __m256i cyx8 = _mm256_set1_epi32(cy);
-+    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
-+
-+    __m256i r0ox16, g0ox16, b0ox16;
-+    __m256i y0ox16;
-+    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
-+    __m256i yoax8, yobx8;
-+    __m256i uvx16, uvx8a, uvx8b;
-+    __m256i y0x16, y1x16;
-+    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
-+
-+    __m256i r1ox16, g1ox16, b1ox16;
-+    __m256i y1ox16;
-+    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
-+    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 4; xx++) {
-+            int x = xx << 4;
-+
-+            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
-+            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
-+
-+            if (in_depth == 10) {
-+                // shift to low10bits for 10bit input
-+                y0x16 = _mm256_srli_epi16(y0x16, 6);
-+                y1x16 = _mm256_srli_epi16(y1x16, 6);
-+                uvx16 = _mm256_srli_epi16(uvx16, 6);
-+            }
-+
-+            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
-+            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
-+            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
-+            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
-+            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
-+            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
-+            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
-+            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
-+            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
-+            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
-+            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
-+
-+            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
-+            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
-+            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
-+            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
-+
-+            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
-+            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
-+            r0x8a = av_clip_int16_avx(r0x8a);
-+
-+            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
-+            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
-+            r1x8a = av_clip_int16_avx(r1x8a);
-+
-+            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
-+            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
-+            g0x8a = av_clip_int16_avx(g0x8a);
-+
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
-+            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
-+            g1x8a = av_clip_int16_avx(g1x8a);
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
 +
-+            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
-+            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
-+            b0x8a = av_clip_int16_avx(b0x8a);
++    __m128i zero128 = _mm_setzero_si128();
++    __m128i ux4, vx4;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
-+            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
-+            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
-+            b1x8a = av_clip_int16_avx(b1x8a);
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
 +
-+            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
-+            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
-+            r0x8b = av_clip_int16_avx(r0x8b);
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4;
++    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4;
 +
-+            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
-+            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
-+            r1x8b = av_clip_int16_avx(r1x8b);
++    __m128 ipt0, ipt1, ipt2, ipt3;
++    __m128 ia1, ib1, ia2, ib2;
++    __m128 ix4, px4, tx4;
++    __m128 lx4, mx4, sx4;
++    __m128 rx4a, gx4a, bx4a, rx4b, gx4b, bx4b;
++    __m128 y0x4af, y0x4bf, y1x4af, y1x4bf, ux4af, ux4bf, vx4af, vx4bf;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
-+            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
-+            g0x8b = av_clip_int16_avx(g0x8b);
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1)));
++            vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1)));
 +
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
-+            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
-+            g1x8b = av_clip_int16_avx(g1x8b);
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            ux4 = _mm_cvtepu16_epi32(ux4);
++            vx4 = _mm_cvtepu16_epi32(vx4);
 +
-+            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
-+            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
-+            b0x8b = av_clip_int16_avx(b0x8b);
++            ux4a = _mm_unpacklo_epi32(ux4, ux4);
++            ux4b = _mm_unpackhi_epi32(ux4, ux4);
++            vx4a = _mm_unpacklo_epi32(vx4, vx4);
++            vx4b = _mm_unpackhi_epi32(vx4, vx4);
 +
-+            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
-+            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
-+            b1x8b = av_clip_int16_avx(b1x8b);
++            y0x4af = _mm_cvtepi32_ps(y0x4a);
++            y0x4bf = _mm_cvtepi32_ps(y0x4b);
++            y1x4af = _mm_cvtepi32_ps(y1x4a);
++            y1x4bf = _mm_cvtepi32_ps(y1x4b);
++            ux4af = _mm_cvtepi32_ps(ux4a);
++            ux4bf = _mm_cvtepi32_ps(ux4b);
++            vx4af = _mm_cvtepi32_ps(vx4a);
++            vx4bf = _mm_cvtepi32_ps(vx4b);
++
++            y0x4af = _mm_div_ps(y0x4af, _mm_set1_ps(in_rng));
++            y0x4bf = _mm_div_ps(y0x4bf, _mm_set1_ps(in_rng));
++            y1x4af = _mm_div_ps(y1x4af, _mm_set1_ps(in_rng));
++            y1x4bf = _mm_div_ps(y1x4bf, _mm_set1_ps(in_rng));
++            ux4af = _mm_div_ps(ux4af, _mm_set1_ps(in_rng));
++            ux4bf = _mm_div_ps(ux4bf, _mm_set1_ps(in_rng));
++            vx4af = _mm_div_ps(vx4af, _mm_set1_ps(in_rng));
++            vx4bf = _mm_div_ps(vx4bf, _mm_set1_ps(in_rng));
++
++            // Reshape y0x4a
++            ia1 = _mm_unpacklo_ps(y0x4af, ux4af);
++            ia2 = _mm_unpackhi_ps(y0x4af, ux4af);
++            ib1 = _mm_unpacklo_ps(vx4af, _mm_setzero_ps());
++            ib2 = _mm_unpackhi_ps(vx4af, _mm_setzero_ps());
++            ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++            ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0));
++
++            ia1 = _mm_unpacklo_ps(ipt0, ipt1);
++            ia2 = _mm_unpacklo_ps(ipt2, ipt3);
++            ib1 = _mm_unpackhi_ps(ipt0, ipt1);
++            ib2 = _mm_unpackhi_ps(ipt2, ipt3);
++
++            ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0));
++            px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(28672.0f));
++            gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(28672.0f));
++            bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(28672.0f));
++
++            r0x4a = _mm_cvtps_epi32(rx4a);
++            g0x4a = _mm_cvtps_epi32(gx4a);
++            b0x4a = _mm_cvtps_epi32(bx4a);
++
++            // Reshape y1x4a
++            ia1 = _mm_unpacklo_ps(y1x4af, ux4af);
++            ia2 = _mm_unpackhi_ps(y1x4af, ux4af);
++            ib1 = _mm_unpacklo_ps(vx4af, _mm_setzero_ps());
++            ib2 = _mm_unpackhi_ps(vx4af, _mm_setzero_ps());
++            ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++            ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0));
++
++            ia1 = _mm_unpacklo_ps(ipt0, ipt1);
++            ia2 = _mm_unpacklo_ps(ipt2, ipt3);
++            ib1 = _mm_unpackhi_ps(ipt0, ipt1);
++            ib2 = _mm_unpackhi_ps(ipt2, ipt3);
++
++            ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0));
++            px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(28672.0f));
++            gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(28672.0f));
++            bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(28672.0f));
++
++            r1x4a = _mm_cvtps_epi32(rx4a);
++            g1x4a = _mm_cvtps_epi32(gx4a);
++            b1x4a = _mm_cvtps_epi32(bx4a);
++
++            // Reshape y0x4b
++            ia1 = _mm_unpacklo_ps(y0x4bf, ux4bf);
++            ia2 = _mm_unpackhi_ps(y0x4bf, ux4bf);
++            ib1 = _mm_unpacklo_ps(vx4bf, _mm_setzero_ps());
++            ib2 = _mm_unpackhi_ps(vx4bf, _mm_setzero_ps());
++            ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++            ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0));
++
++            ia1 = _mm_unpacklo_ps(ipt0, ipt1);
++            ia2 = _mm_unpacklo_ps(ipt2, ipt3);
++            ib1 = _mm_unpackhi_ps(ipt0, ipt1);
++            ib2 = _mm_unpackhi_ps(ipt2, ipt3);
++
++            ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0));
++            px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(28672.0f));
++            gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(28672.0f));
++            bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(28672.0f));
++
++            r0x4b = _mm_cvtps_epi32(rx4b);
++            g0x4b = _mm_cvtps_epi32(gx4b);
++            b0x4b = _mm_cvtps_epi32(bx4b);
++
++            // Reshape y1x4b
++            ia1 = _mm_unpacklo_ps(y1x4bf, ux4bf);
++            ia2 = _mm_unpackhi_ps(y1x4bf, ux4bf);
++            ib1 = _mm_unpacklo_ps(vx4bf, _mm_setzero_ps());
++            ib2 = _mm_unpackhi_ps(vx4bf, _mm_setzero_ps());
++            ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++            ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0));
++
++            ia1 = _mm_unpacklo_ps(ipt0, ipt1);
++            ia2 = _mm_unpacklo_ps(ipt2, ipt3);
++            ib1 = _mm_unpackhi_ps(ipt0, ipt1);
++            ib2 = _mm_unpackhi_ps(ipt2, ipt3);
++
++            ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0));
++            px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(28672.0f));
++            gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(28672.0f));
++            bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(28672.0f));
++
++            r1x4b = _mm_cvtps_epi32(rx4b);
++            g1x4b = _mm_cvtps_epi32(gx4b);
++            b1x4b = _mm_cvtps_epi32(bx4b);
 +
-+            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
-+            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
-+            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
-+
-+            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
-+            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
-+            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
-+
-+            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
-+            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
-+            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
 +
-+            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
-+            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
 +
-+            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
-+            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
 +
-+            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
-+            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2);
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            // output shift bits for 8bit outputs is 29 - 8 = 21
++            yoax4 = _mm_srai_epi32(yoax4, 21);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
-+            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
-+            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, 21);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
-+            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
-+            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
++            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
 +
-+            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
-+            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
-+            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
 +
-+            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
-+            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
 +
-+            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
-+            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
 +
-+            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
-+            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2);
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, 21);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
-+            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
-+            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, 21);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
-+            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
-+            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
++            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
 +
-+            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
-+            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
-+            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
 +
-+            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
-+            uox8 = _mm256_srai_epi32(uox8, out_sh);
-+            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
 +
-+            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
-+            vox8 = _mm256_srai_epi32(vox8, out_sh);
-+            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
 +
-+            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
-+            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
-+            uvox16 = _mm256_packus_epi32(uvoax8, uvobx8);
-+            uvox16 = _mm256_slli_epi16(uvox16, out_sh2);
-+            _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16);
++            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uox4 = _mm_srai_epi32(uox4, 21);
++            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si32(&dstu[x >> 1], _mm_packus_epi16(_mm_packs_epi32(uox4, zero128), zero128));
++
++            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            vox4 = _mm_srai_epi32(vox4, 21);
++            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si32(&dstv[x >> 1], _mm_packus_epi16(_mm_packs_epi32(vox4, zero128), zero128));
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff0;
++        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
-+        rdstuv += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
 +        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
-+                                            rsrcy, rsrcuv,
-+                                            dstlinesize, srclinesize,
-+                                            dstdepth, srcdepth,
-+                                            remainw, rheight, params);
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_dovi_2_420p(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
 +    }
-+#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
 +}
-Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
-===================================================================
---- /dev/null
-+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
-@@ -0,0 +1,54 @@
-+/*
-+ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
 +
-+#ifndef AVFILTER_X86_TONEMAPX_INTRIN_AVX_H
-+#define AVFILTER_X86_TONEMAPX_INTRIN_AVX_H
-+
-+#include "libavfilter/vf_tonemapx.h"
-+
-+X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++X86_64_V2 void tonemap_frame_dovi_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
 +                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                               const int *dstlinesize, const int *srclinesize,
 +                                               int dstdepth, int srcdepth,
 +                                               int width, int height,
-+                                               const struct TonemapIntParams *params);
-+
-+X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                                 const int *dstlinesize, const int *srclinesize,
-+                                                 int dstdepth, int srcdepth,
-+                                                 int width, int height,
-+                                                 const struct TonemapIntParams *params);
-+
-+X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
-+                                                  const uint16_t *srcy, const uint16_t *srcuv,
-+                                                  const int *dstlinesize, const int *srclinesize,
-+                                                  int dstdepth, int srcdepth,
-+                                                  int width, int height,
-+                                                  const struct TonemapIntParams *params);
-+
-+X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
-+                                                       const uint16_t *srcy, const uint16_t *srcuv,
-+                                                       const int *dstlinesize, const int *srclinesize,
-+                                                       int dstdepth, int srcdepth,
-+                                                       int width, int height,
-+                                                       const struct TonemapIntParams *params);
-+
-+#endif // AVFILTER_X86_TONEMAPX_INTRIN_AVX_H
-Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
-===================================================================
---- /dev/null
-+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
-@@ -0,0 +1,1359 @@
-+/*
-+ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "vf_tonemapx_intrin_sse.h"
-+
++                                               const struct TonemapIntParams *params)
++{
 +#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
-+#    include <immintrin.h>
-+#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
 +
-+#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
-+// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction
-+// cast the register into float register and store with movss as a workaround
-+#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
-+__attribute__((always_inline))
-+X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) {
-+    _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a));
-+    return;
-+}
-+#endif
++    const int in_depth = srcdepth;
++    const float in_rng = (float)((1 << in_depth) - 1);
 +
-+X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a)
-+{
-+__m128i mask = _mm_set1_epi32(0x7FFF);
-+__m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF));
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
 +
-+__m128i zero = _mm_setzero_si128();
-+__m128i cmp = _mm_cmpeq_epi32(condition, zero);
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+__m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask);
-+__m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a));
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
 +
-+return result;
-+}
++    __m128i zero128 = _mm_setzero_si128();
++    __m128i ux4, vx4;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
-+X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a)
-+{
-+__m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U));
-+__m128i mask = _mm_set1_epi32(~0xFFFF);
-+__m128i condition = _mm_and_si128(add_result, mask);
-+__m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128());
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
 +
-+__m128i shifted = _mm_srai_epi32(a, 31);
-+__m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF));
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4;
++    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4;
 +
-+return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result));
-+}
++    __m128 ipt0, ipt1, ipt2, ipt3;
++    __m128 ia1, ib1, ia2, ib2;
++    __m128 ix4, px4, tx4;
++    __m128 lx4, mx4, sx4;
++    __m128 rx4a, gx4a, bx4a, rx4b, gx4b, bx4b;
++    __m128 y0x4af, y0x4bf, y1x4af, y1x4bf, ux4af, ux4bf, vx4af, vx4bf;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in,
-+                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
-+                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
-+                                                 const AVLumaCoefficients *coeffs,
-+                                                 const AVLumaCoefficients *ocoeffs, double desat,
-+                                                 double (*rgb2rgb)[3][3],
-+                                                 int rgb2rgb_passthrough)
-+{
-+    __m128i sig4;
-+    __m128 mapvalx4, r_linx4, g_linx4, b_linx4;
-+    __m128 offset = _mm_set1_ps(0.5f);
-+    __m128i input_lut_offset = _mm_set1_epi32(2048);
-+    __m128 intermediate_upper_bound = _mm_set1_ps(32767.0f);
-+    __m128i r, g, b, rx4, gx4, bx4;
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1)));
++            vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1)));
 +
-+    float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4];
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            ux4 = _mm_cvtepu16_epi32(ux4);
++            vx4 = _mm_cvtepu16_epi32(vx4);
 +
-+    sig4 = _mm_max_epi32(r_in, _mm_max_epi32(g_in, b_in));
-+    sig4 = _mm_add_epi32(sig4, input_lut_offset);
-+    sig4 = av_clip_uint16_sse(sig4);
++            ux4a = _mm_unpacklo_epi32(ux4, ux4);
++            ux4b = _mm_unpackhi_epi32(ux4, ux4);
++            vx4a = _mm_unpacklo_epi32(vx4, vx4);
++            vx4b = _mm_unpackhi_epi32(vx4, vx4);
 +
-+    r = _mm_add_epi32(r_in, input_lut_offset);
-+    r = av_clip_uint16_sse(r);
-+    g = _mm_add_epi32(g_in, input_lut_offset);
-+    g = av_clip_uint16_sse(g);
-+    b = _mm_add_epi32(b_in, input_lut_offset);
-+    b = av_clip_uint16_sse(b);
++            y0x4af = _mm_cvtepi32_ps(y0x4a);
++            y0x4bf = _mm_cvtepi32_ps(y0x4b);
++            y1x4af = _mm_cvtepi32_ps(y1x4a);
++            y1x4bf = _mm_cvtepi32_ps(y1x4b);
++            ux4af = _mm_cvtepi32_ps(ux4a);
++            ux4bf = _mm_cvtepi32_ps(ux4b);
++            vx4af = _mm_cvtepi32_ps(vx4a);
++            vx4bf = _mm_cvtepi32_ps(vx4b);
++
++            y0x4af = _mm_div_ps(y0x4af, _mm_set1_ps(in_rng));
++            y0x4bf = _mm_div_ps(y0x4bf, _mm_set1_ps(in_rng));
++            y1x4af = _mm_div_ps(y1x4af, _mm_set1_ps(in_rng));
++            y1x4bf = _mm_div_ps(y1x4bf, _mm_set1_ps(in_rng));
++            ux4af = _mm_div_ps(ux4af, _mm_set1_ps(in_rng));
++            ux4bf = _mm_div_ps(ux4bf, _mm_set1_ps(in_rng));
++            vx4af = _mm_div_ps(vx4af, _mm_set1_ps(in_rng));
++            vx4bf = _mm_div_ps(vx4bf, _mm_set1_ps(in_rng));
++
++            // Reshape y0x4a
++            ia1 = _mm_unpacklo_ps(y0x4af, ux4af);
++            ia2 = _mm_unpackhi_ps(y0x4af, ux4af);
++            ib1 = _mm_unpacklo_ps(vx4af, _mm_setzero_ps());
++            ib2 = _mm_unpackhi_ps(vx4af, _mm_setzero_ps());
++            ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++            ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0));
++
++            ia1 = _mm_unpacklo_ps(ipt0, ipt1);
++            ia2 = _mm_unpacklo_ps(ipt2, ipt3);
++            ib1 = _mm_unpackhi_ps(ipt0, ipt1);
++            ib2 = _mm_unpackhi_ps(ipt2, ipt3);
++
++            ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0));
++            px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(28672.0f));
++            gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(28672.0f));
++            bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(28672.0f));
++
++            r0x4a = _mm_cvtps_epi32(rx4a);
++            g0x4a = _mm_cvtps_epi32(gx4a);
++            b0x4a = _mm_cvtps_epi32(bx4a);
++
++            // Reshape y1x4a
++            ia1 = _mm_unpacklo_ps(y1x4af, ux4af);
++            ia2 = _mm_unpackhi_ps(y1x4af, ux4af);
++            ib1 = _mm_unpacklo_ps(vx4af, _mm_setzero_ps());
++            ib2 = _mm_unpackhi_ps(vx4af, _mm_setzero_ps());
++            ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++            ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0));
++
++            ia1 = _mm_unpacklo_ps(ipt0, ipt1);
++            ia2 = _mm_unpacklo_ps(ipt2, ipt3);
++            ib1 = _mm_unpackhi_ps(ipt0, ipt1);
++            ib2 = _mm_unpackhi_ps(ipt2, ipt3);
++
++            ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0));
++            px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4a, &gx4a, &bx4a, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4a = _mm_mul_ps(rx4a, _mm_set1_ps(28672.0f));
++            gx4a = _mm_mul_ps(gx4a, _mm_set1_ps(28672.0f));
++            bx4a = _mm_mul_ps(bx4a, _mm_set1_ps(28672.0f));
++
++            r1x4a = _mm_cvtps_epi32(rx4a);
++            g1x4a = _mm_cvtps_epi32(gx4a);
++            b1x4a = _mm_cvtps_epi32(bx4a);
++
++            // Reshape y0x4b
++            ia1 = _mm_unpacklo_ps(y0x4bf, ux4bf);
++            ia2 = _mm_unpackhi_ps(y0x4bf, ux4bf);
++            ib1 = _mm_unpacklo_ps(vx4bf, _mm_setzero_ps());
++            ib2 = _mm_unpackhi_ps(vx4bf, _mm_setzero_ps());
++            ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++            ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0));
++
++            ia1 = _mm_unpacklo_ps(ipt0, ipt1);
++            ia2 = _mm_unpacklo_ps(ipt2, ipt3);
++            ib1 = _mm_unpackhi_ps(ipt0, ipt1);
++            ib2 = _mm_unpackhi_ps(ipt2, ipt3);
++
++            ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0));
++            px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(28672.0f));
++            gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(28672.0f));
++            bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(28672.0f));
++
++            r0x4b = _mm_cvtps_epi32(rx4b);
++            g0x4b = _mm_cvtps_epi32(gx4b);
++            b0x4b = _mm_cvtps_epi32(bx4b);
++
++            // Reshape y1x4b
++            ia1 = _mm_unpacklo_ps(y1x4bf, ux4bf);
++            ia2 = _mm_unpackhi_ps(y1x4bf, ux4bf);
++            ib1 = _mm_unpacklo_ps(vx4bf, _mm_setzero_ps());
++            ib2 = _mm_unpackhi_ps(vx4bf, _mm_setzero_ps());
++            ipt0 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt1 = _mm_shuffle_ps(ia1, ib1, _MM_SHUFFLE(3, 2, 3, 2));
++            ipt2 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            ipt3 = _mm_shuffle_ps(ia2, ib2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ipt0 = reshape_dovi_iptpqc2(ipt0, params);
++            ipt1 = reshape_dovi_iptpqc2(ipt1, params);
++            ipt2 = reshape_dovi_iptpqc2(ipt2, params);
++            ipt3 = reshape_dovi_iptpqc2(ipt3, params);
++
++            ipt0 = _mm_shuffle_ps(ipt0, ipt0, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt1 = _mm_shuffle_ps(ipt1, ipt1, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt2 = _mm_shuffle_ps(ipt2, ipt2, _MM_SHUFFLE(3, 1, 2, 0));
++            ipt3 = _mm_shuffle_ps(ipt3, ipt3, _MM_SHUFFLE(3, 1, 2, 0));
++
++            ia1 = _mm_unpacklo_ps(ipt0, ipt1);
++            ia2 = _mm_unpacklo_ps(ipt2, ipt3);
++            ib1 = _mm_unpackhi_ps(ipt0, ipt1);
++            ib2 = _mm_unpackhi_ps(ipt2, ipt3);
++
++            ix4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(1, 0, 1, 0));
++            px4 = _mm_shuffle_ps(ib1, ib2, _MM_SHUFFLE(1, 0, 1, 0));
++            tx4 = _mm_shuffle_ps(ia1, ia2, _MM_SHUFFLE(3, 2, 3, 2));
++
++            ycc2rgbx4(&lx4, &mx4, &sx4, ix4, px4, tx4, params->dovi->nonlinear, *params->ycc_offset);
++            lms2rgbx4(&rx4b, &gx4b, &bx4b, lx4, mx4, sx4, *params->lms2rgb_matrix);
++
++            rx4b = _mm_mul_ps(rx4b, _mm_set1_ps(28672.0f));
++            gx4b = _mm_mul_ps(gx4b, _mm_set1_ps(28672.0f));
++            bx4b = _mm_mul_ps(bx4b, _mm_set1_ps(28672.0f));
++
++            r1x4b = _mm_cvtps_epi32(rx4b);
++            g1x4b = _mm_cvtps_epi32(gx4b);
++            b1x4b = _mm_cvtps_epi32(bx4b);
 +
-+    // Cannot use loop here as the lane has to be compile-time constant
-+#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \
-+r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)];                             \
-+g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)];                             \
-+b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)];
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
 +
-+    LOAD_LUT(0)
-+    LOAD_LUT(1)
-+    LOAD_LUT(2)
-+    LOAD_LUT(3)
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
 +
-+#undef LOAD_LUT
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
 +
-+    mapvalx4 = _mm_loadu_ps(mapval4);
-+    r_linx4 = _mm_loadu_ps(r_lin4);
-+    g_linx4 = _mm_loadu_ps(g_lin4);
-+    b_linx4 = _mm_loadu_ps(b_lin4);
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
 +
-+    if (!rgb2rgb_passthrough) {
-+        r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0]));
-+        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1])));
-+        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2])));
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            yoax4 = _mm_srai_epi32(yoax4, out_sh);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+        g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1]));
-+        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0])));
-+        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2])));
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, out_sh);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+        b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2]));
-+        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0])));
-+        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1])));
-+    }
++            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
++            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
 +
-+    if (desat > 0) {
-+        __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS);
-+        __m128 desat4 = _mm_set1_ps((float)desat);
-+        __m128 luma4 = _mm_set1_ps(0);
-+        __m128 overbright4;
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
 +
-+        luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr))));
-+        luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg))));
-+        luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb))));
-+        overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4));
-+        r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4));
-+        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4));
-+        g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4));
-+        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4));
-+        b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4));
-+        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4));
-+    }
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
 +
-+    r_linx4 = _mm_mul_ps(r_linx4, mapvalx4);
-+    g_linx4 = _mm_mul_ps(g_linx4, mapvalx4);
-+    b_linx4 = _mm_mul_ps(b_linx4, mapvalx4);
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
 +
-+    r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound);
-+    r_linx4 = _mm_add_ps(r_linx4, offset);
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+    g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound);
-+    g_linx4 = _mm_add_ps(g_linx4, offset);
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+    b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound);
-+    b_linx4 = _mm_add_ps(b_linx4, offset);
++            y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
 +
-+    rx4 = _mm_cvttps_epi32(r_linx4);
-+    rx4 = av_clip_uint16_sse(rx4);
-+    gx4 = _mm_cvttps_epi32(g_linx4);
-+    gx4 = av_clip_uint16_sse(gx4);
-+    bx4 = _mm_cvttps_epi32(b_linx4);
-+    bx4 = av_clip_uint16_sse(bx4);
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
 +
-+#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \
-+g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)];                       \
-+b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)];
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
 +
-+    SAVE_COLOR(0)
-+    SAVE_COLOR(1)
-+    SAVE_COLOR(2)
-+    SAVE_COLOR(3)
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
 +
-+#undef SAVE_COLOR
-+}
++            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uox4 = _mm_srai_epi32(uox4, out_sh);
++            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si64((__m128i_u *) &dstu[x >> 1], _mm_packus_epi32(uox4, zero128));
++
++            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            vox4 = _mm_srai_epi32(vox4, out_sh);
++            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si64((__m128i_u *) &dstv[x >> 1], _mm_packus_epi32(vox4, zero128));
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_dovi_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
++    }
 +#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
++}
 +
 +X86_64_V2 void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
 +                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
@@ -5655,7 +9050,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
-@@ -0,0 +1,54 @@
+@@ -0,0 +1,68 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -5681,6 +9076,20 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
 +
 +#include "libavfilter/vf_tonemapx.h"
 +
++X86_64_V2 void tonemap_frame_dovi_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                             const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                             const int *dstlinesize, const int *srclinesize,
++                                             int dstdepth, int srcdepth,
++                                             int width, int height,
++                                             const struct TonemapIntParams *params);
++
++X86_64_V2 void tonemap_frame_dovi_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params);
++
 +X86_64_V2 void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
 +                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                               const int *dstlinesize, const int *srclinesize,