From af84c83ebe544863e6d3099449249b720e7e269e Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Tue, 25 Jun 2024 20:40:06 +0800
Subject: [PATCH 01/27] avfilter/tonemapx: add simd optimized tonemapx

This includes NEON for ARMv8, SSE for x86-64-v2 and AVX+FMA for x86-64-v3

Test result with 4K HEVC 10bit HLG input, encoding with libx264 veryfast using bt2390:

Intel Core i9-12900:

tonemapx.c: 57fps
tonemapx.sse: 74fps
tonemapx.avx: 77fps

Apple M1 Max:

tonemapx.c:43fps
tonemapx.neon: 57fps

For comparison, original zscale+tonemap simd results:

Intel Core i9-12900:

tonemap.avx: 40fps
tonemap.sse: 40fps
tonemap.c: 32fps

Apple M1 Max:

tonemap.neon: 44fps
tonemap.c: 35fps

The original implementation is too memory heavy that dual-channel
desktop CPUs are easily memory bounded due to the intermediate
RGBF32 framebuffer sharing with zscale. Tonemapx lowered the the
bandwidth requirement which brings significant performance gain
to bandwidth limited platforms. Even for bandwidth-rich M1 Max
it still provides significant performance boost due to better cache
hitrate.
---
 debian/patches/0080-add-tonemapx-filter.patch | 2673 +++++++++++++++++
 debian/patches/series                         |    1 +
 2 files changed, 2674 insertions(+)
 create mode 100644 debian/patches/0080-add-tonemapx-filter.patch

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
new file mode 100644
index 00000000000..54fa1ea73e6
--- /dev/null
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -0,0 +1,2673 @@
+Index: FFmpeg/configure
+===================================================================
+--- FFmpeg.orig/configure
++++ FFmpeg/configure
+@@ -3772,6 +3772,7 @@ tinterlace_filter_deps="gpl"
+ tinterlace_merge_test_deps="tinterlace_filter"
+ tinterlace_pad_test_deps="tinterlace_filter"
+ tonemap_filter_deps="const_nan"
++tonemapx_filter_deps="const_nan"
+ tonemap_vaapi_filter_deps="vaapi VAProcFilterParameterBufferHDRToneMapping"
+ tonemap_opencl_filter_deps="opencl const_nan"
+ tonemap_videotoolbox_filter_deps="metal corevideo videotoolbox const_nan"
+Index: FFmpeg/libavfilter/allfilters.c
+===================================================================
+--- FFmpeg.orig/libavfilter/allfilters.c
++++ FFmpeg/libavfilter/allfilters.c
+@@ -484,6 +484,7 @@ extern const AVFilter ff_vf_tmedian;
+ extern const AVFilter ff_vf_tmidequalizer;
+ extern const AVFilter ff_vf_tmix;
+ extern const AVFilter ff_vf_tonemap;
++extern const AVFilter ff_vf_tonemapx;
+ extern const AVFilter ff_vf_tonemap_cuda;
+ extern const AVFilter ff_vf_tonemap_opencl;
+ extern const AVFilter ff_vf_tonemap_vaapi;
+Index: FFmpeg/libavfilter/colorspace.c
+===================================================================
+--- FFmpeg.orig/libavfilter/colorspace.c
++++ FFmpeg/libavfilter/colorspace.c
+@@ -17,6 +17,7 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include "libavutil/avassert.h"
+ #include "libavutil/frame.h"
+ #include "libavutil/mastering_display_metadata.h"
+ #include "libavutil/pixdesc.h"
+@@ -355,3 +356,51 @@ float inverse_eotf_arib_b67(float x) {
+ float inverse_eotf_bt1886(float x) {
+     return x > 0.0f ? powf(x, 1.0f / 2.4f) : 0.0f;
+ }
++
++int ff_get_range_off(int *off, int *y_rng, int *uv_rng,
++                     enum AVColorRange rng, int depth)
++{
++    switch (rng) {
++    case AVCOL_RANGE_UNSPECIFIED:
++    case AVCOL_RANGE_MPEG:
++        *off = 16 << (depth - 8);
++        *y_rng = 219 << (depth - 8);
++        *uv_rng = 224 << (depth - 8);
++        break;
++    case AVCOL_RANGE_JPEG:
++        *off = 0;
++        *y_rng = *uv_rng = (256 << (depth - 8)) - 1;
++        break;
++    default:
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
++void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3],
++                       int depth, int y_rng, int uv_rng, int yuv2rgb)
++{
++#define N (yuv2rgb ? m : n)
++#define M (yuv2rgb ? n : m)
++    int rng, n, m, o;
++    int bits = 1 << (yuv2rgb ? (depth - 1) : (29 - depth));
++    for (rng = y_rng, n = 0; n < 3; n++, rng = uv_rng) {
++        for (m = 0; m < 3; m++) {
++            out[N][M][0] = lrint(bits * (yuv2rgb ? 28672 : rng) * table[N][M] / (yuv2rgb ? rng : 28672));
++            for (o = 1; o < 8; o++)
++                out[N][M][o] = out[N][M][0];
++        }
++    }
++#undef N
++#undef M
++
++    if (yuv2rgb) {
++        av_assert2(out[0][1][0] == 0);
++        av_assert2(out[2][2][0] == 0);
++        av_assert2(out[0][0][0] == out[1][0][0]);
++        av_assert2(out[0][0][0] == out[2][0][0]);
++    } else {
++        av_assert2(out[1][2][0] == out[2][0][0]);
++    }
++}
+Index: FFmpeg/libavfilter/colorspace.h
+===================================================================
+--- FFmpeg.orig/libavfilter/colorspace.h
++++ FFmpeg/libavfilter/colorspace.h
+@@ -85,4 +85,8 @@ float eotf_arib_b67(float x);
+ float inverse_eotf_arib_b67(float x);
+ float inverse_eotf_bt1886(float x);
+ 
++int ff_get_range_off(int *off, int *y_rng, int *uv_rng,
++                     enum AVColorRange rng, int depth);
++void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3],
++                       int depth, int y_rng, int uv_rng, int yuv2rgb);
+ #endif
+Index: FFmpeg/libavfilter/Makefile
+===================================================================
+--- FFmpeg.orig/libavfilter/Makefile
++++ FFmpeg/libavfilter/Makefile
+@@ -516,6 +516,7 @@ OBJS-$(CONFIG_TMEDIAN_FILTER)
+ OBJS-$(CONFIG_TMIDEQUALIZER_FILTER)          += vf_tmidequalizer.o
+ OBJS-$(CONFIG_TMIX_FILTER)                   += vf_mix.o framesync.o
+ OBJS-$(CONFIG_TONEMAP_FILTER)                += vf_tonemap.o
++OBJS-$(CONFIG_TONEMAPX_FILTER)               += vf_tonemapx.o
+ OBJS-$(CONFIG_TONEMAP_OPENCL_FILTER)         += vf_tonemap_opencl.o opencl.o \
+                                                 opencl/tonemap.o opencl/colorspace_common.o
+ OBJS-$(CONFIG_TONEMAP_CUDA_FILTER)           += vf_tonemap_cuda.o cuda/tonemap.ptx.o \
+Index: FFmpeg/libavfilter/vf_tonemapx.c
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/vf_tonemapx.c
+@@ -0,0 +1,2555 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * tonemap algorithms
++ */
++
++#include <float.h>
++#include <stdio.h>
++#include <string.h>
++
++#include "libavutil/avassert.h"
++#include "libavutil/imgutils.h"
++#include "libavutil/internal.h"
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/cpu.h"
++#if  ARCH_AARCH64
++#   include <arm_neon.h>
++#   include "libavutil/aarch64/cpu.h"
++#endif
++#if ARCH_X86
++#   include <immintrin.h>
++#   include <emmintrin.h>
++#   include <smmintrin.h>
++#   include "libavutil/x86/cpu.h"
++#endif
++
++#include "avfilter.h"
++#include "colorspace.h"
++#include "formats.h"
++#include "internal.h"
++#include "video.h"
++
++//#if ARCH_AARCH64
++//#define _mm_extract_epi32(a, b) 0
++//#define _mm_shuffle_epi32(a, b) _mm_setzero_si128()
++//#define __builtin_ia32_vec_ext_v8si(sig4, i) 0
++//#define __builtin_ia32_extract128i256(a, b) _mm_setzero_si128()
++//#define __builtin_ia32_pshufd256(a, b) _mm256_setzero_si256()
++//#endif
++
++#define REFERENCE_WHITE 203.0f
++#define FLOAT_EPS 1.175494351e-38f
++
++enum TonemapAlgorithm {
++    TONEMAP_NONE,
++    TONEMAP_LINEAR,
++    TONEMAP_GAMMA,
++    TONEMAP_CLIP,
++    TONEMAP_REINHARD,
++    TONEMAP_HABLE,
++    TONEMAP_MOBIUS,
++    TONEMAP_BT2390,
++    TONEMAP_MAX,
++};
++
++typedef struct TonemapIntParams {
++    double lut_peak;
++    float *lin_lut;
++    float *tonemap_lut;
++    uint16_t *delin_lut;
++    int in_yuv_off, out_yuv_off;
++    int16_t (*yuv2rgb_coeffs)[3][3][8];
++    int16_t (*rgb2yuv_coeffs)[3][3][8];
++    double  (*rgb2rgb_coeffs)[3][3];
++    int rgb2rgb_passthrough;
++    const AVLumaCoefficients *coeffs, *ocoeffs;
++    double desat;
++} TonemapIntParams;
++
++typedef struct TonemapxContext {
++    const AVClass *class;
++
++    enum TonemapAlgorithm tonemap;
++    enum AVColorTransferCharacteristic trc;
++    enum AVColorSpace spc;
++    enum AVColorPrimaries pri;
++    enum AVColorRange range;
++    enum AVPixelFormat format;
++    char *format_str;
++    double param;
++    double desat;
++    double peak;
++
++    const AVLumaCoefficients *coeffs, *ocoeffs;
++
++    double lut_peak;
++    float *lin_lut;
++    float *tonemap_lut;
++    uint16_t *delin_lut;
++    int in_yuv_off, out_yuv_off;
++
++    DECLARE_ALIGNED(16, int16_t, yuv2rgb_coeffs)[3][3][8];
++    DECLARE_ALIGNED(16, int16_t, rgb2yuv_coeffs)[3][3][8];
++    DECLARE_ALIGNED(16, double,  rgb2rgb_coeffs)[3][3];
++
++    void (*tonemap_frame_p01x_2_nv12) (uint8_t *dsty, uint8_t *dstuv,
++                                       const uint16_t *srcy, const uint16_t *srcuv,
++                                       const int *dstlinesize, const int *srclinesize,
++                                       int dstdepth, int srcdepth,
++                                       int width, int height,
++                                       const struct TonemapIntParams *params);
++    void (*tonemap_frame_p01x_2_p01x) (uint16_t *dsty, uint16_t *dstuv,
++                                       const uint16_t *srcy, const uint16_t *srcuv,
++                                       const int *dstlinesize, const int *srclinesize,
++                                       int dstdepth, int srcdepth,
++                                       int width, int height,
++                                       const struct TonemapIntParams *params);
++} TonemapxContext;
++
++typedef struct ThreadData {
++    AVFrame *in, *out;
++    const AVPixFmtDescriptor *desc, *odesc;
++    double peak;
++    void (*tonemap_fuc) (void *dsty, void *dstuv,
++                         const uint16_t *srcy, const uint16_t *srcuv,
++                         const int *dstlinesize, const int *srclinesize,
++                         int dstdepth, int srcdepth,
++                         int width, int height,
++                         const struct TonemapIntParams *params);
++} ThreadData;
++
++static const enum AVPixelFormat in_pix_fmts[] = {
++    AV_PIX_FMT_P010,
++    AV_PIX_FMT_P016,
++    AV_PIX_FMT_NONE,
++};
++
++static const enum AVPixelFormat out_pix_fmts[] = {
++    AV_PIX_FMT_NV12,
++    AV_PIX_FMT_P010,
++    AV_PIX_FMT_P016,
++};
++
++static int out_format_is_supported(enum AVPixelFormat fmt)
++{
++    int i;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(out_pix_fmts); i++)
++        if (out_pix_fmts[i] == fmt)
++            return 1;
++    return 0;
++}
++
++static int query_formats(AVFilterContext *ctx)
++{
++    TonemapxContext *s = ctx->priv;
++    AVFilterFormats *formats = ff_make_format_list(in_pix_fmts);
++    int res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++    if (res < 0)
++        return res;
++
++    if (!strcmp(s->format_str, "same")) {
++        s->format = AV_PIX_FMT_NONE;
++    } else {
++        s->format = av_get_pix_fmt(s->format_str);
++        if (s->format == AV_PIX_FMT_NONE) {
++            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
++            return AVERROR(EINVAL);
++        }
++        if (out_format_is_supported(s->format)) {
++            formats = NULL;
++            res = ff_add_format(&formats, s->format);
++            if (res < 0)
++                return res;
++        } else {
++            av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
++                   av_get_pix_fmt_name(s->format));
++            return AVERROR(ENOSYS);
++        }
++    }
++
++    return ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats);
++}
++
++static float hable(float in)
++{
++    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
++    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
++}
++
++static float mobius(float in, float j, double peak)
++{
++    float a, b;
++
++    if (in <= j)
++        return in;
++
++    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
++    b = (j * j - 2.0f * j * peak + peak) / FFMAX(peak - 1.0f, FLOAT_EPS);
++
++    return (b * b + 2.0f * b * j + j * j) / (b - a) * (in + a) / (in + b);
++}
++
++static float bt2390(float s, float peak)
++{
++    float peak_pq = inverse_eotf_st2084(peak, REFERENCE_WHITE);
++    float scale = 1.0f / peak_pq;
++
++    // SDR peak
++    float dst_peak = 1.0f;
++    float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE) * scale;
++    float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE) * scale;
++
++    float ks = 1.5f * maxLum - 0.5f;
++    float tb = (s_pq - ks) / (1.0f - ks);
++    float tb2 = tb * tb;
++    float tb3 = tb2 * tb;
++    float pb = (2.0f * tb3 - 3.0f * tb2 + 1.0f) * ks +
++               (tb3 - 2.0f * tb2 + tb) * (1.0f - ks) +
++               (-2.0f * tb3 + 3.0f * tb2) * maxLum;
++    float sig = (s_pq < ks) ? s_pq : pb;
++
++    return eotf_st2084(sig * peak_pq, REFERENCE_WHITE);
++}
++
++static float mapsig(enum TonemapAlgorithm alg, float sig, double peak, double param)
++{
++    switch(alg) {
++    default:
++    case TONEMAP_NONE:
++        // do nothing
++        break;
++    case TONEMAP_LINEAR:
++        sig = sig * param / peak;
++        break;
++    case TONEMAP_GAMMA:
++        sig = sig > 0.05f
++              ? pow(sig / peak, 1.0f / param)
++              : sig * pow(0.05f / peak, 1.0f / param) / 0.05f;
++        break;
++    case TONEMAP_CLIP:
++        sig = av_clipf(sig * param, 0, 1.0f);
++        break;
++    case TONEMAP_HABLE:
++        sig = hable(sig) / hable(peak);
++        break;
++    case TONEMAP_REINHARD:
++        sig = sig / (sig + param) * (peak + param) / peak;
++        break;
++    case TONEMAP_MOBIUS:
++        sig = mobius(sig, param, peak);
++        break;
++    case TONEMAP_BT2390:
++        sig = bt2390(sig, peak);
++        break;
++    }
++
++    return sig;
++}
++
++static float linearize(float x, enum AVColorTransferCharacteristic trc_src)
++{
++    if (trc_src == AVCOL_TRC_SMPTE2084)
++        return eotf_st2084(x, REFERENCE_WHITE);
++    else if (trc_src == AVCOL_TRC_ARIB_STD_B67)
++        return eotf_arib_b67(x);
++    else
++        return x;
++}
++
++static float delinearize(float x, enum AVColorTransferCharacteristic trc_dst)
++{
++    if (trc_dst == AVCOL_TRC_BT709 || trc_dst == AVCOL_TRC_BT2020_10)
++        return inverse_eotf_bt1886(x);
++    else
++        return x;
++}
++
++static int comput_trc_luts(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src,
++                           enum AVColorTransferCharacteristic trc_dst)
++{
++    int i;
++
++    if (!s->lin_lut && !(s->lin_lut = av_calloc(32768, sizeof(float))))
++        return AVERROR(ENOMEM);
++    if (!s->delin_lut && !(s->delin_lut = av_calloc(32768, sizeof(uint16_t))))
++        return AVERROR(ENOMEM);
++
++    for (i = 0; i < 32768; i++) {
++        double v1 = (i - 2048.0f) / 28672.0f;
++        double v2 = i / 32767.0f;
++        s->lin_lut[i] = FFMAX(linearize(v1, trc_src), 0);
++        s->delin_lut[i] = av_clip_int16(lrint(delinearize(v2, trc_dst) * 28672.0f));
++    }
++
++    return 0;
++}
++
++static int compute_tonemap_lut(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src)
++{
++    int i;
++    double peak = s->lut_peak;
++
++    if (!s->tonemap_lut && !(s->tonemap_lut = av_calloc(32768, sizeof(float))))
++        return AVERROR(ENOMEM);
++
++    for (i = 0; i < 32768; i++) {
++        double v = (i - 2048.0f) / 28672.0f;
++        double sig = linearize(v, trc_src);
++        float mapped = mapsig(s->tonemap, sig, peak, s->param);
++        s->tonemap_lut[i] = (sig > 0.0f && mapped > 0.0f) ? mapped / sig : 0.0f;
++    }
++
++    return 0;
++}
++
++static int compute_yuv_coeffs(TonemapxContext *s,
++                              const AVLumaCoefficients *coeffs,
++                              const AVLumaCoefficients *ocoeffs,
++                              const AVPixFmtDescriptor *idesc,
++                              const AVPixFmtDescriptor *odesc,
++                              enum AVColorRange irng,
++                              enum AVColorRange orng)
++{
++    double rgb2yuv[3][3], yuv2rgb[3][3];
++    int res;
++    int y_rng, uv_rng;
++
++    res = ff_get_range_off(&s->in_yuv_off, &y_rng, &uv_rng,
++                           irng, idesc->comp[0].depth);
++    if (res < 0) {
++        av_log(s, AV_LOG_ERROR,
++               "Unsupported input color range %d (%s)\n",
++               irng, av_color_range_name(irng));
++        return res;
++    }
++
++    ff_fill_rgb2yuv_table(coeffs, rgb2yuv);
++    ff_matrix_invert_3x3(rgb2yuv, yuv2rgb);
++    ff_fill_rgb2yuv_table(ocoeffs, rgb2yuv);
++
++    ff_get_yuv_coeffs(s->yuv2rgb_coeffs, yuv2rgb, idesc->comp[0].depth,
++                      y_rng, uv_rng, 1);
++
++    res = ff_get_range_off(&s->out_yuv_off, &y_rng, &uv_rng,
++                           orng, odesc->comp[0].depth);
++    if (res < 0) {
++        av_log(s, AV_LOG_ERROR,
++               "Unsupported output color range %d (%s)\n",
++               orng, av_color_range_name(orng));
++        return res;
++    }
++
++    ff_get_yuv_coeffs(s->rgb2yuv_coeffs, rgb2yuv, odesc->comp[0].depth,
++                      y_rng, uv_rng, 0);
++
++    return 0;
++}
++
++static int compute_rgb_coeffs(TonemapxContext *s,
++                              enum AVColorPrimaries iprm,
++                              enum AVColorPrimaries oprm)
++{
++    double rgb2xyz[3][3], xyz2rgb[3][3];
++    const AVColorPrimariesDesc *iprm_desc = av_csp_primaries_desc_from_id(iprm);
++    const AVColorPrimariesDesc *oprm_desc = av_csp_primaries_desc_from_id(oprm);
++
++    if (!iprm_desc) {
++        av_log(s, AV_LOG_ERROR,
++               "Unsupported input color primaries %d (%s)\n",
++               iprm, av_color_primaries_name(iprm));
++        return AVERROR(EINVAL);
++    }
++    if (!oprm_desc) {
++        av_log(s, AV_LOG_ERROR,
++               "Unsupported output color primaries %d (%s)\n",
++               oprm, av_color_primaries_name(oprm));
++        return AVERROR(EINVAL);
++    }
++
++    ff_fill_rgb2xyz_table(&oprm_desc->prim, &oprm_desc->wp, rgb2xyz);
++    ff_matrix_invert_3x3(rgb2xyz, xyz2rgb);
++    ff_fill_rgb2xyz_table(&iprm_desc->prim, &iprm_desc->wp, rgb2xyz);
++    ff_matrix_mul_3x3(s->rgb2rgb_coeffs, rgb2xyz, xyz2rgb);
++
++    return 0;
++}
++
++static void tonemap_int16(int16_t r_in, int16_t g_in, int16_t b_in,
++                          int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                          float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                          const AVLumaCoefficients *coeffs,
++                          const AVLumaCoefficients *ocoeffs, double desat,
++                          double (*rgb2rgb)[3][3],
++                          int rgb2rgb_passthrough)
++{
++    int16_t sig;
++    float mapval, r_lin, g_lin, b_lin;
++
++    /* load values */
++    *r_out = r_in;
++    *g_out = g_in;
++    *b_out = b_in;
++
++    /* pick the brightest component, reducing the value range as necessary
++     * to keep the entire signal in range and preventing discoloration due to
++     * out-of-bounds clipping */
++    sig = FFMAX3(r_in, g_in, b_in);
++
++    mapval = tonemap_lut[av_clip_uintp2(sig + 2048, 15)];
++
++    r_lin = lin_lut[av_clip_uintp2(r_in + 2048, 15)];
++    g_lin = lin_lut[av_clip_uintp2(g_in + 2048, 15)];
++    b_lin = lin_lut[av_clip_uintp2(b_in + 2048, 15)];
++
++    if (!rgb2rgb_passthrough) {
++        r_lin = (*rgb2rgb)[0][0] * r_lin + (*rgb2rgb)[0][1] * g_lin + (*rgb2rgb)[0][2] * b_lin;
++        g_lin = (*rgb2rgb)[1][0] * r_lin + (*rgb2rgb)[1][1] * g_lin + (*rgb2rgb)[1][2] * b_lin;
++        b_lin = (*rgb2rgb)[2][0] * r_lin + (*rgb2rgb)[2][1] * g_lin + (*rgb2rgb)[2][2] * b_lin;
++    }
++
++#define MIX(x,y,a) (x) * (1 - (a)) + (y) * (a)
++    /* desaturate to prevent unnatural colors */
++    if (desat > 0) {
++        float luma = av_q2d(coeffs->cr) * r_lin + av_q2d(coeffs->cg) * g_lin + av_q2d(coeffs->cb) * b_lin;
++        float overbright = FFMAX(luma - desat, FLOAT_EPS) / FFMAX(luma, FLOAT_EPS);
++        r_lin = MIX(r_lin, luma, overbright);
++        g_lin = MIX(g_lin, luma, overbright);
++        b_lin = MIX(b_lin, luma, overbright);
++    }
++
++    r_lin *= mapval;
++    g_lin *= mapval;
++    b_lin *= mapval;
++
++    /*float cmin = FFMIN(FFMIN(r_lin, g_lin), b_lin);
++    if (cmin < 0.0) {
++        float luma = ocoeffs->cr * r_lin + ocoeffs->cg * g_lin + ocoeffs->cb * b_lin;
++        float coeff = cmin / (cmin - luma);
++        r_lin = MIX(r_lin, luma, coeff);
++        g_lin = MIX(g_lin, luma, coeff);
++        b_lin = MIX(b_lin, luma, coeff);avassert
++    }
++    float cmax = FFMAX(FFMAX(r_lin, g_lin), b_lin);
++    if (cmax > 1.0) {
++        r_lin /= cmax;
++        g_lin /= cmax;
++        b_lin /= cmax;
++    }*/
++#undef MIX
++
++    *r_out = delin_lut[av_clip_uintp2(r_lin * 32767 + 0.5, 15)];
++    *g_out = delin_lut[av_clip_uintp2(g_lin * 32767 + 0.5, 15)];
++    *b_out = delin_lut[av_clip_uintp2(b_lin * 32767 + 0.5, 15)];
++}
++
++#if ARCH_X86
++__attribute__((target("arch=x86-64-v2")))
++static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in,
++                                       int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                                       float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                                       const AVLumaCoefficients *coeffs,
++                                       const AVLumaCoefficients *ocoeffs, double desat,
++                                       double (*rgb2rgb)[3][3],
++                                       int rgb2rgb_passthrough)
++{
++    __m128i sig4;
++    __m128 mapvalx4, r_linx4, g_linx4, b_linx4;
++    __m128 offset = _mm_set1_ps(0.5f);
++    __m128i zerox4 = _mm_setzero_si128();
++    __m128i input_lut_offset = _mm_set1_epi32(2048);
++    __m128i upper_bound = _mm_set1_epi32(32767);
++    __m128 intermediate_upper_bound = _mm_set1_ps(32767.0f);
++    __m128i r, g, b, rx4, gx4, bx4;
++
++    float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4];
++    int i;
++
++    sig4 = _mm_max_epi32(r_in, _mm_max_epi32(g_in, b_in));
++    sig4 = _mm_add_epi32(sig4, input_lut_offset);
++    sig4 = _mm_min_epi32(sig4, upper_bound);
++
++    r = _mm_add_epi32(r_in, input_lut_offset);
++    r = _mm_min_epi32(r, upper_bound);
++    g = _mm_add_epi32(g_in, input_lut_offset);
++    g = _mm_min_epi32(g, upper_bound);
++    b = _mm_add_epi32(b_in, input_lut_offset);
++    b = _mm_min_epi32(b, upper_bound);
++
++    for (i = 0; i < 4; i++) {
++        mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)];
++        r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)];
++        g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)];
++        b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)];
++    }
++
++    mapvalx4 = _mm_loadu_ps(mapval4);
++    r_linx4 = _mm_loadu_ps(r_lin4);
++    g_linx4 = _mm_loadu_ps(g_lin4);
++    b_linx4 = _mm_loadu_ps(b_lin4);
++
++    if (!rgb2rgb_passthrough) {
++        r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0]));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1])));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2])));
++
++        g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1]));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0])));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2])));
++
++        b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2]));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0])));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1])));
++    }
++
++    if (desat > 0) {
++        __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS);
++        __m128 desat4 = _mm_set1_ps((float)desat);
++        __m128 luma4 = _mm_set1_ps(0);
++        __m128 overbright4;
++
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr))));
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg))));
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb))));
++        overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4));
++        r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4));
++        g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4));
++        b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4));
++    }
++
++    r_linx4 = _mm_mul_ps(r_linx4, mapvalx4);
++    g_linx4 = _mm_mul_ps(g_linx4, mapvalx4);
++    b_linx4 = _mm_mul_ps(b_linx4, mapvalx4);
++
++    r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound);
++    r_linx4 = _mm_add_ps(r_linx4, offset);
++
++    g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound);
++    g_linx4 = _mm_add_ps(g_linx4, offset);
++
++    b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound);
++    b_linx4 = _mm_add_ps(b_linx4, offset);
++
++    rx4 = _mm_cvttps_epi32(r_linx4);
++    rx4 = _mm_min_epi32(rx4, upper_bound);
++    rx4 = _mm_max_epi32(rx4, zerox4);
++
++    gx4 = _mm_cvttps_epi32(g_linx4);
++    gx4 = _mm_min_epi32(gx4, upper_bound);
++    gx4 = _mm_max_epi32(gx4, zerox4);
++
++    bx4 = _mm_cvttps_epi32(b_linx4);
++    bx4 = _mm_min_epi32(bx4, upper_bound);
++    bx4 = _mm_max_epi32(bx4, zerox4);
++
++    for (i = 0; i < 4; i++) {
++        r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)];
++        g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)];
++        b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)];
++    }
++}
++
++__attribute__((target("arch=x86-64-v3")))
++static inline void tonemap_int32x8_avx(__m256i r_in, __m256i g_in, __m256i b_in,
++                                       int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                                       float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                                       const AVLumaCoefficients *coeffs,
++                                       const AVLumaCoefficients *ocoeffs, double desat,
++                                       double (*rgb2rgb)[3][3],
++                                       int rgb2rgb_passthrough)
++{
++    __m256i sig8;
++    __m256 mapvalx8, r_linx8, g_linx8, b_linx8;
++    __m256 offset = _mm256_set1_ps(0.5f);
++    __m256i zerox8 = _mm256_setzero_si256();
++    __m256i input_lut_offset = _mm256_set1_epi32(2048);
++    __m256i upper_bound = _mm256_set1_epi32(32767);
++    __m256 intermediate_upper_bound = _mm256_set1_ps(32767.0f);
++    __m256i r, g, b, rx8, gx8, bx8;
++
++    float mapval8[8], r_lin8[8], g_lin8[8], b_lin8[8];
++    int i;
++
++    sig8 = _mm256_max_epi32(r_in, _mm256_max_epi32(g_in, b_in));
++    sig8 = _mm256_add_epi32(sig8, input_lut_offset);
++    sig8 = _mm256_min_epi32(sig8, upper_bound);
++
++    r = _mm256_add_epi32(r_in, input_lut_offset);
++    r = _mm256_min_epi32(r, upper_bound);
++    g = _mm256_add_epi32(g_in, input_lut_offset);
++    g = _mm256_min_epi32(g, upper_bound);
++    b = _mm256_add_epi32(b_in, input_lut_offset);
++    b = _mm256_min_epi32(b, upper_bound);
++
++    for (i = 0; i < 8; i++) {
++        mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)];
++        r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)];
++        g_lin8[i] = lin_lut[_mm256_extract_epi32(g, i)];
++        b_lin8[i] = lin_lut[_mm256_extract_epi32(b, i)];
++    }
++
++    mapvalx8 = _mm256_loadu_ps(mapval8);
++    r_linx8 = _mm256_loadu_ps(r_lin8);
++    g_linx8 = _mm256_loadu_ps(g_lin8);
++    b_linx8 = _mm256_loadu_ps(b_lin8);
++
++    if (!rgb2rgb_passthrough) {
++        r_linx8 = _mm256_mul_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][0]));
++        r_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][1]), r_linx8);
++        r_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][2]), r_linx8);
++
++        g_linx8 = _mm256_mul_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][1]));
++        g_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][0]), g_linx8);
++        g_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][2]), g_linx8);
++
++        b_linx8 = _mm256_mul_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][2]));
++        b_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][0]), b_linx8);
++        b_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][1]), b_linx8);
++    }
++
++    if (desat > 0) {
++        __m256 eps_x8 = _mm256_set1_ps(FLOAT_EPS);
++        __m256 desat8 = _mm256_set1_ps((float)desat);
++        __m256 luma8 = _mm256_set1_ps(0);
++        __m256 overbright8;
++
++        luma8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cr)), luma8);
++        luma8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cg)), luma8);
++        luma8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cb)), luma8);
++        overbright8 = _mm256_div_ps(_mm256_max_ps(_mm256_sub_ps(luma8, desat8), eps_x8), _mm256_max_ps(luma8, eps_x8));
++        r_linx8 = _mm256_fnmadd_ps(r_linx8, overbright8, r_linx8);
++        r_linx8 = _mm256_fmadd_ps(luma8, overbright8, r_linx8);
++        g_linx8 = _mm256_fnmadd_ps(g_linx8, overbright8, g_linx8);
++        g_linx8 = _mm256_fmadd_ps(luma8, overbright8, g_linx8);
++        b_linx8 = _mm256_fnmadd_ps(b_linx8, overbright8, b_linx8);
++        b_linx8 = _mm256_fmadd_ps(luma8, overbright8, b_linx8);
++    }
++
++    r_linx8 = _mm256_mul_ps(r_linx8, mapvalx8);
++    g_linx8 = _mm256_mul_ps(g_linx8, mapvalx8);
++    b_linx8 = _mm256_mul_ps(b_linx8, mapvalx8);
++
++    r_linx8 = _mm256_fmadd_ps(r_linx8, intermediate_upper_bound, offset);
++    g_linx8 = _mm256_fmadd_ps(g_linx8, intermediate_upper_bound, offset);
++    b_linx8 = _mm256_fmadd_ps(b_linx8, intermediate_upper_bound, offset);
++
++    rx8 = _mm256_cvttps_epi32(r_linx8);
++    rx8 = _mm256_min_epi32(rx8, upper_bound);
++    rx8 = _mm256_max_epi32(rx8, zerox8);
++
++    gx8 = _mm256_cvttps_epi32(g_linx8);
++    gx8 = _mm256_min_epi32(gx8, upper_bound);
++    gx8 = _mm256_max_epi32(gx8, zerox8);
++
++    bx8 = _mm256_cvttps_epi32(b_linx8);
++    bx8 = _mm256_min_epi32(bx8, upper_bound);
++    bx8 = _mm256_max_epi32(bx8, zerox8);
++
++    for (i = 0; i < 8; i++) {
++        r_out[i] = delin_lut[_mm256_extract_epi32(rx8, i)];
++        g_out[i] = delin_lut[_mm256_extract_epi32(gx8, i)];
++        b_out[i] = delin_lut[_mm256_extract_epi32(bx8, i)];
++    }
++}
++#endif
++
++#if ARCH_AARCH64
++static inline void tonemap_int16x8_neon(uint16x8_t r_in, uint16x8_t g_in, uint16x8_t b_in,
++                                        int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                                        float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                                        const AVLumaCoefficients *coeffs,
++                                        const AVLumaCoefficients *ocoeffs, double desat,
++                                        double (*rgb2rgb)[3][3],
++                                        int rgb2rgb_passthrough)
++{
++    uint16x8_t sig8;
++    float32x4_t mapvalx4a;
++    float32x4_t mapvalx4b;
++    float32x4_t r_linx4a;
++    float32x4_t r_linx4b;
++    float32x4_t g_linx4a;
++    float32x4_t g_linx4b;
++    float32x4_t b_linx4a;
++    float32x4_t b_linx4b;
++    float32x4_t offset = vdupq_n_f32(0.5f);
++    int32x4_t output_upper_bound = vdupq_n_s32(32767);
++    int32x4_t zerox4 = vdupq_n_s32(0);
++    int16x8_t input_lut_offset = vdupq_n_s16(2048);
++    int16x8_t input_upper_bound = vdupq_n_s16(32767);
++    int16x8_t r, g, b;
++    int32x4_t rx4a, gx4a, bx4a, rx4b, gx4b, bx4b;
++
++    float mapval4a[4], mapval4b[4], r_lin4a[4], r_lin4b[4], g_lin4a[4], g_lin4b[4], b_lin4a[4], b_lin4b[4];
++
++    r = vreinterpretq_s16_u16(r_in);
++    g = vreinterpretq_s16_u16(g_in);
++    b = vreinterpretq_s16_u16(b_in);
++
++    sig8 = vmaxq_s16(r, vmaxq_s16(g, b));
++    sig8 = vaddq_s16(sig8, input_lut_offset);
++    sig8 = vminq_s16(sig8, input_upper_bound);
++
++    r = vaddq_s16(r, input_lut_offset);
++    r = vminq_s16(r, input_upper_bound);
++    g = vaddq_s16(g, input_lut_offset);
++    g = vminq_s16(g, input_upper_bound);
++    b = vaddq_s16(b, input_lut_offset);
++    b = vminq_s16(b, input_upper_bound);
++
++    // Cannot use loop here as the lane has to be compile-time constant
++#define LOAD_LUT(i) mapval4a[i] = tonemap_lut[vget_lane_u16(vget_low_u16(sig8), i)]; \
++    mapval4b[i] = tonemap_lut[vget_lane_u16(vget_high_u16(sig8), i)]; \
++    r_lin4a[i] = lin_lut[vget_lane_u16(vget_low_u16(r), i)]; \
++    r_lin4b[i] = lin_lut[vget_lane_u16(vget_high_u16(r), i)]; \
++    g_lin4a[i] = lin_lut[vget_lane_u16(vget_low_u16(g), i)]; \
++    g_lin4b[i] = lin_lut[vget_lane_u16(vget_high_u16(g), i)]; \
++    b_lin4a[i] = lin_lut[vget_lane_u16(vget_low_u16(b), i)]; \
++    b_lin4b[i] = lin_lut[vget_lane_u16(vget_high_u16(b), i)];
++
++    LOAD_LUT(0)
++    LOAD_LUT(1)
++    LOAD_LUT(2)
++    LOAD_LUT(3)
++#undef  LOAD_LUT
++
++    mapvalx4a = vld1q_f32(mapval4a);
++    mapvalx4b = vld1q_f32(mapval4b);
++    r_linx4a = vld1q_f32(r_lin4a);
++    r_linx4b = vld1q_f32(r_lin4b);
++    g_linx4a = vld1q_f32(g_lin4a);
++    g_linx4b = vld1q_f32(g_lin4b);
++    b_linx4a = vld1q_f32(b_lin4a);
++    b_linx4b = vld1q_f32(b_lin4b);
++
++    if (!rgb2rgb_passthrough) {
++        r_linx4a = vmulq_n_f32(r_linx4a, (float)(*rgb2rgb)[0][0]);
++        r_linx4a = vfmaq_n_f32(r_linx4a, g_linx4a, (float)(*rgb2rgb)[0][1]);
++        r_linx4a = vfmaq_n_f32(r_linx4a, b_linx4a, (float)(*rgb2rgb)[0][2]);
++        r_linx4b = vmulq_n_f32(r_linx4b, (float)(*rgb2rgb)[0][0]);
++        r_linx4b = vfmaq_n_f32(r_linx4b, g_linx4b, (float)(*rgb2rgb)[0][1]);
++        r_linx4b = vfmaq_n_f32(r_linx4b, b_linx4b, (float)(*rgb2rgb)[0][2]);
++
++        g_linx4a = vmulq_n_f32(g_linx4a, (float)(*rgb2rgb)[1][1]);
++        g_linx4a = vfmaq_n_f32(g_linx4a, r_linx4a, (float)(*rgb2rgb)[1][0]);
++        g_linx4a = vfmaq_n_f32(g_linx4a, b_linx4a, (float)(*rgb2rgb)[1][2]);
++        g_linx4b = vmulq_n_f32(g_linx4b, (float)(*rgb2rgb)[1][1]);
++        g_linx4b = vfmaq_n_f32(g_linx4b, r_linx4b, (float)(*rgb2rgb)[1][0]);
++        g_linx4b = vfmaq_n_f32(g_linx4b, b_linx4b, (float)(*rgb2rgb)[1][2]);
++
++        b_linx4a = vmulq_n_f32(b_linx4a, (float)(*rgb2rgb)[2][2]);
++        b_linx4a = vfmaq_n_f32(b_linx4a, r_linx4a, (float)(*rgb2rgb)[2][0]);
++        b_linx4a = vfmaq_n_f32(b_linx4a, g_linx4a, (float)(*rgb2rgb)[2][1]);
++        b_linx4b = vmulq_n_f32(b_linx4b, (float)(*rgb2rgb)[2][2]);
++        b_linx4b = vfmaq_n_f32(b_linx4b, r_linx4b, (float)(*rgb2rgb)[2][0]);
++        b_linx4b = vfmaq_n_f32(b_linx4b, g_linx4b, (float)(*rgb2rgb)[2][1]);
++    }
++
++    if (desat > 0) {
++        float32x4_t eps_x4 = vdupq_n_f32(FLOAT_EPS);
++        float32x4_t desat4 = vdupq_n_f32((float)desat);
++        float32x4_t luma4 = vdupq_n_f32(0);
++        float32x4_t overbright4;
++        // Group A
++        luma4 = vmlaq_n_f32(luma4, r_linx4a, (float)av_q2d(coeffs->cr));
++        luma4 = vmlaq_n_f32(luma4, g_linx4a, (float)av_q2d(coeffs->cg));
++        luma4 = vmlaq_n_f32(luma4, b_linx4a, (float)av_q2d(coeffs->cb));
++        overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
++        r_linx4a = vmlsq_f32(r_linx4a, r_linx4a, overbright4);
++        r_linx4a = vmlaq_f32(r_linx4a, luma4, overbright4);
++        g_linx4a = vmlsq_f32(g_linx4a, g_linx4a, overbright4);
++        g_linx4a = vmlaq_f32(g_linx4a, luma4, overbright4);
++        b_linx4a = vmlsq_f32(b_linx4a, b_linx4a, overbright4);
++        b_linx4a = vmlaq_f32(b_linx4a, luma4, overbright4);
++        // Group B
++        luma4 = vdupq_n_f32(0);
++        luma4 = vmlaq_n_f32(luma4, r_linx4b, (float)av_q2d(coeffs->cr));
++        luma4 = vmlaq_n_f32(luma4, g_linx4b, (float)av_q2d(coeffs->cg));
++        luma4 = vmlaq_n_f32(luma4, b_linx4b, (float)av_q2d(coeffs->cb));
++        overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
++        r_linx4b = vmlsq_f32(r_linx4b, r_linx4b, overbright4);
++        r_linx4b = vmlaq_f32(r_linx4b, luma4, overbright4);
++        g_linx4b = vmlsq_f32(g_linx4b, g_linx4b, overbright4);
++        g_linx4b = vmlaq_f32(g_linx4b, luma4, overbright4);
++        b_linx4b = vmlsq_f32(b_linx4b, b_linx4b, overbright4);
++        b_linx4b = vmlaq_f32(b_linx4b, luma4, overbright4);
++    }
++
++    r_linx4a = vmulq_f32(r_linx4a, mapvalx4a);
++    g_linx4a = vmulq_f32(g_linx4a, mapvalx4a);
++    b_linx4a = vmulq_f32(b_linx4a, mapvalx4a);
++
++    r_linx4b = vmulq_f32(r_linx4b, mapvalx4b);
++    g_linx4b = vmulq_f32(g_linx4b, mapvalx4b);
++    b_linx4b = vmulq_f32(b_linx4b, mapvalx4b);
++
++    r_linx4a = vmlaq_n_f32(offset, r_linx4a, 32767);
++    r_linx4b = vmlaq_n_f32(offset, r_linx4b, 32767);
++    g_linx4a = vmlaq_n_f32(offset, g_linx4a, 32767);
++    g_linx4b = vmlaq_n_f32(offset, g_linx4b, 32767);
++    b_linx4a = vmlaq_n_f32(offset, b_linx4a, 32767);
++    b_linx4b = vmlaq_n_f32(offset, b_linx4b, 32767);
++
++    rx4a = vcvtq_s32_f32(r_linx4a);
++    rx4a = vminq_s32(rx4a, output_upper_bound);
++    rx4a = vmaxq_s32(rx4a, zerox4);
++    gx4a = vcvtq_s32_f32(g_linx4a);
++    gx4a = vminq_s32(gx4a, output_upper_bound);
++    gx4a = vmaxq_s32(gx4a, zerox4);
++    bx4a = vcvtq_s32_f32(b_linx4a);
++    bx4a = vminq_s32(bx4a, output_upper_bound);
++    bx4a = vmaxq_s32(bx4a, zerox4);
++    rx4b = vcvtq_s32_f32(r_linx4b);
++    rx4b = vminq_s32(rx4b, output_upper_bound);
++    rx4b = vmaxq_s32(rx4b, zerox4);
++    gx4b = vcvtq_s32_f32(g_linx4b);
++    gx4b = vminq_s32(gx4b, output_upper_bound);
++    gx4b = vmaxq_s32(gx4b, zerox4);
++    bx4b = vcvtq_s32_f32(b_linx4b);
++    bx4b = vminq_s32(bx4b, output_upper_bound);
++    bx4b = vmaxq_s32(bx4b, zerox4);
++
++    r_out[0] = delin_lut[vget_lane_s32(vget_low_s32(rx4a), 0)];
++    r_out[1] = delin_lut[vget_lane_s32(vget_low_s32(rx4a), 1)];
++    r_out[2] = delin_lut[vget_lane_s32(vget_high_s32(rx4a), 0)];
++    r_out[3] = delin_lut[vget_lane_s32(vget_high_s32(rx4a), 1)];
++    r_out[4] = delin_lut[vget_lane_s32(vget_low_s32(rx4b), 0)];
++    r_out[5] = delin_lut[vget_lane_s32(vget_low_s32(rx4b), 1)];
++    r_out[6] = delin_lut[vget_lane_s32(vget_high_s32(rx4b), 0)];
++    r_out[7] = delin_lut[vget_lane_s32(vget_high_s32(rx4b), 1)];
++
++    g_out[0] = delin_lut[vget_lane_s32(vget_low_s32(gx4a), 0)];
++    g_out[1] = delin_lut[vget_lane_s32(vget_low_s32(gx4a), 1)];
++    g_out[2] = delin_lut[vget_lane_s32(vget_high_s32(gx4a), 0)];
++    g_out[3] = delin_lut[vget_lane_s32(vget_high_s32(gx4a), 1)];
++    g_out[4] = delin_lut[vget_lane_s32(vget_low_s32(gx4b), 0)];
++    g_out[5] = delin_lut[vget_lane_s32(vget_low_s32(gx4b), 1)];
++    g_out[6] = delin_lut[vget_lane_s32(vget_high_s32(gx4b), 0)];
++    g_out[7] = delin_lut[vget_lane_s32(vget_high_s32(gx4b), 1)];
++
++    b_out[0] = delin_lut[vget_lane_s32(vget_low_s32(bx4a), 0)];
++    b_out[1] = delin_lut[vget_lane_s32(vget_low_s32(bx4a), 1)];
++    b_out[2] = delin_lut[vget_lane_s32(vget_high_s32(bx4a), 0)];
++    b_out[3] = delin_lut[vget_lane_s32(vget_high_s32(bx4a), 1)];
++    b_out[4] = delin_lut[vget_lane_s32(vget_low_s32(bx4b), 0)];
++    b_out[5] = delin_lut[vget_lane_s32(vget_low_s32(bx4b), 1)];
++    b_out[6] = delin_lut[vget_lane_s32(vget_high_s32(bx4b), 0)];
++    b_out[7] = delin_lut[vget_lane_s32(vget_high_s32(bx4b), 1)];
++}
++#endif
++
++// See also libavfilter/colorspacedsp_template.c
++static void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty,
++                                           uint8_t *dstuv,
++                                           const uint16_t *srcy,
++                                           const uint16_t *srcuv,
++                                           const int *dstlinesize,
++                                           const int *srclinesize,
++                                           int dstdepth,
++                                           int srcdepth,
++                                           int width,
++                                           int height,
++                                           const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
++            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
++            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
++
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++
++            int r00 = r[0], g00 = g[0], b00 = b[0];
++            int r01 = r[1], g01 = g[1], b01 = b[1];
++            int r10 = r[2], g10 = g[2], b10 = b[2];
++            int r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
++            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstuv[x]     = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
++            dstuv[x + 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
++#undef AVG
++        }
++    }
++}
++
++#if ARCH_X86
++__attribute__((target("arch=x86-64-v2")))
++static inline __m128i av_clip_int16_sse(__m128i a)
++{
++    __m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U));
++    __m128i mask = _mm_set1_epi32(~0xFFFF);
++    __m128i condition = _mm_and_si128(add_result, mask);
++    __m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128());
++
++    __m128i shifted = _mm_srai_epi32(a, 31);
++    __m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF));
++
++    return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result));
++}
++
++__attribute__((target("arch=x86-64-v3")))
++static inline __m256i av_clip_int16_avx(__m256i a)
++{
++    __m256i add_result = _mm256_add_epi32(a, _mm256_set1_epi32(0x8000U));
++    __m256i mask = _mm256_set1_epi32(~0xFFFF);
++    __m256i condition = _mm256_and_si256(add_result, mask);
++    __m256i cmp = _mm256_cmpeq_epi32(condition, _mm256_setzero_si256());
++
++    __m256i shifted = _mm256_srai_epi32(a, 31);
++    __m256i xor_result = _mm256_xor_si256(shifted, _mm256_set1_epi32(0x7FFF));
++
++    return _mm256_or_si256(_mm256_and_si256(cmp, a), _mm256_andnot_si256(cmp, xor_result));
++}
++
++__attribute__((target("arch=x86-64-v2")))
++static void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty,
++                                               uint8_t *dstuv,
++                                               const uint16_t *srcy,
++                                               const uint16_t *srcuv,
++                                               const int *dstlinesize,
++                                               const int *srclinesize,
++                                               int dstdepth,
++                                               int srcdepth,
++                                               int width,
++                                               int height,
++                                               const struct TonemapIntParams *params)
++{
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++//    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++//    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
++    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
++    __m128i cyx4 = _mm_set1_epi32(cy);
++    __m128i rndx4 = _mm_set1_epi32(in_rnd);
++    __m128i zero128 = _mm_setzero_si128();
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
++            int ip;
++            __m128i uvx8, uvx4a, uvx4b;
++            __m128i y0x8, y1x8;
++            __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++            __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++            __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
++
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx8 = _mm_lddqu_si128((__m128i*)(srcuv + x));
++
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                // shift bit has to be compile-time constant
++                y0x8 = _mm_srli_epi16(y0x8, 6);
++                y1x8 = _mm_srli_epi16(y1x8, 6);
++                uvx8 = _mm_srli_epi16(uvx8, 6);
++            }
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            uvx4a = _mm_cvtepu16_epi32(uvx8);
++            uvx4b = _mm_unpackhi_epi16(uvx8, zero128);
++            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
++            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
++            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
++            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
++            uvx4a = _mm_sub_epi32(uvx4a, in_uv_offx4);
++            uvx4b = _mm_sub_epi32(uvx4b, in_uv_offx4);
++
++            ux4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
++
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r0x4a = _mm_add_epi32(r0x4a, rndx4);
++            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
++            r0x4a = av_clip_int16_sse(r0x4a);
++
++            r1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r1x4a = _mm_add_epi32(r1x4a, rndx4);
++            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
++            r1x4a = av_clip_int16_sse(r1x4a);
++
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g0x4a = _mm_add_epi32(g0x4a, rndx4);
++            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
++            g0x4a = av_clip_int16_sse(g0x4a);
++
++            g1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g1x4a = _mm_add_epi32(g1x4a, rndx4);
++            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
++            g1x4a = av_clip_int16_sse(g1x4a);
++
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b0x4a = _mm_add_epi32(b0x4a, rndx4);
++            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
++            b0x4a = av_clip_int16_sse(b0x4a);
++
++            b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b1x4a = _mm_add_epi32(b1x4a, rndx4);
++            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
++            b1x4a = av_clip_int16_sse(b1x4a);
++
++            r0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r0x4b = _mm_add_epi32(r0x4b, rndx4);
++            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
++            r0x4b = av_clip_int16_sse(r0x4b);
++
++            r1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r1x4b = _mm_add_epi32(r1x4b, rndx4);
++            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
++            r1x4b = av_clip_int16_sse(r1x4b);
++
++            g0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g0x4b = _mm_add_epi32(g0x4b, rndx4);
++            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
++            g0x4b = av_clip_int16_sse(g0x4b);
++
++            g1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g1x4b = _mm_add_epi32(g1x4b, rndx4);
++            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
++            g1x4b = av_clip_int16_sse(g1x4b);
++
++            b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b0x4b = _mm_add_epi32(b0x4b, rndx4);
++            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
++            b0x4b = av_clip_int16_sse(b0x4b);
++
++            b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b1x4b = _mm_add_epi32(b1x4b, rndx4);
++            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
++            b1x4b = av_clip_int16_sse(b1x4b);
++
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            for (ip = 0; ip < 8; ip ++) {
++                dsty[x + ip] = av_clip_uint8(params->out_yuv_off + ((r[ip] * cry + g[ip] * cgy + b[ip] * cby + out_rnd) >> out_sh));
++                dsty[dstlinesize[0] + x + ip] = av_clip_uint8(params->out_yuv_off + ((r1[ip] * cry + g1[ip] * cgy + b1[ip] * cby + out_rnd) >> out_sh));
++                if (ip & 1) { // is odd
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++                    dstuv[x + ip -1] = av_clip_uint8(out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cru + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgu + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cburv + out_rnd) >> out_sh));
++                    dstuv[x + ip] = av_clip_uint8(out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cburv + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgv + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cbv + out_rnd) >> out_sh));
++#undef AVG
++                }
++            }
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
++    }
++}
++
++__attribute__((target("arch=x86-64-v3")))
++static void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty,
++                                               uint8_t *dstuv,
++                                               const uint16_t *srcy,
++                                               const uint16_t *srcuv,
++                                               const int *dstlinesize,
++                                               const int *srclinesize,
++                                               int dstdepth,
++                                               int srcdepth,
++                                               int width,
++                                               int height,
++                                               const struct TonemapIntParams *params)
++{
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 16
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 14;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++//    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++//    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
++            int ip;
++            __m256i uvx16, uvx8a, uvx8b;
++            __m256i y0x16, y1x16;
++            __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++            __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++            __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
++
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                y0x16 = _mm256_srli_epi16(y0x16, 6);
++                y1x16 = _mm256_srli_epi16(y1x16, 6);
++                uvx16 = _mm256_srli_epi16(uvx16, 6);
++            }
++
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
++            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
++            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
++
++            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
++
++            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
++
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
++
++            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
++
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
++
++            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
++
++            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
++
++            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
++
++            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
++
++            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
++
++            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
++
++            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
++
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            for (ip = 0; ip < 16; ip ++) {
++                dsty[x + ip] = av_clip_uint8(params->out_yuv_off + ((r[ip] * cry + g[ip] * cgy + b[ip] * cby + out_rnd) >> out_sh));
++                dsty[dstlinesize[0] + x + ip] = av_clip_uint8(params->out_yuv_off + ((r1[ip] * cry + g1[ip] * cgy + b1[ip] * cby + out_rnd) >> out_sh));
++                if (ip & 1) { // is odd
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++                    dstuv[x + ip -1] = av_clip_uint8(out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cru + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgu + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cburv + out_rnd) >> out_sh));
++                    dstuv[x + ip] = av_clip_uint8(out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cburv + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgv + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cbv + out_rnd) >> out_sh));
++#undef AVG
++                }
++            }
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
++    }
++}
++#endif
++
++#if ARCH_AARCH64
++static void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty,
++                                                uint8_t *dstuv,
++                                                const uint16_t *srcy,
++                                                const uint16_t *srcuv,
++                                                const int *dstlinesize,
++                                                const int *srclinesize,
++                                                int dstdepth,
++                                                int srcdepth,
++                                                int width,
++                                                int height,
++                                                const struct TonemapIntParams *params)
++{
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
++            int ip = 0;
++            uint16x8_t uvx8;
++            uint16x4_t ux2a, vx2a, ux2b, vx2b;
++            uint16x8_t y0x8, y1x8, ux8, vx8;
++            uint16x8_t r0x8, g0x8, b0x8;
++            uint16x8_t r1x8, g1x8, b1x8;
++
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            uvx8 = vld1q_u16(srcuv + x);
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                // shift bit has to be compile-time constant
++                y0x8 = vshrq_n_u16(y0x8, 6);
++                y1x8 = vshrq_n_u16(y1x8, 6);
++                uvx8 = vshrq_n_u16(uvx8, 6);
++            }
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
++
++            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
++            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
++            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
++            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
++
++            ux8 = vcombine_u16(ux2a, ux2b);
++            vx8 = vcombine_u16(vx2a, vx2b);
++
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
++
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
++
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
++
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
++
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
++
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
++
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++
++            for (ip = 0; ip < 8; ip ++) {
++                dsty[x + ip] = av_clip_uint8(params->out_yuv_off + ((r[ip] * cry + g[ip] * cgy + b[ip] * cby + out_rnd) >> out_sh));
++                dsty[dstlinesize[0] + x + ip] = av_clip_uint8(params->out_yuv_off + ((r1[ip] * cry + g1[ip] * cgy + b1[ip] * cby + out_rnd) >> out_sh));
++                if (ip & 1) { // is odd
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++                    dstuv[x + ip -1] = av_clip_uint8(out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cru + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgu + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cburv + out_rnd) >> out_sh));
++                    dstuv[x + ip] = av_clip_uint8(out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cburv + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgv + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cbv + out_rnd) >> out_sh));
++#undef AVG
++                }
++            }
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
++    }
++}
++#endif
++
++static void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty,
++                                                uint16_t *dstuv,
++                                                const uint16_t *srcy,
++                                                const uint16_t *srcuv,
++                                                const int *dstlinesize,
++                                                const int *srclinesize,
++                                                int dstdepth,
++                                                int srcdepth,
++                                                int width,
++                                                int height,
++                                                const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
++            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
++            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
++
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++
++            int r00 = r[0], g00 = g[0], b00 = b[0];
++            int r01 = r[1], g01 = g[1], b01 = b[1];
++            int r10 = r[2], g10 = g[2], b10 = b[2];
++            int r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstuv[x]     = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
++            dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
++#undef AVG
++        }
++    }
++}
++
++#if ARCH_X86
++__attribute__((target("arch=x86-64-v2")))
++static void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty,
++                                                    uint16_t *dstuv,
++                                                    const uint16_t *srcy,
++                                                    const uint16_t *srcuv,
++                                                    const int *dstlinesize,
++                                                    const int *srclinesize,
++                                                    int dstdepth,
++                                                    int srcdepth,
++                                                    int width,
++                                                    int height,
++                                                    const struct TonemapIntParams *params)
++{
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++
++    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
++    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
++    __m128i cyx4 = _mm_set1_epi32(cy);
++    __m128i rndx4 = _mm_set1_epi32(in_rnd);
++    __m128i zero128 = _mm_setzero_si128();
++
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
++            int ip;
++            __m128i uvx8, uvx4a, uvx4b;
++            __m128i y0x8, y1x8;
++            __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++            __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++            __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
++
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx8 = _mm_lddqu_si128((__m128i*)(srcuv + x));
++
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                // shift bit has to be compile-time constant
++                y0x8 = _mm_srli_epi16(y0x8, 6);
++                y1x8 = _mm_srli_epi16(y1x8, 6);
++                uvx8 = _mm_srli_epi16(uvx8, 6);
++            }
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            uvx4a = _mm_cvtepu16_epi32(uvx8);
++            uvx4b = _mm_unpackhi_epi16(uvx8, zero128);
++            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
++            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
++            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
++            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
++            uvx4a = _mm_sub_epi32(uvx4a, in_uv_offx4);
++            uvx4b = _mm_sub_epi32(uvx4b, in_uv_offx4);
++
++            ux4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
++
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r0x4a = _mm_add_epi32(r0x4a, rndx4);
++            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
++            r0x4a = av_clip_int16_sse(r0x4a);
++
++            r1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r1x4a = _mm_add_epi32(r1x4a, rndx4);
++            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
++            r1x4a = av_clip_int16_sse(r1x4a);
++
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g0x4a = _mm_add_epi32(g0x4a, rndx4);
++            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
++            g0x4a = av_clip_int16_sse(g0x4a);
++
++            g1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g1x4a = _mm_add_epi32(g1x4a, rndx4);
++            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
++            g1x4a = av_clip_int16_sse(g1x4a);
++
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b0x4a = _mm_add_epi32(b0x4a, rndx4);
++            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
++            b0x4a = av_clip_int16_sse(b0x4a);
++
++            b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b1x4a = _mm_add_epi32(b1x4a, rndx4);
++            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
++            b1x4a = av_clip_int16_sse(b1x4a);
++
++            r0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r0x4b = _mm_add_epi32(r0x4b, rndx4);
++            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
++            r0x4b = av_clip_int16_sse(r0x4b);
++
++            r1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r1x4b = _mm_add_epi32(r1x4b, rndx4);
++            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
++            r1x4b = av_clip_int16_sse(r1x4b);
++
++            g0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g0x4b = _mm_add_epi32(g0x4b, rndx4);
++            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
++            g0x4b = av_clip_int16_sse(g0x4b);
++
++            g1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g1x4b = _mm_add_epi32(g1x4b, rndx4);
++            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
++            g1x4b = av_clip_int16_sse(g1x4b);
++
++            b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b0x4b = _mm_add_epi32(b0x4b, rndx4);
++            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
++            b0x4b = av_clip_int16_sse(b0x4b);
++
++            b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b1x4b = _mm_add_epi32(b1x4b, rndx4);
++            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
++            b1x4b = av_clip_int16_sse(b1x4b);
++
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            for (ip = 0; ip < 8; ip ++) {
++                dsty[x + ip]                          = av_clip_uintp2((params->out_yuv_off + ((r[ip] * cry + g[ip] * cgy + b[ip] * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++                dsty[dstlinesize[0] / 2 + x + ip]     = av_clip_uintp2((params->out_yuv_off + ((r1[ip] * cry + g1[ip] * cgy + b1[ip] * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++                if (ip & 1) { // is odd
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++                    dstuv[x + ip - 1] = av_clip_uintp2((out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cru + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgu + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
++                    dstuv[x + ip] = av_clip_uintp2((out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cburv + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgv + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
++#undef AVG
++                }
++            }
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
++    }
++}
++
++__attribute__((target("arch=x86-64-v3")))
++static void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty,
++                                                    uint16_t *dstuv,
++                                                    const uint16_t *srcy,
++                                                    const uint16_t *srcuv,
++                                                    const int *dstlinesize,
++                                                    const int *srclinesize,
++                                                    int dstdepth,
++                                                    int srcdepth,
++                                                    int width,
++                                                    int height,
++                                                    const struct TonemapIntParams *params)
++{
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 14;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
++            int ip;
++            __m256i uvx16, uvx8a, uvx8b;
++            __m256i y0x16, y1x16;
++            __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++            __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++            __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
++
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                y0x16 = _mm256_srli_epi16(y0x16, 6);
++                y1x16 = _mm256_srli_epi16(y1x16, 6);
++                uvx16 = _mm256_srli_epi16(uvx16, 6);
++            }
++
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
++            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
++            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
++
++            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
++
++            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
++
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
++
++            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
++
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
++
++            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
++
++            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
++
++            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
++
++            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
++
++            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
++
++            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
++
++            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
++
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            for (ip = 0; ip < 16; ip ++) {
++                dsty[x + ip]                          = av_clip_uintp2((params->out_yuv_off + ((r[ip] * cry + g[ip] * cgy + b[ip] * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++                dsty[dstlinesize[0] / 2 + x + ip]     = av_clip_uintp2((params->out_yuv_off + ((r1[ip] * cry + g1[ip] * cgy + b1[ip] * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++                if (ip & 1) { // is odd
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++                    dstuv[x + ip - 1] = av_clip_uintp2((out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cru + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgu + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
++                    dstuv[x + ip] = av_clip_uintp2((out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cburv + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgv + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
++#undef AVG
++                }
++            }
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
++    }
++}
++#endif
++
++#if ARCH_AARCH64
++static void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty,
++                                                     uint16_t *dstuv,
++                                                     const uint16_t *srcy,
++                                                     const uint16_t *srcuv,
++                                                     const int *dstlinesize,
++                                                     const int *srclinesize,
++                                                     int dstdepth,
++                                                     int srcdepth,
++                                                     int width,
++                                                     int height,
++                                                     const struct TonemapIntParams *params)
++{
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
++            int ip = 0;
++            uint16x8_t uvx8;
++            uint16x4_t ux2a, vx2a, ux2b, vx2b;
++            uint16x8_t y0x8, y1x8, ux8, vx8;
++            uint16x8_t r0x8, g0x8, b0x8;
++            uint16x8_t r1x8, g1x8, b1x8;
++
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            uvx8 = vld1q_u16(srcuv + x);
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                // shift bit has to be compile-time constant
++                y0x8 = vshrq_n_u16(y0x8, 6);
++                y1x8 = vshrq_n_u16(y1x8, 6);
++                uvx8 = vshrq_n_u16(uvx8, 6);
++            }
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
++
++            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
++            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
++            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
++            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
++
++            ux8 = vcombine_u16(ux2a, ux2b);
++            vx8 = vcombine_u16(vx2a, vx2b);
++
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
++
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
++
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
++
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
++
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
++
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
++
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++
++            for (ip = 0; ip < 8; ip ++) {
++                dsty[x + ip]                          = av_clip_uintp2((params->out_yuv_off + ((r[ip] * cry + g[ip] * cgy + b[ip] * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++                dsty[dstlinesize[0] / 2 + x + ip]     = av_clip_uintp2((params->out_yuv_off + ((r1[ip] * cry + g1[ip] * cgy + b1[ip] * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++                if (ip & 1) { // is odd
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++                    dstuv[x + ip - 1] = av_clip_uintp2((out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cru + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgu + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
++                    dstuv[x + ip] = av_clip_uintp2((out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cburv + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgv + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
++#undef AVG
++                }
++            }
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
++    }
++}
++#endif
++
++static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    TonemapxContext *s = ctx->priv;
++    ThreadData *td = arg;
++    AVFrame *in = td->in;
++    AVFrame *out = td->out;
++    const AVPixFmtDescriptor *desc  = td->desc;
++    const AVPixFmtDescriptor *odesc = td->odesc;
++    const int ss = 1 << FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h);
++    const int slice_start = (in->height / ss *  jobnr     ) / nb_jobs * ss;
++    const int slice_end   = (in->height / ss * (jobnr + 1)) / nb_jobs * ss;
++    int y, x;
++
++    TonemapIntParams params = {
++        .lut_peak            = s->lut_peak,
++        .lin_lut             = s->lin_lut,
++        .tonemap_lut         = s->tonemap_lut,
++        .delin_lut           = s->delin_lut,
++        .in_yuv_off          = s->in_yuv_off,
++        .out_yuv_off         = s->out_yuv_off,
++        .yuv2rgb_coeffs      = &s->yuv2rgb_coeffs,
++        .rgb2yuv_coeffs      = &s->rgb2yuv_coeffs,
++        .rgb2rgb_coeffs      = &s->rgb2rgb_coeffs,
++        .rgb2rgb_passthrough = in->color_primaries == out->color_primaries,
++        .coeffs              = s->coeffs,
++        .ocoeffs             = s->ocoeffs,
++        .desat               = s->desat,
++    };
++
++    av_log(s, AV_LOG_DEBUG, "dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++
++    td->tonemap_fuc(out->data[0] + out->linesize[0] * slice_start,
++                    out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                    (void*)(in->data[0] + in->linesize[0] * slice_start),
++                    (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                    out->linesize, in->linesize,
++                    odesc->comp[0].depth, desc->comp[0].depth,
++                    out->width, slice_end - slice_start,
++                    &params);
++
++    return 0;
++}
++
++static int filter_frame(AVFilterLink *link, AVFrame *in)
++{
++    AVFilterContext *ctx = link->dst;
++    TonemapxContext *s = ctx->priv;
++    AVFilterLink *outlink = ctx->outputs[0];
++    AVFrame *out;
++    const AVPixFmtDescriptor *desc;
++    const AVPixFmtDescriptor *odesc;
++    int ret;
++    double peak = s->peak;
++    const AVLumaCoefficients *coeffs;
++    ThreadData td;
++
++    desc = av_pix_fmt_desc_get(link->format);
++    odesc = av_pix_fmt_desc_get(outlink->format);
++    if (!desc || !odesc) {
++        av_frame_free(&in);
++        return AVERROR_BUG;
++    }
++
++    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
++    if (!out) {
++        av_frame_free(&in);
++        return AVERROR(ENOMEM);
++    }
++
++    if ((ret = av_frame_copy_props(out, in)) < 0)
++        goto fail;
++
++    /* read peak from side data if not passed in */
++    if (!peak) {
++        peak = ff_determine_signal_peak(in);
++        av_log(s, AV_LOG_DEBUG, "Computed signal peak: %f\n", peak);
++    }
++
++    out->color_trc = s->trc;
++    out->colorspace = s->spc;
++    out->color_primaries = s->pri;
++    out->color_range = s->range;
++
++    if (in->color_trc == AVCOL_TRC_UNSPECIFIED)
++        in->color_trc = AVCOL_TRC_SMPTE2084;
++    if (out->color_trc == AVCOL_TRC_UNSPECIFIED)
++        out->color_trc = AVCOL_TRC_BT709;
++
++    if (in->colorspace == AVCOL_SPC_UNSPECIFIED)
++        in->colorspace = AVCOL_SPC_BT2020_NCL;
++    if (out->colorspace == AVCOL_SPC_UNSPECIFIED)
++        out->colorspace = AVCOL_SPC_BT709;
++
++    if (in->color_primaries == AVCOL_PRI_UNSPECIFIED)
++        in->color_primaries = AVCOL_PRI_BT2020;
++    if (out->color_primaries == AVCOL_PRI_UNSPECIFIED)
++        out->color_primaries = AVCOL_PRI_BT709;
++
++    if (in->color_range == AVCOL_RANGE_UNSPECIFIED)
++        in->color_range = AVCOL_RANGE_MPEG;
++    if (out->color_range == AVCOL_RANGE_UNSPECIFIED)
++        out->color_range = AVCOL_RANGE_MPEG;
++
++    if (!s->lin_lut || !s->delin_lut) {
++        if ((ret = comput_trc_luts(s, in->color_trc, out->color_trc)) < 0)
++            goto fail;
++    }
++
++    if (!s->tonemap_lut || s->lut_peak != peak) {
++        s->lut_peak = peak;
++        if ((ret = compute_tonemap_lut(s, out->color_trc)) < 0)
++            goto fail;
++    }
++
++    coeffs = av_csp_luma_coeffs_from_avcsp(in->colorspace);
++    if (s->coeffs != coeffs) {
++        s->coeffs = coeffs;
++        s->ocoeffs = av_csp_luma_coeffs_from_avcsp(out->colorspace);
++        if ((ret = compute_yuv_coeffs(s, coeffs, s->ocoeffs, desc, odesc,
++             in->color_range, out->color_range)) < 0)
++            goto fail;
++        if ((ret = compute_rgb_coeffs(s, in->color_primaries, out->color_primaries)) < 0)
++            goto fail;
++    }
++
++    /* do the tonemap */
++    td.in    = in;
++    td.out   = out;
++    td.desc  = desc;
++    td.odesc = odesc;
++    td.peak  = peak;
++    td.tonemap_fuc = odesc->comp[0].depth == 8 ? s->tonemap_frame_p01x_2_nv12 : s->tonemap_frame_p01x_2_p01x;
++    ctx->internal->execute(ctx, filter_slice, &td, NULL,
++                           FFMIN(outlink->h >> FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h), ff_filter_get_nb_threads(ctx)));
++
++    av_frame_free(&in);
++
++    ff_update_hdr_metadata(out, peak);
++
++    return ff_filter_frame(outlink, out);
++fail:
++    av_frame_free(&in);
++    av_frame_free(&out);
++    return ret;
++}
++
++static void uninit(AVFilterContext *ctx)
++{
++    TonemapxContext *s = ctx->priv;
++
++    av_freep(&s->lin_lut);
++    av_freep(&s->delin_lut);
++    av_freep(&s->tonemap_lut);
++}
++
++static av_cold int init(AVFilterContext *ctx)
++{
++    TonemapxContext *s = ctx->priv;
++    int cpu_flags = av_get_cpu_flags();
++
++#if ARCH_AARCH64
++    if (have_neon(cpu_flags)) {
++        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12_neon;
++        s->tonemap_frame_p01x_2_p01x = tonemap_frame_p016_p010_2_p016_p010_neon;
++    }
++#elif ARCH_X86
++    if (X86_SSE42(cpu_flags)) {
++        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12_sse;
++        s->tonemap_frame_p01x_2_p01x = tonemap_frame_p016_p010_2_p016_p010_sse;
++    }
++    if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) {
++        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12_avx;
++        s->tonemap_frame_p01x_2_p01x = tonemap_frame_p016_p010_2_p016_p010_avx;
++    }
++#endif
++
++    if (!s->tonemap_frame_p01x_2_nv12) {
++        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12;
++    }
++
++    if (!s->tonemap_frame_p01x_2_p01x) {
++        s->tonemap_frame_p01x_2_p01x = tonemap_frame_p016_p010_2_p016_p010;
++    }
++
++    switch(s->tonemap) {
++        case TONEMAP_GAMMA:
++            if (isnan(s->param))
++                s->param = 1.8f;
++            break;
++        case TONEMAP_REINHARD:
++            if (!isnan(s->param))
++                s->param = (1.0f - s->param) / s->param;
++            break;
++        case TONEMAP_MOBIUS:
++            if (isnan(s->param))
++                s->param = 0.3f;
++            break;
++    }
++
++    if (isnan(s->param))
++        s->param = 1.0f;
++
++    return 0;
++}
++
++#define OFFSET(x) offsetof(TonemapxContext, x)
++#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
++static const AVOption tonemapx_options[] = {
++    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_BT2390}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, "tonemap" },
++    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},              0, 0, FLAGS, "tonemap" },
++    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},            0, 0, FLAGS, "tonemap" },
++    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},             0, 0, FLAGS, "tonemap" },
++    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0, 0, FLAGS, "tonemap" },
++    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},          0, 0, FLAGS, "tonemap" },
++    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},             0, 0, FLAGS, "tonemap" },
++    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},            0, 0, FLAGS, "tonemap" },
++    {     "bt2390",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_BT2390},            0, 0, FLAGS, "tonemap" },
++    { "transfer",     "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, "transfer" },
++    { "t",            "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, "transfer" },
++    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},           0, 0, FLAGS, "transfer" },
++    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},       0, 0, FLAGS, "transfer" },
++    { "matrix",       "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, "matrix" },
++    { "m",            "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, "matrix" },
++    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},           0, 0, FLAGS, "matrix" },
++    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},      0, 0, FLAGS, "matrix" },
++    { "primaries",    "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, "primaries" },
++    { "p",            "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, "primaries" },
++    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709},           0, 0, FLAGS, "primaries" },
++    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020},          0, 0, FLAGS, "primaries" },
++    { "range",        "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, "range" },
++    { "r",            "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, "range" },
++    {     "tv",       0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},          0, 0, FLAGS, "range" },
++    {     "pc",       0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},          0, 0, FLAGS, "range" },
++    {     "limited",  0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},          0, 0, FLAGS, "range" },
++    {     "full",     0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},          0, 0, FLAGS, "range" },
++    { "format",       "output format",       OFFSET(format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS },
++    { "param",        "tonemap parameter", OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },
++    { "desat",        "desaturation strength", OFFSET(desat), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
++    { "peak",         "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
++    { NULL }
++};
++
++AVFILTER_DEFINE_CLASS(tonemapx);
++
++static const AVFilterPad tonemapx_inputs[] = {
++    {
++        .name         = "default",
++        .type         = AVMEDIA_TYPE_VIDEO,
++        .filter_frame = filter_frame,
++    },
++};
++
++static const AVFilterPad tonemapx_outputs[] = {
++    {
++        .name         = "default",
++        .type         = AVMEDIA_TYPE_VIDEO,
++    },
++};
++
++AVFilter ff_vf_tonemapx = {
++    .name            = "tonemapx",
++    .description     = NULL_IF_CONFIG_SMALL("HDR to SDR tonemapping"),
++    .init            = init,
++    .uninit          = uninit,
++    .priv_size       = sizeof(TonemapxContext),
++    .priv_class      = &tonemapx_class,
++    FILTER_INPUTS(tonemapx_inputs),
++    FILTER_OUTPUTS(tonemapx_outputs),
++    FILTER_QUERY_FUNC(query_formats),
++    .flags           = AVFILTER_FLAG_SLICE_THREADS,
++};
diff --git a/debian/patches/series b/debian/patches/series
index 89f73782901..62c02c7603e 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -77,3 +77,4 @@
 0077-add-detection-of-dtsx.patch
 0078-add-detection-of-atmos-in-eac3.patch
 0079-add-detection-of-atmos-in-truehd.patch
+0080-add-tonemapx-filter.patch

From bbb6a126159f35ab863591d52a8e2d2496f66f74 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Tue, 25 Jun 2024 21:38:05 +0800
Subject: [PATCH 02/27] avfilter/tonmapx: fix gcc build

---
 debian/patches/0080-add-tonemapx-filter.patch | 30 +++++++------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 54fa1ea73e6..b148925e196 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -34,7 +34,7 @@ Index: FFmpeg/libavfilter/colorspace.c
  #include "libavutil/frame.h"
  #include "libavutil/mastering_display_metadata.h"
  #include "libavutil/pixdesc.h"
-@@ -355,3 +356,51 @@ float inverse_eotf_arib_b67(float x) {
+@@ -354,3 +355,51 @@ float inverse_eotf_arib_b67(float x) {
  float inverse_eotf_bt1886(float x) {
      return x > 0.0f ? powf(x, 1.0f / 2.4f) : 0.0f;
  }
@@ -115,7 +115,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,2555 @@
+@@ -0,0 +1,2547 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -168,14 +168,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#include "internal.h"
 +#include "video.h"
 +
-+//#if ARCH_AARCH64
-+//#define _mm_extract_epi32(a, b) 0
-+//#define _mm_shuffle_epi32(a, b) _mm_setzero_si128()
-+//#define __builtin_ia32_vec_ext_v8si(sig4, i) 0
-+//#define __builtin_ia32_extract128i256(a, b) _mm_setzero_si128()
-+//#define __builtin_ia32_pshufd256(a, b) _mm256_setzero_si256()
-+//#endif
-+
 +#define REFERENCE_WHITE 203.0f
 +#define FLOAT_EPS 1.175494351e-38f
 +
@@ -804,7 +796,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                                        double (*rgb2rgb)[3][3],
 +                                        int rgb2rgb_passthrough)
 +{
-+    uint16x8_t sig8;
++    int16x8_t sig8;
 +    float32x4_t mapvalx4a;
 +    float32x4_t mapvalx4b;
 +    float32x4_t r_linx4a;
@@ -839,14 +831,14 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    b = vminq_s16(b, input_upper_bound);
 +
 +    // Cannot use loop here as the lane has to be compile-time constant
-+#define LOAD_LUT(i) mapval4a[i] = tonemap_lut[vget_lane_u16(vget_low_u16(sig8), i)]; \
-+    mapval4b[i] = tonemap_lut[vget_lane_u16(vget_high_u16(sig8), i)]; \
-+    r_lin4a[i] = lin_lut[vget_lane_u16(vget_low_u16(r), i)]; \
-+    r_lin4b[i] = lin_lut[vget_lane_u16(vget_high_u16(r), i)]; \
-+    g_lin4a[i] = lin_lut[vget_lane_u16(vget_low_u16(g), i)]; \
-+    g_lin4b[i] = lin_lut[vget_lane_u16(vget_high_u16(g), i)]; \
-+    b_lin4a[i] = lin_lut[vget_lane_u16(vget_low_u16(b), i)]; \
-+    b_lin4b[i] = lin_lut[vget_lane_u16(vget_high_u16(b), i)];
++#define LOAD_LUT(i) mapval4a[i] = tonemap_lut[vget_lane_s16(vget_low_s16(sig8), i)]; \
++    mapval4b[i] = tonemap_lut[vget_lane_s16(vget_high_s16(sig8), i)]; \
++    r_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(r), i)]; \
++    r_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(r), i)]; \
++    g_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(g), i)]; \
++    g_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(g), i)]; \
++    b_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(b), i)]; \
++    b_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(b), i)];
 +
 +    LOAD_LUT(0)
 +    LOAD_LUT(1)

From 8e42663a116b820af79c0faa77d935f9a6048c63 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Tue, 25 Jun 2024 23:50:58 +0800
Subject: [PATCH 03/27] avfilter/tonemapx: fix old gcc build

---
 debian/patches/0080-add-tonemapx-filter.patch | 124 +++++++++++-------
 1 file changed, 78 insertions(+), 46 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index b148925e196..e4b9bcd3deb 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -115,7 +115,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,2547 @@
+@@ -0,0 +1,2579 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -171,6 +171,16 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#define REFERENCE_WHITE 203.0f
 +#define FLOAT_EPS 1.175494351e-38f
 +
++#if defined(__GNUC__) || defined(__clang__)
++#    if (__GNUC__ >= 11) || (__clang_major__ >= 12)
++#        define X86_64_V2 __attribute__((target("arch=x86-64-v2")))
++#        define X86_64_V3 __attribute__((target("arch=x86-64-v3")))
++#    else
++#        define X86_64_V2 __attribute__((target("arch=nehalem")))
++#        define X86_64_V3 __attribute__((target("arch=haswell")))
++#    endif // (__GNUC__ >= 11) || (__clang_major__ >= 12)
++#endif // ifdef __GNUC__
++
 +enum TonemapAlgorithm {
 +    TONEMAP_NONE,
 +    TONEMAP_LINEAR,
@@ -575,8 +585,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +}
 +
 +#if ARCH_X86
-+__attribute__((target("arch=x86-64-v2")))
-+static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in,
++X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in,
 +                                       int16_t *r_out, int16_t *g_out, int16_t *b_out,
 +                                       float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
 +                                       const AVLumaCoefficients *coeffs,
@@ -607,12 +616,18 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    b = _mm_add_epi32(b_in, input_lut_offset);
 +    b = _mm_min_epi32(b, upper_bound);
 +
-+    for (i = 0; i < 4; i++) {
-+        mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)];
-+        r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)];
-+        g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)];
-+        b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)];
-+    }
++    // Cannot use loop here as the lane has to be compile-time constant
++#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \
++r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)];                             \
++g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)];                             \
++b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)];
++
++    LOAD_LUT(0)
++    LOAD_LUT(1)
++    LOAD_LUT(2)
++    LOAD_LUT(3)
++
++#undef LOAD_LUT
 +
 +    mapvalx4 = _mm_loadu_ps(mapval4);
 +    r_linx4 = _mm_loadu_ps(r_lin4);
@@ -676,15 +691,19 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    bx4 = _mm_min_epi32(bx4, upper_bound);
 +    bx4 = _mm_max_epi32(bx4, zerox4);
 +
-+    for (i = 0; i < 4; i++) {
-+        r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)];
-+        g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)];
-+        b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)];
-+    }
++#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \
++g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)];                       \
++b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)];
++
++    SAVE_COLOR(0)
++    SAVE_COLOR(1)
++    SAVE_COLOR(2)
++    SAVE_COLOR(3)
++
++#undef SAVE_COLOR
 +}
 +
-+__attribute__((target("arch=x86-64-v3")))
-+static inline void tonemap_int32x8_avx(__m256i r_in, __m256i g_in, __m256i b_in,
++X86_64_V3 static inline void tonemap_int32x8_avx(__m256i r_in, __m256i g_in, __m256i b_in,
 +                                       int16_t *r_out, int16_t *g_out, int16_t *b_out,
 +                                       float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
 +                                       const AVLumaCoefficients *coeffs,
@@ -715,12 +734,21 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    b = _mm256_add_epi32(b_in, input_lut_offset);
 +    b = _mm256_min_epi32(b, upper_bound);
 +
-+    for (i = 0; i < 8; i++) {
-+        mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)];
-+        r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)];
-+        g_lin8[i] = lin_lut[_mm256_extract_epi32(g, i)];
-+        b_lin8[i] = lin_lut[_mm256_extract_epi32(b, i)];
-+    }
++#define LOAD_LUT(i) mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)]; \
++r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)];                             \
++g_lin8[i] = lin_lut[_mm256_extract_epi32(g, i)];                             \
++b_lin8[i] = lin_lut[_mm256_extract_epi32(b, i)];
++
++    LOAD_LUT(0)
++    LOAD_LUT(1)
++    LOAD_LUT(2)
++    LOAD_LUT(3)
++    LOAD_LUT(4)
++    LOAD_LUT(5)
++    LOAD_LUT(6)
++    LOAD_LUT(7)
++
++#undef LOAD_LUT
 +
 +    mapvalx8 = _mm256_loadu_ps(mapval8);
 +    r_linx8 = _mm256_loadu_ps(r_lin8);
@@ -779,11 +807,20 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    bx8 = _mm256_min_epi32(bx8, upper_bound);
 +    bx8 = _mm256_max_epi32(bx8, zerox8);
 +
-+    for (i = 0; i < 8; i++) {
-+        r_out[i] = delin_lut[_mm256_extract_epi32(rx8, i)];
-+        g_out[i] = delin_lut[_mm256_extract_epi32(gx8, i)];
-+        b_out[i] = delin_lut[_mm256_extract_epi32(bx8, i)];
-+    }
++#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm256_extract_epi32(rx8, i)]; \
++g_out[i] = delin_lut[_mm256_extract_epi32(gx8, i)];                       \
++b_out[i] = delin_lut[_mm256_extract_epi32(bx8, i)];
++
++    SAVE_COLOR(0)
++    SAVE_COLOR(1)
++    SAVE_COLOR(2)
++    SAVE_COLOR(3)
++    SAVE_COLOR(4)
++    SAVE_COLOR(5)
++    SAVE_COLOR(6)
++    SAVE_COLOR(7)
++
++#undef SAVE_COLOR
 +}
 +#endif
 +
@@ -832,18 +869,19 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +
 +    // Cannot use loop here as the lane has to be compile-time constant
 +#define LOAD_LUT(i) mapval4a[i] = tonemap_lut[vget_lane_s16(vget_low_s16(sig8), i)]; \
-+    mapval4b[i] = tonemap_lut[vget_lane_s16(vget_high_s16(sig8), i)]; \
-+    r_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(r), i)]; \
-+    r_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(r), i)]; \
-+    g_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(g), i)]; \
-+    g_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(g), i)]; \
-+    b_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(b), i)]; \
-+    b_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(b), i)];
++mapval4b[i] = tonemap_lut[vget_lane_s16(vget_high_s16(sig8), i)];                    \
++r_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(r), i)];                             \
++r_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(r), i)];                            \
++g_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(g), i)];                             \
++g_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(g), i)];                            \
++b_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(b), i)];                             \
++b_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(b), i)];
 +
 +    LOAD_LUT(0)
 +    LOAD_LUT(1)
 +    LOAD_LUT(2)
 +    LOAD_LUT(3)
++
 +#undef  LOAD_LUT
 +
 +    mapvalx4a = vld1q_f32(mapval4a);
@@ -1070,8 +1108,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +}
 +
 +#if ARCH_X86
-+__attribute__((target("arch=x86-64-v2")))
-+static inline __m128i av_clip_int16_sse(__m128i a)
++X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a)
 +{
 +    __m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U));
 +    __m128i mask = _mm_set1_epi32(~0xFFFF);
@@ -1084,8 +1121,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result));
 +}
 +
-+__attribute__((target("arch=x86-64-v3")))
-+static inline __m256i av_clip_int16_avx(__m256i a)
++X86_64_V3 static inline __m256i av_clip_int16_avx(__m256i a)
 +{
 +    __m256i add_result = _mm256_add_epi32(a, _mm256_set1_epi32(0x8000U));
 +    __m256i mask = _mm256_set1_epi32(~0xFFFF);
@@ -1098,8 +1134,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    return _mm256_or_si256(_mm256_and_si256(cmp, a), _mm256_andnot_si256(cmp, xor_result));
 +}
 +
-+__attribute__((target("arch=x86-64-v2")))
-+static void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty,
++X86_64_V2 static void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty,
 +                                               uint8_t *dstuv,
 +                                               const uint16_t *srcy,
 +                                               const uint16_t *srcuv,
@@ -1319,8 +1354,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+__attribute__((target("arch=x86-64-v3")))
-+static void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty,
++X86_64_V3 static void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty,
 +                                               uint8_t *dstuv,
 +                                               const uint16_t *srcy,
 +                                               const uint16_t *srcuv,
@@ -1793,8 +1827,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +}
 +
 +#if ARCH_X86
-+__attribute__((target("arch=x86-64-v2")))
-+static void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty,
++X86_64_V2 static void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty,
 +                                                    uint16_t *dstuv,
 +                                                    const uint16_t *srcy,
 +                                                    const uint16_t *srcuv,
@@ -2016,8 +2049,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+__attribute__((target("arch=x86-64-v3")))
-+static void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty,
++X86_64_V3 static void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty,
 +                                                    uint16_t *dstuv,
 +                                                    const uint16_t *srcy,
 +                                                    const uint16_t *srcuv,

From 49c5eda16a1245cf40972d322af68bd7ab20a3fc Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Wed, 26 Jun 2024 00:45:38 +0800
Subject: [PATCH 04/27] avfilter/tonmapx: use better target for old gcc

---
 debian/patches/0080-add-tonemapx-filter.patch | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index e4b9bcd3deb..8afc0e8e932 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -176,10 +176,10 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#        define X86_64_V2 __attribute__((target("arch=x86-64-v2")))
 +#        define X86_64_V3 __attribute__((target("arch=x86-64-v3")))
 +#    else
-+#        define X86_64_V2 __attribute__((target("arch=nehalem")))
-+#        define X86_64_V3 __attribute__((target("arch=haswell")))
++#        define X86_64_V2 __attribute__((target("sse4.2")))
++#        define X86_64_V3 __attribute__((target("avx2,fma")))
 +#    endif // (__GNUC__ >= 11) || (__clang_major__ >= 12)
-+#endif // ifdef __GNUC__
++#endif // defined(__GNUC__) || defined(__clang__)
 +
 +enum TonemapAlgorithm {
 +    TONEMAP_NONE,
@@ -2262,10 +2262,10 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +        rsrcy += offset;
 +        rsrcuv += offset;
 +        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
-+                                       rsrcy, rsrcuv,
-+                                       dstlinesize, srclinesize,
-+                                       dstdepth, srcdepth,
-+                                       remainw, rheight, params);
++                                            rsrcy, rsrcuv,
++                                            dstlinesize, srclinesize,
++                                            dstdepth, srcdepth,
++                                            remainw, rheight, params);
 +    }
 +}
 +#endif

From 940908c2281d8b0d851283958b3814cfefcbde37 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Thu, 27 Jun 2024 01:05:32 +0800
Subject: [PATCH 05/27] avfilter/tonmapx: disable avx for windows

gcc-mingw compiles this badly on windows
---
 debian/patches/0080-add-tonemapx-filter.patch | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 8afc0e8e932..39981f04a51 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -115,7 +115,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,2579 @@
+@@ -0,0 +1,2585 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -181,6 +181,10 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#    endif // (__GNUC__ >= 11) || (__clang_major__ >= 12)
 +#endif // defined(__GNUC__) || defined(__clang__)
 +
++#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
++#define OS_WIN
++#endif
++
 +enum TonemapAlgorithm {
 +    TONEMAP_NONE,
 +    TONEMAP_LINEAR,
@@ -2594,11 +2598,13 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12_sse;
 +        s->tonemap_frame_p01x_2_p01x = tonemap_frame_p016_p010_2_p016_p010_sse;
 +    }
++#ifndef OS_WIN
 +    if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) {
 +        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12_avx;
 +        s->tonemap_frame_p01x_2_p01x = tonemap_frame_p016_p010_2_p016_p010_avx;
 +    }
 +#endif
++#endif
 +
 +    if (!s->tonemap_frame_p01x_2_nv12) {
 +        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12;

From 82118c24d44f8659feeb5908b31eb573f037eb09 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sat, 29 Jun 2024 01:30:08 +0800
Subject: [PATCH 06/27] avfilter/tonmapx: improve memory store

This improves perforamance for all platforms.
This also makes AVX useable on Windows with compiler flags
`-Wa,-muse-unaligned-vector-move`
---
 debian/patches/0080-add-tonemapx-filter.patch | 1036 +++++++++++++----
 1 file changed, 807 insertions(+), 229 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 39981f04a51..bb0cdc01160 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -115,7 +115,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,2585 @@
+@@ -0,0 +1,3153 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -154,8 +154,10 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#if  ARCH_AARCH64
 +#   include <arm_neon.h>
 +#   include "libavutil/aarch64/cpu.h"
++#define __builtin_ia32_extract128i256(a, b) _mm_setzero_si128()
++#define __builtin_ia32_permdi256(a, b) _mm256_setzero_si256()
 +#endif
-+#if ARCH_X86
++#if ARCH_X86 || ARCH_AARCH64
 +#   include <immintrin.h>
 +#   include <emmintrin.h>
 +#   include <smmintrin.h>
@@ -181,10 +183,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#    endif // (__GNUC__ >= 11) || (__clang_major__ >= 12)
 +#endif // defined(__GNUC__) || defined(__clang__)
 +
-+#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__)
-+#define OS_WIN
-+#endif
-+
 +enum TonemapAlgorithm {
 +    TONEMAP_NONE,
 +    TONEMAP_LINEAR,
@@ -588,14 +586,27 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    *b_out = delin_lut[av_clip_uintp2(b_lin * 32767 + 0.5, 15)];
 +}
 +
-+#if ARCH_X86
++#if ARCH_X86 || ARCH_AARCH64
++X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a)
++{
++    __m128i mask = _mm_set1_epi32(0x7FFF);
++    __m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF));
++
++    __m128i zero = _mm_setzero_si128();
++    __m128i cmp = _mm_cmpeq_epi32(condition, zero);
++
++    __m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask);
++    __m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a));
++
++    return result;
++}
 +X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in,
-+                                       int16_t *r_out, int16_t *g_out, int16_t *b_out,
-+                                       float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
-+                                       const AVLumaCoefficients *coeffs,
-+                                       const AVLumaCoefficients *ocoeffs, double desat,
-+                                       double (*rgb2rgb)[3][3],
-+                                       int rgb2rgb_passthrough)
++                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                                                 const AVLumaCoefficients *coeffs,
++                                                 const AVLumaCoefficients *ocoeffs, double desat,
++                                                 double (*rgb2rgb)[3][3],
++                                                 int rgb2rgb_passthrough)
 +{
 +    __m128i sig4;
 +    __m128 mapvalx4, r_linx4, g_linx4, b_linx4;
@@ -607,18 +618,20 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m128i r, g, b, rx4, gx4, bx4;
 +
 +    float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4];
++    int r4[4], g4[4], b4[4], s4[4];
++    int lr[4], lg[4], lb[4];
 +    int i;
 +
 +    sig4 = _mm_max_epi32(r_in, _mm_max_epi32(g_in, b_in));
 +    sig4 = _mm_add_epi32(sig4, input_lut_offset);
-+    sig4 = _mm_min_epi32(sig4, upper_bound);
++    sig4 = av_clip_uint16_sse(sig4);
 +
 +    r = _mm_add_epi32(r_in, input_lut_offset);
-+    r = _mm_min_epi32(r, upper_bound);
++    r = av_clip_uint16_sse(r);
 +    g = _mm_add_epi32(g_in, input_lut_offset);
-+    g = _mm_min_epi32(g, upper_bound);
++    g = av_clip_uint16_sse(g);
 +    b = _mm_add_epi32(b_in, input_lut_offset);
-+    b = _mm_min_epi32(b, upper_bound);
++    b = av_clip_uint16_sse(b);
 +
 +    // Cannot use loop here as the lane has to be compile-time constant
 +#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \
@@ -684,16 +697,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    b_linx4 = _mm_add_ps(b_linx4, offset);
 +
 +    rx4 = _mm_cvttps_epi32(r_linx4);
-+    rx4 = _mm_min_epi32(rx4, upper_bound);
-+    rx4 = _mm_max_epi32(rx4, zerox4);
-+
++    rx4 = av_clip_uint16_sse(rx4);
 +    gx4 = _mm_cvttps_epi32(g_linx4);
-+    gx4 = _mm_min_epi32(gx4, upper_bound);
-+    gx4 = _mm_max_epi32(gx4, zerox4);
-+
++    gx4 = av_clip_uint16_sse(gx4);
 +    bx4 = _mm_cvttps_epi32(b_linx4);
-+    bx4 = _mm_min_epi32(bx4, upper_bound);
-+    bx4 = _mm_max_epi32(bx4, zerox4);
++    bx4 = av_clip_uint16_sse(bx4);
 +
 +#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \
 +g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)];                       \
@@ -708,12 +716,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +}
 +
 +X86_64_V3 static inline void tonemap_int32x8_avx(__m256i r_in, __m256i g_in, __m256i b_in,
-+                                       int16_t *r_out, int16_t *g_out, int16_t *b_out,
-+                                       float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
-+                                       const AVLumaCoefficients *coeffs,
-+                                       const AVLumaCoefficients *ocoeffs, double desat,
-+                                       double (*rgb2rgb)[3][3],
-+                                       int rgb2rgb_passthrough)
++                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                                                 const AVLumaCoefficients *coeffs,
++                                                 const AVLumaCoefficients *ocoeffs, double desat,
++                                                 double (*rgb2rgb)[3][3],
++                                                 int rgb2rgb_passthrough)
 +{
 +    __m256i sig8;
 +    __m256 mapvalx8, r_linx8, g_linx8, b_linx8;
@@ -730,13 +738,17 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    sig8 = _mm256_max_epi32(r_in, _mm256_max_epi32(g_in, b_in));
 +    sig8 = _mm256_add_epi32(sig8, input_lut_offset);
 +    sig8 = _mm256_min_epi32(sig8, upper_bound);
++    sig8 = _mm256_max_epi32(sig8, zerox8);
 +
 +    r = _mm256_add_epi32(r_in, input_lut_offset);
 +    r = _mm256_min_epi32(r, upper_bound);
++    r = _mm256_max_epi32(r, zerox8);
 +    g = _mm256_add_epi32(g_in, input_lut_offset);
 +    g = _mm256_min_epi32(g, upper_bound);
++    g = _mm256_max_epi32(g, zerox8);
 +    b = _mm256_add_epi32(b_in, input_lut_offset);
 +    b = _mm256_min_epi32(b, upper_bound);
++    b = _mm256_max_epi32(b, zerox8);
 +
 +#define LOAD_LUT(i) mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)]; \
 +r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)];                             \
@@ -863,13 +875,17 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    sig8 = vmaxq_s16(r, vmaxq_s16(g, b));
 +    sig8 = vaddq_s16(sig8, input_lut_offset);
 +    sig8 = vminq_s16(sig8, input_upper_bound);
++    sig8 = vmaxq_s16(sig8, zerox4);
 +
 +    r = vaddq_s16(r, input_lut_offset);
 +    r = vminq_s16(r, input_upper_bound);
++    r = vmaxq_s16(r, zerox4);
 +    g = vaddq_s16(g, input_lut_offset);
 +    g = vminq_s16(g, input_upper_bound);
++    g = vmaxq_s16(g, zerox4);
 +    b = vaddq_s16(b, input_lut_offset);
 +    b = vminq_s16(b, input_upper_bound);
++    b = vmaxq_s16(b, zerox4);
 +
 +    // Cannot use loop here as the lane has to be compile-time constant
 +#define LOAD_LUT(i) mapval4a[i] = tonemap_lut[vget_lane_s16(vget_low_s16(sig8), i)]; \
@@ -1014,16 +1030,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#endif
 +
 +// See also libavfilter/colorspacedsp_template.c
-+static void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty,
-+                                           uint8_t *dstuv,
-+                                           const uint16_t *srcy,
-+                                           const uint16_t *srcuv,
-+                                           const int *dstlinesize,
-+                                           const int *srclinesize,
-+                                           int dstdepth,
-+                                           int srcdepth,
-+                                           int width,
-+                                           int height,
++static void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
++                                           const uint16_t *srcy, const uint16_t *srcuv,
++                                           const int *dstlinesize, const int *srclinesize,
++                                           int dstdepth, int srcdepth,
++                                           int width, int height,
 +                                           const struct TonemapIntParams *params)
 +{
 +    const int in_depth = srcdepth;
@@ -1111,7 +1122,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+#if ARCH_X86
++#if ARCH_X86 || ARCH_AARCH64
 +X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a)
 +{
 +    __m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U));
@@ -1138,17 +1149,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    return _mm256_or_si256(_mm256_and_si256(cmp, a), _mm256_andnot_si256(cmp, xor_result));
 +}
 +
-+X86_64_V2 static void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty,
-+                                               uint8_t *dstuv,
-+                                               const uint16_t *srcy,
-+                                               const uint16_t *srcuv,
-+                                               const int *dstlinesize,
-+                                               const int *srclinesize,
-+                                               int dstdepth,
-+                                               int srcdepth,
-+                                               int width,
-+                                               int height,
-+                                               const struct TonemapIntParams *params)
++X86_64_V2 static void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv,
++                                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                                         const int *dstlinesize, const int *srclinesize,
++                                                         int dstdepth, int srcdepth,
++                                                         int width, int height,
++                                                         const struct TonemapIntParams *params)
 +{
 +    uint8_t *rdsty = dsty;
 +    uint8_t *rdstuv = dstuv;
@@ -1188,22 +1194,33 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +
 +    int16_t r[8], g[8], b[8];
 +    int16_t r1[8], g1[8], b1[8];
++
 +    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
 +    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
 +    __m128i cyx4 = _mm_set1_epi32(cy);
 +    __m128i rndx4 = _mm_set1_epi32(in_rnd);
 +    __m128i zero128 = _mm_setzero_si128();
++    __m128i uvx8, uvx4a, uvx4b;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
++
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
++
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
++    __m128i uoax4, voax4, ravgx4, gavgx4, bavgx4;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
-+            int ip;
-+            __m128i uvx8, uvx4a, uvx4b;
-+            __m128i y0x8, y1x8;
-+            __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
-+            __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
-+            __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
 +            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
 +            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
@@ -1235,27 +1252,25 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
 +            r0x4a = _mm_add_epi32(r0x4a, rndx4);
 +            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
 +            r0x4a = av_clip_int16_sse(r0x4a);
 +
-+            r1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
 +            r1x4a = _mm_add_epi32(r1x4a, rndx4);
 +            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
 +            r1x4a = av_clip_int16_sse(r1x4a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
 +            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
 +            g0x4a = _mm_add_epi32(g0x4a, rndx4);
 +            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
 +            g0x4a = av_clip_int16_sse(g0x4a);
 +
-+            g1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
 +            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
 +            g1x4a = _mm_add_epi32(g1x4a, rndx4);
@@ -1263,51 +1278,45 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            g1x4a = av_clip_int16_sse(g1x4a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
 +            b0x4a = _mm_add_epi32(b0x4a, rndx4);
 +            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
 +            b0x4a = av_clip_int16_sse(b0x4a);
 +
-+            b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
 +            b1x4a = _mm_add_epi32(b1x4a, rndx4);
 +            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
 +            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+            r0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
 +            r0x4b = _mm_add_epi32(r0x4b, rndx4);
 +            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
 +            r0x4b = av_clip_int16_sse(r0x4b);
 +
-+            r1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
 +            r1x4b = _mm_add_epi32(r1x4b, rndx4);
 +            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
 +            r1x4b = av_clip_int16_sse(r1x4b);
 +
-+            g0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
 +            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
 +            g0x4b = _mm_add_epi32(g0x4b, rndx4);
 +            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
 +            g0x4b = av_clip_int16_sse(g0x4b);
 +
-+            g1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
 +            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
 +            g1x4b = _mm_add_epi32(g1x4b, rndx4);
 +            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
 +            g1x4b = av_clip_int16_sse(g1x4b);
 +
-+            b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
 +            b0x4b = _mm_add_epi32(b0x4b, rndx4);
 +            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
 +            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+            b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
 +            b1x4b = _mm_add_epi32(b1x4b, rndx4);
 +            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
@@ -1330,16 +1339,94 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            for (ip = 0; ip < 8; ip ++) {
-+                dsty[x + ip] = av_clip_uint8(params->out_yuv_off + ((r[ip] * cry + g[ip] * cgy + b[ip] * cby + out_rnd) >> out_sh));
-+                dsty[dstlinesize[0] + x + ip] = av_clip_uint8(params->out_yuv_off + ((r1[ip] * cry + g1[ip] * cgy + b1[ip] * cby + out_rnd) >> out_sh));
-+                if (ip & 1) { // is odd
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+                    dstuv[x + ip -1] = av_clip_uint8(out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cru + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgu + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cburv + out_rnd) >> out_sh));
-+                    dstuv[x + ip] = av_clip_uint8(out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cburv + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgv + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cbv + out_rnd) >> out_sh));
-+#undef AVG
-+                }
-+            }
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
++
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
++
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
++
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            yoax4 = _mm_srai_epi32(yoax4, 21);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
++
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, 21);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
++
++            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
++            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
++
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
++
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
++
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
++
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, 21);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
++
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, 21);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
++
++            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
++            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
++
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
++
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
++
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
++
++            uoax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uoax4 = _mm_srai_epi32(uoax4, 21);
++            uoax4 = _mm_add_epi32(uoax4, _mm_set1_epi32(out_uv_offset));
++
++            voax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            voax4 = _mm_srai_epi32(voax4, 21);
++            voax4 = _mm_add_epi32(voax4, _mm_set1_epi32(out_uv_offset));
++
++            uvoax4 = _mm_unpacklo_epi32(uoax4, voax4);
++            uvobx4 = _mm_unpackhi_epi32(uoax4, voax4);
++            _mm_storeu_si64(&dstuv[x], _mm_packus_epi16(_mm_packs_epi32(uvoax4, uvobx4), zero128));
 +        }
 +    }
 +
@@ -1358,17 +1445,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+X86_64_V3 static void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty,
-+                                               uint8_t *dstuv,
-+                                               const uint16_t *srcy,
-+                                               const uint16_t *srcuv,
-+                                               const int *dstlinesize,
-+                                               const int *srclinesize,
-+                                               int dstdepth,
-+                                               int srcdepth,
-+                                               int width,
-+                                               int height,
-+                                               const struct TonemapIntParams *params)
++X86_64_V3 static void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
++                                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                                         const int *dstlinesize, const int *srclinesize,
++                                                         int dstdepth, int srcdepth,
++                                                         int width, int height,
++                                                         const struct TonemapIntParams *params)
 +{
 +    uint8_t *rdsty = dsty;
 +    uint8_t *rdstuv = dstuv;
@@ -1413,17 +1495,27 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m256i cyx8 = _mm256_set1_epi32(cy);
 +    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
++    __m256i uvx16, uvx8a, uvx8b;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 4; xx++) {
 +            int x = xx << 4;
-+            int ip;
-+            __m256i uvx16, uvx8a, uvx8b;
-+            __m256i y0x16, y1x16;
-+            __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+            __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+            __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
 +            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
 +            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
@@ -1550,16 +1642,100 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            for (ip = 0; ip < 16; ip ++) {
-+                dsty[x + ip] = av_clip_uint8(params->out_yuv_off + ((r[ip] * cry + g[ip] * cgy + b[ip] * cby + out_rnd) >> out_sh));
-+                dsty[dstlinesize[0] + x + ip] = av_clip_uint8(params->out_yuv_off + ((r1[ip] * cry + g1[ip] * cgy + b1[ip] * cby + out_rnd) >> out_sh));
-+                if (ip & 1) { // is odd
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+                    dstuv[x + ip -1] = av_clip_uint8(out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cru + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgu + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cburv + out_rnd) >> out_sh));
-+                    dstuv[x + ip] = av_clip_uint8(out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cburv + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgv + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cbv + out_rnd) >> out_sh));
-+#undef AVG
-+                }
-+            }
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++
++            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
++            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
++            uvox16 = _mm256_packs_epi32(uvoax8, uvobx8);
++            _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +        }
 +    }
 +
@@ -1580,16 +1756,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#endif
 +
 +#if ARCH_AARCH64
-+static void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty,
-+                                                uint8_t *dstuv,
-+                                                const uint16_t *srcy,
-+                                                const uint16_t *srcuv,
-+                                                const int *dstlinesize,
-+                                                const int *srclinesize,
-+                                                int dstdepth,
-+                                                int srcdepth,
-+                                                int width,
-+                                                int height,
++static void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
++                                                const uint16_t *srcy, const uint16_t *srcuv,
++                                                const int *dstlinesize, const int *srclinesize,
++                                                int dstdepth, int srcdepth,
++                                                int width, int height,
 +                                                const struct TonemapIntParams *params)
 +{
 +    uint8_t *rdsty = dsty;
@@ -1605,13 +1776,13 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
++//    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+    const int out_sh2 = 16 - out_depth;
++//    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -1639,17 +1810,33 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
 +    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
 +    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x8_t uvx8;
++    uint16x4_t ux2a, vx2a, ux2b, vx2b;
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
++
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    int16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
++
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    int16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x4_t uvoax4, uvobx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
-+            int ip = 0;
-+            uint16x8_t uvx8;
-+            uint16x4_t ux2a, vx2a, ux2b, vx2b;
-+            uint16x8_t y0x8, y1x8, ux8, vx8;
-+            uint16x8_t r0x8, g0x8, b0x8;
-+            uint16x8_t r1x8, g1x8, b1x8;
 +
 +            y0x8 = vld1q_u16(srcy + x);
 +            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
@@ -1704,16 +1891,107 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                 params->rgb2rgb_passthrough);
 +
-+            for (ip = 0; ip < 8; ip ++) {
-+                dsty[x + ip] = av_clip_uint8(params->out_yuv_off + ((r[ip] * cry + g[ip] * cgy + b[ip] * cby + out_rnd) >> out_sh));
-+                dsty[dstlinesize[0] + x + ip] = av_clip_uint8(params->out_yuv_off + ((r1[ip] * cry + g1[ip] * cgy + b1[ip] * cby + out_rnd) >> out_sh));
-+                if (ip & 1) { // is odd
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+                    dstuv[x + ip -1] = av_clip_uint8(out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cru + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgu + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cburv + out_rnd) >> out_sh));
-+                    dstuv[x + ip] = av_clip_uint8(out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cburv + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgv + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cbv + out_rnd) >> out_sh));
-+#undef AVG
-+                }
-+            }
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
++
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            y0oax4 = vshrq_n_s32(y0oax4, 21);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, 21);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++
++            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
++            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
++
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
++
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, 21);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, 21);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++
++            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
++            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
++
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
++
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
++
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
++
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, 21);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            vox4 = vshrq_n_s32(vox4, 21);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++
++            uvoax4 = vzip1q_s32(uox4, vox4);
++            uvobx4 = vzip2q_s32(uox4, vox4);
++
++            vst1_u8(&dstuv[x], vqmovun_s16(vcombine_s16(vmovn_s32(uvoax4), vmovn_s32(uvobx4))));
 +        }
 +    }
 +
@@ -1733,16 +2011,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +}
 +#endif
 +
-+static void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty,
-+                                                uint16_t *dstuv,
-+                                                const uint16_t *srcy,
-+                                                const uint16_t *srcuv,
-+                                                const int *dstlinesize,
-+                                                const int *srclinesize,
-+                                                int dstdepth,
-+                                                int srcdepth,
-+                                                int width,
-+                                                int height,
++static void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
++                                                const uint16_t *srcy, const uint16_t *srcuv,
++                                                const int *dstlinesize, const int *srclinesize,
++                                                int dstdepth, int srcdepth,
++                                                int width, int height,
 +                                                const struct TonemapIntParams *params)
 +{
 +    const int in_depth = srcdepth;
@@ -1830,18 +2103,13 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+#if ARCH_X86
-+X86_64_V2 static void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty,
-+                                                    uint16_t *dstuv,
-+                                                    const uint16_t *srcy,
-+                                                    const uint16_t *srcuv,
-+                                                    const int *dstlinesize,
-+                                                    const int *srclinesize,
-+                                                    int dstdepth,
-+                                                    int srcdepth,
-+                                                    int width,
-+                                                    int height,
-+                                                    const struct TonemapIntParams *params)
++#if ARCH_X86 || ARCH_AARCH64
++X86_64_V2 static void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty, uint16_t *dstuv,
++                                                              const uint16_t *srcy, const uint16_t *srcuv,
++                                                              const int *dstlinesize, const int *srclinesize,
++                                                              int dstdepth, int srcdepth,
++                                                              int width, int height,
++                                                              const struct TonemapIntParams *params)
 +{
 +    uint16_t *rdsty = dsty;
 +    uint16_t *rdstuv = dstuv;
@@ -1887,18 +2155,27 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m128i cyx4 = _mm_set1_epi32(cy);
 +    __m128i rndx4 = _mm_set1_epi32(in_rnd);
 +    __m128i zero128 = _mm_setzero_si128();
-+
++    __m128i uvx8, uvx4a, uvx4b;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
++
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
++
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
++    __m128i uoax4, voax4, ravgx4, gavgx4, bavgx4, uvox8;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
-+            int ip;
-+            __m128i uvx8, uvx4a, uvx4b;
-+            __m128i y0x8, y1x8;
-+            __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
-+            __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
-+            __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
 +            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
 +            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
@@ -2025,16 +2302,98 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            for (ip = 0; ip < 8; ip ++) {
-+                dsty[x + ip]                          = av_clip_uintp2((params->out_yuv_off + ((r[ip] * cry + g[ip] * cgy + b[ip] * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+                dsty[dstlinesize[0] / 2 + x + ip]     = av_clip_uintp2((params->out_yuv_off + ((r1[ip] * cry + g1[ip] * cgy + b1[ip] * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+                if (ip & 1) { // is odd
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+                    dstuv[x + ip - 1] = av_clip_uintp2((out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cru + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgu + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
-+                    dstuv[x + ip] = av_clip_uintp2((out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cburv + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgv + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
-+#undef AVG
-+                }
-+            }
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
++
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
++
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
++
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            yoax4 = _mm_srai_epi32(yoax4, out_sh);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
++
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, out_sh);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
++
++            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
++            y0ox8 = _mm_slli_epi16(y0ox8, out_sh2);
++            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
++
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
++
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
++
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
++
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
++
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
++
++            y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
++            y1ox8 = _mm_slli_epi16(y1ox8, out_sh2);
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
++
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
++
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
++
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
++
++            uoax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uoax4 = _mm_srai_epi32(uoax4, out_sh);
++            uoax4 = _mm_add_epi32(uoax4, _mm_set1_epi32(out_uv_offset));
++
++            voax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            voax4 = _mm_srai_epi32(voax4, out_sh);
++            voax4 = _mm_add_epi32(voax4, _mm_set1_epi32(out_uv_offset));
++
++            uvoax4 = _mm_unpacklo_epi32(uoax4, voax4);
++            uvobx4 = _mm_unpackhi_epi32(uoax4, voax4);
++            uvox8 = _mm_packus_epi32(uvoax4, uvobx4);
++            uvox8 = _mm_slli_epi16(uvox8, out_sh2);
++            _mm_storeu_si128((__m128i_u *) &dstuv[x], uvox8);
 +        }
 +    }
 +
@@ -2053,17 +2412,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+X86_64_V3 static void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty,
-+                                                    uint16_t *dstuv,
-+                                                    const uint16_t *srcy,
-+                                                    const uint16_t *srcuv,
-+                                                    const int *dstlinesize,
-+                                                    const int *srclinesize,
-+                                                    int dstdepth,
-+                                                    int srcdepth,
-+                                                    int width,
-+                                                    int height,
-+                                                    const struct TonemapIntParams *params)
++X86_64_V3 static void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
++                                                              const uint16_t *srcy, const uint16_t *srcuv,
++                                                              const int *dstlinesize, const int *srclinesize,
++                                                              int dstdepth, int srcdepth,
++                                                              int width, int height,
++                                                              const struct TonemapIntParams *params)
 +{
 +    uint16_t *rdsty = dsty;
 +    uint16_t *rdstuv = dstuv;
@@ -2108,17 +2462,26 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m256i cyx8 = _mm256_set1_epi32(cy);
 +    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++    __m256i uvx16, uvx8a, uvx8b;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 4; xx++) {
 +            int x = xx << 4;
-+            int ip;
-+            __m256i uvx16, uvx8a, uvx8b;
-+            __m256i y0x16, y1x16;
-+            __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+            __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+            __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
 +            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
 +            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
@@ -2245,16 +2608,103 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            for (ip = 0; ip < 16; ip ++) {
-+                dsty[x + ip]                          = av_clip_uintp2((params->out_yuv_off + ((r[ip] * cry + g[ip] * cgy + b[ip] * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+                dsty[dstlinesize[0] / 2 + x + ip]     = av_clip_uintp2((params->out_yuv_off + ((r1[ip] * cry + g1[ip] * cgy + b1[ip] * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+                if (ip & 1) { // is odd
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+                    dstuv[x + ip - 1] = av_clip_uintp2((out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cru + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgu + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
-+                    dstuv[x + ip] = av_clip_uintp2((out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cburv + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgv + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
-+#undef AVG
-+                }
-+            }
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++
++            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
++            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
++            uvox16 = _mm256_packus_epi32(uvoax8, uvobx8);
++            uvox16 = _mm256_slli_epi16(uvox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16);
 +        }
 +    }
 +
@@ -2275,16 +2725,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#endif
 +
 +#if ARCH_AARCH64
-+static void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty,
-+                                                     uint16_t *dstuv,
-+                                                     const uint16_t *srcy,
-+                                                     const uint16_t *srcuv,
-+                                                     const int *dstlinesize,
-+                                                     const int *srclinesize,
-+                                                     int dstdepth,
-+                                                     int srcdepth,
-+                                                     int width,
-+                                                     int height,
++static void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
++                                                     const uint16_t *srcy, const uint16_t *srcuv,
++                                                     const int *dstlinesize, const int *srclinesize,
++                                                     int dstdepth, int srcdepth,
++                                                     int width, int height,
 +                                                     const struct TonemapIntParams *params)
 +{
 +    uint16_t *rdsty = dsty;
@@ -2334,17 +2779,34 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
 +    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
 +    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x8_t uvx8;
++    uint16x4_t ux2a, vx2a, ux2b, vx2b;
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
++
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    int16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
++
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    int16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x4_t uvoax4, uvobx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    uint16x8_t out_sh2x8 = vdupq_n_u16(out_sh2);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
-+            int ip = 0;
-+            uint16x8_t uvx8;
-+            uint16x4_t ux2a, vx2a, ux2b, vx2b;
-+            uint16x8_t y0x8, y1x8, ux8, vx8;
-+            uint16x8_t r0x8, g0x8, b0x8;
-+            uint16x8_t r1x8, g1x8, b1x8;
 +
 +            y0x8 = vld1q_u16(srcy + x);
 +            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
@@ -2399,16 +2861,124 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                 params->rgb2rgb_passthrough);
 +
-+            for (ip = 0; ip < 8; ip ++) {
-+                dsty[x + ip]                          = av_clip_uintp2((params->out_yuv_off + ((r[ip] * cry + g[ip] * cgy + b[ip] * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+                dsty[dstlinesize[0] / 2 + x + ip]     = av_clip_uintp2((params->out_yuv_off + ((r1[ip] * cry + g1[ip] * cgy + b1[ip] * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+                if (ip & 1) { // is odd
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+                    dstuv[x + ip - 1] = av_clip_uintp2((out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cru + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgu + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
-+                    dstuv[x + ip] = av_clip_uintp2((out_uv_offset + ((AVG(r[ip-1], r[ip], r1[ip-1], r1[ip]) * cburv + AVG(g[ip-1], g[ip], g1[ip-1], g1[ip]) * ocgv + AVG(b[ip-1], b[ip], b1[ip-1], b1[ip]) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
-+#undef AVG
-+                }
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
++
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
++
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
++
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
++
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
++
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++
++            switch(out_depth) {
++                default:
++                case 10:
++                    y0oax4 = vshrq_n_s32(y0oax4, 19);
++                    y0obx4 = vshrq_n_s32(y0obx4, 19);
++                    y1oax4 = vshrq_n_s32(y1oax4, 19);
++                    y1obx4 = vshrq_n_s32(y1obx4, 19);
++                    uox4 = vshrq_n_s32(uox4, 19);
++                    vox4 = vshrq_n_s32(vox4, 19);
++                    break;
++                case 16:
++                    y0oax4 = vshrq_n_s32(y0oax4, 13);
++                    y0obx4 = vshrq_n_s32(y0obx4, 13);
++                    y1oax4 = vshrq_n_s32(y1oax4, 13);
++                    y1obx4 = vshrq_n_s32(y1obx4, 13);
++                    uox4 = vshrq_n_s32(uox4, 13);
++                    vox4 = vshrq_n_s32(vox4, 13);
++                    break;
 +            }
++
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++
++            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
++            y0ox8 = vshlq_u16(y0ox8, out_sh2x8);
++            vst1q_u16(&dsty[x], y0ox8);
++
++            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
++            y1ox8 = vshlq_u16(y1ox8, out_sh2x8);
++            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
++
++            uvoax4 = vzip1q_s32(uox4, vox4);
++            uvobx4 = vzip2q_s32(uox4, vox4);
++
++            vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8));
 +        }
 +    }
 +
@@ -2598,13 +3168,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12_sse;
 +        s->tonemap_frame_p01x_2_p01x = tonemap_frame_p016_p010_2_p016_p010_sse;
 +    }
-+#ifndef OS_WIN
 +    if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) {
 +        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12_avx;
 +        s->tonemap_frame_p01x_2_p01x = tonemap_frame_p016_p010_2_p016_p010_avx;
 +    }
 +#endif
-+#endif
 +
 +    if (!s->tonemap_frame_p01x_2_nv12) {
 +        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12;
@@ -2701,3 +3269,13 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    FILTER_QUERY_FUNC(query_formats),
 +    .flags           = AVFILTER_FLAG_SLICE_THREADS,
 +};
+Index: FFmpeg/builder/variants/defaults-gpl.sh
+===================================================================
+--- FFmpeg.orig/builder/variants/defaults-gpl.sh
++++ FFmpeg/builder/variants/defaults-gpl.sh
+@@ -6,3 +6,5 @@ GIT_BRANCH="jellyfin"
+ LICENSE_FILE="COPYING.GPLv3"
+ 
+ [[ $TARGET == linux* ]] && FF_CONFIGURE+=" --disable-libxcb --disable-xlib" || true
++[[ $TARGET == win* ]] && FF_CFLAGS+=" -Wa,-muse-unaligned-vector-move" || true
++[[ $TARGET == win* ]] && FF_CXXFLAGS+=" -Wa,-muse-unaligned-vector-move" || true

From 5e1b964190fdb0662c08e44446acf7f0f1bd0f27 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sat, 29 Jun 2024 01:57:14 +0800
Subject: [PATCH 07/27] diable aligned vector move for windows

GCC cannot correctly generate aligned AVX code
---
 builder/variants/defaults-gpl.sh              |  2 ++
 debian/patches/0080-add-tonemapx-filter.patch | 10 ----------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/builder/variants/defaults-gpl.sh b/builder/variants/defaults-gpl.sh
index e5a6523f669..4591981fe2c 100755
--- a/builder/variants/defaults-gpl.sh
+++ b/builder/variants/defaults-gpl.sh
@@ -6,3 +6,5 @@ GIT_BRANCH="jellyfin"
 LICENSE_FILE="COPYING.GPLv3"
 
 [[ $TARGET == linux* ]] && FF_CONFIGURE+=" --disable-libxcb --disable-xlib" || true
+[[ $TARGET == win* ]] && FF_CFLAGS+=" -Wa,-muse-unaligned-vector-move" || true
+[[ $TARGET == win* ]] && FF_CXXFLAGS+=" -Wa,-muse-unaligned-vector-move" || true
diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index bb0cdc01160..3154217743d 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -3269,13 +3269,3 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    FILTER_QUERY_FUNC(query_formats),
 +    .flags           = AVFILTER_FLAG_SLICE_THREADS,
 +};
-Index: FFmpeg/builder/variants/defaults-gpl.sh
-===================================================================
---- FFmpeg.orig/builder/variants/defaults-gpl.sh
-+++ FFmpeg/builder/variants/defaults-gpl.sh
-@@ -6,3 +6,5 @@ GIT_BRANCH="jellyfin"
- LICENSE_FILE="COPYING.GPLv3"
- 
- [[ $TARGET == linux* ]] && FF_CONFIGURE+=" --disable-libxcb --disable-xlib" || true
-+[[ $TARGET == win* ]] && FF_CFLAGS+=" -Wa,-muse-unaligned-vector-move" || true
-+[[ $TARGET == win* ]] && FF_CXXFLAGS+=" -Wa,-muse-unaligned-vector-move" || true

From 22f52469cae9b13c080efc6fc6ed445c248f713b Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sat, 29 Jun 2024 02:11:51 +0800
Subject: [PATCH 08/27] avfilter/tonemapx: remove debug options

---
 debian/patches/0080-add-tonemapx-filter.patch | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 3154217743d..21c2f171581 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -115,7 +115,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,3153 @@
+@@ -0,0 +1,3151 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -154,10 +154,8 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#if  ARCH_AARCH64
 +#   include <arm_neon.h>
 +#   include "libavutil/aarch64/cpu.h"
-+#define __builtin_ia32_extract128i256(a, b) _mm_setzero_si128()
-+#define __builtin_ia32_permdi256(a, b) _mm256_setzero_si256()
 +#endif
-+#if ARCH_X86 || ARCH_AARCH64
++#if ARCH_X86
 +#   include <immintrin.h>
 +#   include <emmintrin.h>
 +#   include <smmintrin.h>
@@ -586,7 +584,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    *b_out = delin_lut[av_clip_uintp2(b_lin * 32767 + 0.5, 15)];
 +}
 +
-+#if ARCH_X86 || ARCH_AARCH64
++#if ARCH_X86
 +X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a)
 +{
 +    __m128i mask = _mm_set1_epi32(0x7FFF);
@@ -1122,7 +1120,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+#if ARCH_X86 || ARCH_AARCH64
++#if ARCH_X86
 +X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a)
 +{
 +    __m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U));
@@ -2103,7 +2101,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+#if ARCH_X86 || ARCH_AARCH64
++#if ARCH_X86
 +X86_64_V2 static void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty, uint16_t *dstuv,
 +                                                              const uint16_t *srcy, const uint16_t *srcuv,
 +                                                              const int *dstlinesize, const int *srclinesize,

From 844cc28797c8443b0e039bb0ec8df360206b410d Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sat, 29 Jun 2024 02:38:34 +0800
Subject: [PATCH 09/27] avfilter/tonemapx: use correct type for arm neon

---
 debian/patches/0080-add-tonemapx-filter.patch | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 21c2f171581..bcf220ce28c 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -2784,12 +2784,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    uint16x8_t r1x8, g1x8, b1x8;
 +
 +    int16x8_t r0ox8, g0ox8, b0ox8;
-+    int16x8_t y0ox8;
++    uint16x8_t y0ox8;
 +    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
 +    int32x4_t y0oax4, y0obx4;
 +
 +    int16x8_t r1ox8, g1ox8, b1ox8;
-+    int16x8_t y1ox8;
++    uint16x8_t y1ox8;
 +    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
 +    int32x4_t y1oax4, y1obx4;
 +    int32x4_t uvoax4, uvobx4;
@@ -2797,7 +2797,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
 +    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
 +    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
-+    uint16x8_t out_sh2x8 = vdupq_n_u16(out_sh2);
++    int16x8_t out_sh2x8 = vdupq_n_s16(out_sh2);
 +    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
 +    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
 +    for (; height > 1; height -= 2,

From 1693fc37b3f3e5220e1ae2b4defae6461e311bc8 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sat, 29 Jun 2024 03:04:39 +0800
Subject: [PATCH 10/27] avfilter/tonemapx: reinterpret zero

---
 debian/patches/0080-add-tonemapx-filter.patch | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index bcf220ce28c..1e9f4b20853 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -873,17 +873,17 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    sig8 = vmaxq_s16(r, vmaxq_s16(g, b));
 +    sig8 = vaddq_s16(sig8, input_lut_offset);
 +    sig8 = vminq_s16(sig8, input_upper_bound);
-+    sig8 = vmaxq_s16(sig8, zerox4);
++    sig8 = vmaxq_s16(sig8, vreinterpretq_s16_s32(zerox4));
 +
 +    r = vaddq_s16(r, input_lut_offset);
 +    r = vminq_s16(r, input_upper_bound);
-+    r = vmaxq_s16(r, zerox4);
++    r = vmaxq_s16(r, vreinterpretq_s16_s32(zerox4));
 +    g = vaddq_s16(g, input_lut_offset);
 +    g = vminq_s16(g, input_upper_bound);
-+    g = vmaxq_s16(g, zerox4);
++    g = vmaxq_s16(g, vreinterpretq_s16_s32(zerox4));
 +    b = vaddq_s16(b, input_lut_offset);
 +    b = vminq_s16(b, input_upper_bound);
-+    b = vmaxq_s16(b, zerox4);
++    b = vmaxq_s16(b, vreinterpretq_s16_s32(zerox4));
 +
 +    // Cannot use loop here as the lane has to be compile-time constant
 +#define LOAD_LUT(i) mapval4a[i] = tonemap_lut[vget_lane_s16(vget_low_s16(sig8), i)]; \

From 4e9d2f49e1534bafa8decc29df5b44855706dd37 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sat, 29 Jun 2024 04:57:18 +0800
Subject: [PATCH 11/27] avfilter/tonemapx: fix typo

---
 debian/patches/0080-add-tonemapx-filter.patch | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 1e9f4b20853..7d11ce8c659 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -251,12 +251,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    AVFrame *in, *out;
 +    const AVPixFmtDescriptor *desc, *odesc;
 +    double peak;
-+    void (*tonemap_fuc) (void *dsty, void *dstuv,
-+                         const uint16_t *srcy, const uint16_t *srcuv,
-+                         const int *dstlinesize, const int *srclinesize,
-+                         int dstdepth, int srcdepth,
-+                         int width, int height,
-+                         const struct TonemapIntParams *params);
++    void (*tonemap_func) (void *dsty, void *dstuv,
++                          const uint16_t *srcy, const uint16_t *srcuv,
++                          const int *dstlinesize, const int *srclinesize,
++                          int dstdepth, int srcdepth,
++                          int width, int height,
++                          const struct TonemapIntParams *params);
 +} ThreadData;
 +
 +static const enum AVPixelFormat in_pix_fmts[] = {
@@ -3027,14 +3027,14 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +
 +    av_log(s, AV_LOG_DEBUG, "dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
 +
-+    td->tonemap_fuc(out->data[0] + out->linesize[0] * slice_start,
-+                    out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                    (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                    (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                    out->linesize, in->linesize,
-+                    odesc->comp[0].depth, desc->comp[0].depth,
-+                    out->width, slice_end - slice_start,
-+                    &params);
++    td->tonemap_func(out->data[0] + out->linesize[0] * slice_start,
++                     out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                     (void*)(in->data[0] + in->linesize[0] * slice_start),
++                     (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                     out->linesize, in->linesize,
++                     odesc->comp[0].depth, desc->comp[0].depth,
++                     out->width, slice_end - slice_start,
++                     &params);
 +
 +    return 0;
 +}
@@ -3127,7 +3127,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    td.desc  = desc;
 +    td.odesc = odesc;
 +    td.peak  = peak;
-+    td.tonemap_fuc = odesc->comp[0].depth == 8 ? s->tonemap_frame_p01x_2_nv12 : s->tonemap_frame_p01x_2_p01x;
++    td.tonemap_func = odesc->comp[0].depth == 8 ? s->tonemap_frame_p01x_2_nv12 : s->tonemap_frame_p01x_2_p01x;
 +    ctx->internal->execute(ctx, filter_slice, &td, NULL,
 +                           FFMIN(outlink->h >> FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h), ff_filter_get_nb_threads(ctx)));
 +

From 158016b0560c78646d9ce43c89b141b903a9a6c5 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sun, 30 Jun 2024 02:49:43 +0800
Subject: [PATCH 12/27] avfilter/tonemapx: add yuv420p support

---
 debian/patches/0080-add-tonemapx-filter.patch | 3056 ++++++++++++++---
 1 file changed, 2526 insertions(+), 530 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 7d11ce8c659..ccb4b2bfb63 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -115,7 +115,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,3151 @@
+@@ -0,0 +1,5147 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -233,39 +233,53 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    DECLARE_ALIGNED(16, int16_t, rgb2yuv_coeffs)[3][3][8];
 +    DECLARE_ALIGNED(16, double,  rgb2rgb_coeffs)[3][3];
 +
-+    void (*tonemap_frame_p01x_2_nv12) (uint8_t *dsty, uint8_t *dstuv,
-+                                       const uint16_t *srcy, const uint16_t *srcuv,
-+                                       const int *dstlinesize, const int *srclinesize,
-+                                       int dstdepth, int srcdepth,
-+                                       int width, int height,
-+                                       const struct TonemapIntParams *params);
-+    void (*tonemap_frame_p01x_2_p01x) (uint16_t *dsty, uint16_t *dstuv,
-+                                       const uint16_t *srcy, const uint16_t *srcuv,
-+                                       const int *dstlinesize, const int *srclinesize,
-+                                       int dstdepth, int srcdepth,
-+                                       int width, int height,
-+                                       const struct TonemapIntParams *params);
++    int (*filter_slice) (AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
++
++    void (*tonemap_func_biplanar8) (uint8_t *dsty, uint8_t *dstuv,
++                                    const uint16_t *srcy, const uint16_t *srcuv,
++                                    const int *dstlinesize, const int *srclinesize,
++                                    int dstdepth, int srcdepth,
++                                    int width, int height,
++                                    const struct TonemapIntParams *params);
++
++    void (*tonemap_func_planar8) (uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                  const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                  const int *dstlinesize, const int *srclinesize,
++                                  int dstdepth, int srcdepth,
++                                  int width, int height,
++                                  const struct TonemapIntParams *params);
++
++    void (*tonemap_func_biplanar10) (uint16_t *dsty, uint16_t *dstuv,
++                                     const uint16_t *srcy, const uint16_t *srcuv,
++                                     const int *dstlinesize, const int *srclinesize,
++                                     int dstdepth, int srcdepth,
++                                     int width, int height,
++                                     const struct TonemapIntParams *params);
++
++    void (*tonemap_func_planar10) (uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                   const int *dstlinesize, const int *srclinesize,
++                                   int dstdepth, int srcdepth,
++                                   int width, int height,
++                                   const struct TonemapIntParams *params);
 +} TonemapxContext;
 +
 +typedef struct ThreadData {
 +    AVFrame *in, *out;
 +    const AVPixFmtDescriptor *desc, *odesc;
 +    double peak;
-+    void (*tonemap_func) (void *dsty, void *dstuv,
-+                          const uint16_t *srcy, const uint16_t *srcuv,
-+                          const int *dstlinesize, const int *srclinesize,
-+                          int dstdepth, int srcdepth,
-+                          int width, int height,
-+                          const struct TonemapIntParams *params);
 +} ThreadData;
 +
 +static const enum AVPixelFormat in_pix_fmts[] = {
++    AV_PIX_FMT_YUV420P10,
 +    AV_PIX_FMT_P010,
 +    AV_PIX_FMT_P016,
 +    AV_PIX_FMT_NONE,
 +};
 +
 +static const enum AVPixelFormat out_pix_fmts[] = {
++    AV_PIX_FMT_YUV420P,
++    AV_PIX_FMT_YUV420P10,
 +    AV_PIX_FMT_NV12,
 +    AV_PIX_FMT_P010,
 +    AV_PIX_FMT_P016,
@@ -281,37 +295,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    return 0;
 +}
 +
-+static int query_formats(AVFilterContext *ctx)
-+{
-+    TonemapxContext *s = ctx->priv;
-+    AVFilterFormats *formats = ff_make_format_list(in_pix_fmts);
-+    int res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
-+    if (res < 0)
-+        return res;
-+
-+    if (!strcmp(s->format_str, "same")) {
-+        s->format = AV_PIX_FMT_NONE;
-+    } else {
-+        s->format = av_get_pix_fmt(s->format_str);
-+        if (s->format == AV_PIX_FMT_NONE) {
-+            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
-+            return AVERROR(EINVAL);
-+        }
-+        if (out_format_is_supported(s->format)) {
-+            formats = NULL;
-+            res = ff_add_format(&formats, s->format);
-+            if (res < 0)
-+                return res;
-+        } else {
-+            av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
-+                   av_get_pix_fmt_name(s->format));
-+            return AVERROR(ENOSYS);
-+        }
-+    }
-+
-+    return ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats);
-+}
-+
 +static float hable(float in)
 +{
 +    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
@@ -1120,6 +1103,96 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
++static void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                        const int *dstlinesize, const int *srclinesize,
++                                        int dstdepth, int srcdepth,
++                                        int width, int height,
++                                        const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                         ) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
++            int u = (srcu[x >> 1]) - in_uv_offset;
++            int v = (srcv[x >> 1]) - in_uv_offset;
++
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++
++            int r00 = r[0], g00 = g[0], b00 = b[0];
++            int r01 = r[1], g01 = g[1], b01 = b[1];
++            int r10 = r[2], g10 = g[2], b10 = b[2];
++            int r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
++            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstu[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
++            dstv[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
++#undef AVG
++        }
++    }
++}
++
 +#if ARCH_X86
 +X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a)
 +{
@@ -1147,17 +1220,21 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    return _mm256_or_si256(_mm256_and_si256(cmp, a), _mm256_andnot_si256(cmp, xor_result));
 +}
 +
-+X86_64_V2 static void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv,
-+                                                         const uint16_t *srcy, const uint16_t *srcuv,
-+                                                         const int *dstlinesize, const int *srclinesize,
-+                                                         int dstdepth, int srcdepth,
-+                                                         int width, int height,
-+                                                         const struct TonemapIntParams *params)
++X86_64_V2 static void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                                      const int *dstlinesize, const int *srclinesize,
++                                                      int dstdepth, int srcdepth,
++                                                      int width, int height,
++                                                      const struct TonemapIntParams *params)
 +{
 +    uint8_t *rdsty = dsty;
-+    uint8_t *rdstuv = dstuv;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
++
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
@@ -1167,13 +1244,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+//    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+//    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -1198,7 +1273,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m128i cyx4 = _mm_set1_epi32(cy);
 +    __m128i rndx4 = _mm_set1_epi32(in_rnd);
 +    __m128i zero128 = _mm_setzero_si128();
-+    __m128i uvx8, uvx4a, uvx4b;
++    __m128i ux4, vx4;
 +    __m128i y0x8, y1x8;
 +    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
 +    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
@@ -1212,42 +1287,36 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m128i r1ox8, g1ox8, b1ox8;
 +    __m128i y1ox8;
 +    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
-+    __m128i uoax4, voax4, ravgx4, gavgx4, bavgx4;
++    __m128i y1oax4, y1obx4;
++    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
 +
 +            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
 +            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
-+            uvx8 = _mm_lddqu_si128((__m128i*)(srcuv + x));
++            ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1)));
++            vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1)));
 +
-+            if (in_depth == 10) {
-+                // shift to low10bits for 10bit input
-+                // shift bit has to be compile-time constant
-+                y0x8 = _mm_srli_epi16(y0x8, 6);
-+                y1x8 = _mm_srli_epi16(y1x8, 6);
-+                uvx8 = _mm_srli_epi16(uvx8, 6);
-+            }
 +            y0x4a = _mm_cvtepu16_epi32(y0x8);
 +            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
 +            y1x4a = _mm_cvtepu16_epi32(y1x8);
 +            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
-+            uvx4a = _mm_cvtepu16_epi32(uvx8);
-+            uvx4b = _mm_unpackhi_epi16(uvx8, zero128);
++            ux4 = _mm_cvtepu16_epi32(ux4);
++            vx4 = _mm_cvtepu16_epi32(vx4);
 +            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
 +            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
 +            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
 +            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
-+            uvx4a = _mm_sub_epi32(uvx4a, in_uv_offx4);
-+            uvx4b = _mm_sub_epi32(uvx4b, in_uv_offx4);
++            ux4 = _mm_sub_epi32(ux4, in_uv_offx4);
++            vx4 = _mm_sub_epi32(vx4, in_uv_offx4);
 +
-+            ux4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(2, 2, 0, 0));
-+            ux4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(2, 2, 0, 0));
-+            vx4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(3, 3, 1, 1));
-+            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
++            ux4a = _mm_unpacklo_epi32(ux4, ux4);
++            ux4b = _mm_unpackhi_epi32(ux4, ux4);
++            vx4a = _mm_unpacklo_epi32(vx4, vx4);
++            vx4b = _mm_unpackhi_epi32(vx4, vx4);
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
 +            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
@@ -1410,21 +1479,19 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
 +            bavgx4 = _mm_srai_epi32(bavgx4, 2);
 +
-+            uoax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
-+            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
-+            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
-+            uoax4 = _mm_srai_epi32(uoax4, 21);
-+            uoax4 = _mm_add_epi32(uoax4, _mm_set1_epi32(out_uv_offset));
-+
-+            voax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
-+            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
-+            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
-+            voax4 = _mm_srai_epi32(voax4, 21);
-+            voax4 = _mm_add_epi32(voax4, _mm_set1_epi32(out_uv_offset));
-+
-+            uvoax4 = _mm_unpacklo_epi32(uoax4, voax4);
-+            uvobx4 = _mm_unpackhi_epi32(uoax4, voax4);
-+            _mm_storeu_si64(&dstuv[x], _mm_packus_epi16(_mm_packs_epi32(uvoax4, uvobx4), zero128));
++            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uox4 = _mm_srai_epi32(uox4, 21);
++            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si32(&dstu[x >> 1], _mm_packus_epi16(_mm_packs_epi32(uox4, zero128), zero128));
++
++            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            vox4 = _mm_srai_epi32(vox4, 21);
++            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si32(&dstv[x >> 1], _mm_packus_epi16(_mm_packs_epi32(vox4, zero128), zero128));
 +        }
 +    }
 +
@@ -1432,28 +1499,32 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    if (remainw) {
 +        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
-+        rdstuv += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
 +        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
-+                                       rsrcy, rsrcuv,
-+                                       dstlinesize, srclinesize,
-+                                       dstdepth, srcdepth,
-+                                       remainw, rheight, params);
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
 +    }
 +}
 +
-+X86_64_V3 static void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
-+                                                         const uint16_t *srcy, const uint16_t *srcuv,
-+                                                         const int *dstlinesize, const int *srclinesize,
-+                                                         int dstdepth, int srcdepth,
-+                                                         int width, int height,
-+                                                         const struct TonemapIntParams *params)
++X86_64_V3 static void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                                      const int *dstlinesize, const int *srclinesize,
++                                                      int dstdepth, int srcdepth,
++                                                      int width, int height,
++                                                      const struct TonemapIntParams *params)
 +{
 +    uint8_t *rdsty = dsty;
-+    uint8_t *rdstuv = dstuv;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
 +    int rheight = height;
 +    // not zero when not divisible by 16
 +    // intentionally leave last pixel emtpy when input is odd
@@ -1463,13 +1534,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+//    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+//    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -1493,7 +1562,8 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m256i cyx8 = _mm256_set1_epi32(cy);
 +    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+    __m256i uvx16, uvx8a, uvx8b;
++    __m256i ux8, vx8;
++    __m256i uvx8a, uvx8b;
 +    __m256i y0x16, y1x16;
 +    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
 +    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
@@ -1510,39 +1580,32 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
 +    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
 +        for (int xx = 0; xx < width >> 4; xx++) {
 +            int x = xx << 4;
 +
 +            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
 +            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
-+
-+            if (in_depth == 10) {
-+                // shift to low10bits for 10bit input
-+                y0x16 = _mm256_srli_epi16(y0x16, 6);
-+                y1x16 = _mm256_srli_epi16(y1x16, 6);
-+                uvx16 = _mm256_srli_epi16(uvx16, 6);
-+            }
++            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
++            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
 +
 +            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
 +            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
 +            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
 +            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
-+            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
-+            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
++
 +            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
 +            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
 +            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
 +            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
-+            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
++            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
++            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
 +
-+            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
-+            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
-+            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
-+            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
 +            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
@@ -1723,17 +1786,20 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
 +            uox8 = _mm256_srai_epi32(uox8, out_sh);
 +            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
++            uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256());
++            _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8));
 +
 +            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
 +            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
 +            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
 +            vox8 = _mm256_srai_epi32(vox8, out_sh);
 +            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
-+
-+            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
-+            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
-+            uvox16 = _mm256_packs_epi32(uvoax8, uvobx8);
-+            _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
++            vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256());
++            _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8));
 +        }
 +    }
 +
@@ -1741,25 +1807,25 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    if (remainw) {
 +        int offset = width & (int)0xfffffff0;
 +        rdsty += offset;
-+        rdstuv += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
 +        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
-+                                       rsrcy, rsrcuv,
-+                                       dstlinesize, srclinesize,
-+                                       dstdepth, srcdepth,
-+                                       remainw, rheight, params);
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
 +    }
 +}
-+#endif
 +
-+#if ARCH_AARCH64
-+static void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
-+                                                const uint16_t *srcy, const uint16_t *srcuv,
-+                                                const int *dstlinesize, const int *srclinesize,
-+                                                int dstdepth, int srcdepth,
-+                                                int width, int height,
-+                                                const struct TonemapIntParams *params)
++X86_64_V2 static void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv,
++                                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                                         const int *dstlinesize, const int *srclinesize,
++                                                         int dstdepth, int srcdepth,
++                                                         int width, int height,
++                                                         const struct TonemapIntParams *params)
 +{
 +    uint8_t *rdsty = dsty;
 +    uint8_t *rdstuv = dstuv;
@@ -1799,234 +1865,1573 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +
 +    int16_t r[8], g[8], b[8];
 +    int16_t r1[8], g1[8], b1[8];
-+    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
-+    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
-+    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
-+    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
-+    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
-+    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
-+    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
-+    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
-+    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
-+    uint16x8_t uvx8;
-+    uint16x4_t ux2a, vx2a, ux2b, vx2b;
-+    uint16x8_t y0x8, y1x8, ux8, vx8;
-+    uint16x8_t r0x8, g0x8, b0x8;
-+    uint16x8_t r1x8, g1x8, b1x8;
 +
-+    int16x8_t r0ox8, g0ox8, b0ox8;
-+    int16x8_t y0ox8;
-+    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
-+    int32x4_t y0oax4, y0obx4;
++    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
++    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
++    __m128i cyx4 = _mm_set1_epi32(cy);
++    __m128i rndx4 = _mm_set1_epi32(in_rnd);
++    __m128i zero128 = _mm_setzero_si128();
++    __m128i uvx8, uvx4a, uvx4b;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
-+    int16x8_t r1ox8, g1ox8, b1ox8;
-+    int16x8_t y1ox8;
-+    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    int32x4_t y1oax4, y1obx4;
-+    int32x4_t uvoax4, uvobx4;
-+    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
-+    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
-+    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
-+    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
-+    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
-+    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
++
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
++    __m128i uoax4, voax4, ravgx4, gavgx4, bavgx4;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
 +
-+            y0x8 = vld1q_u16(srcy + x);
-+            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            uvx8 = vld1q_u16(srcuv + x);
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx8 = _mm_lddqu_si128((__m128i*)(srcuv + x));
++
 +            if (in_depth == 10) {
 +                // shift to low10bits for 10bit input
 +                // shift bit has to be compile-time constant
-+                y0x8 = vshrq_n_u16(y0x8, 6);
-+                y1x8 = vshrq_n_u16(y1x8, 6);
-+                uvx8 = vshrq_n_u16(uvx8, 6);
++                y0x8 = _mm_srli_epi16(y0x8, 6);
++                y1x8 = _mm_srli_epi16(y1x8, 6);
++                uvx8 = _mm_srli_epi16(uvx8, 6);
 +            }
-+            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
-+            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
-+            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            uvx4a = _mm_cvtepu16_epi32(uvx8);
++            uvx4b = _mm_unpackhi_epi16(uvx8, zero128);
++            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
++            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
++            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
++            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
++            uvx4a = _mm_sub_epi32(uvx4a, in_uv_offx4);
++            uvx4b = _mm_sub_epi32(uvx4b, in_uv_offx4);
 +
-+            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
-+            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
-+            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
-+            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
++            ux4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
++
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r0x4a = _mm_add_epi32(r0x4a, rndx4);
++            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
++            r0x4a = av_clip_int16_sse(r0x4a);
++
++            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r1x4a = _mm_add_epi32(r1x4a, rndx4);
++            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
++            r1x4a = av_clip_int16_sse(r1x4a);
++
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g0x4a = _mm_add_epi32(g0x4a, rndx4);
++            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
++            g0x4a = av_clip_int16_sse(g0x4a);
++
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g1x4a = _mm_add_epi32(g1x4a, rndx4);
++            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
++            g1x4a = av_clip_int16_sse(g1x4a);
++
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b0x4a = _mm_add_epi32(b0x4a, rndx4);
++            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
++            b0x4a = av_clip_int16_sse(b0x4a);
++
++            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b1x4a = _mm_add_epi32(b1x4a, rndx4);
++            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
++            b1x4a = av_clip_int16_sse(b1x4a);
++
++            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r0x4b = _mm_add_epi32(r0x4b, rndx4);
++            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
++            r0x4b = av_clip_int16_sse(r0x4b);
++
++            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r1x4b = _mm_add_epi32(r1x4b, rndx4);
++            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
++            r1x4b = av_clip_int16_sse(r1x4b);
++
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g0x4b = _mm_add_epi32(g0x4b, rndx4);
++            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
++            g0x4b = av_clip_int16_sse(g0x4b);
++
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g1x4b = _mm_add_epi32(g1x4b, rndx4);
++            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
++            g1x4b = av_clip_int16_sse(g1x4b);
++
++            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b0x4b = _mm_add_epi32(b0x4b, rndx4);
++            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
++            b0x4b = av_clip_int16_sse(b0x4b);
++
++            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b1x4b = _mm_add_epi32(b1x4b, rndx4);
++            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
++            b1x4b = av_clip_int16_sse(b1x4b);
++
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
++
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
++
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
++
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            yoax4 = _mm_srai_epi32(yoax4, 21);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
++
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, 21);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
++
++            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
++            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
++
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
++
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
++
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
++
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, 21);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
++
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, 21);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
++
++            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
++            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
++
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
++
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
++
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
++
++            uoax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uoax4 = _mm_srai_epi32(uoax4, 21);
++            uoax4 = _mm_add_epi32(uoax4, _mm_set1_epi32(out_uv_offset));
++
++            voax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            voax4 = _mm_srai_epi32(voax4, 21);
++            voax4 = _mm_add_epi32(voax4, _mm_set1_epi32(out_uv_offset));
++
++            uvoax4 = _mm_unpacklo_epi32(uoax4, voax4);
++            uvobx4 = _mm_unpackhi_epi32(uoax4, voax4);
++            _mm_storeu_si64(&dstuv[x], _mm_packus_epi16(_mm_packs_epi32(uvoax4, uvobx4), zero128));
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
++    }
++}
++
++X86_64_V3 static void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
++                                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                                         const int *dstlinesize, const int *srclinesize,
++                                                         int dstdepth, int srcdepth,
++                                                         int width, int height,
++                                                         const struct TonemapIntParams *params)
++{
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 16
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 14;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++//    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++//    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++
++    __m256i uvx16, uvx8a, uvx8b;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
++
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
++
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                y0x16 = _mm256_srli_epi16(y0x16, 6);
++                y1x16 = _mm256_srli_epi16(y1x16, 6);
++                uvx16 = _mm256_srli_epi16(uvx16, 6);
++            }
++
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
++            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
++            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
++
++            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
++
++            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
++
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
++
++            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
++
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
++
++            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
++
++            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
++
++            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
++
++            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
++
++            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
++
++            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
++
++            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
++
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++
++            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
++            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
++            uvox16 = _mm256_packs_epi32(uvoax8, uvobx8);
++            _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
++    }
++}
++#endif
++
++#if ARCH_AARCH64
++static void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                             const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                             const int *dstlinesize, const int *srclinesize,
++                                             int dstdepth, int srcdepth,
++                                             int width, int height,
++                                             const struct TonemapIntParams *params)
++{
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
++    uint16x4_t ux4, vx4;
++
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    int16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
++
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    int16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
++
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            ux4 = vld1_u16(srcu + (x >> 1));
++            vx4 = vld1_u16(srcv + (x >> 1));
++
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
++            ux8 = vsubq_u16(ux8, in_uv_offx8);
++            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
++            vx8 = vsubq_u16(vx8, in_uv_offx8);
++
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
++
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
++
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
++
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
++
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
++
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
++
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
++
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            y0oax4 = vshrq_n_s32(y0oax4, 21);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, 21);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++
++            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
++            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
++
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
++
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, 21);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, 21);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++
++            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
++            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
++
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
++
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
++
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
++
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, 21);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vst1_lane_u32(&dstu[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(uox4), vdup_n_s16(0)))), 0);
++
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            vox4 = vshrq_n_s32(vox4, 21);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++            vst1_lane_u32(&dstv[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(vox4), vdup_n_s16(0)))), 0);
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
++    }
++}
++
++static void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
++                                                const uint16_t *srcy, const uint16_t *srcuv,
++                                                const int *dstlinesize, const int *srclinesize,
++                                                int dstdepth, int srcdepth,
++                                                int width, int height,
++                                                const struct TonemapIntParams *params)
++{
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++//    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++//    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x8_t uvx8;
++    uint16x4_t ux2a, vx2a, ux2b, vx2b;
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
++
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    int16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
++
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    int16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x4_t uvoax4, uvobx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
++
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            uvx8 = vld1q_u16(srcuv + x);
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                // shift bit has to be compile-time constant
++                y0x8 = vshrq_n_u16(y0x8, 6);
++                y1x8 = vshrq_n_u16(y1x8, 6);
++                uvx8 = vshrq_n_u16(uvx8, 6);
++            }
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
++
++            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
++            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
++            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
++            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
++
++            ux8 = vcombine_u16(ux2a, ux2b);
++            vx8 = vcombine_u16(vx2a, vx2b);
++
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
++
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
++
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
++
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
++
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
++
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
++
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
++
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            y0oax4 = vshrq_n_s32(y0oax4, 21);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, 21);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++
++            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
++            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
++
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
++
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, 21);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, 21);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++
++            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
++            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
++
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
++
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
++
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
++
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, 21);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            vox4 = vshrq_n_s32(vox4, 21);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++
++            uvoax4 = vzip1q_s32(uox4, vox4);
++            uvobx4 = vzip2q_s32(uox4, vox4);
++
++            vst1_u8(&dstuv[x], vqmovun_s16(vcombine_s16(vmovn_s32(uvoax4), vmovn_s32(uvobx4))));
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
++    }
++}
++#endif
++
++static void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                          const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                          const int *dstlinesize, const int *srclinesize,
++                                          int dstdepth, int srcdepth,
++                                          int width, int height,
++                                          const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                         ) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
++            int u = (srcu[x >> 1]) - in_uv_offset;
++            int v = (srcv[x >> 1]) - in_uv_offset;
++
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++
++            int r00 = r[0], g00 = g[0], b00 = b[0];
++            int r01 = r[1], g01 = g[1], b01 = b[1];
++            int r10 = r[2], g10 = g[2], b10 = b[2];
++            int r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 16);
++            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 16);
++            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 16);
++            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 16);
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 16);
++            dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 16);
++#undef AVG
++        }
++    }
++}
++
++static void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
++                                                const uint16_t *srcy, const uint16_t *srcuv,
++                                                const int *dstlinesize, const int *srclinesize,
++                                                int dstdepth, int srcdepth,
++                                                int width, int height,
++                                                const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
++            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
++            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
++
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++
++            int r00 = r[0], g00 = g[0], b00 = b[0];
++            int r01 = r[1], g01 = g[1], b01 = b[1];
++            int r10 = r[2], g10 = g[2], b10 = b[2];
++            int r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstuv[x]     = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
++            dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
++#undef AVG
++        }
++    }
++}
++
++#if ARCH_X86
++X86_64_V2 static void tonemap_frame_420p10_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                                        const int *dstlinesize, const int *srclinesize,
++                                                        int dstdepth, int srcdepth,
++                                                        int width, int height,
++                                                        const struct TonemapIntParams *params)
++{
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++
++    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
++    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
++    __m128i cyx4 = _mm_set1_epi32(cy);
++    __m128i rndx4 = _mm_set1_epi32(in_rnd);
++    __m128i zero128 = _mm_setzero_si128();
++    __m128i uvx8, ux4, vx4;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
++
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
++
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
++    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4, uvox8;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
++
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1)));
++            vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1)));
++
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            ux4 = _mm_cvtepu16_epi32(ux4);
++            vx4 = _mm_cvtepu16_epi32(vx4);
++            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
++            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
++            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
++            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
++            ux4 = _mm_sub_epi32(ux4, in_uv_offx4);
++            vx4 = _mm_sub_epi32(vx4, in_uv_offx4);
++
++            ux4a = _mm_unpacklo_epi32(ux4, ux4);
++            ux4b = _mm_unpackhi_epi32(ux4, ux4);
++            vx4a = _mm_unpacklo_epi32(vx4, vx4);
++            vx4b = _mm_unpackhi_epi32(vx4, vx4);
++
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r0x4a = _mm_add_epi32(r0x4a, rndx4);
++            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
++            r0x4a = av_clip_int16_sse(r0x4a);
 +
-+            ux8 = vcombine_u16(ux2a, ux2b);
-+            vx8 = vcombine_u16(vx2a, vx2b);
++            r1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r1x4a = _mm_add_epi32(r1x4a, rndx4);
++            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
++            r1x4a = av_clip_int16_sse(r1x4a);
 +
-+            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
-+            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
-+            r0x8 = vaddq_u16(r0x8, rndx8);
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g0x4a = _mm_add_epi32(g0x4a, rndx4);
++            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
++            g0x4a = av_clip_int16_sse(g0x4a);
 +
-+            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
-+            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
-+            g0x8 = vaddq_u16(g0x8, rndx8);
++            g1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g1x4a = _mm_add_epi32(g1x4a, rndx4);
++            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
++            g1x4a = av_clip_int16_sse(g1x4a);
 +
-+            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
-+            b0x8 = vaddq_u16(b0x8, rndx8);
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b0x4a = _mm_add_epi32(b0x4a, rndx4);
++            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
++            b0x4a = av_clip_int16_sse(b0x4a);
 +
-+            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
-+            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
-+            r1x8 = vaddq_u16(r1x8, rndx8);
++            b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b1x4a = _mm_add_epi32(b1x4a, rndx4);
++            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
++            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
-+            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
-+            g1x8 = vaddq_u16(g1x8, rndx8);
++            r0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r0x4b = _mm_add_epi32(r0x4b, rndx4);
++            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
++            r0x4b = av_clip_int16_sse(r0x4b);
 +
-+            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
-+            b1x8 = vaddq_u16(b1x8, rndx8);
++            r1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r1x4b = _mm_add_epi32(r1x4b, rndx4);
++            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
++            r1x4b = av_clip_int16_sse(r1x4b);
 +
-+            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
-+            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
++            g0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g0x4b = _mm_add_epi32(g0x4b, rndx4);
++            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
++            g0x4b = av_clip_int16_sse(g0x4b);
 +
-+            r0ox8 = vld1q_s16(r);
-+            g0ox8 = vld1q_s16(g);
-+            b0ox8 = vld1q_s16(b);
++            g1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g1x4b = _mm_add_epi32(g1x4b, rndx4);
++            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
++            g1x4b = av_clip_int16_sse(g1x4b);
 +
-+            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
-+            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
-+            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++            b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b0x4b = _mm_add_epi32(b0x4b, rndx4);
++            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
++            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
-+            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
-+            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++            b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b1x4b = _mm_add_epi32(b1x4b, rndx4);
++            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
++            b1x4b = av_clip_int16_sse(b1x4b);
 +
-+            y0oax4 = vmulq_n_s32(r0oax4, cry);
-+            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
-+            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
-+            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
-+            y0oax4 = vshrq_n_s32(y0oax4, 21);
-+            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
 +
-+            y0obx4 = vmulq_n_s32(r0obx4, cry);
-+            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
-+            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
-+            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
-+            y0obx4 = vshrq_n_s32(y0obx4, 21);
-+            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
 +
-+            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
-+            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
 +
-+            r1ox8 = vld1q_s16(r1);
-+            g1ox8 = vld1q_s16(g1);
-+            b1ox8 = vld1q_s16(b1);
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
 +
-+            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
-+            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
-+            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            yoax4 = _mm_srai_epi32(yoax4, out_sh);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
-+            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
-+            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, out_sh);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1oax4 = vmulq_n_s32(r1oax4, cry);
-+            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
-+            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
-+            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
-+            y1oax4 = vshrq_n_s32(y1oax4, 21);
-+            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
++            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
 +
-+            y1obx4 = vmulq_n_s32(r1obx4, cry);
-+            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
-+            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
-+            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
-+            y1obx4 = vshrq_n_s32(y1obx4, 21);
-+            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
 +
-+            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
-+            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
 +
-+            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
-+            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
-+            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
-+            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
-+            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
-+            ravgx4 = vshrq_n_s32(ravgx4, 2);
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
 +
-+            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
-+            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
-+            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
-+            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
-+            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
-+            gavgx4 = vshrq_n_s32(gavgx4, 2);
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
-+            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
-+            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
-+            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
-+            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
-+            bavgx4 = vshrq_n_s32(bavgx4, 2);
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
-+            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
-+            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
-+            uox4 = vshrq_n_s32(uox4, 21);
-+            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
 +
-+            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
-+            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
-+            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
-+            vox4 = vshrq_n_s32(vox4, 21);
-+            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
 +
-+            uvoax4 = vzip1q_s32(uox4, vox4);
-+            uvobx4 = vzip2q_s32(uox4, vox4);
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
 +
-+            vst1_u8(&dstuv[x], vqmovun_s16(vcombine_s16(vmovn_s32(uvoax4), vmovn_s32(uvobx4))));
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
++
++            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uox4 = _mm_srai_epi32(uox4, out_sh);
++            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si64((__m128i_u *) &dstu[x >> 1], _mm_packus_epi32(uox4, zero128));
++
++            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            vox4 = _mm_srai_epi32(vox4, out_sh);
++            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si64((__m128i_u *) &dstv[x >> 1], _mm_packus_epi32(vox4, zero128));
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
-+    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
-+        rdsty += offset;
-+        rdstuv += offset;
-+        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
-+                                       rsrcy, rsrcuv,
-+                                       dstlinesize, srclinesize,
-+                                       dstdepth, srcdepth,
-+                                       remainw, rheight, params);
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
 +    }
 +}
-+#endif
 +
-+static void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
-+                                                const uint16_t *srcy, const uint16_t *srcuv,
-+                                                const int *dstlinesize, const int *srclinesize,
-+                                                int dstdepth, int srcdepth,
-+                                                int width, int height,
-+                                                const struct TonemapIntParams *params)
++X86_64_V3 static void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                                        const int *dstlinesize, const int *srclinesize,
++                                                        int dstdepth, int srcdepth,
++                                                        int width, int height,
++                                                        const struct TonemapIntParams *params)
 +{
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 14;
++
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -2043,65 +3448,267 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[4], g[4], b[4];
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++    __m256i ux8, vx8;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
-+            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
-+            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
 +
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
++            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
 +
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
 +
-+            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
++            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
 +
-+            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
 +
-+            int r00 = r[0], g00 = g[0], b00 = b[0];
-+            int r01 = r[1], g01 = g[1], b01 = b[1];
-+            int r10 = r[2], g10 = g[2], b10 = b[2];
-+            int r11 = r[3], g11 = g[3], b11 = b[3];
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
++
++            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
++
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
++
++            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
++
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
++
++            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
++
++            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
++
++            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
++
++            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
++
++            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
++
++            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
++
++            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
++
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
 +
-+            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
 +
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstuv[x]     = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
-+#undef AVG
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8));
 +        }
 +    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
++    }
 +}
 +
-+#if ARCH_X86
 +X86_64_V2 static void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty, uint16_t *dstuv,
 +                                                              const uint16_t *srcy, const uint16_t *srcuv,
 +                                                              const int *dstlinesize, const int *srclinesize,
@@ -2589,140 +4196,381 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
 +            b1x8b = av_clip_int16_avx(b1x8b);
 +
-+            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++
++            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
++            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
++            uvox16 = _mm256_packus_epi32(uvoax8, uvobx8);
++            uvox16 = _mm256_slli_epi16(uvox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16);
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
++                                            rsrcy, rsrcuv,
++                                            dstlinesize, srclinesize,
++                                            dstdepth, srcdepth,
++                                            remainw, rheight, params);
++    }
++}
++#endif
++
++#if ARCH_AARCH64
++static void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params)
++{
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x4_t ux4, vx4;
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
++
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    uint16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
++
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    uint16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
++
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            ux4 = vld1_u16(srcu + (x >> 1));
++            vx4 = vld1_u16(srcv + (x >> 1));
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++
++            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
++            ux8 = vsubq_u16(ux8, in_uv_offx8);
++            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
++            vx8 = vsubq_u16(vx8, in_uv_offx8);
++
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
++
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
++
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
++
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
++
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
++
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
 +
-+            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
-+            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
-+            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
 +
-+            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
-+            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
-+            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
 +
-+            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
-+            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
-+            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
 +
-+            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
-+            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
 +
-+            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
-+            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            y0oax4 = vshrq_n_s32(y0oax4, 19);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
-+            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
-+            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2);
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, 19);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
 +
-+            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
-+            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
-+            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
++            vst1q_u16(&dsty[x], y0ox8);
 +
-+            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
-+            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
-+            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
 +
-+            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
-+            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
-+            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
 +
-+            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
-+            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
 +
-+            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
-+            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, 19);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
 +
-+            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
-+            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2);
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, 19);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
 +
-+            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
-+            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
-+            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
++            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
 +
-+            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
-+            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
-+            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
 +
-+            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
-+            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
-+            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
 +
-+            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
-+            uox8 = _mm256_srai_epi32(uox8, out_sh);
-+            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
 +
-+            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
-+            vox8 = _mm256_srai_epi32(vox8, out_sh);
-+            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, 19);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4));
 +
-+            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
-+            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
-+            uvox16 = _mm256_packus_epi32(uvoax8, uvobx8);
-+            uvox16 = _mm256_slli_epi16(uvox16, out_sh2);
-+            _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16);
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            vox4 = vshrq_n_s32(vox4, 19);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++            vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4));
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff0;
++        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
-+        rdstuv += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
 +        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
-+                                            rsrcy, rsrcuv,
-+                                            dstlinesize, srclinesize,
-+                                            dstdepth, srcdepth,
-+                                            remainw, rheight, params);
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
 +    }
 +}
-+#endif
 +
-+#if ARCH_AARCH64
 +static void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
 +                                                     const uint16_t *srcy, const uint16_t *srcuv,
 +                                                     const int *dstlinesize, const int *srclinesize,
@@ -2996,45 +4844,99 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +}
 +#endif
 +
-+static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++#define LOAD_TONEMAP_PARAMS     TonemapxContext *s = ctx->priv; \
++ThreadData *td = arg;                                           \
++AVFrame *in = td->in;                                           \
++AVFrame *out = td->out;                                         \
++const AVPixFmtDescriptor *desc  = td->desc;                     \
++const AVPixFmtDescriptor *odesc = td->odesc;                    \
++const int ss = 1 << FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h); \
++const int slice_start = (in->height / ss *  jobnr     ) / nb_jobs * ss; \
++const int slice_end   = (in->height / ss * (jobnr + 1)) / nb_jobs * ss; \
++TonemapIntParams params = {                                     \
++.lut_peak            = s->lut_peak,                             \
++.lin_lut             = s->lin_lut,                              \
++.tonemap_lut         = s->tonemap_lut,                          \
++.delin_lut           = s->delin_lut,                            \
++.in_yuv_off          = s->in_yuv_off,                           \
++.out_yuv_off         = s->out_yuv_off,                          \
++.yuv2rgb_coeffs      = &s->yuv2rgb_coeffs,                      \
++.rgb2yuv_coeffs      = &s->rgb2yuv_coeffs,                      \
++.rgb2rgb_coeffs      = &s->rgb2rgb_coeffs,                      \
++.rgb2rgb_passthrough = in->color_primaries == out->color_primaries,   \
++.coeffs              = s->coeffs,                               \
++.ocoeffs             = s->ocoeffs,                              \
++.desat               = s->desat,                                \
++};
++
++static int filter_slice_planar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
 +{
-+    TonemapxContext *s = ctx->priv;
-+    ThreadData *td = arg;
-+    AVFrame *in = td->in;
-+    AVFrame *out = td->out;
-+    const AVPixFmtDescriptor *desc  = td->desc;
-+    const AVPixFmtDescriptor *odesc = td->odesc;
-+    const int ss = 1 << FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h);
-+    const int slice_start = (in->height / ss *  jobnr     ) / nb_jobs * ss;
-+    const int slice_end   = (in->height / ss * (jobnr + 1)) / nb_jobs * ss;
-+    int y, x;
-+
-+    TonemapIntParams params = {
-+        .lut_peak            = s->lut_peak,
-+        .lin_lut             = s->lin_lut,
-+        .tonemap_lut         = s->tonemap_lut,
-+        .delin_lut           = s->delin_lut,
-+        .in_yuv_off          = s->in_yuv_off,
-+        .out_yuv_off         = s->out_yuv_off,
-+        .yuv2rgb_coeffs      = &s->yuv2rgb_coeffs,
-+        .rgb2yuv_coeffs      = &s->rgb2yuv_coeffs,
-+        .rgb2rgb_coeffs      = &s->rgb2rgb_coeffs,
-+        .rgb2rgb_passthrough = in->color_primaries == out->color_primaries,
-+        .coeffs              = s->coeffs,
-+        .ocoeffs             = s->ocoeffs,
-+        .desat               = s->desat,
-+    };
-+
-+    av_log(s, AV_LOG_DEBUG, "dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
-+
-+    td->tonemap_func(out->data[0] + out->linesize[0] * slice_start,
-+                     out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                     (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                     (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                     out->linesize, in->linesize,
-+                     odesc->comp[0].depth, desc->comp[0].depth,
-+                     out->width, slice_end - slice_start,
-+                     &params);
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++
++    s->tonemap_func_planar8(out->data[0] + out->linesize[0] * slice_start,
++                            out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                            out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                            (void*)(in->data[0] + in->linesize[0] * slice_start),
++                            (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                            (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                            out->linesize, in->linesize,
++                            odesc->comp[0].depth, desc->comp[0].depth,
++                            out->width, slice_end - slice_start,
++                            &params);
++
++    return 0;
++}
++
++static int filter_slice_biplanar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++
++    s->tonemap_func_biplanar8(out->data[0] + out->linesize[0] * slice_start,
++                              out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                              (void*)(in->data[0] + in->linesize[0] * slice_start),
++                              (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                              out->linesize, in->linesize,
++                              odesc->comp[0].depth, desc->comp[0].depth,
++                              out->width, slice_end - slice_start,
++                              &params);
++
++    return 0;
++}
++
++static int filter_slice_planar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++
++    s->tonemap_func_planar10(out->data[0] + out->linesize[0] * slice_start,
++                             out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                             out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                             (void*)(in->data[0] + in->linesize[0] * slice_start),
++                             (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                             (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                             out->linesize, in->linesize,
++                             odesc->comp[0].depth, desc->comp[0].depth,
++                             out->width, slice_end - slice_start,
++                             &params);
++
++    return 0;
++}
++
++static int filter_slice_biplanar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++
++    s->tonemap_func_biplanar10(out->data[0] + out->linesize[0] * slice_start,
++                               out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                               (void*)(in->data[0] + in->linesize[0] * slice_start),
++                               (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                               out->linesize, in->linesize,
++                               odesc->comp[0].depth, desc->comp[0].depth,
++                               out->width, slice_end - slice_start,
++                               &params);
 +
 +    return 0;
 +}
@@ -3059,6 +4961,24 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +        return AVERROR_BUG;
 +    }
 +
++    switch (odesc->comp[2].plane) {
++        case 1: // biplanar
++            if (odesc->comp[0].depth == 8) {
++                s->filter_slice = filter_slice_biplanar8;
++            } else {
++                s->filter_slice = filter_slice_biplanar10;
++            }
++            break;
++        default:
++        case 2: // planar
++            if (odesc->comp[0].depth == 8) {
++                s->filter_slice = filter_slice_planar8;
++            } else {
++                s->filter_slice = filter_slice_planar10;
++            }
++            break;
++    }
++
 +    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
 +    if (!out) {
 +        av_frame_free(&in);
@@ -3127,8 +5047,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    td.desc  = desc;
 +    td.odesc = odesc;
 +    td.peak  = peak;
-+    td.tonemap_func = odesc->comp[0].depth == 8 ? s->tonemap_frame_p01x_2_nv12 : s->tonemap_frame_p01x_2_p01x;
-+    ctx->internal->execute(ctx, filter_slice, &td, NULL,
++    ctx->internal->execute(ctx, s->filter_slice, &td, NULL,
 +                           FFMIN(outlink->h >> FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h), ff_filter_get_nb_threads(ctx)));
 +
 +    av_frame_free(&in);
@@ -3151,33 +5070,110 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    av_freep(&s->tonemap_lut);
 +}
 +
++static int query_formats(AVFilterContext *ctx)
++{
++    enum AVPixelFormat valid_in_pix_fmts[4];
++    AVFilterFormats *formats;
++    const AVPixFmtDescriptor *desc;
++    TonemapxContext *s = ctx->priv;
++
++    if (!strcmp(s->format_str, "same")) {
++        int res;
++        formats = ff_make_format_list(in_pix_fmts);
++        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++        if (res < 0)
++            return res;
++        s->format = AV_PIX_FMT_NONE;
++    } else {
++        int i, j = 0;
++        int res;
++        formats = ff_make_format_list(in_pix_fmts);
++        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++        if (res < 0)
++            return res;
++        if (s->format == AV_PIX_FMT_NONE) {
++            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
++            return AVERROR(EINVAL);
++        }
++        s->format = av_get_pix_fmt(s->format_str);
++        // Check again in case of the string is invalid
++        if (s->format == AV_PIX_FMT_NONE) {
++            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
++            return AVERROR(EINVAL);
++        }
++        desc = av_pix_fmt_desc_get(s->format);
++        // Filter out the input formats for requested output formats
++        // The input and output must have the same planar format, either planar or bi-planar packed
++        for (i = 0; in_pix_fmts[i] != AV_PIX_FMT_NONE; i++) {
++            const AVPixFmtDescriptor *tdesc = av_pix_fmt_desc_get(in_pix_fmts[i]);
++            if (tdesc->comp[2].plane == desc->comp[2].plane) {
++                valid_in_pix_fmts[j] = in_pix_fmts[i];
++                j++;
++            }
++        }
++        valid_in_pix_fmts[j] = AV_PIX_FMT_NONE;
++        formats = ff_make_format_list(valid_in_pix_fmts);
++        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++        if (res < 0)
++            return res;
++        if (out_format_is_supported(s->format)) {
++            formats = NULL;
++            res = ff_add_format(&formats, s->format);
++            if (res < 0)
++                return res;
++        } else {
++            av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
++                   av_get_pix_fmt_name(s->format));
++            return AVERROR(ENOSYS);
++        }
++    }
++
++    return ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats);
++}
++
 +static av_cold int init(AVFilterContext *ctx)
 +{
 +    TonemapxContext *s = ctx->priv;
 +    int cpu_flags = av_get_cpu_flags();
++    av_log(ctx, AV_LOG_DEBUG, "Requested output format: %s\n",
++           s->format_str);
 +
 +#if ARCH_AARCH64
 +    if (have_neon(cpu_flags)) {
-+        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12_neon;
-+        s->tonemap_frame_p01x_2_p01x = tonemap_frame_p016_p010_2_p016_p010_neon;
++        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_neon;
++        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_neon;
++        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon;
++        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon;
 +    }
 +#elif ARCH_X86
 +    if (X86_SSE42(cpu_flags)) {
-+        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12_sse;
-+        s->tonemap_frame_p01x_2_p01x = tonemap_frame_p016_p010_2_p016_p010_sse;
++        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_sse;
++        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_sse;
++        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse;
++        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse;
 +    }
 +    if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) {
-+        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12_avx;
-+        s->tonemap_frame_p01x_2_p01x = tonemap_frame_p016_p010_2_p016_p010_avx;
++        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_avx;
++        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_avx;
++        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx;
++        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx;
 +    }
 +#endif
 +
-+    if (!s->tonemap_frame_p01x_2_nv12) {
-+        s->tonemap_frame_p01x_2_nv12 = tonemap_frame_p016_p010_2_nv12;
++    if (!s->tonemap_func_biplanar8) {
++        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12;
++    }
++
++    if (!s->tonemap_func_biplanar10) {
++        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010;
++    }
++
++    if (!s->tonemap_func_planar8) {
++        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p;
 +    }
 +
-+    if (!s->tonemap_frame_p01x_2_p01x) {
-+        s->tonemap_frame_p01x_2_p01x = tonemap_frame_p016_p010_2_p016_p010;
++    if (!s->tonemap_func_planar10) {
++        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10;
 +    }
 +
 +    switch(s->tonemap) {

From c9229cab87b15131f967fa3dba45ac94a2b2719b Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sun, 30 Jun 2024 10:01:41 +0800
Subject: [PATCH 13/27] avfilter/tonemapx: add workaround for gcc <= 10

---
 debian/patches/0080-add-tonemapx-filter.patch | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index ccb4b2bfb63..ebefe12823c 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -568,6 +568,16 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +}
 +
 +#if ARCH_X86
++// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction
++// cast the register into float register and store with movss as a workaround
++#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
++__attribute__((always_inline))
++X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) {
++    _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a));
++    return;
++}
++#endif
++
 +X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a)
 +{
 +    __m128i mask = _mm_set1_epi32(0x7FFF);
@@ -592,16 +602,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m128i sig4;
 +    __m128 mapvalx4, r_linx4, g_linx4, b_linx4;
 +    __m128 offset = _mm_set1_ps(0.5f);
-+    __m128i zerox4 = _mm_setzero_si128();
 +    __m128i input_lut_offset = _mm_set1_epi32(2048);
 +    __m128i upper_bound = _mm_set1_epi32(32767);
 +    __m128 intermediate_upper_bound = _mm_set1_ps(32767.0f);
 +    __m128i r, g, b, rx4, gx4, bx4;
 +
 +    float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4];
-+    int r4[4], g4[4], b4[4], s4[4];
-+    int lr[4], lg[4], lb[4];
-+    int i;
 +
 +    sig4 = _mm_max_epi32(r_in, _mm_max_epi32(g_in, b_in));
 +    sig4 = _mm_add_epi32(sig4, input_lut_offset);
@@ -714,7 +720,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m256i r, g, b, rx8, gx8, bx8;
 +
 +    float mapval8[8], r_lin8[8], g_lin8[8], b_lin8[8];
-+    int i;
 +
 +    sig8 = _mm256_max_epi32(r_in, _mm256_max_epi32(g_in, b_in));
 +    sig8 = _mm256_add_epi32(sig8, input_lut_offset);
@@ -1028,7 +1033,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -1563,7 +1567,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
 +    __m256i ux8, vx8;
-+    __m256i uvx8a, uvx8b;
 +    __m256i y0x16, y1x16;
 +    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
 +    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
@@ -1577,7 +1580,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m256i r1ox16, g1ox16, b1ox16;
 +    __m256i y1ox16;
 +    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
++    __m256i y1oax8, y1obx8;
 +    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
@@ -3128,7 +3131,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
@@ -3158,7 +3160,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m128i cyx4 = _mm_set1_epi32(cy);
 +    __m128i rndx4 = _mm_set1_epi32(in_rnd);
 +    __m128i zero128 = _mm_setzero_si128();
-+    __m128i uvx8, ux4, vx4;
++    __m128i ux4, vx4;
 +    __m128i y0x8, y1x8;
 +    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
 +    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
@@ -3172,8 +3174,8 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m128i r1ox8, g1ox8, b1ox8;
 +    __m128i y1ox8;
 +    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
-+    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4, uvox8;
++    __m128i y1oax4, y1obx4;
++    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
@@ -3729,7 +3731,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
@@ -4037,7 +4038,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);

From 9872840d7809ee520934e24631ce6a37a2b5eaaa Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sun, 30 Jun 2024 15:30:13 +0800
Subject: [PATCH 14/27] avfilter/tonemapx: split platform-specific code

---
 debian/patches/0080-add-tonemapx-filter.patch | 6806 +++++++++--------
 1 file changed, 3571 insertions(+), 3235 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index ebefe12823c..8a7e6674c4e 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -115,7 +115,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,5147 @@
+@@ -0,0 +1,1203 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -140,46 +140,29 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 + */
 +
 +#include <float.h>
-+#include <stdio.h>
 +#include <string.h>
 +
 +#include "libavutil/avassert.h"
 +#include "libavutil/imgutils.h"
 +#include "libavutil/internal.h"
-+#include "libavutil/intreadwrite.h"
 +#include "libavutil/mem_internal.h"
 +#include "libavutil/opt.h"
-+#include "libavutil/pixdesc.h"
 +#include "libavutil/cpu.h"
 +#if  ARCH_AARCH64
-+#   include <arm_neon.h>
 +#   include "libavutil/aarch64/cpu.h"
++#   include "aarch64/vf_tonemapx_intrin_neon.h"
 +#endif
 +#if ARCH_X86
-+#   include <immintrin.h>
-+#   include <emmintrin.h>
-+#   include <smmintrin.h>
 +#   include "libavutil/x86/cpu.h"
++#   include "x86/vf_tonemapx_intrin_sse.h"
++#   include "x86/vf_tonemapx_intrin_avx.h"
 +#endif
 +
 +#include "avfilter.h"
-+#include "colorspace.h"
 +#include "formats.h"
 +#include "internal.h"
 +#include "video.h"
-+
-+#define REFERENCE_WHITE 203.0f
-+#define FLOAT_EPS 1.175494351e-38f
-+
-+#if defined(__GNUC__) || defined(__clang__)
-+#    if (__GNUC__ >= 11) || (__clang_major__ >= 12)
-+#        define X86_64_V2 __attribute__((target("arch=x86-64-v2")))
-+#        define X86_64_V3 __attribute__((target("arch=x86-64-v3")))
-+#    else
-+#        define X86_64_V2 __attribute__((target("sse4.2")))
-+#        define X86_64_V3 __attribute__((target("avx2,fma")))
-+#    endif // (__GNUC__ >= 11) || (__clang_major__ >= 12)
-+#endif // defined(__GNUC__) || defined(__clang__)
++#include "vf_tonemapx.h"
 +
 +enum TonemapAlgorithm {
 +    TONEMAP_NONE,
@@ -193,20 +176,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    TONEMAP_MAX,
 +};
 +
-+typedef struct TonemapIntParams {
-+    double lut_peak;
-+    float *lin_lut;
-+    float *tonemap_lut;
-+    uint16_t *delin_lut;
-+    int in_yuv_off, out_yuv_off;
-+    int16_t (*yuv2rgb_coeffs)[3][3][8];
-+    int16_t (*rgb2yuv_coeffs)[3][3][8];
-+    double  (*rgb2rgb_coeffs)[3][3];
-+    int rgb2rgb_passthrough;
-+    const AVLumaCoefficients *coeffs, *ocoeffs;
-+    double desat;
-+} TonemapIntParams;
-+
 +typedef struct TonemapxContext {
 +    const AVClass *class;
 +
@@ -567,509 +536,333 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    *b_out = delin_lut[av_clip_uintp2(b_lin * 32767 + 0.5, 15)];
 +}
 +
-+#if ARCH_X86
-+// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction
-+// cast the register into float register and store with movss as a workaround
-+#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
-+__attribute__((always_inline))
-+X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) {
-+    _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a));
-+    return;
-+}
-+#endif
-+
-+X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a)
++// See also libavfilter/colorspacedsp_template.c
++void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
++                                           const uint16_t *srcy, const uint16_t *srcuv,
++                                           const int *dstlinesize, const int *srclinesize,
++                                           int dstdepth, int srcdepth,
++                                           int width, int height,
++                                           const struct TonemapIntParams *params)
 +{
-+    __m128i mask = _mm_set1_epi32(0x7FFF);
-+    __m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF));
-+
-+    __m128i zero = _mm_setzero_si128();
-+    __m128i cmp = _mm_cmpeq_epi32(condition, zero);
-+
-+    __m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask);
-+    __m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a));
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
 +
-+    return result;
-+}
-+X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in,
-+                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
-+                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
-+                                                 const AVLumaCoefficients *coeffs,
-+                                                 const AVLumaCoefficients *ocoeffs, double desat,
-+                                                 double (*rgb2rgb)[3][3],
-+                                                 int rgb2rgb_passthrough)
-+{
-+    __m128i sig4;
-+    __m128 mapvalx4, r_linx4, g_linx4, b_linx4;
-+    __m128 offset = _mm_set1_ps(0.5f);
-+    __m128i input_lut_offset = _mm_set1_epi32(2048);
-+    __m128i upper_bound = _mm_set1_epi32(32767);
-+    __m128 intermediate_upper_bound = _mm_set1_ps(32767.0f);
-+    __m128i r, g, b, rx4, gx4, bx4;
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
 +
-+    float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4];
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
-+    sig4 = _mm_max_epi32(r_in, _mm_max_epi32(g_in, b_in));
-+    sig4 = _mm_add_epi32(sig4, input_lut_offset);
-+    sig4 = av_clip_uint16_sse(sig4);
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    r = _mm_add_epi32(r_in, input_lut_offset);
-+    r = av_clip_uint16_sse(r);
-+    g = _mm_add_epi32(g_in, input_lut_offset);
-+    g = av_clip_uint16_sse(g);
-+    b = _mm_add_epi32(b_in, input_lut_offset);
-+    b = av_clip_uint16_sse(b);
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
++            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
++            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
 +
-+    // Cannot use loop here as the lane has to be compile-time constant
-+#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \
-+r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)];                             \
-+g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)];                             \
-+b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)];
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
 +
-+    LOAD_LUT(0)
-+    LOAD_LUT(1)
-+    LOAD_LUT(2)
-+    LOAD_LUT(3)
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
 +
-+#undef LOAD_LUT
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
 +
-+    mapvalx4 = _mm_loadu_ps(mapval4);
-+    r_linx4 = _mm_loadu_ps(r_lin4);
-+    g_linx4 = _mm_loadu_ps(g_lin4);
-+    b_linx4 = _mm_loadu_ps(b_lin4);
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
 +
-+    if (!rgb2rgb_passthrough) {
-+        r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0]));
-+        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1])));
-+        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2])));
++            int r00 = r[0], g00 = g[0], b00 = b[0];
++            int r01 = r[1], g01 = g[1], b01 = b[1];
++            int r10 = r[2], g10 = g[2], b10 = b[2];
++            int r11 = r[3], g11 = g[3], b11 = b[3];
 +
-+        g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1]));
-+        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0])));
-+        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2])));
++            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
++            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
 +
-+        b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2]));
-+        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0])));
-+        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1])));
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstuv[x]     = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
++            dstuv[x + 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
++#undef AVG
++        }
 +    }
++}
 +
-+    if (desat > 0) {
-+        __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS);
-+        __m128 desat4 = _mm_set1_ps((float)desat);
-+        __m128 luma4 = _mm_set1_ps(0);
-+        __m128 overbright4;
++void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                        const int *dstlinesize, const int *srclinesize,
++                                        int dstdepth, int srcdepth,
++                                        int width, int height,
++                                        const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
 +
-+        luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr))));
-+        luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg))));
-+        luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb))));
-+        overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4));
-+        r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4));
-+        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4));
-+        g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4));
-+        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4));
-+        b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4));
-+        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4));
-+    }
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
 +
-+    r_linx4 = _mm_mul_ps(r_linx4, mapvalx4);
-+    g_linx4 = _mm_mul_ps(g_linx4, mapvalx4);
-+    b_linx4 = _mm_mul_ps(b_linx4, mapvalx4);
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
-+    r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound);
-+    r_linx4 = _mm_add_ps(r_linx4, offset);
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound);
-+    g_linx4 = _mm_add_ps(g_linx4, offset);
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                         ) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
++            int u = (srcu[x >> 1]) - in_uv_offset;
++            int v = (srcv[x >> 1]) - in_uv_offset;
 +
-+    b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound);
-+    b_linx4 = _mm_add_ps(b_linx4, offset);
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
 +
-+    rx4 = _mm_cvttps_epi32(r_linx4);
-+    rx4 = av_clip_uint16_sse(rx4);
-+    gx4 = _mm_cvttps_epi32(g_linx4);
-+    gx4 = av_clip_uint16_sse(gx4);
-+    bx4 = _mm_cvttps_epi32(b_linx4);
-+    bx4 = av_clip_uint16_sse(bx4);
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
 +
-+#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \
-+g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)];                       \
-+b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)];
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
 +
-+    SAVE_COLOR(0)
-+    SAVE_COLOR(1)
-+    SAVE_COLOR(2)
-+    SAVE_COLOR(3)
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
 +
-+#undef SAVE_COLOR
++            int r00 = r[0], g00 = g[0], b00 = b[0];
++            int r01 = r[1], g01 = g[1], b01 = b[1];
++            int r10 = r[2], g10 = g[2], b10 = b[2];
++            int r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
++            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstu[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
++            dstv[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
++#undef AVG
++        }
++    }
 +}
 +
-+X86_64_V3 static inline void tonemap_int32x8_avx(__m256i r_in, __m256i g_in, __m256i b_in,
-+                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
-+                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
-+                                                 const AVLumaCoefficients *coeffs,
-+                                                 const AVLumaCoefficients *ocoeffs, double desat,
-+                                                 double (*rgb2rgb)[3][3],
-+                                                 int rgb2rgb_passthrough)
++void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                          const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                          const int *dstlinesize, const int *srclinesize,
++                                          int dstdepth, int srcdepth,
++                                          int width, int height,
++                                          const struct TonemapIntParams *params)
 +{
-+    __m256i sig8;
-+    __m256 mapvalx8, r_linx8, g_linx8, b_linx8;
-+    __m256 offset = _mm256_set1_ps(0.5f);
-+    __m256i zerox8 = _mm256_setzero_si256();
-+    __m256i input_lut_offset = _mm256_set1_epi32(2048);
-+    __m256i upper_bound = _mm256_set1_epi32(32767);
-+    __m256 intermediate_upper_bound = _mm256_set1_ps(32767.0f);
-+    __m256i r, g, b, rx8, gx8, bx8;
-+
-+    float mapval8[8], r_lin8[8], g_lin8[8], b_lin8[8];
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
 +
-+    sig8 = _mm256_max_epi32(r_in, _mm256_max_epi32(g_in, b_in));
-+    sig8 = _mm256_add_epi32(sig8, input_lut_offset);
-+    sig8 = _mm256_min_epi32(sig8, upper_bound);
-+    sig8 = _mm256_max_epi32(sig8, zerox8);
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
 +
-+    r = _mm256_add_epi32(r_in, input_lut_offset);
-+    r = _mm256_min_epi32(r, upper_bound);
-+    r = _mm256_max_epi32(r, zerox8);
-+    g = _mm256_add_epi32(g_in, input_lut_offset);
-+    g = _mm256_min_epi32(g, upper_bound);
-+    g = _mm256_max_epi32(g, zerox8);
-+    b = _mm256_add_epi32(b_in, input_lut_offset);
-+    b = _mm256_min_epi32(b, upper_bound);
-+    b = _mm256_max_epi32(b, zerox8);
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
-+#define LOAD_LUT(i) mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)]; \
-+r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)];                             \
-+g_lin8[i] = lin_lut[_mm256_extract_epi32(g, i)];                             \
-+b_lin8[i] = lin_lut[_mm256_extract_epi32(b, i)];
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    LOAD_LUT(0)
-+    LOAD_LUT(1)
-+    LOAD_LUT(2)
-+    LOAD_LUT(3)
-+    LOAD_LUT(4)
-+    LOAD_LUT(5)
-+    LOAD_LUT(6)
-+    LOAD_LUT(7)
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                         ) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
++            int u = (srcu[x >> 1]) - in_uv_offset;
++            int v = (srcv[x >> 1]) - in_uv_offset;
 +
-+#undef LOAD_LUT
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
 +
-+    mapvalx8 = _mm256_loadu_ps(mapval8);
-+    r_linx8 = _mm256_loadu_ps(r_lin8);
-+    g_linx8 = _mm256_loadu_ps(g_lin8);
-+    b_linx8 = _mm256_loadu_ps(b_lin8);
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
 +
-+    if (!rgb2rgb_passthrough) {
-+        r_linx8 = _mm256_mul_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][0]));
-+        r_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][1]), r_linx8);
-+        r_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][2]), r_linx8);
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
 +
-+        g_linx8 = _mm256_mul_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][1]));
-+        g_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][0]), g_linx8);
-+        g_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][2]), g_linx8);
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
 +
-+        b_linx8 = _mm256_mul_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][2]));
-+        b_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][0]), b_linx8);
-+        b_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][1]), b_linx8);
-+    }
++            int r00 = r[0], g00 = g[0], b00 = b[0];
++            int r01 = r[1], g01 = g[1], b01 = b[1];
++            int r10 = r[2], g10 = g[2], b10 = b[2];
++            int r11 = r[3], g11 = g[3], b11 = b[3];
 +
-+    if (desat > 0) {
-+        __m256 eps_x8 = _mm256_set1_ps(FLOAT_EPS);
-+        __m256 desat8 = _mm256_set1_ps((float)desat);
-+        __m256 luma8 = _mm256_set1_ps(0);
-+        __m256 overbright8;
++            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 16);
++            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 16);
++            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 16);
++            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 16);
 +
-+        luma8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cr)), luma8);
-+        luma8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cg)), luma8);
-+        luma8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cb)), luma8);
-+        overbright8 = _mm256_div_ps(_mm256_max_ps(_mm256_sub_ps(luma8, desat8), eps_x8), _mm256_max_ps(luma8, eps_x8));
-+        r_linx8 = _mm256_fnmadd_ps(r_linx8, overbright8, r_linx8);
-+        r_linx8 = _mm256_fmadd_ps(luma8, overbright8, r_linx8);
-+        g_linx8 = _mm256_fnmadd_ps(g_linx8, overbright8, g_linx8);
-+        g_linx8 = _mm256_fmadd_ps(luma8, overbright8, g_linx8);
-+        b_linx8 = _mm256_fnmadd_ps(b_linx8, overbright8, b_linx8);
-+        b_linx8 = _mm256_fmadd_ps(luma8, overbright8, b_linx8);
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 16);
++            dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 16);
++#undef AVG
++        }
 +    }
++}
 +
-+    r_linx8 = _mm256_mul_ps(r_linx8, mapvalx8);
-+    g_linx8 = _mm256_mul_ps(g_linx8, mapvalx8);
-+    b_linx8 = _mm256_mul_ps(b_linx8, mapvalx8);
-+
-+    r_linx8 = _mm256_fmadd_ps(r_linx8, intermediate_upper_bound, offset);
-+    g_linx8 = _mm256_fmadd_ps(g_linx8, intermediate_upper_bound, offset);
-+    b_linx8 = _mm256_fmadd_ps(b_linx8, intermediate_upper_bound, offset);
++void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
++                                                const uint16_t *srcy, const uint16_t *srcuv,
++                                                const int *dstlinesize, const int *srclinesize,
++                                                int dstdepth, int srcdepth,
++                                                int width, int height,
++                                                const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
 +
-+    rx8 = _mm256_cvttps_epi32(r_linx8);
-+    rx8 = _mm256_min_epi32(rx8, upper_bound);
-+    rx8 = _mm256_max_epi32(rx8, zerox8);
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
 +
-+    gx8 = _mm256_cvttps_epi32(g_linx8);
-+    gx8 = _mm256_min_epi32(gx8, upper_bound);
-+    gx8 = _mm256_max_epi32(gx8, zerox8);
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
-+    bx8 = _mm256_cvttps_epi32(b_linx8);
-+    bx8 = _mm256_min_epi32(bx8, upper_bound);
-+    bx8 = _mm256_max_epi32(bx8, zerox8);
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm256_extract_epi32(rx8, i)]; \
-+g_out[i] = delin_lut[_mm256_extract_epi32(gx8, i)];                       \
-+b_out[i] = delin_lut[_mm256_extract_epi32(bx8, i)];
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
++            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
++            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
 +
-+    SAVE_COLOR(0)
-+    SAVE_COLOR(1)
-+    SAVE_COLOR(2)
-+    SAVE_COLOR(3)
-+    SAVE_COLOR(4)
-+    SAVE_COLOR(5)
-+    SAVE_COLOR(6)
-+    SAVE_COLOR(7)
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
 +
-+#undef SAVE_COLOR
-+}
-+#endif
-+
-+#if ARCH_AARCH64
-+static inline void tonemap_int16x8_neon(uint16x8_t r_in, uint16x8_t g_in, uint16x8_t b_in,
-+                                        int16_t *r_out, int16_t *g_out, int16_t *b_out,
-+                                        float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
-+                                        const AVLumaCoefficients *coeffs,
-+                                        const AVLumaCoefficients *ocoeffs, double desat,
-+                                        double (*rgb2rgb)[3][3],
-+                                        int rgb2rgb_passthrough)
-+{
-+    int16x8_t sig8;
-+    float32x4_t mapvalx4a;
-+    float32x4_t mapvalx4b;
-+    float32x4_t r_linx4a;
-+    float32x4_t r_linx4b;
-+    float32x4_t g_linx4a;
-+    float32x4_t g_linx4b;
-+    float32x4_t b_linx4a;
-+    float32x4_t b_linx4b;
-+    float32x4_t offset = vdupq_n_f32(0.5f);
-+    int32x4_t output_upper_bound = vdupq_n_s32(32767);
-+    int32x4_t zerox4 = vdupq_n_s32(0);
-+    int16x8_t input_lut_offset = vdupq_n_s16(2048);
-+    int16x8_t input_upper_bound = vdupq_n_s16(32767);
-+    int16x8_t r, g, b;
-+    int32x4_t rx4a, gx4a, bx4a, rx4b, gx4b, bx4b;
-+
-+    float mapval4a[4], mapval4b[4], r_lin4a[4], r_lin4b[4], g_lin4a[4], g_lin4b[4], b_lin4a[4], b_lin4b[4];
-+
-+    r = vreinterpretq_s16_u16(r_in);
-+    g = vreinterpretq_s16_u16(g_in);
-+    b = vreinterpretq_s16_u16(b_in);
-+
-+    sig8 = vmaxq_s16(r, vmaxq_s16(g, b));
-+    sig8 = vaddq_s16(sig8, input_lut_offset);
-+    sig8 = vminq_s16(sig8, input_upper_bound);
-+    sig8 = vmaxq_s16(sig8, vreinterpretq_s16_s32(zerox4));
-+
-+    r = vaddq_s16(r, input_lut_offset);
-+    r = vminq_s16(r, input_upper_bound);
-+    r = vmaxq_s16(r, vreinterpretq_s16_s32(zerox4));
-+    g = vaddq_s16(g, input_lut_offset);
-+    g = vminq_s16(g, input_upper_bound);
-+    g = vmaxq_s16(g, vreinterpretq_s16_s32(zerox4));
-+    b = vaddq_s16(b, input_lut_offset);
-+    b = vminq_s16(b, input_upper_bound);
-+    b = vmaxq_s16(b, vreinterpretq_s16_s32(zerox4));
-+
-+    // Cannot use loop here as the lane has to be compile-time constant
-+#define LOAD_LUT(i) mapval4a[i] = tonemap_lut[vget_lane_s16(vget_low_s16(sig8), i)]; \
-+mapval4b[i] = tonemap_lut[vget_lane_s16(vget_high_s16(sig8), i)];                    \
-+r_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(r), i)];                             \
-+r_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(r), i)];                            \
-+g_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(g), i)];                             \
-+g_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(g), i)];                            \
-+b_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(b), i)];                             \
-+b_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(b), i)];
-+
-+    LOAD_LUT(0)
-+    LOAD_LUT(1)
-+    LOAD_LUT(2)
-+    LOAD_LUT(3)
-+
-+#undef  LOAD_LUT
-+
-+    mapvalx4a = vld1q_f32(mapval4a);
-+    mapvalx4b = vld1q_f32(mapval4b);
-+    r_linx4a = vld1q_f32(r_lin4a);
-+    r_linx4b = vld1q_f32(r_lin4b);
-+    g_linx4a = vld1q_f32(g_lin4a);
-+    g_linx4b = vld1q_f32(g_lin4b);
-+    b_linx4a = vld1q_f32(b_lin4a);
-+    b_linx4b = vld1q_f32(b_lin4b);
-+
-+    if (!rgb2rgb_passthrough) {
-+        r_linx4a = vmulq_n_f32(r_linx4a, (float)(*rgb2rgb)[0][0]);
-+        r_linx4a = vfmaq_n_f32(r_linx4a, g_linx4a, (float)(*rgb2rgb)[0][1]);
-+        r_linx4a = vfmaq_n_f32(r_linx4a, b_linx4a, (float)(*rgb2rgb)[0][2]);
-+        r_linx4b = vmulq_n_f32(r_linx4b, (float)(*rgb2rgb)[0][0]);
-+        r_linx4b = vfmaq_n_f32(r_linx4b, g_linx4b, (float)(*rgb2rgb)[0][1]);
-+        r_linx4b = vfmaq_n_f32(r_linx4b, b_linx4b, (float)(*rgb2rgb)[0][2]);
-+
-+        g_linx4a = vmulq_n_f32(g_linx4a, (float)(*rgb2rgb)[1][1]);
-+        g_linx4a = vfmaq_n_f32(g_linx4a, r_linx4a, (float)(*rgb2rgb)[1][0]);
-+        g_linx4a = vfmaq_n_f32(g_linx4a, b_linx4a, (float)(*rgb2rgb)[1][2]);
-+        g_linx4b = vmulq_n_f32(g_linx4b, (float)(*rgb2rgb)[1][1]);
-+        g_linx4b = vfmaq_n_f32(g_linx4b, r_linx4b, (float)(*rgb2rgb)[1][0]);
-+        g_linx4b = vfmaq_n_f32(g_linx4b, b_linx4b, (float)(*rgb2rgb)[1][2]);
-+
-+        b_linx4a = vmulq_n_f32(b_linx4a, (float)(*rgb2rgb)[2][2]);
-+        b_linx4a = vfmaq_n_f32(b_linx4a, r_linx4a, (float)(*rgb2rgb)[2][0]);
-+        b_linx4a = vfmaq_n_f32(b_linx4a, g_linx4a, (float)(*rgb2rgb)[2][1]);
-+        b_linx4b = vmulq_n_f32(b_linx4b, (float)(*rgb2rgb)[2][2]);
-+        b_linx4b = vfmaq_n_f32(b_linx4b, r_linx4b, (float)(*rgb2rgb)[2][0]);
-+        b_linx4b = vfmaq_n_f32(b_linx4b, g_linx4b, (float)(*rgb2rgb)[2][1]);
-+    }
-+
-+    if (desat > 0) {
-+        float32x4_t eps_x4 = vdupq_n_f32(FLOAT_EPS);
-+        float32x4_t desat4 = vdupq_n_f32((float)desat);
-+        float32x4_t luma4 = vdupq_n_f32(0);
-+        float32x4_t overbright4;
-+        // Group A
-+        luma4 = vmlaq_n_f32(luma4, r_linx4a, (float)av_q2d(coeffs->cr));
-+        luma4 = vmlaq_n_f32(luma4, g_linx4a, (float)av_q2d(coeffs->cg));
-+        luma4 = vmlaq_n_f32(luma4, b_linx4a, (float)av_q2d(coeffs->cb));
-+        overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
-+        r_linx4a = vmlsq_f32(r_linx4a, r_linx4a, overbright4);
-+        r_linx4a = vmlaq_f32(r_linx4a, luma4, overbright4);
-+        g_linx4a = vmlsq_f32(g_linx4a, g_linx4a, overbright4);
-+        g_linx4a = vmlaq_f32(g_linx4a, luma4, overbright4);
-+        b_linx4a = vmlsq_f32(b_linx4a, b_linx4a, overbright4);
-+        b_linx4a = vmlaq_f32(b_linx4a, luma4, overbright4);
-+        // Group B
-+        luma4 = vdupq_n_f32(0);
-+        luma4 = vmlaq_n_f32(luma4, r_linx4b, (float)av_q2d(coeffs->cr));
-+        luma4 = vmlaq_n_f32(luma4, g_linx4b, (float)av_q2d(coeffs->cg));
-+        luma4 = vmlaq_n_f32(luma4, b_linx4b, (float)av_q2d(coeffs->cb));
-+        overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
-+        r_linx4b = vmlsq_f32(r_linx4b, r_linx4b, overbright4);
-+        r_linx4b = vmlaq_f32(r_linx4b, luma4, overbright4);
-+        g_linx4b = vmlsq_f32(g_linx4b, g_linx4b, overbright4);
-+        g_linx4b = vmlaq_f32(g_linx4b, luma4, overbright4);
-+        b_linx4b = vmlsq_f32(b_linx4b, b_linx4b, overbright4);
-+        b_linx4b = vmlaq_f32(b_linx4b, luma4, overbright4);
-+    }
-+
-+    r_linx4a = vmulq_f32(r_linx4a, mapvalx4a);
-+    g_linx4a = vmulq_f32(g_linx4a, mapvalx4a);
-+    b_linx4a = vmulq_f32(b_linx4a, mapvalx4a);
-+
-+    r_linx4b = vmulq_f32(r_linx4b, mapvalx4b);
-+    g_linx4b = vmulq_f32(g_linx4b, mapvalx4b);
-+    b_linx4b = vmulq_f32(b_linx4b, mapvalx4b);
-+
-+    r_linx4a = vmlaq_n_f32(offset, r_linx4a, 32767);
-+    r_linx4b = vmlaq_n_f32(offset, r_linx4b, 32767);
-+    g_linx4a = vmlaq_n_f32(offset, g_linx4a, 32767);
-+    g_linx4b = vmlaq_n_f32(offset, g_linx4b, 32767);
-+    b_linx4a = vmlaq_n_f32(offset, b_linx4a, 32767);
-+    b_linx4b = vmlaq_n_f32(offset, b_linx4b, 32767);
-+
-+    rx4a = vcvtq_s32_f32(r_linx4a);
-+    rx4a = vminq_s32(rx4a, output_upper_bound);
-+    rx4a = vmaxq_s32(rx4a, zerox4);
-+    gx4a = vcvtq_s32_f32(g_linx4a);
-+    gx4a = vminq_s32(gx4a, output_upper_bound);
-+    gx4a = vmaxq_s32(gx4a, zerox4);
-+    bx4a = vcvtq_s32_f32(b_linx4a);
-+    bx4a = vminq_s32(bx4a, output_upper_bound);
-+    bx4a = vmaxq_s32(bx4a, zerox4);
-+    rx4b = vcvtq_s32_f32(r_linx4b);
-+    rx4b = vminq_s32(rx4b, output_upper_bound);
-+    rx4b = vmaxq_s32(rx4b, zerox4);
-+    gx4b = vcvtq_s32_f32(g_linx4b);
-+    gx4b = vminq_s32(gx4b, output_upper_bound);
-+    gx4b = vmaxq_s32(gx4b, zerox4);
-+    bx4b = vcvtq_s32_f32(b_linx4b);
-+    bx4b = vminq_s32(bx4b, output_upper_bound);
-+    bx4b = vmaxq_s32(bx4b, zerox4);
-+
-+    r_out[0] = delin_lut[vget_lane_s32(vget_low_s32(rx4a), 0)];
-+    r_out[1] = delin_lut[vget_lane_s32(vget_low_s32(rx4a), 1)];
-+    r_out[2] = delin_lut[vget_lane_s32(vget_high_s32(rx4a), 0)];
-+    r_out[3] = delin_lut[vget_lane_s32(vget_high_s32(rx4a), 1)];
-+    r_out[4] = delin_lut[vget_lane_s32(vget_low_s32(rx4b), 0)];
-+    r_out[5] = delin_lut[vget_lane_s32(vget_low_s32(rx4b), 1)];
-+    r_out[6] = delin_lut[vget_lane_s32(vget_high_s32(rx4b), 0)];
-+    r_out[7] = delin_lut[vget_lane_s32(vget_high_s32(rx4b), 1)];
-+
-+    g_out[0] = delin_lut[vget_lane_s32(vget_low_s32(gx4a), 0)];
-+    g_out[1] = delin_lut[vget_lane_s32(vget_low_s32(gx4a), 1)];
-+    g_out[2] = delin_lut[vget_lane_s32(vget_high_s32(gx4a), 0)];
-+    g_out[3] = delin_lut[vget_lane_s32(vget_high_s32(gx4a), 1)];
-+    g_out[4] = delin_lut[vget_lane_s32(vget_low_s32(gx4b), 0)];
-+    g_out[5] = delin_lut[vget_lane_s32(vget_low_s32(gx4b), 1)];
-+    g_out[6] = delin_lut[vget_lane_s32(vget_high_s32(gx4b), 0)];
-+    g_out[7] = delin_lut[vget_lane_s32(vget_high_s32(gx4b), 1)];
-+
-+    b_out[0] = delin_lut[vget_lane_s32(vget_low_s32(bx4a), 0)];
-+    b_out[1] = delin_lut[vget_lane_s32(vget_low_s32(bx4a), 1)];
-+    b_out[2] = delin_lut[vget_lane_s32(vget_high_s32(bx4a), 0)];
-+    b_out[3] = delin_lut[vget_lane_s32(vget_high_s32(bx4a), 1)];
-+    b_out[4] = delin_lut[vget_lane_s32(vget_low_s32(bx4b), 0)];
-+    b_out[5] = delin_lut[vget_lane_s32(vget_low_s32(bx4b), 1)];
-+    b_out[6] = delin_lut[vget_lane_s32(vget_high_s32(bx4b), 0)];
-+    b_out[7] = delin_lut[vget_lane_s32(vget_high_s32(bx4b), 1)];
-+}
-+#endif
-+
-+// See also libavfilter/colorspacedsp_template.c
-+static void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
-+                                           const uint16_t *srcy, const uint16_t *srcuv,
-+                                           const int *dstlinesize, const int *srclinesize,
-+                                           int dstdepth, int srcdepth,
-+                                           int width, int height,
-+                                           const struct TonemapIntParams *params)
-+{
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
-+
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
-+
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
-+
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
-+
-+    int16_t r[4], g[4], b[4];
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
-+            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
-+            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
-+
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
-+
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
 +
 +            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
 +            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
@@ -1094,445 +887,678 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            int r10 = r[2], g10 = g[2], b10 = b[2];
 +            int r11 = r[3], g11 = g[3], b11 = b[3];
 +
-+            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
-+            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
-+            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
-+            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
++            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
 +
 +#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstuv[x]     = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
-+            dstuv[x + 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
++            dstuv[x]     = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
++            dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
 +#undef AVG
 +        }
 +    }
 +}
 +
-+static void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                        const int *dstlinesize, const int *srclinesize,
-+                                        int dstdepth, int srcdepth,
-+                                        int width, int height,
-+                                        const struct TonemapIntParams *params)
-+{
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
-+
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
-+
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
-+
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
-+
-+    int16_t r[4], g[4], b[4];
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                         ) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
-+            int u = (srcu[x >> 1]) - in_uv_offset;
-+            int v = (srcv[x >> 1]) - in_uv_offset;
-+
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++#define LOAD_TONEMAP_PARAMS     TonemapxContext *s = ctx->priv; \
++ThreadData *td = arg;                                           \
++AVFrame *in = td->in;                                           \
++AVFrame *out = td->out;                                         \
++const AVPixFmtDescriptor *desc  = td->desc;                     \
++const AVPixFmtDescriptor *odesc = td->odesc;                    \
++const int ss = 1 << FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h); \
++const int slice_start = (in->height / ss *  jobnr     ) / nb_jobs * ss; \
++const int slice_end   = (in->height / ss * (jobnr + 1)) / nb_jobs * ss; \
++TonemapIntParams params = {                                     \
++.lut_peak            = s->lut_peak,                             \
++.lin_lut             = s->lin_lut,                              \
++.tonemap_lut         = s->tonemap_lut,                          \
++.delin_lut           = s->delin_lut,                            \
++.in_yuv_off          = s->in_yuv_off,                           \
++.out_yuv_off         = s->out_yuv_off,                          \
++.yuv2rgb_coeffs      = &s->yuv2rgb_coeffs,                      \
++.rgb2yuv_coeffs      = &s->rgb2yuv_coeffs,                      \
++.rgb2rgb_coeffs      = &s->rgb2rgb_coeffs,                      \
++.rgb2rgb_passthrough = in->color_primaries == out->color_primaries,   \
++.coeffs              = s->coeffs,                               \
++.ocoeffs             = s->ocoeffs,                              \
++.desat               = s->desat,                                \
++};
 +
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++static int filter_slice_planar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
 +
-+            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++    s->tonemap_func_planar8(out->data[0] + out->linesize[0] * slice_start,
++                            out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                            out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                            (void*)(in->data[0] + in->linesize[0] * slice_start),
++                            (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                            (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                            out->linesize, in->linesize,
++                            odesc->comp[0].depth, desc->comp[0].depth,
++                            out->width, slice_end - slice_start,
++                            &params);
 +
-+            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++    return 0;
++}
 +
-+            int r00 = r[0], g00 = g[0], b00 = b[0];
-+            int r01 = r[1], g01 = g[1], b01 = b[1];
-+            int r10 = r[2], g10 = g[2], b10 = b[2];
-+            int r11 = r[3], g11 = g[3], b11 = b[3];
++static int filter_slice_biplanar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
 +
-+            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
-+            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
-+            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
-+            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
++    s->tonemap_func_biplanar8(out->data[0] + out->linesize[0] * slice_start,
++                              out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                              (void*)(in->data[0] + in->linesize[0] * slice_start),
++                              (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                              out->linesize, in->linesize,
++                              odesc->comp[0].depth, desc->comp[0].depth,
++                              out->width, slice_end - slice_start,
++                              &params);
 +
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstu[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
-+            dstv[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
-+#undef AVG
-+        }
-+    }
++    return 0;
 +}
 +
-+#if ARCH_X86
-+X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a)
++static int filter_slice_planar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
 +{
-+    __m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U));
-+    __m128i mask = _mm_set1_epi32(~0xFFFF);
-+    __m128i condition = _mm_and_si128(add_result, mask);
-+    __m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128());
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
 +
-+    __m128i shifted = _mm_srai_epi32(a, 31);
-+    __m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF));
++    s->tonemap_func_planar10(out->data[0] + out->linesize[0] * slice_start,
++                             out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                             out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                             (void*)(in->data[0] + in->linesize[0] * slice_start),
++                             (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                             (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                             out->linesize, in->linesize,
++                             odesc->comp[0].depth, desc->comp[0].depth,
++                             out->width, slice_end - slice_start,
++                             &params);
 +
-+    return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result));
++    return 0;
 +}
 +
-+X86_64_V3 static inline __m256i av_clip_int16_avx(__m256i a)
++static int filter_slice_biplanar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
 +{
-+    __m256i add_result = _mm256_add_epi32(a, _mm256_set1_epi32(0x8000U));
-+    __m256i mask = _mm256_set1_epi32(~0xFFFF);
-+    __m256i condition = _mm256_and_si256(add_result, mask);
-+    __m256i cmp = _mm256_cmpeq_epi32(condition, _mm256_setzero_si256());
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
 +
-+    __m256i shifted = _mm256_srai_epi32(a, 31);
-+    __m256i xor_result = _mm256_xor_si256(shifted, _mm256_set1_epi32(0x7FFF));
++    s->tonemap_func_biplanar10(out->data[0] + out->linesize[0] * slice_start,
++                               out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                               (void*)(in->data[0] + in->linesize[0] * slice_start),
++                               (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                               out->linesize, in->linesize,
++                               odesc->comp[0].depth, desc->comp[0].depth,
++                               out->width, slice_end - slice_start,
++                               &params);
 +
-+    return _mm256_or_si256(_mm256_and_si256(cmp, a), _mm256_andnot_si256(cmp, xor_result));
++    return 0;
 +}
 +
-+X86_64_V2 static void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                                      const int *dstlinesize, const int *srclinesize,
-+                                                      int dstdepth, int srcdepth,
-+                                                      int width, int height,
-+                                                      const struct TonemapIntParams *params)
++static int filter_frame(AVFilterLink *link, AVFrame *in)
 +{
-+    uint8_t *rdsty = dsty;
-+    uint8_t *rdstu = dstu;
-+    uint8_t *rdstv = dstv;
++    AVFilterContext *ctx = link->dst;
++    TonemapxContext *s = ctx->priv;
++    AVFilterLink *outlink = ctx->outputs[0];
++    AVFrame *out;
++    const AVPixFmtDescriptor *desc;
++    const AVPixFmtDescriptor *odesc;
++    int ret;
++    double peak = s->peak;
++    const AVLumaCoefficients *coeffs;
++    ThreadData td;
 +
-+    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcu = srcu;
-+    const uint16_t *rsrcv = srcv;
++    desc = av_pix_fmt_desc_get(link->format);
++    odesc = av_pix_fmt_desc_get(outlink->format);
++    if (!desc || !odesc) {
++        av_frame_free(&in);
++        return AVERROR_BUG;
++    }
 +
-+    int rheight = height;
-+    // not zero when not divisible by 8
-+    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
++    switch (odesc->comp[2].plane) {
++        case 1: // biplanar
++            if (odesc->comp[0].depth == 8) {
++                s->filter_slice = filter_slice_biplanar8;
++            } else {
++                s->filter_slice = filter_slice_biplanar10;
++            }
++            break;
++        default:
++        case 2: // planar
++            if (odesc->comp[0].depth == 8) {
++                s->filter_slice = filter_slice_planar8;
++            } else {
++                s->filter_slice = filter_slice_planar10;
++            }
++            break;
++    }
 +
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
++    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
++    if (!out) {
++        av_frame_free(&in);
++        return AVERROR(ENOMEM);
++    }
 +
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
++    if ((ret = av_frame_copy_props(out, in)) < 0)
++        goto fail;
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++    /* read peak from side data if not passed in */
++    if (!peak) {
++        peak = ff_determine_signal_peak(in);
++        av_log(s, AV_LOG_DEBUG, "Computed signal peak: %f\n", peak);
++    }
 +
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++    out->color_trc = s->trc;
++    out->colorspace = s->spc;
++    out->color_primaries = s->pri;
++    out->color_range = s->range;
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
++    if (in->color_trc == AVCOL_TRC_UNSPECIFIED)
++        in->color_trc = AVCOL_TRC_SMPTE2084;
++    if (out->color_trc == AVCOL_TRC_UNSPECIFIED)
++        out->color_trc = AVCOL_TRC_BT709;
 +
-+    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
-+    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
-+    __m128i cyx4 = _mm_set1_epi32(cy);
-+    __m128i rndx4 = _mm_set1_epi32(in_rnd);
-+    __m128i zero128 = _mm_setzero_si128();
-+    __m128i ux4, vx4;
-+    __m128i y0x8, y1x8;
-+    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
-+    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
-+    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
-+
-+    __m128i r0ox8, g0ox8, b0ox8;
-+    __m128i y0ox8;
-+    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
-+    __m128i yoax4, yobx4;
++    if (in->colorspace == AVCOL_SPC_UNSPECIFIED)
++        in->colorspace = AVCOL_SPC_BT2020_NCL;
++    if (out->colorspace == AVCOL_SPC_UNSPECIFIED)
++        out->colorspace = AVCOL_SPC_BT709;
 +
-+    __m128i r1ox8, g1ox8, b1ox8;
-+    __m128i y1ox8;
-+    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    __m128i y1oax4, y1obx4;
-+    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4;
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
-+            int x = xx << 3;
++    if (in->color_primaries == AVCOL_PRI_UNSPECIFIED)
++        in->color_primaries = AVCOL_PRI_BT2020;
++    if (out->color_primaries == AVCOL_PRI_UNSPECIFIED)
++        out->color_primaries = AVCOL_PRI_BT709;
 +
-+            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
-+            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
-+            ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1)));
-+            vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1)));
++    if (in->color_range == AVCOL_RANGE_UNSPECIFIED)
++        in->color_range = AVCOL_RANGE_MPEG;
++    if (out->color_range == AVCOL_RANGE_UNSPECIFIED)
++        out->color_range = AVCOL_RANGE_MPEG;
 +
-+            y0x4a = _mm_cvtepu16_epi32(y0x8);
-+            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
-+            y1x4a = _mm_cvtepu16_epi32(y1x8);
-+            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
-+            ux4 = _mm_cvtepu16_epi32(ux4);
-+            vx4 = _mm_cvtepu16_epi32(vx4);
-+            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
-+            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
-+            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
-+            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
-+            ux4 = _mm_sub_epi32(ux4, in_uv_offx4);
-+            vx4 = _mm_sub_epi32(vx4, in_uv_offx4);
++    if (!s->lin_lut || !s->delin_lut) {
++        if ((ret = comput_trc_luts(s, in->color_trc, out->color_trc)) < 0)
++            goto fail;
++    }
 +
-+            ux4a = _mm_unpacklo_epi32(ux4, ux4);
-+            ux4b = _mm_unpackhi_epi32(ux4, ux4);
-+            vx4a = _mm_unpacklo_epi32(vx4, vx4);
-+            vx4b = _mm_unpackhi_epi32(vx4, vx4);
++    if (!s->tonemap_lut || s->lut_peak != peak) {
++        s->lut_peak = peak;
++        if ((ret = compute_tonemap_lut(s, out->color_trc)) < 0)
++            goto fail;
++    }
 +
-+            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
-+            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
-+            r0x4a = _mm_add_epi32(r0x4a, rndx4);
-+            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
-+            r0x4a = av_clip_int16_sse(r0x4a);
++    coeffs = av_csp_luma_coeffs_from_avcsp(in->colorspace);
++    if (s->coeffs != coeffs) {
++        s->coeffs = coeffs;
++        s->ocoeffs = av_csp_luma_coeffs_from_avcsp(out->colorspace);
++        if ((ret = compute_yuv_coeffs(s, coeffs, s->ocoeffs, desc, odesc,
++             in->color_range, out->color_range)) < 0)
++            goto fail;
++        if ((ret = compute_rgb_coeffs(s, in->color_primaries, out->color_primaries)) < 0)
++            goto fail;
++    }
 +
-+            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
-+            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
-+            r1x4a = _mm_add_epi32(r1x4a, rndx4);
-+            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
-+            r1x4a = av_clip_int16_sse(r1x4a);
++    /* do the tonemap */
++    td.in    = in;
++    td.out   = out;
++    td.desc  = desc;
++    td.odesc = odesc;
++    td.peak  = peak;
++    ctx->internal->execute(ctx, s->filter_slice, &td, NULL,
++                           FFMIN(outlink->h >> FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h), ff_filter_get_nb_threads(ctx)));
 +
-+            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
-+            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
-+            g0x4a = _mm_add_epi32(g0x4a, rndx4);
-+            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
-+            g0x4a = av_clip_int16_sse(g0x4a);
++    av_frame_free(&in);
 +
-+            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
-+            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
-+            g1x4a = _mm_add_epi32(g1x4a, rndx4);
-+            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
-+            g1x4a = av_clip_int16_sse(g1x4a);
++    ff_update_hdr_metadata(out, peak);
 +
-+            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
-+            b0x4a = _mm_add_epi32(b0x4a, rndx4);
-+            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
-+            b0x4a = av_clip_int16_sse(b0x4a);
++    return ff_filter_frame(outlink, out);
++fail:
++    av_frame_free(&in);
++    av_frame_free(&out);
++    return ret;
++}
 +
-+            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
-+            b1x4a = _mm_add_epi32(b1x4a, rndx4);
-+            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
-+            b1x4a = av_clip_int16_sse(b1x4a);
++static void uninit(AVFilterContext *ctx)
++{
++    TonemapxContext *s = ctx->priv;
 +
-+            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
-+            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
-+            r0x4b = _mm_add_epi32(r0x4b, rndx4);
-+            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
-+            r0x4b = av_clip_int16_sse(r0x4b);
++    av_freep(&s->lin_lut);
++    av_freep(&s->delin_lut);
++    av_freep(&s->tonemap_lut);
++}
 +
-+            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
-+            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
-+            r1x4b = _mm_add_epi32(r1x4b, rndx4);
-+            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
-+            r1x4b = av_clip_int16_sse(r1x4b);
++static int query_formats(AVFilterContext *ctx)
++{
++    enum AVPixelFormat valid_in_pix_fmts[4];
++    AVFilterFormats *formats;
++    const AVPixFmtDescriptor *desc;
++    TonemapxContext *s = ctx->priv;
 +
-+            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
-+            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
-+            g0x4b = _mm_add_epi32(g0x4b, rndx4);
-+            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
-+            g0x4b = av_clip_int16_sse(g0x4b);
++    if (!strcmp(s->format_str, "same")) {
++        int res;
++        formats = ff_make_format_list(in_pix_fmts);
++        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++        if (res < 0)
++            return res;
++        s->format = AV_PIX_FMT_NONE;
++    } else {
++        int i, j = 0;
++        int res;
++        formats = ff_make_format_list(in_pix_fmts);
++        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++        if (res < 0)
++            return res;
++        if (s->format == AV_PIX_FMT_NONE) {
++            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
++            return AVERROR(EINVAL);
++        }
++        s->format = av_get_pix_fmt(s->format_str);
++        // Check again in case of the string is invalid
++        if (s->format == AV_PIX_FMT_NONE) {
++            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
++            return AVERROR(EINVAL);
++        }
++        desc = av_pix_fmt_desc_get(s->format);
++        // Filter out the input formats for requested output formats
++        // The input and output must have the same planar format, either planar or bi-planar packed
++        for (i = 0; in_pix_fmts[i] != AV_PIX_FMT_NONE; i++) {
++            const AVPixFmtDescriptor *tdesc = av_pix_fmt_desc_get(in_pix_fmts[i]);
++            if (tdesc->comp[2].plane == desc->comp[2].plane) {
++                valid_in_pix_fmts[j] = in_pix_fmts[i];
++                j++;
++            }
++        }
++        valid_in_pix_fmts[j] = AV_PIX_FMT_NONE;
++        formats = ff_make_format_list(valid_in_pix_fmts);
++        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++        if (res < 0)
++            return res;
++        if (out_format_is_supported(s->format)) {
++            formats = NULL;
++            res = ff_add_format(&formats, s->format);
++            if (res < 0)
++                return res;
++        } else {
++            av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
++                   av_get_pix_fmt_name(s->format));
++            return AVERROR(ENOSYS);
++        }
++    }
 +
-+            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
-+            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
-+            g1x4b = _mm_add_epi32(g1x4b, rndx4);
-+            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
-+            g1x4b = av_clip_int16_sse(g1x4b);
++    return ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats);
++}
 +
-+            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
-+            b0x4b = _mm_add_epi32(b0x4b, rndx4);
-+            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
-+            b0x4b = av_clip_int16_sse(b0x4b);
++static av_cold int init(AVFilterContext *ctx)
++{
++    TonemapxContext *s = ctx->priv;
++    int cpu_flags = av_get_cpu_flags();
++    av_log(ctx, AV_LOG_DEBUG, "Requested output format: %s\n",
++           s->format_str);
 +
-+            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
-+            b1x4b = _mm_add_epi32(b1x4b, rndx4);
-+            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
-+            b1x4b = av_clip_int16_sse(b1x4b);
++#if ARCH_AARCH64
++    if (have_neon(cpu_flags)) {
++        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_neon;
++        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_neon;
++        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon;
++        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon;
++    }
++#elif ARCH_X86
++    if (X86_SSE42(cpu_flags)) {
++        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_sse;
++        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_sse;
++        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse;
++        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse;
++    }
++    if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) {
++        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_avx;
++        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_avx;
++        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx;
++        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx;
++    }
++#endif
 +
-+            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
++    if (!s->tonemap_func_biplanar8) {
++        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12;
++    }
 +
-+            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
-+            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
-+            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
++    if (!s->tonemap_func_biplanar10) {
++        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010;
++    }
 +
-+            roax4 = _mm_cvtepi16_epi32(r0ox8);
-+            goax4 = _mm_cvtepi16_epi32(g0ox8);
-+            boax4 = _mm_cvtepi16_epi32(b0ox8);
++    if (!s->tonemap_func_planar8) {
++        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p;
++    }
 +
-+            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
-+            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
-+            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
++    if (!s->tonemap_func_planar10) {
++        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10;
++    }
 +
-+            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
-+            yoax4 = _mm_srai_epi32(yoax4, 21);
-+            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
++    switch(s->tonemap) {
++        case TONEMAP_GAMMA:
++            if (isnan(s->param))
++                s->param = 1.8f;
++            break;
++        case TONEMAP_REINHARD:
++            if (!isnan(s->param))
++                s->param = (1.0f - s->param) / s->param;
++            break;
++        case TONEMAP_MOBIUS:
++            if (isnan(s->param))
++                s->param = 0.3f;
++            break;
++    }
 +
-+            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
-+            yobx4 = _mm_srai_epi32(yobx4, 21);
-+            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
++    if (isnan(s->param))
++        s->param = 1.0f;
 +
-+            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
-+            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
++    return 0;
++}
 +
-+            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
-+            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
-+            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
++#define OFFSET(x) offsetof(TonemapxContext, x)
++#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
++static const AVOption tonemapx_options[] = {
++    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_BT2390}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, "tonemap" },
++    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},              0, 0, FLAGS, "tonemap" },
++    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},            0, 0, FLAGS, "tonemap" },
++    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},             0, 0, FLAGS, "tonemap" },
++    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0, 0, FLAGS, "tonemap" },
++    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},          0, 0, FLAGS, "tonemap" },
++    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},             0, 0, FLAGS, "tonemap" },
++    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},            0, 0, FLAGS, "tonemap" },
++    {     "bt2390",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_BT2390},            0, 0, FLAGS, "tonemap" },
++    { "transfer",     "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, "transfer" },
++    { "t",            "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, "transfer" },
++    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},           0, 0, FLAGS, "transfer" },
++    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},       0, 0, FLAGS, "transfer" },
++    { "matrix",       "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, "matrix" },
++    { "m",            "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, "matrix" },
++    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},           0, 0, FLAGS, "matrix" },
++    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},      0, 0, FLAGS, "matrix" },
++    { "primaries",    "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, "primaries" },
++    { "p",            "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, "primaries" },
++    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709},           0, 0, FLAGS, "primaries" },
++    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020},          0, 0, FLAGS, "primaries" },
++    { "range",        "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, "range" },
++    { "r",            "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, "range" },
++    {     "tv",       0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},          0, 0, FLAGS, "range" },
++    {     "pc",       0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},          0, 0, FLAGS, "range" },
++    {     "limited",  0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},          0, 0, FLAGS, "range" },
++    {     "full",     0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},          0, 0, FLAGS, "range" },
++    { "format",       "output format",       OFFSET(format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS },
++    { "param",        "tonemap parameter", OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },
++    { "desat",        "desaturation strength", OFFSET(desat), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
++    { "peak",         "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
++    { NULL }
++};
 +
-+            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
-+            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
-+            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
++AVFILTER_DEFINE_CLASS(tonemapx);
 +
-+            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
-+            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
-+            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
++static const AVFilterPad tonemapx_inputs[] = {
++    {
++        .name         = "default",
++        .type         = AVMEDIA_TYPE_VIDEO,
++        .filter_frame = filter_frame,
++    },
++};
 +
-+            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
-+            y1oax4 = _mm_srai_epi32(y1oax4, 21);
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
++static const AVFilterPad tonemapx_outputs[] = {
++    {
++        .name         = "default",
++        .type         = AVMEDIA_TYPE_VIDEO,
++    },
++};
 +
-+            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
-+            y1obx4 = _mm_srai_epi32(y1obx4, 21);
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
++AVFilter ff_vf_tonemapx = {
++    .name            = "tonemapx",
++    .description     = NULL_IF_CONFIG_SMALL("HDR to SDR tonemapping"),
++    .init            = init,
++    .uninit          = uninit,
++    .priv_size       = sizeof(TonemapxContext),
++    .priv_class      = &tonemapx_class,
++    FILTER_INPUTS(tonemapx_inputs),
++    FILTER_OUTPUTS(tonemapx_outputs),
++    FILTER_QUERY_FUNC(query_formats),
++    .flags           = AVFILTER_FLAG_SLICE_THREADS,
++};
+Index: FFmpeg/libavfilter/aarch64/Makefile
+===================================================================
+--- FFmpeg.orig/libavfilter/aarch64/Makefile
++++ FFmpeg/libavfilter/aarch64/Makefile
+@@ -1,3 +1,4 @@
+ OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
++OBJS-$(CONFIG_TONEMAPX_FILTER)               += aarch64/vf_tonemapx_intrin_neon.o
+ 
+ NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
+Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
+@@ -0,0 +1,1216 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
-+            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
++#include "vf_tonemapx_intrin_neon.h"
 +
-+            ravgx4 = _mm_hadd_epi32(roax4, robx4);
-+            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
-+            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
-+            ravgx4 = _mm_srai_epi32(ravgx4, 2);
++static inline void tonemap_int16x8_neon(uint16x8_t r_in, uint16x8_t g_in, uint16x8_t b_in,
++                                        int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                                        float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                                        const AVLumaCoefficients *coeffs,
++                                        const AVLumaCoefficients *ocoeffs, double desat,
++                                        double (*rgb2rgb)[3][3],
++                                        int rgb2rgb_passthrough)
++{
++    int16x8_t sig8;
++    float32x4_t mapvalx4a;
++    float32x4_t mapvalx4b;
++    float32x4_t r_linx4a;
++    float32x4_t r_linx4b;
++    float32x4_t g_linx4a;
++    float32x4_t g_linx4b;
++    float32x4_t b_linx4a;
++    float32x4_t b_linx4b;
++    float32x4_t offset = vdupq_n_f32(0.5f);
++    int32x4_t output_upper_bound = vdupq_n_s32(32767);
++    int32x4_t zerox4 = vdupq_n_s32(0);
++    int16x8_t input_lut_offset = vdupq_n_s16(2048);
++    int16x8_t input_upper_bound = vdupq_n_s16(32767);
++    int16x8_t r, g, b;
++    int32x4_t rx4a, gx4a, bx4a, rx4b, gx4b, bx4b;
 +
-+            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
-+            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
-+            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
-+            gavgx4 = _mm_srai_epi32(gavgx4, 2);
++    float mapval4a[4], mapval4b[4], r_lin4a[4], r_lin4b[4], g_lin4a[4], g_lin4b[4], b_lin4a[4], b_lin4b[4];
 +
-+            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
-+            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
-+            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
-+            bavgx4 = _mm_srai_epi32(bavgx4, 2);
++    r = vreinterpretq_s16_u16(r_in);
++    g = vreinterpretq_s16_u16(g_in);
++    b = vreinterpretq_s16_u16(b_in);
 +
-+            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
-+            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
-+            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
-+            uox4 = _mm_srai_epi32(uox4, 21);
-+            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
-+            _mm_storeu_si32(&dstu[x >> 1], _mm_packus_epi16(_mm_packs_epi32(uox4, zero128), zero128));
++    sig8 = vmaxq_s16(r, vmaxq_s16(g, b));
++    sig8 = vaddq_s16(sig8, input_lut_offset);
++    sig8 = vminq_s16(sig8, input_upper_bound);
++    sig8 = vmaxq_s16(sig8, vreinterpretq_s16_s32(zerox4));
 +
-+            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
-+            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
-+            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
-+            vox4 = _mm_srai_epi32(vox4, 21);
-+            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
-+            _mm_storeu_si32(&dstv[x >> 1], _mm_packus_epi16(_mm_packs_epi32(vox4, zero128), zero128));
-+        }
-+    }
++    r = vaddq_s16(r, input_lut_offset);
++    r = vminq_s16(r, input_upper_bound);
++    r = vmaxq_s16(r, vreinterpretq_s16_s32(zerox4));
++    g = vaddq_s16(g, input_lut_offset);
++    g = vminq_s16(g, input_upper_bound);
++    g = vmaxq_s16(g, vreinterpretq_s16_s32(zerox4));
++    b = vaddq_s16(b, input_lut_offset);
++    b = vminq_s16(b, input_upper_bound);
++    b = vmaxq_s16(b, vreinterpretq_s16_s32(zerox4));
 +
-+    // Process remaining pixels cannot fill the full simd register with scalar version
-+    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
-+        rdsty += offset;
-+        rdstu += offset >> 1;
-+        rdstv += offset >> 1;
-+        rsrcy += offset;
-+        rsrcu += offset >> 1;
-+        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
-+                                    rsrcy, rsrcu, rsrcv,
-+                                    dstlinesize, srclinesize,
-+                                    dstdepth, srcdepth,
-+                                    remainw, rheight, params);
-+    }
-+}
++    // Cannot use loop here as the lane has to be compile-time constant
++#define LOAD_LUT(i) mapval4a[i] = tonemap_lut[vget_lane_s16(vget_low_s16(sig8), i)]; \
++mapval4b[i] = tonemap_lut[vget_lane_s16(vget_high_s16(sig8), i)];                    \
++r_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(r), i)];                             \
++r_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(r), i)];                            \
++g_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(g), i)];                             \
++g_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(g), i)];                            \
++b_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(b), i)];                             \
++b_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(b), i)];
 +
-+X86_64_V3 static void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                                      const int *dstlinesize, const int *srclinesize,
-+                                                      int dstdepth, int srcdepth,
-+                                                      int width, int height,
-+                                                      const struct TonemapIntParams *params)
-+{
-+    uint8_t *rdsty = dsty;
-+    uint8_t *rdstu = dstu;
-+    uint8_t *rdstv = dstv;
-+    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcu = srcu;
-+    const uint16_t *rsrcv = srcv;
++    LOAD_LUT(0)
++    LOAD_LUT(1)
++    LOAD_LUT(2)
++    LOAD_LUT(3)
++
++#undef  LOAD_LUT
++
++    mapvalx4a = vld1q_f32(mapval4a);
++    mapvalx4b = vld1q_f32(mapval4b);
++    r_linx4a = vld1q_f32(r_lin4a);
++    r_linx4b = vld1q_f32(r_lin4b);
++    g_linx4a = vld1q_f32(g_lin4a);
++    g_linx4b = vld1q_f32(g_lin4b);
++    b_linx4a = vld1q_f32(b_lin4a);
++    b_linx4b = vld1q_f32(b_lin4b);
++
++    if (!rgb2rgb_passthrough) {
++        r_linx4a = vmulq_n_f32(r_linx4a, (float)(*rgb2rgb)[0][0]);
++        r_linx4a = vfmaq_n_f32(r_linx4a, g_linx4a, (float)(*rgb2rgb)[0][1]);
++        r_linx4a = vfmaq_n_f32(r_linx4a, b_linx4a, (float)(*rgb2rgb)[0][2]);
++        r_linx4b = vmulq_n_f32(r_linx4b, (float)(*rgb2rgb)[0][0]);
++        r_linx4b = vfmaq_n_f32(r_linx4b, g_linx4b, (float)(*rgb2rgb)[0][1]);
++        r_linx4b = vfmaq_n_f32(r_linx4b, b_linx4b, (float)(*rgb2rgb)[0][2]);
++
++        g_linx4a = vmulq_n_f32(g_linx4a, (float)(*rgb2rgb)[1][1]);
++        g_linx4a = vfmaq_n_f32(g_linx4a, r_linx4a, (float)(*rgb2rgb)[1][0]);
++        g_linx4a = vfmaq_n_f32(g_linx4a, b_linx4a, (float)(*rgb2rgb)[1][2]);
++        g_linx4b = vmulq_n_f32(g_linx4b, (float)(*rgb2rgb)[1][1]);
++        g_linx4b = vfmaq_n_f32(g_linx4b, r_linx4b, (float)(*rgb2rgb)[1][0]);
++        g_linx4b = vfmaq_n_f32(g_linx4b, b_linx4b, (float)(*rgb2rgb)[1][2]);
++
++        b_linx4a = vmulq_n_f32(b_linx4a, (float)(*rgb2rgb)[2][2]);
++        b_linx4a = vfmaq_n_f32(b_linx4a, r_linx4a, (float)(*rgb2rgb)[2][0]);
++        b_linx4a = vfmaq_n_f32(b_linx4a, g_linx4a, (float)(*rgb2rgb)[2][1]);
++        b_linx4b = vmulq_n_f32(b_linx4b, (float)(*rgb2rgb)[2][2]);
++        b_linx4b = vfmaq_n_f32(b_linx4b, r_linx4b, (float)(*rgb2rgb)[2][0]);
++        b_linx4b = vfmaq_n_f32(b_linx4b, g_linx4b, (float)(*rgb2rgb)[2][1]);
++    }
++
++    if (desat > 0) {
++        float32x4_t eps_x4 = vdupq_n_f32(FLOAT_EPS);
++        float32x4_t desat4 = vdupq_n_f32((float)desat);
++        float32x4_t luma4 = vdupq_n_f32(0);
++        float32x4_t overbright4;
++        // Group A
++        luma4 = vmlaq_n_f32(luma4, r_linx4a, (float)av_q2d(coeffs->cr));
++        luma4 = vmlaq_n_f32(luma4, g_linx4a, (float)av_q2d(coeffs->cg));
++        luma4 = vmlaq_n_f32(luma4, b_linx4a, (float)av_q2d(coeffs->cb));
++        overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
++        r_linx4a = vmlsq_f32(r_linx4a, r_linx4a, overbright4);
++        r_linx4a = vmlaq_f32(r_linx4a, luma4, overbright4);
++        g_linx4a = vmlsq_f32(g_linx4a, g_linx4a, overbright4);
++        g_linx4a = vmlaq_f32(g_linx4a, luma4, overbright4);
++        b_linx4a = vmlsq_f32(b_linx4a, b_linx4a, overbright4);
++        b_linx4a = vmlaq_f32(b_linx4a, luma4, overbright4);
++        // Group B
++        luma4 = vdupq_n_f32(0);
++        luma4 = vmlaq_n_f32(luma4, r_linx4b, (float)av_q2d(coeffs->cr));
++        luma4 = vmlaq_n_f32(luma4, g_linx4b, (float)av_q2d(coeffs->cg));
++        luma4 = vmlaq_n_f32(luma4, b_linx4b, (float)av_q2d(coeffs->cb));
++        overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
++        r_linx4b = vmlsq_f32(r_linx4b, r_linx4b, overbright4);
++        r_linx4b = vmlaq_f32(r_linx4b, luma4, overbright4);
++        g_linx4b = vmlsq_f32(g_linx4b, g_linx4b, overbright4);
++        g_linx4b = vmlaq_f32(g_linx4b, luma4, overbright4);
++        b_linx4b = vmlsq_f32(b_linx4b, b_linx4b, overbright4);
++        b_linx4b = vmlaq_f32(b_linx4b, luma4, overbright4);
++    }
++
++    r_linx4a = vmulq_f32(r_linx4a, mapvalx4a);
++    g_linx4a = vmulq_f32(g_linx4a, mapvalx4a);
++    b_linx4a = vmulq_f32(b_linx4a, mapvalx4a);
++
++    r_linx4b = vmulq_f32(r_linx4b, mapvalx4b);
++    g_linx4b = vmulq_f32(g_linx4b, mapvalx4b);
++    b_linx4b = vmulq_f32(b_linx4b, mapvalx4b);
++
++    r_linx4a = vmlaq_n_f32(offset, r_linx4a, 32767);
++    r_linx4b = vmlaq_n_f32(offset, r_linx4b, 32767);
++    g_linx4a = vmlaq_n_f32(offset, g_linx4a, 32767);
++    g_linx4b = vmlaq_n_f32(offset, g_linx4b, 32767);
++    b_linx4a = vmlaq_n_f32(offset, b_linx4a, 32767);
++    b_linx4b = vmlaq_n_f32(offset, b_linx4b, 32767);
++
++    rx4a = vcvtq_s32_f32(r_linx4a);
++    rx4a = vminq_s32(rx4a, output_upper_bound);
++    rx4a = vmaxq_s32(rx4a, zerox4);
++    gx4a = vcvtq_s32_f32(g_linx4a);
++    gx4a = vminq_s32(gx4a, output_upper_bound);
++    gx4a = vmaxq_s32(gx4a, zerox4);
++    bx4a = vcvtq_s32_f32(b_linx4a);
++    bx4a = vminq_s32(bx4a, output_upper_bound);
++    bx4a = vmaxq_s32(bx4a, zerox4);
++    rx4b = vcvtq_s32_f32(r_linx4b);
++    rx4b = vminq_s32(rx4b, output_upper_bound);
++    rx4b = vmaxq_s32(rx4b, zerox4);
++    gx4b = vcvtq_s32_f32(g_linx4b);
++    gx4b = vminq_s32(gx4b, output_upper_bound);
++    gx4b = vmaxq_s32(gx4b, zerox4);
++    bx4b = vcvtq_s32_f32(b_linx4b);
++    bx4b = vminq_s32(bx4b, output_upper_bound);
++    bx4b = vmaxq_s32(bx4b, zerox4);
++
++    r_out[0] = delin_lut[vget_lane_s32(vget_low_s32(rx4a), 0)];
++    r_out[1] = delin_lut[vget_lane_s32(vget_low_s32(rx4a), 1)];
++    r_out[2] = delin_lut[vget_lane_s32(vget_high_s32(rx4a), 0)];
++    r_out[3] = delin_lut[vget_lane_s32(vget_high_s32(rx4a), 1)];
++    r_out[4] = delin_lut[vget_lane_s32(vget_low_s32(rx4b), 0)];
++    r_out[5] = delin_lut[vget_lane_s32(vget_low_s32(rx4b), 1)];
++    r_out[6] = delin_lut[vget_lane_s32(vget_high_s32(rx4b), 0)];
++    r_out[7] = delin_lut[vget_lane_s32(vget_high_s32(rx4b), 1)];
++
++    g_out[0] = delin_lut[vget_lane_s32(vget_low_s32(gx4a), 0)];
++    g_out[1] = delin_lut[vget_lane_s32(vget_low_s32(gx4a), 1)];
++    g_out[2] = delin_lut[vget_lane_s32(vget_high_s32(gx4a), 0)];
++    g_out[3] = delin_lut[vget_lane_s32(vget_high_s32(gx4a), 1)];
++    g_out[4] = delin_lut[vget_lane_s32(vget_low_s32(gx4b), 0)];
++    g_out[5] = delin_lut[vget_lane_s32(vget_low_s32(gx4b), 1)];
++    g_out[6] = delin_lut[vget_lane_s32(vget_high_s32(gx4b), 0)];
++    g_out[7] = delin_lut[vget_lane_s32(vget_high_s32(gx4b), 1)];
++
++    b_out[0] = delin_lut[vget_lane_s32(vget_low_s32(bx4a), 0)];
++    b_out[1] = delin_lut[vget_lane_s32(vget_low_s32(bx4a), 1)];
++    b_out[2] = delin_lut[vget_lane_s32(vget_high_s32(bx4a), 0)];
++    b_out[3] = delin_lut[vget_lane_s32(vget_high_s32(bx4a), 1)];
++    b_out[4] = delin_lut[vget_lane_s32(vget_low_s32(bx4b), 0)];
++    b_out[5] = delin_lut[vget_lane_s32(vget_low_s32(bx4b), 1)];
++    b_out[6] = delin_lut[vget_lane_s32(vget_high_s32(bx4b), 0)];
++    b_out[7] = delin_lut[vget_lane_s32(vget_high_s32(bx4b), 1)];
++}
++
++void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                      const int *dstlinesize, const int *srclinesize,
++                                      int dstdepth, int srcdepth,
++                                      int width, int height,
++                                      const struct TonemapIntParams *params)
++{
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
 +    int rheight = height;
-+    // not zero when not divisible by 16
++    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 14;
++    int remainw = width & 6;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -1559,256 +1585,190 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[16], g[16], b[16];
-+    int16_t r1[16], g1[16], b1[16];
-+    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
-+    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
-+    __m256i cyx8 = _mm256_set1_epi32(cy);
-+    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
-+
-+    __m256i ux8, vx8;
-+    __m256i y0x16, y1x16;
-+    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
++    uint16x4_t ux4, vx4;
 +
-+    __m256i r0ox16, g0ox16, b0ox16;
-+    __m256i y0ox16;
-+    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
-+    __m256i yoax8, yobx8;
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    int16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
 +
-+    __m256i r1ox16, g1ox16, b1ox16;
-+    __m256i y1ox16;
-+    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8;
-+    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    int16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
 +                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
-+        for (int xx = 0; xx < width >> 4; xx++) {
-+            int x = xx << 4;
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
-+            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
-+            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            ux4 = vld1_u16(srcu + (x >> 1));
++            vx4 = vld1_u16(srcv + (x >> 1));
 +
-+            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
-+            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
-+            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
-+            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
++            ux8 = vsubq_u16(ux8, in_uv_offx8);
++            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
++            vx8 = vsubq_u16(vx8, in_uv_offx8);
 +
-+            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
-+            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
-+            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
-+            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
-+            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
 +
-+            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-+            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-+            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-+            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
 +
-+            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
-+            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
-+            r0x8a = av_clip_int16_avx(r0x8a);
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
 +
-+            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
-+            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
-+            r1x8a = av_clip_int16_avx(r1x8a);
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
 +
-+            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
-+            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
-+            g0x8a = av_clip_int16_avx(g0x8a);
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
 +
-+            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
-+            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
-+            g1x8a = av_clip_int16_avx(g1x8a);
-+
-+            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
-+            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
-+            b0x8a = av_clip_int16_avx(b0x8a);
-+
-+            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
-+            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
-+            b1x8a = av_clip_int16_avx(b1x8a);
-+
-+            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
-+            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
-+            r0x8b = av_clip_int16_avx(r0x8b);
-+
-+            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
-+            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
-+            r1x8b = av_clip_int16_avx(r1x8b);
-+
-+            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
-+            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
-+            g0x8b = av_clip_int16_avx(g0x8b);
-+
-+            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
-+            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
-+            g1x8b = av_clip_int16_avx(g1x8b);
-+
-+            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
-+            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
-+            b0x8b = av_clip_int16_avx(b0x8b);
-+
-+            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
-+            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
-+            b1x8b = av_clip_int16_avx(b1x8b);
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
 +
-+            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
 +
-+            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
-+            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
-+            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
 +
-+            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
-+            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
-+            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
 +
-+            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
-+            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
-+            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
 +
-+            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
-+            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            y0oax4 = vshrq_n_s32(y0oax4, 21);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
-+            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
-+            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, 21);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
 +
-+            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
-+            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
++            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
 +
-+            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
-+            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
-+            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
 +
-+            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
-+            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
-+            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
 +
-+            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
-+            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
-+            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
 +
-+            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
-+            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, 21);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
 +
-+            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
-+            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, 21);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
 +
-+            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
-+            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
++            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
 +
-+            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
-+            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
-+            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
 +
-+            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
-+            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
-+            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
 +
-+            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
-+            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
-+            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
 +
-+            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
-+            uox8 = _mm256_srai_epi32(uox8, out_sh);
-+            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
-+            uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256());
-+            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256());
-+            _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8));
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, 21);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vst1_lane_u32(&dstu[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(uox4), vdup_n_s16(0)))), 0);
 +
-+            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
-+            vox8 = _mm256_srai_epi32(vox8, out_sh);
-+            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
-+            vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256());
-+            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256());
-+            _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8));
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            vox4 = vshrq_n_s32(vox4, 21);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++            vst1_lane_u32(&dstv[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(vox4), vdup_n_s16(0)))), 0);
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff0;
++        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
 +        rdstu += offset >> 1;
 +        rdstv += offset >> 1;
@@ -1823,12 +1783,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+X86_64_V2 static void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv,
-+                                                         const uint16_t *srcy, const uint16_t *srcuv,
-+                                                         const int *dstlinesize, const int *srclinesize,
-+                                                         int dstdepth, int srcdepth,
-+                                                         int width, int height,
-+                                                         const struct TonemapIntParams *params)
++void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
++                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                         const int *dstlinesize, const int *srclinesize,
++                                         int dstdepth, int srcdepth,
++                                         int width, int height,
++                                         const struct TonemapIntParams *params)
 +{
 +    uint8_t *rdsty = dsty;
 +    uint8_t *rdstuv = dstuv;
@@ -1868,239 +1828,197 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +
 +    int16_t r[8], g[8], b[8];
 +    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x8_t uvx8;
++    uint16x4_t ux2a, vx2a, ux2b, vx2b;
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
 +
-+    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
-+    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
-+    __m128i cyx4 = _mm_set1_epi32(cy);
-+    __m128i rndx4 = _mm_set1_epi32(in_rnd);
-+    __m128i zero128 = _mm_setzero_si128();
-+    __m128i uvx8, uvx4a, uvx4b;
-+    __m128i y0x8, y1x8;
-+    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
-+    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
-+    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
-+
-+    __m128i r0ox8, g0ox8, b0ox8;
-+    __m128i y0ox8;
-+    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
-+    __m128i yoax4, yobx4;
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    int16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
 +
-+    __m128i r1ox8, g1ox8, b1ox8;
-+    __m128i y1ox8;
-+    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
-+    __m128i uoax4, voax4, ravgx4, gavgx4, bavgx4;
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    int16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x4_t uvoax4, uvobx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
 +
-+            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
-+            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
-+            uvx8 = _mm_lddqu_si128((__m128i*)(srcuv + x));
-+
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            uvx8 = vld1q_u16(srcuv + x);
 +            if (in_depth == 10) {
 +                // shift to low10bits for 10bit input
 +                // shift bit has to be compile-time constant
-+                y0x8 = _mm_srli_epi16(y0x8, 6);
-+                y1x8 = _mm_srli_epi16(y1x8, 6);
-+                uvx8 = _mm_srli_epi16(uvx8, 6);
++                y0x8 = vshrq_n_u16(y0x8, 6);
++                y1x8 = vshrq_n_u16(y1x8, 6);
++                uvx8 = vshrq_n_u16(uvx8, 6);
 +            }
-+            y0x4a = _mm_cvtepu16_epi32(y0x8);
-+            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
-+            y1x4a = _mm_cvtepu16_epi32(y1x8);
-+            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
-+            uvx4a = _mm_cvtepu16_epi32(uvx8);
-+            uvx4b = _mm_unpackhi_epi16(uvx8, zero128);
-+            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
-+            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
-+            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
-+            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
-+            uvx4a = _mm_sub_epi32(uvx4a, in_uv_offx4);
-+            uvx4b = _mm_sub_epi32(uvx4b, in_uv_offx4);
-+
-+            ux4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(2, 2, 0, 0));
-+            ux4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(2, 2, 0, 0));
-+            vx4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(3, 3, 1, 1));
-+            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
 +
-+            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
-+            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
-+            r0x4a = _mm_add_epi32(r0x4a, rndx4);
-+            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
-+            r0x4a = av_clip_int16_sse(r0x4a);
++            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
++            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
++            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
++            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
 +
-+            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
-+            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
-+            r1x4a = _mm_add_epi32(r1x4a, rndx4);
-+            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
-+            r1x4a = av_clip_int16_sse(r1x4a);
++            ux8 = vcombine_u16(ux2a, ux2b);
++            vx8 = vcombine_u16(vx2a, vx2b);
 +
-+            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
-+            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
-+            g0x4a = _mm_add_epi32(g0x4a, rndx4);
-+            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
-+            g0x4a = av_clip_int16_sse(g0x4a);
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
 +
-+            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
-+            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
-+            g1x4a = _mm_add_epi32(g1x4a, rndx4);
-+            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
-+            g1x4a = av_clip_int16_sse(g1x4a);
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
 +
-+            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
-+            b0x4a = _mm_add_epi32(b0x4a, rndx4);
-+            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
-+            b0x4a = av_clip_int16_sse(b0x4a);
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
 +
-+            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
-+            b1x4a = _mm_add_epi32(b1x4a, rndx4);
-+            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
-+            b1x4a = av_clip_int16_sse(b1x4a);
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
 +
-+            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
-+            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
-+            r0x4b = _mm_add_epi32(r0x4b, rndx4);
-+            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
-+            r0x4b = av_clip_int16_sse(r0x4b);
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
 +
-+            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
-+            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
-+            r1x4b = _mm_add_epi32(r1x4b, rndx4);
-+            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
-+            r1x4b = av_clip_int16_sse(r1x4b);
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
 +
-+            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
-+            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
-+            g0x4b = _mm_add_epi32(g0x4b, rndx4);
-+            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
-+            g0x4b = av_clip_int16_sse(g0x4b);
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
 +
-+            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
-+            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
-+            g1x4b = _mm_add_epi32(g1x4b, rndx4);
-+            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
-+            g1x4b = av_clip_int16_sse(g1x4b);
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
 +
-+            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
-+            b0x4b = _mm_add_epi32(b0x4b, rndx4);
-+            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
-+            b0x4b = av_clip_int16_sse(b0x4b);
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
 +
-+            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
-+            b1x4b = _mm_add_epi32(b1x4b, rndx4);
-+            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
-+            b1x4b = av_clip_int16_sse(b1x4b);
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
 +
-+            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            y0oax4 = vshrq_n_s32(y0oax4, 21);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
-+            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
-+            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
-+            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, 21);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
 +
-+            roax4 = _mm_cvtepi16_epi32(r0ox8);
-+            goax4 = _mm_cvtepi16_epi32(g0ox8);
-+            boax4 = _mm_cvtepi16_epi32(b0ox8);
++            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
++            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
 +
-+            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
-+            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
-+            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
 +
-+            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
-+            yoax4 = _mm_srai_epi32(yoax4, 21);
-+            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
 +
-+            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
-+            yobx4 = _mm_srai_epi32(yobx4, 21);
-+            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
 +
-+            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
-+            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, 21);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
 +
-+            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
-+            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
-+            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, 21);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
 +
-+            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
-+            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
-+            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
++            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
++            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
 +
-+            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
-+            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
-+            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
-+
-+            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
-+            y1oax4 = _mm_srai_epi32(y1oax4, 21);
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
-+
-+            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
-+            y1obx4 = _mm_srai_epi32(y1obx4, 21);
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
-+
-+            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
-+            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
 +
-+            ravgx4 = _mm_hadd_epi32(roax4, robx4);
-+            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
-+            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
-+            ravgx4 = _mm_srai_epi32(ravgx4, 2);
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
 +
-+            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
-+            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
-+            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
-+            gavgx4 = _mm_srai_epi32(gavgx4, 2);
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
 +
-+            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
-+            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
-+            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
-+            bavgx4 = _mm_srai_epi32(bavgx4, 2);
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, 21);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
 +
-+            uoax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
-+            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
-+            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
-+            uoax4 = _mm_srai_epi32(uoax4, 21);
-+            uoax4 = _mm_add_epi32(uoax4, _mm_set1_epi32(out_uv_offset));
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            vox4 = vshrq_n_s32(vox4, 21);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
 +
-+            voax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
-+            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
-+            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
-+            voax4 = _mm_srai_epi32(voax4, 21);
-+            voax4 = _mm_add_epi32(voax4, _mm_set1_epi32(out_uv_offset));
++            uvoax4 = vzip1q_s32(uox4, vox4);
++            uvobx4 = vzip2q_s32(uox4, vox4);
 +
-+            uvoax4 = _mm_unpacklo_epi32(uoax4, voax4);
-+            uvobx4 = _mm_unpackhi_epi32(uoax4, voax4);
-+            _mm_storeu_si64(&dstuv[x], _mm_packus_epi16(_mm_packs_epi32(uvoax4, uvobx4), zero128));
++            vst1_u8(&dstuv[x], vqmovun_s16(vcombine_s16(vmovn_s32(uvoax4), vmovn_s32(uvobx4))));
 +        }
 +    }
 +
@@ -2119,33 +2037,33 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+X86_64_V3 static void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
-+                                                         const uint16_t *srcy, const uint16_t *srcuv,
-+                                                         const int *dstlinesize, const int *srclinesize,
-+                                                         int dstdepth, int srcdepth,
-+                                                         int width, int height,
-+                                                         const struct TonemapIntParams *params)
++void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                        const int *dstlinesize, const int *srclinesize,
++                                        int dstdepth, int srcdepth,
++                                        int width, int height,
++                                        const struct TonemapIntParams *params)
 +{
-+    uint8_t *rdsty = dsty;
-+    uint8_t *rdstuv = dstuv;
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
 +    int rheight = height;
-+    // not zero when not divisible by 16
++    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 14;
++    int remainw = width & 6;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+//    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+//    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -2162,287 +2080,215 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[16], g[16], b[16];
-+    int16_t r1[16], g1[16], b1[16];
-+    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
-+    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
-+    __m256i cyx8 = _mm256_set1_epi32(cy);
-+    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
-+
-+    __m256i uvx16, uvx8a, uvx8b;
-+    __m256i y0x16, y1x16;
-+    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x4_t ux4, vx4;
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
 +
-+    __m256i r0ox16, g0ox16, b0ox16;
-+    __m256i y0ox16;
-+    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
-+    __m256i yoax8, yobx8;
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    uint16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
 +
-+    __m256i r1ox16, g1ox16, b1ox16;
-+    __m256i y1ox16;
-+    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
-+    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    uint16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 4; xx++) {
-+            int x = xx << 4;
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
-+            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            ux4 = vld1_u16(srcu + (x >> 1));
++            vx4 = vld1_u16(srcv + (x >> 1));
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
 +
-+            if (in_depth == 10) {
-+                // shift to low10bits for 10bit input
-+                y0x16 = _mm256_srli_epi16(y0x16, 6);
-+                y1x16 = _mm256_srli_epi16(y1x16, 6);
-+                uvx16 = _mm256_srli_epi16(uvx16, 6);
-+            }
++            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
++            ux8 = vsubq_u16(ux8, in_uv_offx8);
++            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
++            vx8 = vsubq_u16(vx8, in_uv_offx8);
 +
-+            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
-+            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
-+            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
-+            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
-+            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
-+            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
-+            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
-+            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
-+            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
-+            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
-+            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
 +
-+            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
-+            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
-+            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
-+            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
 +
-+            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
-+            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
-+            r0x8a = av_clip_int16_avx(r0x8a);
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
 +
-+            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
-+            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
-+            r1x8a = av_clip_int16_avx(r1x8a);
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
 +
-+            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
-+            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
-+            g0x8a = av_clip_int16_avx(g0x8a);
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
 +
-+            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
-+            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
-+            g1x8a = av_clip_int16_avx(g1x8a);
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
 +
-+            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
-+            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
-+            b0x8a = av_clip_int16_avx(b0x8a);
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
 +
-+            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
-+            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
-+            b1x8a = av_clip_int16_avx(b1x8a);
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
 +
-+            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
-+            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
-+            r0x8b = av_clip_int16_avx(r0x8b);
-+
-+            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
-+            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
-+            r1x8b = av_clip_int16_avx(r1x8b);
-+
-+            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
-+            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
-+            g0x8b = av_clip_int16_avx(g0x8b);
-+
-+            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
-+            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
-+            g1x8b = av_clip_int16_avx(g1x8b);
-+
-+            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
-+            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
-+            b0x8b = av_clip_int16_avx(b0x8b);
-+
-+            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
-+            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
-+            b1x8b = av_clip_int16_avx(b1x8b);
-+
-+            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
-+                                params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                params->rgb2rgb_passthrough);
-+
-+            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
-+            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
-+            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
-+
-+            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
-+            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
-+            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
-+
-+            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
-+            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
-+            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
 +
-+            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
-+            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
 +
-+            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
-+            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            y0oax4 = vshrq_n_s32(y0oax4, 19);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
-+            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
-+            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, 19);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
 +
-+            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
-+            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
-+            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
++            vst1q_u16(&dsty[x], y0ox8);
 +
-+            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
-+            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
-+            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
 +
-+            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
-+            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
-+            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
 +
-+            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
-+            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
 +
-+            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
-+            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, 19);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
 +
-+            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
-+            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, 19);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
 +
-+            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
-+            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
-+            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
++            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
 +
-+            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
-+            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
-+            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
 +
-+            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
-+            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
-+            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
 +
-+            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
-+            uox8 = _mm256_srai_epi32(uox8, out_sh);
-+            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
 +
-+            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
-+            vox8 = _mm256_srai_epi32(vox8, out_sh);
-+            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, 19);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4));
 +
-+            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
-+            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
-+            uvox16 = _mm256_packs_epi32(uvoax8, uvobx8);
-+            _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            vox4 = vshrq_n_s32(vox4, 19);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++            vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4));
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff0;
++        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
-+        rdstuv += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
 +        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
-+                                       rsrcy, rsrcuv,
-+                                       dstlinesize, srclinesize,
-+                                       dstdepth, srcdepth,
-+                                       remainw, rheight, params);
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
 +    }
 +}
-+#endif
 +
-+#if ARCH_AARCH64
-+static void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                             const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                             const int *dstlinesize, const int *srclinesize,
-+                                             int dstdepth, int srcdepth,
-+                                             int width, int height,
-+                                             const struct TonemapIntParams *params)
++void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
++                                              const uint16_t *srcy, const uint16_t *srcuv,
++                                              const int *dstlinesize, const int *srclinesize,
++                                              int dstdepth, int srcdepth,
++                                              int width, int height,
++                                              const struct TonemapIntParams *params)
 +{
-+    uint8_t *rdsty = dsty;
-+    uint8_t *rdstu = dstu;
-+    uint8_t *rdstv = dstv;
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstuv = dstuv;
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcu = srcu;
-+    const uint16_t *rsrcv = srcv;
++    const uint16_t *rsrcuv = srcuv;
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
@@ -2452,11 +2298,13 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -2484,43 +2332,56 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
 +    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
 +    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x8_t uvx8;
++    uint16x4_t ux2a, vx2a, ux2b, vx2b;
 +    uint16x8_t y0x8, y1x8, ux8, vx8;
 +    uint16x8_t r0x8, g0x8, b0x8;
 +    uint16x8_t r1x8, g1x8, b1x8;
-+    uint16x4_t ux4, vx4;
 +
 +    int16x8_t r0ox8, g0ox8, b0ox8;
-+    int16x8_t y0ox8;
++    uint16x8_t y0ox8;
 +    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
 +    int32x4_t y0oax4, y0obx4;
 +
 +    int16x8_t r1ox8, g1ox8, b1ox8;
-+    int16x8_t y1ox8;
++    uint16x8_t y1ox8;
 +    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
 +    int32x4_t y1oax4, y1obx4;
++    int32x4_t uvoax4, uvobx4;
 +    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
 +    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
 +    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
 +    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int16x8_t out_sh2x8 = vdupq_n_s16(out_sh2);
 +    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
 +    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
 +
 +            y0x8 = vld1q_u16(srcy + x);
 +            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            ux4 = vld1_u16(srcu + (x >> 1));
-+            vx4 = vld1_u16(srcv + (x >> 1));
-+
++            uvx8 = vld1q_u16(srcuv + x);
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                // shift bit has to be compile-time constant
++                y0x8 = vshrq_n_u16(y0x8, 6);
++                y1x8 = vshrq_n_u16(y1x8, 6);
++                uvx8 = vshrq_n_u16(uvx8, 6);
++            }
 +            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
 +            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
-+            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
-+            ux8 = vsubq_u16(ux8, in_uv_offx8);
-+            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
-+            vx8 = vsubq_u16(vx8, in_uv_offx8);
++            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
++
++            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
++            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
++            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
++            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
++
++            ux8 = vcombine_u16(ux2a, ux2b);
++            vx8 = vcombine_u16(vx2a, vx2b);
 +
 +            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
 +            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
@@ -2569,18 +2430,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
 +            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
 +            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
-+            y0oax4 = vshrq_n_s32(y0oax4, 21);
-+            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
 +            y0obx4 = vmulq_n_s32(r0obx4, cry);
 +            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
 +            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
 +            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
-+            y0obx4 = vshrq_n_s32(y0obx4, 21);
-+            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
-+
-+            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
-+            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
 +
 +            r1ox8 = vld1q_s16(r1);
 +            g1ox8 = vld1q_s16(g1);
@@ -2598,18 +2452,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
 +            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
 +            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
-+            y1oax4 = vshrq_n_s32(y1oax4, 21);
-+            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
 +
 +            y1obx4 = vmulq_n_s32(r1obx4, cry);
 +            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
 +            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
 +            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
-+            y1obx4 = vshrq_n_s32(y1obx4, 21);
-+            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
-+
-+            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
-+            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
 +
 +            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
 +            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
@@ -2641,16 +2488,50 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
 +            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
 +            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
-+            uox4 = vshrq_n_s32(uox4, 21);
-+            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
-+            vst1_lane_u32(&dstu[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(uox4), vdup_n_s16(0)))), 0);
 +
 +            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
 +            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
 +            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
-+            vox4 = vshrq_n_s32(vox4, 21);
++
++            switch(out_depth) {
++                default:
++                case 10:
++                    y0oax4 = vshrq_n_s32(y0oax4, 19);
++                    y0obx4 = vshrq_n_s32(y0obx4, 19);
++                    y1oax4 = vshrq_n_s32(y1oax4, 19);
++                    y1obx4 = vshrq_n_s32(y1obx4, 19);
++                    uox4 = vshrq_n_s32(uox4, 19);
++                    vox4 = vshrq_n_s32(vox4, 19);
++                    break;
++                case 16:
++                    y0oax4 = vshrq_n_s32(y0oax4, 13);
++                    y0obx4 = vshrq_n_s32(y0obx4, 13);
++                    y1oax4 = vshrq_n_s32(y1oax4, 13);
++                    y1obx4 = vshrq_n_s32(y1obx4, 13);
++                    uox4 = vshrq_n_s32(uox4, 13);
++                    vox4 = vshrq_n_s32(vox4, 13);
++                    break;
++            }
++
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
 +            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
-+            vst1_lane_u32(&dstv[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(vox4), vdup_n_s16(0)))), 0);
++
++            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
++            y0ox8 = vshlq_u16(y0ox8, out_sh2x8);
++            vst1q_u16(&dsty[x], y0ox8);
++
++            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
++            y1ox8 = vshlq_u16(y1ox8, out_sh2x8);
++            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
++
++            uvoax4 = vzip1q_s32(uox4, vox4);
++            uvobx4 = vzip2q_s32(uox4, vox4);
++
++            vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8));
 +        }
 +    }
 +
@@ -2658,281 +2539,285 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    if (remainw) {
 +        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
-+        rdstu += offset >> 1;
-+        rdstv += offset >> 1;
++        rdstuv += offset;
 +        rsrcy += offset;
-+        rsrcu += offset >> 1;
-+        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
-+                                    rsrcy, rsrcu, rsrcv,
-+                                    dstlinesize, srclinesize,
-+                                    dstdepth, srcdepth,
-+                                    remainw, rheight, params);
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
++                                            rsrcy, rsrcuv,
++                                            dstlinesize, srclinesize,
++                                            dstdepth, srcdepth,
++                                            remainw, rheight, params);
 +    }
 +}
+Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
+@@ -0,0 +1,56 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+static void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
-+                                                const uint16_t *srcy, const uint16_t *srcuv,
-+                                                const int *dstlinesize, const int *srclinesize,
-+                                                int dstdepth, int srcdepth,
-+                                                int width, int height,
-+                                                const struct TonemapIntParams *params)
-+{
-+    uint8_t *rdsty = dsty;
-+    uint8_t *rdstuv = dstuv;
-+    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
-+    int rheight = height;
-+    // not zero when not divisible by 8
-+    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
++#ifndef FFMPEG_VF_TONEMAPX_INTRIN_NEON_H
++#define FFMPEG_VF_TONEMAPX_INTRIN_NEON_H
 +
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
-+//    const int in_sh2 = 16 - in_depth;
++#include <arm_neon.h>
 +
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
-+//    const int out_sh2 = 16 - out_depth;
++#include "libavfilter/vf_tonemapx.h"
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                      const int *dstlinesize, const int *srclinesize,
++                                      int dstdepth, int srcdepth,
++                                      int width, int height,
++                                      const struct TonemapIntParams *params);
 +
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
++                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                         const int *dstlinesize, const int *srclinesize,
++                                         int dstdepth, int srcdepth,
++                                         int width, int height,
++                                         const struct TonemapIntParams *params);
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
-+    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
-+    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
-+    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
-+    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
-+    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
-+    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
-+    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
-+    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
-+    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
-+    uint16x8_t uvx8;
-+    uint16x4_t ux2a, vx2a, ux2b, vx2b;
-+    uint16x8_t y0x8, y1x8, ux8, vx8;
-+    uint16x8_t r0x8, g0x8, b0x8;
-+    uint16x8_t r1x8, g1x8, b1x8;
++void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                        const int *dstlinesize, const int *srclinesize,
++                                        int dstdepth, int srcdepth,
++                                        int width, int height,
++                                        const struct TonemapIntParams *params);
 +
-+    int16x8_t r0ox8, g0ox8, b0ox8;
-+    int16x8_t y0ox8;
-+    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
-+    int32x4_t y0oax4, y0obx4;
++void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
++                                              const uint16_t *srcy, const uint16_t *srcuv,
++                                              const int *dstlinesize, const int *srclinesize,
++                                              int dstdepth, int srcdepth,
++                                              int width, int height,
++                                              const struct TonemapIntParams *params);
 +
-+    int16x8_t r1ox8, g1ox8, b1ox8;
-+    int16x8_t y1ox8;
-+    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    int32x4_t y1oax4, y1obx4;
-+    int32x4_t uvoax4, uvobx4;
-+    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
-+    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
-+    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
-+    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
-+    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
-+    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
-+            int x = xx << 3;
++#endif //FFMPEG_VF_TONEMAPX_INTRIN_NEON_H
+Index: FFmpeg/libavfilter/x86/Makefile
+===================================================================
+--- FFmpeg.orig/libavfilter/x86/Makefile
++++ FFmpeg/libavfilter/x86/Makefile
+@@ -39,6 +39,8 @@ OBJS-$(CONFIG_VOLUME_FILTER)
+ OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
+ OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
+ OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
++OBJS-$(CONFIG_TONEMAPX_FILTER)               += x86/vf_tonemapx_intrin_sse.o
++OBJS-$(CONFIG_TONEMAPX_FILTER)               += x86/vf_tonemapx_intrin_avx.o
+ 
+ X86ASM-OBJS-$(CONFIG_SCENE_SAD)              += x86/scene_sad.o
+ 
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
+@@ -0,0 +1,1362 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+            y0x8 = vld1q_u16(srcy + x);
-+            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            uvx8 = vld1q_u16(srcuv + x);
-+            if (in_depth == 10) {
-+                // shift to low10bits for 10bit input
-+                // shift bit has to be compile-time constant
-+                y0x8 = vshrq_n_u16(y0x8, 6);
-+                y1x8 = vshrq_n_u16(y1x8, 6);
-+                uvx8 = vshrq_n_u16(uvx8, 6);
-+            }
-+            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
-+            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
-+            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
++#include "vf_tonemapx_intrin_sse.h"
 +
-+            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
-+            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
-+            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
-+            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
++// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction
++// cast the register into float register and store with movss as a workaround
++#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
++__attribute__((always_inline))
++X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) {
++    _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a));
++    return;
++}
++#endif
 +
-+            ux8 = vcombine_u16(ux2a, ux2b);
-+            vx8 = vcombine_u16(vx2a, vx2b);
++X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a)
++{
++__m128i mask = _mm_set1_epi32(0x7FFF);
++__m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF));
 +
-+            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
-+            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
-+            r0x8 = vaddq_u16(r0x8, rndx8);
++__m128i zero = _mm_setzero_si128();
++__m128i cmp = _mm_cmpeq_epi32(condition, zero);
 +
-+            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
-+            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
-+            g0x8 = vaddq_u16(g0x8, rndx8);
++__m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask);
++__m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a));
 +
-+            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
-+            b0x8 = vaddq_u16(b0x8, rndx8);
++return result;
++}
 +
-+            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
-+            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
-+            r1x8 = vaddq_u16(r1x8, rndx8);
++X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a)
++{
++__m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U));
++__m128i mask = _mm_set1_epi32(~0xFFFF);
++__m128i condition = _mm_and_si128(add_result, mask);
++__m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128());
 +
-+            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
-+            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
-+            g1x8 = vaddq_u16(g1x8, rndx8);
++__m128i shifted = _mm_srai_epi32(a, 31);
++__m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF));
 +
-+            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
-+            b1x8 = vaddq_u16(b1x8, rndx8);
++return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result));
++}
 +
-+            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
-+            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
++X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in,
++                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                                                 const AVLumaCoefficients *coeffs,
++                                                 const AVLumaCoefficients *ocoeffs, double desat,
++                                                 double (*rgb2rgb)[3][3],
++                                                 int rgb2rgb_passthrough)
++{
++    __m128i sig4;
++    __m128 mapvalx4, r_linx4, g_linx4, b_linx4;
++    __m128 offset = _mm_set1_ps(0.5f);
++    __m128i input_lut_offset = _mm_set1_epi32(2048);
++    __m128i upper_bound = _mm_set1_epi32(32767);
++    __m128 intermediate_upper_bound = _mm_set1_ps(32767.0f);
++    __m128i r, g, b, rx4, gx4, bx4;
 +
-+            r0ox8 = vld1q_s16(r);
-+            g0ox8 = vld1q_s16(g);
-+            b0ox8 = vld1q_s16(b);
++    float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4];
 +
-+            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
-+            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
-+            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++    sig4 = _mm_max_epi32(r_in, _mm_max_epi32(g_in, b_in));
++    sig4 = _mm_add_epi32(sig4, input_lut_offset);
++    sig4 = av_clip_uint16_sse(sig4);
 +
-+            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
-+            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
-+            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++    r = _mm_add_epi32(r_in, input_lut_offset);
++    r = av_clip_uint16_sse(r);
++    g = _mm_add_epi32(g_in, input_lut_offset);
++    g = av_clip_uint16_sse(g);
++    b = _mm_add_epi32(b_in, input_lut_offset);
++    b = av_clip_uint16_sse(b);
 +
-+            y0oax4 = vmulq_n_s32(r0oax4, cry);
-+            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
-+            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
-+            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
-+            y0oax4 = vshrq_n_s32(y0oax4, 21);
-+            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++    // Cannot use loop here as the lane has to be compile-time constant
++#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \
++r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)];                             \
++g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)];                             \
++b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)];
 +
-+            y0obx4 = vmulq_n_s32(r0obx4, cry);
-+            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
-+            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
-+            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
-+            y0obx4 = vshrq_n_s32(y0obx4, 21);
-+            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++    LOAD_LUT(0)
++    LOAD_LUT(1)
++    LOAD_LUT(2)
++    LOAD_LUT(3)
 +
-+            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
-+            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
++#undef LOAD_LUT
 +
-+            r1ox8 = vld1q_s16(r1);
-+            g1ox8 = vld1q_s16(g1);
-+            b1ox8 = vld1q_s16(b1);
++    mapvalx4 = _mm_loadu_ps(mapval4);
++    r_linx4 = _mm_loadu_ps(r_lin4);
++    g_linx4 = _mm_loadu_ps(g_lin4);
++    b_linx4 = _mm_loadu_ps(b_lin4);
 +
-+            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
-+            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
-+            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++    if (!rgb2rgb_passthrough) {
++        r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0]));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1])));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2])));
 +
-+            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
-+            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
-+            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++        g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1]));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0])));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2])));
 +
-+            y1oax4 = vmulq_n_s32(r1oax4, cry);
-+            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
-+            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
-+            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
-+            y1oax4 = vshrq_n_s32(y1oax4, 21);
-+            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++        b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2]));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0])));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1])));
++    }
 +
-+            y1obx4 = vmulq_n_s32(r1obx4, cry);
-+            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
-+            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
-+            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
-+            y1obx4 = vshrq_n_s32(y1obx4, 21);
-+            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++    if (desat > 0) {
++        __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS);
++        __m128 desat4 = _mm_set1_ps((float)desat);
++        __m128 luma4 = _mm_set1_ps(0);
++        __m128 overbright4;
 +
-+            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
-+            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr))));
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg))));
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb))));
++        overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4));
++        r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4));
++        g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4));
++        b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4));
++    }
 +
-+            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
-+            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
-+            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
-+            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
-+            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
-+            ravgx4 = vshrq_n_s32(ravgx4, 2);
++    r_linx4 = _mm_mul_ps(r_linx4, mapvalx4);
++    g_linx4 = _mm_mul_ps(g_linx4, mapvalx4);
++    b_linx4 = _mm_mul_ps(b_linx4, mapvalx4);
 +
-+            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
-+            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
-+            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
-+            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
-+            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
-+            gavgx4 = vshrq_n_s32(gavgx4, 2);
++    r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound);
++    r_linx4 = _mm_add_ps(r_linx4, offset);
 +
-+            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
-+            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
-+            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
-+            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
-+            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
-+            bavgx4 = vshrq_n_s32(bavgx4, 2);
++    g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound);
++    g_linx4 = _mm_add_ps(g_linx4, offset);
 +
-+            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
-+            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
-+            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
-+            uox4 = vshrq_n_s32(uox4, 21);
-+            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++    b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound);
++    b_linx4 = _mm_add_ps(b_linx4, offset);
 +
-+            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
-+            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
-+            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
-+            vox4 = vshrq_n_s32(vox4, 21);
-+            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++    rx4 = _mm_cvttps_epi32(r_linx4);
++    rx4 = av_clip_uint16_sse(rx4);
++    gx4 = _mm_cvttps_epi32(g_linx4);
++    gx4 = av_clip_uint16_sse(gx4);
++    bx4 = _mm_cvttps_epi32(b_linx4);
++    bx4 = av_clip_uint16_sse(bx4);
 +
-+            uvoax4 = vzip1q_s32(uox4, vox4);
-+            uvobx4 = vzip2q_s32(uox4, vox4);
++#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \
++g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)];                       \
++b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)];
 +
-+            vst1_u8(&dstuv[x], vqmovun_s16(vcombine_s16(vmovn_s32(uvoax4), vmovn_s32(uvobx4))));
-+        }
-+    }
++    SAVE_COLOR(0)
++    SAVE_COLOR(1)
++    SAVE_COLOR(2)
++    SAVE_COLOR(3)
 +
-+    // Process remaining pixels cannot fill the full simd register with scalar version
-+    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
-+        rdsty += offset;
-+        rdstuv += offset;
-+        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
-+                                       rsrcy, rsrcuv,
-+                                       dstlinesize, srclinesize,
-+                                       dstdepth, srcdepth,
-+                                       remainw, rheight, params);
-+    }
++#undef SAVE_COLOR
 +}
-+#endif
 +
-+static void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                          const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                          const int *dstlinesize, const int *srclinesize,
-+                                          int dstdepth, int srcdepth,
-+                                          int width, int height,
-+                                          const struct TonemapIntParams *params)
++X86_64_V2 void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params)
 +{
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
++
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
@@ -2958,213 +2843,19 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[4], g[4], b[4];
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                         ) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
-+            int u = (srcu[x >> 1]) - in_uv_offset;
-+            int v = (srcv[x >> 1]) - in_uv_offset;
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
 +
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
-+
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+
-+            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
-+
-+            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+
-+            int r00 = r[0], g00 = g[0], b00 = b[0];
-+            int r01 = r[1], g01 = g[1], b01 = b[1];
-+            int r10 = r[2], g10 = g[2], b10 = b[2];
-+            int r11 = r[3], g11 = g[3], b11 = b[3];
-+
-+            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 16);
-+            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 16);
-+            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 16);
-+            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 16);
-+
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 16);
-+            dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 16);
-+#undef AVG
-+        }
-+    }
-+}
-+
-+static void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
-+                                                const uint16_t *srcy, const uint16_t *srcuv,
-+                                                const int *dstlinesize, const int *srclinesize,
-+                                                int dstdepth, int srcdepth,
-+                                                int width, int height,
-+                                                const struct TonemapIntParams *params)
-+{
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
-+
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
-+    const int out_sh2 = 16 - out_depth;
-+
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
-+
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
-+
-+    int16_t r[4], g[4], b[4];
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
-+            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
-+            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
-+
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
-+
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+
-+            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
-+
-+            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+
-+            int r00 = r[0], g00 = g[0], b00 = b[0];
-+            int r01 = r[1], g01 = g[1], b01 = b[1];
-+            int r10 = r[2], g10 = g[2], b10 = b[2];
-+            int r11 = r[3], g11 = g[3], b11 = b[3];
-+
-+            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstuv[x]     = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
-+#undef AVG
-+        }
-+    }
-+}
-+
-+#if ARCH_X86
-+X86_64_V2 static void tonemap_frame_420p10_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                                        const int *dstlinesize, const int *srclinesize,
-+                                                        int dstdepth, int srcdepth,
-+                                                        int width, int height,
-+                                                        const struct TonemapIntParams *params)
-+{
-+    uint16_t *rdsty = dsty;
-+    uint16_t *rdstu = dstu;
-+    uint16_t *rdstv = dstv;
-+    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcu = srcu;
-+    const uint16_t *rsrcv = srcv;
-+    int rheight = height;
-+    // not zero when not divisible by 8
-+    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
-+
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
-+
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
-+
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
-+
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
-+
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
-+
-+    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
-+    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
-+    __m128i cyx4 = _mm_set1_epi32(cy);
-+    __m128i rndx4 = _mm_set1_epi32(in_rnd);
-+    __m128i zero128 = _mm_setzero_si128();
-+    __m128i ux4, vx4;
-+    __m128i y0x8, y1x8;
-+    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
-+    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
-+    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
++    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
++    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
++    __m128i cyx4 = _mm_set1_epi32(cy);
++    __m128i rndx4 = _mm_set1_epi32(in_rnd);
++    __m128i zero128 = _mm_setzero_si128();
++    __m128i ux4, vx4;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
 +    __m128i r0ox8, g0ox8, b0ox8;
 +    __m128i y0ox8;
@@ -3177,8 +2868,8 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m128i y1oax4, y1obx4;
 +    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
 +
@@ -3206,27 +2897,25 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            vx4b = _mm_unpackhi_epi32(vx4, vx4);
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
 +            r0x4a = _mm_add_epi32(r0x4a, rndx4);
 +            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
 +            r0x4a = av_clip_int16_sse(r0x4a);
 +
-+            r1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
 +            r1x4a = _mm_add_epi32(r1x4a, rndx4);
 +            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
 +            r1x4a = av_clip_int16_sse(r1x4a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
 +            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
 +            g0x4a = _mm_add_epi32(g0x4a, rndx4);
 +            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
 +            g0x4a = av_clip_int16_sse(g0x4a);
 +
-+            g1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
 +            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
 +            g1x4a = _mm_add_epi32(g1x4a, rndx4);
@@ -3234,51 +2923,45 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            g1x4a = av_clip_int16_sse(g1x4a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
 +            b0x4a = _mm_add_epi32(b0x4a, rndx4);
 +            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
 +            b0x4a = av_clip_int16_sse(b0x4a);
 +
-+            b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
 +            b1x4a = _mm_add_epi32(b1x4a, rndx4);
 +            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
 +            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+            r0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
 +            r0x4b = _mm_add_epi32(r0x4b, rndx4);
 +            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
 +            r0x4b = av_clip_int16_sse(r0x4b);
 +
-+            r1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
 +            r1x4b = _mm_add_epi32(r1x4b, rndx4);
 +            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
 +            r1x4b = av_clip_int16_sse(r1x4b);
 +
-+            g0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
 +            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
 +            g0x4b = _mm_add_epi32(g0x4b, rndx4);
 +            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
 +            g0x4b = av_clip_int16_sse(g0x4b);
 +
-+            g1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
 +            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
 +            g1x4b = _mm_add_epi32(g1x4b, rndx4);
 +            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
 +            g1x4b = av_clip_int16_sse(g1x4b);
 +
-+            b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
 +            b0x4b = _mm_add_epi32(b0x4b, rndx4);
 +            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
 +            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+            b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
 +            b1x4b = _mm_add_epi32(b1x4b, rndx4);
 +            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
@@ -3317,18 +3000,18 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
-+            yoax4 = _mm_srai_epi32(yoax4, out_sh);
++            yoax4 = _mm_srai_epi32(yoax4, 21);
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
 +            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
 +            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
 +            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
 +            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
-+            yobx4 = _mm_srai_epi32(yobx4, out_sh);
++            yobx4 = _mm_srai_epi32(yobx4, 21);
 +            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
-+            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
++            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
++            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
 +
 +            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
 +            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
@@ -3346,18 +3029,18 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
-+            y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
++            y1oax4 = _mm_srai_epi32(y1oax4, 21);
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
 +            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
 +            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
 +            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
 +            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
-+            y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
++            y1obx4 = _mm_srai_epi32(y1obx4, 21);
 +            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
-+            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
++            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
++            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
 +
 +            ravgx4 = _mm_hadd_epi32(roax4, robx4);
 +            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
@@ -3377,16 +3060,16 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
 +            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
 +            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
-+            uox4 = _mm_srai_epi32(uox4, out_sh);
++            uox4 = _mm_srai_epi32(uox4, 21);
 +            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
-+            _mm_storeu_si64((__m128i_u *) &dstu[x >> 1], _mm_packus_epi32(uox4, zero128));
++            _mm_storeu_si32(&dstu[x >> 1], _mm_packus_epi16(_mm_packs_epi32(uox4, zero128), zero128));
 +
 +            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
 +            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
 +            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
-+            vox4 = _mm_srai_epi32(vox4, out_sh);
++            vox4 = _mm_srai_epi32(vox4, 21);
 +            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
-+            _mm_storeu_si64((__m128i_u *) &dstv[x >> 1], _mm_packus_epi32(vox4, zero128));
++            _mm_storeu_si32(&dstv[x >> 1], _mm_packus_epi16(_mm_packs_epi32(vox4, zero128), zero128));
 +        }
 +    }
 +
@@ -3399,20 +3082,20 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +        rsrcy += offset;
 +        rsrcu += offset >> 1;
 +        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
-+                                      rsrcy, rsrcu, rsrcv,
-+                                      dstlinesize, srclinesize,
-+                                      dstdepth, srcdepth,
-+                                      remainw, rheight, params);
++        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
 +    }
 +}
 +
-+X86_64_V3 static void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                                        const int *dstlinesize, const int *srclinesize,
-+                                                        int dstdepth, int srcdepth,
-+                                                        int width, int height,
-+                                                        const struct TonemapIntParams *params)
++X86_64_V2 void tonemap_frame_420p10_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                                 const int *dstlinesize, const int *srclinesize,
++                                                 int dstdepth, int srcdepth,
++                                                 int width, int height,
++                                                 const struct TonemapIntParams *params)
 +{
 +    uint16_t *rdsty = dsty;
 +    uint16_t *rdstu = dstu;
@@ -3423,7 +3106,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 14;
++    int remainw = width & 6;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -3450,253 +3133,247 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[16], g[16], b[16];
-+    int16_t r1[16], g1[16], b1[16];
-+    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
-+    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
-+    __m256i cyx8 = _mm256_set1_epi32(cy);
-+    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
 +
-+    __m256i r0ox16, g0ox16, b0ox16;
-+    __m256i y0ox16;
-+    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
-+    __m256i yoax8, yobx8;
-+    __m256i ux8, vx8;
-+    __m256i y0x16, y1x16;
-+    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
++    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
++    __m128i cyx4 = _mm_set1_epi32(cy);
++    __m128i rndx4 = _mm_set1_epi32(in_rnd);
++    __m128i zero128 = _mm_setzero_si128();
++    __m128i ux4, vx4;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
-+    __m256i r1ox16, g1ox16, b1ox16;
-+    __m256i y1ox16;
-+    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8;
-+    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
++
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4;
++    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 4; xx++) {
-+            int x = xx << 4;
-+
-+            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
-+            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
-+            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
-+            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
-+            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
-+            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1)));
++            vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1)));
 +
-+            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
-+            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
-+            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
-+            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
-+            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            ux4 = _mm_cvtepu16_epi32(ux4);
++            vx4 = _mm_cvtepu16_epi32(vx4);
++            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
++            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
++            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
++            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
++            ux4 = _mm_sub_epi32(ux4, in_uv_offx4);
++            vx4 = _mm_sub_epi32(vx4, in_uv_offx4);
 +
-+            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-+            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-+            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-+            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            ux4a = _mm_unpacklo_epi32(ux4, ux4);
++            ux4b = _mm_unpackhi_epi32(ux4, ux4);
++            vx4a = _mm_unpacklo_epi32(vx4, vx4);
++            vx4b = _mm_unpackhi_epi32(vx4, vx4);
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
-+            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
-+            r0x8a = av_clip_int16_avx(r0x8a);
++            r0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r0x4a = _mm_add_epi32(r0x4a, rndx4);
++            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
++            r0x4a = av_clip_int16_sse(r0x4a);
 +
-+            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
-+            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
-+            r1x8a = av_clip_int16_avx(r1x8a);
++            r1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r1x4a = _mm_add_epi32(r1x4a, rndx4);
++            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
++            r1x4a = av_clip_int16_sse(r1x4a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
-+            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
-+            g0x8a = av_clip_int16_avx(g0x8a);
++            g0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g0x4a = _mm_add_epi32(g0x4a, rndx4);
++            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
++            g0x4a = av_clip_int16_sse(g0x4a);
 +
-+            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
-+            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
-+            g1x8a = av_clip_int16_avx(g1x8a);
++            g1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g1x4a = _mm_add_epi32(g1x4a, rndx4);
++            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
++            g1x4a = av_clip_int16_sse(g1x4a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
-+            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
-+            b0x8a = av_clip_int16_avx(b0x8a);
++            b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b0x4a = _mm_add_epi32(b0x4a, rndx4);
++            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
++            b0x4a = av_clip_int16_sse(b0x4a);
 +
-+            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
-+            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
-+            b1x8a = av_clip_int16_avx(b1x8a);
++            b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b1x4a = _mm_add_epi32(b1x4a, rndx4);
++            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
++            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
-+            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
-+            r0x8b = av_clip_int16_avx(r0x8b);
++            r0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r0x4b = _mm_add_epi32(r0x4b, rndx4);
++            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
++            r0x4b = av_clip_int16_sse(r0x4b);
 +
-+            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
-+            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
-+            r1x8b = av_clip_int16_avx(r1x8b);
++            r1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r1x4b = _mm_add_epi32(r1x4b, rndx4);
++            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
++            r1x4b = av_clip_int16_sse(r1x4b);
 +
-+            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
-+            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
-+            g0x8b = av_clip_int16_avx(g0x8b);
++            g0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g0x4b = _mm_add_epi32(g0x4b, rndx4);
++            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
++            g0x4b = av_clip_int16_sse(g0x4b);
 +
-+            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
-+            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
-+            g1x8b = av_clip_int16_avx(g1x8b);
++            g1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g1x4b = _mm_add_epi32(g1x4b, rndx4);
++            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
++            g1x4b = av_clip_int16_sse(g1x4b);
 +
-+            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
-+            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
-+            b0x8b = av_clip_int16_avx(b0x8b);
++            b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b0x4b = _mm_add_epi32(b0x4b, rndx4);
++            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
++            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
-+            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
-+            b1x8b = av_clip_int16_avx(b1x8b);
++            b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b1x4b = _mm_add_epi32(b1x4b, rndx4);
++            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
++            b1x4b = av_clip_int16_sse(b1x4b);
 +
-+            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
-+            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
-+            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
-+
-+            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
-+            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
-+            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
 +
-+            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
-+            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
-+            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
 +
-+            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
-+            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
 +
-+            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
-+            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            yoax4 = _mm_srai_epi32(yoax4, out_sh);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
-+            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, out_sh);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
-+            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
-+            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
++            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
 +
-+            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
-+            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
-+            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
 +
-+            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
-+            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
-+            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
 +
-+            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
-+            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
 +
-+            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
-+            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
-+            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
-+            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
-+            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++            y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
 +
-+            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
-+            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
-+            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
 +
-+            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
-+            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
-+            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
 +
-+            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
-+            uox8 = _mm256_srai_epi32(uox8, out_sh);
-+            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
-+            uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256());
-+            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8));
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
 +
-+            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
-+            vox8 = _mm256_srai_epi32(vox8, out_sh);
-+            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
-+            vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256());
-+            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8));
++            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uox4 = _mm_srai_epi32(uox4, out_sh);
++            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si64((__m128i_u *) &dstu[x >> 1], _mm_packus_epi32(uox4, zero128));
++
++            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            vox4 = _mm_srai_epi32(vox4, out_sh);
++            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si64((__m128i_u *) &dstv[x >> 1], _mm_packus_epi32(vox4, zero128));
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff0;
++        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
 +        rdstu += offset >> 1;
 +        rdstv += offset >> 1;
@@ -3711,15 +3388,15 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+X86_64_V2 static void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty, uint16_t *dstuv,
-+                                                              const uint16_t *srcy, const uint16_t *srcuv,
-+                                                              const int *dstlinesize, const int *srclinesize,
-+                                                              int dstdepth, int srcdepth,
-+                                                              int width, int height,
-+                                                              const struct TonemapIntParams *params)
++X86_64_V2 void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv,
++                                                  const uint16_t *srcy, const uint16_t *srcuv,
++                                                  const int *dstlinesize, const int *srclinesize,
++                                                  int dstdepth, int srcdepth,
++                                                  int width, int height,
++                                                  const struct TonemapIntParams *params)
 +{
-+    uint16_t *rdsty = dsty;
-+    uint16_t *rdstuv = dstuv;
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstuv = dstuv;
 +    const uint16_t *rsrcy = srcy;
 +    const uint16_t *rsrcuv = srcuv;
 +    int rheight = height;
@@ -3731,12 +3408,13 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
++//    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+    const int out_sh2 = 16 - out_depth;
++//    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -3776,9 +3454,9 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    __m128i y1ox8;
 +    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
 +    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
-+    __m128i uoax4, voax4, ravgx4, gavgx4, bavgx4, uvox8;
++    __m128i uoax4, voax4, ravgx4, gavgx4, bavgx4;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
 +        for (int xx = 0; xx < width >> 3; xx++) {
 +            int x = xx << 3;
@@ -3813,27 +3491,25 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
 +            r0x4a = _mm_add_epi32(r0x4a, rndx4);
 +            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
 +            r0x4a = av_clip_int16_sse(r0x4a);
 +
-+            r1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
 +            r1x4a = _mm_add_epi32(r1x4a, rndx4);
 +            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
 +            r1x4a = av_clip_int16_sse(r1x4a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
 +            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
 +            g0x4a = _mm_add_epi32(g0x4a, rndx4);
 +            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
 +            g0x4a = av_clip_int16_sse(g0x4a);
 +
-+            g1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
 +            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
 +            g1x4a = _mm_add_epi32(g1x4a, rndx4);
@@ -3841,51 +3517,45 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            g1x4a = av_clip_int16_sse(g1x4a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
 +            b0x4a = _mm_add_epi32(b0x4a, rndx4);
 +            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
 +            b0x4a = av_clip_int16_sse(b0x4a);
 +
-+            b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
 +            b1x4a = _mm_add_epi32(b1x4a, rndx4);
 +            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
 +            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+            r0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
 +            r0x4b = _mm_add_epi32(r0x4b, rndx4);
 +            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
 +            r0x4b = av_clip_int16_sse(r0x4b);
 +
-+            r1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
 +            r1x4b = _mm_add_epi32(r1x4b, rndx4);
 +            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
 +            r1x4b = av_clip_int16_sse(r1x4b);
 +
-+            g0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
 +            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
 +            g0x4b = _mm_add_epi32(g0x4b, rndx4);
 +            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
 +            g0x4b = av_clip_int16_sse(g0x4b);
 +
-+            g1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
 +            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
 +            g1x4b = _mm_add_epi32(g1x4b, rndx4);
 +            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
 +            g1x4b = av_clip_int16_sse(g1x4b);
 +
-+            b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
 +            b0x4b = _mm_add_epi32(b0x4b, rndx4);
 +            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
 +            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+            b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
 +            b1x4b = _mm_add_epi32(b1x4b, rndx4);
 +            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
@@ -3924,19 +3594,18 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
-+            yoax4 = _mm_srai_epi32(yoax4, out_sh);
++            yoax4 = _mm_srai_epi32(yoax4, 21);
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
 +            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
 +            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
 +            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
 +            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
-+            yobx4 = _mm_srai_epi32(yobx4, out_sh);
++            yobx4 = _mm_srai_epi32(yobx4, 21);
 +            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
-+            y0ox8 = _mm_slli_epi16(y0ox8, out_sh2);
-+            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
++            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
++            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
 +
 +            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
 +            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
@@ -3954,19 +3623,18 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
-+            y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
++            y1oax4 = _mm_srai_epi32(y1oax4, 21);
 +            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
 +            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
 +            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
 +            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
 +            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
-+            y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
++            y1obx4 = _mm_srai_epi32(y1obx4, 21);
 +            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
-+            y1ox8 = _mm_slli_epi16(y1ox8, out_sh2);
-+            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
++            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
++            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
 +
 +            ravgx4 = _mm_hadd_epi32(roax4, robx4);
 +            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
@@ -3986,20 +3654,18 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +            uoax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
 +            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
 +            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
-+            uoax4 = _mm_srai_epi32(uoax4, out_sh);
++            uoax4 = _mm_srai_epi32(uoax4, 21);
 +            uoax4 = _mm_add_epi32(uoax4, _mm_set1_epi32(out_uv_offset));
 +
 +            voax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
 +            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
 +            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
-+            voax4 = _mm_srai_epi32(voax4, out_sh);
++            voax4 = _mm_srai_epi32(voax4, 21);
 +            voax4 = _mm_add_epi32(voax4, _mm_set1_epi32(out_uv_offset));
 +
 +            uvoax4 = _mm_unpacklo_epi32(uoax4, voax4);
 +            uvobx4 = _mm_unpackhi_epi32(uoax4, voax4);
-+            uvox8 = _mm_packus_epi32(uvoax4, uvobx4);
-+            uvox8 = _mm_slli_epi16(uvox8, out_sh2);
-+            _mm_storeu_si128((__m128i_u *) &dstuv[x], uvox8);
++            _mm_storeu_si64(&dstuv[x], _mm_packus_epi16(_mm_packs_epi32(uvoax4, uvobx4), zero128));
 +        }
 +    }
 +
@@ -4010,7 +3676,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +        rdstuv += offset;
 +        rsrcy += offset;
 +        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
++        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
 +                                       rsrcy, rsrcuv,
 +                                       dstlinesize, srclinesize,
 +                                       dstdepth, srcdepth,
@@ -4018,12 +3684,12 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    }
 +}
 +
-+X86_64_V3 static void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
-+                                                              const uint16_t *srcy, const uint16_t *srcuv,
-+                                                              const int *dstlinesize, const int *srclinesize,
-+                                                              int dstdepth, int srcdepth,
-+                                                              int width, int height,
-+                                                              const struct TonemapIntParams *params)
++X86_64_V2 void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty, uint16_t *dstuv,
++                                                       const uint16_t *srcy, const uint16_t *srcuv,
++                                                       const int *dstlinesize, const int *srclinesize,
++                                                       int dstdepth, int srcdepth,
++                                                       int width, int height,
++                                                       const struct TonemapIntParams *params)
 +{
 +    uint16_t *rdsty = dsty;
 +    uint16_t *rdstuv = dstuv;
@@ -4032,7 +3698,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 14;
++    int remainw = width & 6;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -4060,262 +3726,259 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[16], g[16], b[16];
-+    int16_t r1[16], g1[16], b1[16];
-+    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
-+    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
-+    __m256i cyx8 = _mm256_set1_epi32(cy);
-+    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
 +
-+    __m256i r0ox16, g0ox16, b0ox16;
-+    __m256i y0ox16;
-+    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
-+    __m256i yoax8, yobx8;
-+    __m256i uvx16, uvx8a, uvx8b;
-+    __m256i y0x16, y1x16;
-+    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
++    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
++    __m128i cyx4 = _mm_set1_epi32(cy);
++    __m128i rndx4 = _mm_set1_epi32(in_rnd);
++    __m128i zero128 = _mm_setzero_si128();
++    __m128i uvx8, uvx4a, uvx4b;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
-+    __m256i r1ox16, g1ox16, b1ox16;
-+    __m256i y1ox16;
-+    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
-+    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
++
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
++    __m128i uoax4, voax4, ravgx4, gavgx4, bavgx4, uvox8;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 4; xx++) {
-+            int x = xx << 4;
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
-+            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx8 = _mm_lddqu_si128((__m128i*)(srcuv + x));
 +
 +            if (in_depth == 10) {
 +                // shift to low10bits for 10bit input
-+                y0x16 = _mm256_srli_epi16(y0x16, 6);
-+                y1x16 = _mm256_srli_epi16(y1x16, 6);
-+                uvx16 = _mm256_srli_epi16(uvx16, 6);
++                // shift bit has to be compile-time constant
++                y0x8 = _mm_srli_epi16(y0x8, 6);
++                y1x8 = _mm_srli_epi16(y1x8, 6);
++                uvx8 = _mm_srli_epi16(uvx8, 6);
 +            }
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            uvx4a = _mm_cvtepu16_epi32(uvx8);
++            uvx4b = _mm_unpackhi_epi16(uvx8, zero128);
++            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
++            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
++            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
++            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
++            uvx4a = _mm_sub_epi32(uvx4a, in_uv_offx4);
++            uvx4b = _mm_sub_epi32(uvx4b, in_uv_offx4);
 +
-+            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
-+            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
-+            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
-+            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
-+            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
-+            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
-+            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
-+            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
-+            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
-+            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
-+            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
-+
-+            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
-+            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
-+            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
-+            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++            ux4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
-+            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
-+            r0x8a = av_clip_int16_avx(r0x8a);
++            r0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r0x4a = _mm_add_epi32(r0x4a, rndx4);
++            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
++            r0x4a = av_clip_int16_sse(r0x4a);
 +
-+            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
-+            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
-+            r1x8a = av_clip_int16_avx(r1x8a);
++            r1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r1x4a = _mm_add_epi32(r1x4a, rndx4);
++            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
++            r1x4a = av_clip_int16_sse(r1x4a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
-+            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
-+            g0x8a = av_clip_int16_avx(g0x8a);
++            g0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g0x4a = _mm_add_epi32(g0x4a, rndx4);
++            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
++            g0x4a = av_clip_int16_sse(g0x4a);
 +
-+            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
-+            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
-+            g1x8a = av_clip_int16_avx(g1x8a);
++            g1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g1x4a = _mm_add_epi32(g1x4a, rndx4);
++            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
++            g1x4a = av_clip_int16_sse(g1x4a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
-+            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
-+            b0x8a = av_clip_int16_avx(b0x8a);
++            b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b0x4a = _mm_add_epi32(b0x4a, rndx4);
++            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
++            b0x4a = av_clip_int16_sse(b0x4a);
 +
-+            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
-+            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
-+            b1x8a = av_clip_int16_avx(b1x8a);
++            b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b1x4a = _mm_add_epi32(b1x4a, rndx4);
++            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
++            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
-+            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
-+            r0x8b = av_clip_int16_avx(r0x8b);
++            r0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r0x4b = _mm_add_epi32(r0x4b, rndx4);
++            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
++            r0x4b = av_clip_int16_sse(r0x4b);
 +
-+            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
-+            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
-+            r1x8b = av_clip_int16_avx(r1x8b);
++            r1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r1x4b = _mm_add_epi32(r1x4b, rndx4);
++            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
++            r1x4b = av_clip_int16_sse(r1x4b);
 +
-+            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
-+            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
-+            g0x8b = av_clip_int16_avx(g0x8b);
++            g0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g0x4b = _mm_add_epi32(g0x4b, rndx4);
++            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
++            g0x4b = av_clip_int16_sse(g0x4b);
 +
-+            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
-+            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
-+            g1x8b = av_clip_int16_avx(g1x8b);
++            g1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g1x4b = _mm_add_epi32(g1x4b, rndx4);
++            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
++            g1x4b = av_clip_int16_sse(g1x4b);
 +
-+            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
-+            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
-+            b0x8b = av_clip_int16_avx(b0x8b);
++            b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b0x4b = _mm_add_epi32(b0x4b, rndx4);
++            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
++            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
-+            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
-+            b1x8b = av_clip_int16_avx(b1x8b);
++            b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b1x4b = _mm_add_epi32(b1x4b, rndx4);
++            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
++            b1x4b = av_clip_int16_sse(b1x4b);
 +
-+            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
-+            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
-+            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
-+
-+            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
-+            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
-+            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
 +
-+            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
-+            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
-+            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
 +
-+            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
-+            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
 +
-+            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
-+            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            yoax4 = _mm_srai_epi32(yoax4, out_sh);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
-+            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2);
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, out_sh);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
-+            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
-+            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
++            y0ox8 = _mm_slli_epi16(y0ox8, out_sh2);
++            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
 +
-+            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
-+            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
-+            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
 +
-+            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
-+            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
-+            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
 +
-+            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
-+            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
 +
-+            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
-+            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
-+            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2);
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
-+            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
-+            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++            y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
++            y1ox8 = _mm_slli_epi16(y1ox8, out_sh2);
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
 +
-+            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
-+            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
-+            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
 +
-+            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
-+            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
-+            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
 +
-+            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
-+            uox8 = _mm256_srai_epi32(uox8, out_sh);
-+            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
 +
-+            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
-+            vox8 = _mm256_srai_epi32(vox8, out_sh);
-+            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            uoax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uoax4 = _mm_srai_epi32(uoax4, out_sh);
++            uoax4 = _mm_add_epi32(uoax4, _mm_set1_epi32(out_uv_offset));
 +
-+            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
-+            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
-+            uvox16 = _mm256_packus_epi32(uvoax8, uvobx8);
-+            uvox16 = _mm256_slli_epi16(uvox16, out_sh2);
-+            _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16);
++            voax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            voax4 = _mm_srai_epi32(voax4, out_sh);
++            voax4 = _mm_add_epi32(voax4, _mm_set1_epi32(out_uv_offset));
++
++            uvoax4 = _mm_unpacklo_epi32(uoax4, voax4);
++            uvobx4 = _mm_unpackhi_epi32(uoax4, voax4);
++            uvox8 = _mm_packus_epi32(uvoax4, uvobx4);
++            uvox8 = _mm_slli_epi16(uvox8, out_sh2);
++            _mm_storeu_si128((__m128i_u *) &dstuv[x], uvox8);
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff0;
++        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
 +        rdstuv += offset;
 +        rsrcy += offset;
@@ -4327,277 +3990,350 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                                            remainw, rheight, params);
 +    }
 +}
-+#endif
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
+@@ -0,0 +1,58 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+#if ARCH_AARCH64
-+static void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++#ifndef FFMPEG_VF_TONEMAPX_INTRIN_SSE_H
++#define FFMPEG_VF_TONEMAPX_INTRIN_SSE_H
++
++#include <immintrin.h>
++#include <emmintrin.h>
++#include <smmintrin.h>
++
++#include "libavfilter/vf_tonemapx.h"
++
++X86_64_V2 void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
 +                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                               const int *dstlinesize, const int *srclinesize,
 +                                               int dstdepth, int srcdepth,
 +                                               int width, int height,
-+                                               const struct TonemapIntParams *params)
-+{
-+    uint16_t *rdsty = dsty;
-+    uint16_t *rdstu = dstu;
-+    uint16_t *rdstv = dstv;
-+    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcu = srcu;
-+    const uint16_t *rsrcv = srcv;
-+    int rheight = height;
-+    // not zero when not divisible by 8
-+    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
-+
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
++                                               const struct TonemapIntParams *params);
++
++X86_64_V2 void tonemap_frame_420p10_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                                 const int *dstlinesize, const int *srclinesize,
++                                                 int dstdepth, int srcdepth,
++                                                 int width, int height,
++                                                 const struct TonemapIntParams *params);
++
++X86_64_V2 void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv,
++                                                  const uint16_t *srcy, const uint16_t *srcuv,
++                                                  const int *dstlinesize, const int *srclinesize,
++                                                  int dstdepth, int srcdepth,
++                                                  int width, int height,
++                                                  const struct TonemapIntParams *params);
++
++X86_64_V2 void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty, uint16_t *dstuv,
++                                                       const uint16_t *srcy, const uint16_t *srcuv,
++                                                       const int *dstlinesize, const int *srclinesize,
++                                                       int dstdepth, int srcdepth,
++                                                       int width, int height,
++                                                       const struct TonemapIntParams *params);
++
++#endif //FFMPEG_VF_TONEMAPX_INTRIN_SSE_H
+Index: FFmpeg/libavfilter/vf_tonemapx.h
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/vf_tonemapx.h
+@@ -0,0 +1,86 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
++#ifndef FFMPEG_VF_TONEMAPX_H
++#define FFMPEG_VF_TONEMAPX_H
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++#include "colorspace.h"
 +
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++#ifdef REFERENCE_WHITE
++#undef REFERENCE_WHITE
++#endif
++#define REFERENCE_WHITE 203.0f
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
-+    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
-+    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
-+    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
-+    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
-+    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
-+    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
-+    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
-+    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
-+    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
-+    uint16x4_t ux4, vx4;
-+    uint16x8_t y0x8, y1x8, ux8, vx8;
-+    uint16x8_t r0x8, g0x8, b0x8;
-+    uint16x8_t r1x8, g1x8, b1x8;
++#ifdef FLOAT_EPS
++#undef FLOAT_EPS
++#endif
++#define FLOAT_EPS 1.175494351e-38f
 +
-+    int16x8_t r0ox8, g0ox8, b0ox8;
-+    uint16x8_t y0ox8;
-+    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
-+    int32x4_t y0oax4, y0obx4;
++#if defined(__GNUC__) || defined(__clang__)
++#    if (__GNUC__ >= 11) || (__clang_major__ >= 12)
++#        define X86_64_V2 __attribute__((target("arch=x86-64-v2")))
++#        define X86_64_V3 __attribute__((target("arch=x86-64-v3")))
++#    else
++#        define X86_64_V2 __attribute__((target("sse4.2")))
++#        define X86_64_V3 __attribute__((target("avx2,fma")))
++#    endif // (__GNUC__ >= 11) || (__clang_major__ >= 12)
++#endif // defined(__GNUC__) || defined(__clang__)
 +
-+    int16x8_t r1ox8, g1ox8, b1ox8;
-+    uint16x8_t y1ox8;
-+    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    int32x4_t y1oax4, y1obx4;
-+    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
-+    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
-+    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
-+    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
-+    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
-+    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
-+            int x = xx << 3;
++typedef struct TonemapIntParams {
++    double lut_peak;
++    float *lin_lut;
++    float *tonemap_lut;
++    uint16_t *delin_lut;
++    int in_yuv_off, out_yuv_off;
++    int16_t (*yuv2rgb_coeffs)[3][3][8];
++    int16_t (*rgb2yuv_coeffs)[3][3][8];
++    double  (*rgb2rgb_coeffs)[3][3];
++    int rgb2rgb_passthrough;
++    const AVLumaCoefficients *coeffs, *ocoeffs;
++    double desat;
++} TonemapIntParams;
 +
-+            y0x8 = vld1q_u16(srcy + x);
-+            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            ux4 = vld1_u16(srcu + (x >> 1));
-+            vx4 = vld1_u16(srcv + (x >> 1));
-+            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
-+            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                        const int *dstlinesize, const int *srclinesize,
++                                        int dstdepth, int srcdepth,
++                                        int width, int height,
++                                        const struct TonemapIntParams *params);
 +
-+            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
-+            ux8 = vsubq_u16(ux8, in_uv_offx8);
-+            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
-+            vx8 = vsubq_u16(vx8, in_uv_offx8);
++void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
++                                           const uint16_t *srcy, const uint16_t *srcuv,
++                                           const int *dstlinesize, const int *srclinesize,
++                                           int dstdepth, int srcdepth,
++                                           int width, int height,
++                                           const struct TonemapIntParams *params);
 +
-+            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
-+            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
-+            r0x8 = vaddq_u16(r0x8, rndx8);
++void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                          const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                          const int *dstlinesize, const int *srclinesize,
++                                          int dstdepth, int srcdepth,
++                                          int width, int height,
++                                          const struct TonemapIntParams *params);
 +
-+            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
-+            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
-+            g0x8 = vaddq_u16(g0x8, rndx8);
++void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
++                                                const uint16_t *srcy, const uint16_t *srcuv,
++                                                const int *dstlinesize, const int *srclinesize,
++                                                int dstdepth, int srcdepth,
++                                                int width, int height,
++                                                const struct TonemapIntParams *params);
 +
-+            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
-+            b0x8 = vaddq_u16(b0x8, rndx8);
++#endif //FFMPEG_VF_TONEMAPX_H
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
+@@ -0,0 +1,1387 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
-+            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
-+            r1x8 = vaddq_u16(r1x8, rndx8);
++#include "vf_tonemapx_intrin_avx.h"
 +
-+            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
-+            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
-+            g1x8 = vaddq_u16(g1x8, rndx8);
++X86_64_V3 static inline __m256i av_clip_int16_avx(__m256i a)
++{
++__m256i add_result = _mm256_add_epi32(a, _mm256_set1_epi32(0x8000U));
++__m256i mask = _mm256_set1_epi32(~0xFFFF);
++__m256i condition = _mm256_and_si256(add_result, mask);
++__m256i cmp = _mm256_cmpeq_epi32(condition, _mm256_setzero_si256());
 +
-+            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
-+            b1x8 = vaddq_u16(b1x8, rndx8);
++__m256i shifted = _mm256_srai_epi32(a, 31);
++__m256i xor_result = _mm256_xor_si256(shifted, _mm256_set1_epi32(0x7FFF));
 +
-+            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
-+            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
++return _mm256_or_si256(_mm256_and_si256(cmp, a), _mm256_andnot_si256(cmp, xor_result));
++}
 +
-+            r0ox8 = vld1q_s16(r);
-+            g0ox8 = vld1q_s16(g);
-+            b0ox8 = vld1q_s16(b);
++X86_64_V3 static inline void tonemap_int32x8_avx(__m256i r_in, __m256i g_in, __m256i b_in,
++                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                                                 const AVLumaCoefficients *coeffs,
++                                                 const AVLumaCoefficients *ocoeffs, double desat,
++                                                 double (*rgb2rgb)[3][3],
++                                                 int rgb2rgb_passthrough)
++{
++    __m256i sig8;
++    __m256 mapvalx8, r_linx8, g_linx8, b_linx8;
++    __m256 offset = _mm256_set1_ps(0.5f);
++    __m256i zerox8 = _mm256_setzero_si256();
++    __m256i input_lut_offset = _mm256_set1_epi32(2048);
++    __m256i upper_bound = _mm256_set1_epi32(32767);
++    __m256 intermediate_upper_bound = _mm256_set1_ps(32767.0f);
++    __m256i r, g, b, rx8, gx8, bx8;
 +
-+            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
-+            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
-+            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++    float mapval8[8], r_lin8[8], g_lin8[8], b_lin8[8];
 +
-+            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
-+            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
-+            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++    sig8 = _mm256_max_epi32(r_in, _mm256_max_epi32(g_in, b_in));
++    sig8 = _mm256_add_epi32(sig8, input_lut_offset);
++    sig8 = _mm256_min_epi32(sig8, upper_bound);
++    sig8 = _mm256_max_epi32(sig8, zerox8);
 +
-+            y0oax4 = vmulq_n_s32(r0oax4, cry);
-+            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
-+            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
-+            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
-+            y0oax4 = vshrq_n_s32(y0oax4, 19);
-+            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++    r = _mm256_add_epi32(r_in, input_lut_offset);
++    r = _mm256_min_epi32(r, upper_bound);
++    r = _mm256_max_epi32(r, zerox8);
++    g = _mm256_add_epi32(g_in, input_lut_offset);
++    g = _mm256_min_epi32(g, upper_bound);
++    g = _mm256_max_epi32(g, zerox8);
++    b = _mm256_add_epi32(b_in, input_lut_offset);
++    b = _mm256_min_epi32(b, upper_bound);
++    b = _mm256_max_epi32(b, zerox8);
 +
-+            y0obx4 = vmulq_n_s32(r0obx4, cry);
-+            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
-+            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
-+            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
-+            y0obx4 = vshrq_n_s32(y0obx4, 19);
-+            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++#define LOAD_LUT(i) mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)]; \
++r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)];                             \
++g_lin8[i] = lin_lut[_mm256_extract_epi32(g, i)];                             \
++b_lin8[i] = lin_lut[_mm256_extract_epi32(b, i)];
 +
-+            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
-+            vst1q_u16(&dsty[x], y0ox8);
++    LOAD_LUT(0)
++    LOAD_LUT(1)
++    LOAD_LUT(2)
++    LOAD_LUT(3)
++    LOAD_LUT(4)
++    LOAD_LUT(5)
++    LOAD_LUT(6)
++    LOAD_LUT(7)
 +
-+            r1ox8 = vld1q_s16(r1);
-+            g1ox8 = vld1q_s16(g1);
-+            b1ox8 = vld1q_s16(b1);
++#undef LOAD_LUT
 +
-+            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
-+            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
-+            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++    mapvalx8 = _mm256_loadu_ps(mapval8);
++    r_linx8 = _mm256_loadu_ps(r_lin8);
++    g_linx8 = _mm256_loadu_ps(g_lin8);
++    b_linx8 = _mm256_loadu_ps(b_lin8);
 +
-+            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
-+            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
-+            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++    if (!rgb2rgb_passthrough) {
++        r_linx8 = _mm256_mul_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][0]));
++        r_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][1]), r_linx8);
++        r_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][2]), r_linx8);
 +
-+            y1oax4 = vmulq_n_s32(r1oax4, cry);
-+            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
-+            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
-+            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
-+            y1oax4 = vshrq_n_s32(y1oax4, 19);
-+            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++        g_linx8 = _mm256_mul_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][1]));
++        g_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][0]), g_linx8);
++        g_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][2]), g_linx8);
 +
-+            y1obx4 = vmulq_n_s32(r1obx4, cry);
-+            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
-+            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
-+            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
-+            y1obx4 = vshrq_n_s32(y1obx4, 19);
-+            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++        b_linx8 = _mm256_mul_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][2]));
++        b_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][0]), b_linx8);
++        b_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][1]), b_linx8);
++    }
 +
-+            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
-+            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
++    if (desat > 0) {
++        __m256 eps_x8 = _mm256_set1_ps(FLOAT_EPS);
++        __m256 desat8 = _mm256_set1_ps((float)desat);
++        __m256 luma8 = _mm256_set1_ps(0);
++        __m256 overbright8;
 +
-+            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
-+            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
-+            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
-+            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
-+            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
-+            ravgx4 = vshrq_n_s32(ravgx4, 2);
++        luma8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cr)), luma8);
++        luma8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cg)), luma8);
++        luma8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cb)), luma8);
++        overbright8 = _mm256_div_ps(_mm256_max_ps(_mm256_sub_ps(luma8, desat8), eps_x8), _mm256_max_ps(luma8, eps_x8));
++        r_linx8 = _mm256_fnmadd_ps(r_linx8, overbright8, r_linx8);
++        r_linx8 = _mm256_fmadd_ps(luma8, overbright8, r_linx8);
++        g_linx8 = _mm256_fnmadd_ps(g_linx8, overbright8, g_linx8);
++        g_linx8 = _mm256_fmadd_ps(luma8, overbright8, g_linx8);
++        b_linx8 = _mm256_fnmadd_ps(b_linx8, overbright8, b_linx8);
++        b_linx8 = _mm256_fmadd_ps(luma8, overbright8, b_linx8);
++    }
 +
-+            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
-+            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
-+            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
-+            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
-+            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
-+            gavgx4 = vshrq_n_s32(gavgx4, 2);
++    r_linx8 = _mm256_mul_ps(r_linx8, mapvalx8);
++    g_linx8 = _mm256_mul_ps(g_linx8, mapvalx8);
++    b_linx8 = _mm256_mul_ps(b_linx8, mapvalx8);
 +
-+            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
-+            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
-+            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
-+            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
-+            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
-+            bavgx4 = vshrq_n_s32(bavgx4, 2);
++    r_linx8 = _mm256_fmadd_ps(r_linx8, intermediate_upper_bound, offset);
++    g_linx8 = _mm256_fmadd_ps(g_linx8, intermediate_upper_bound, offset);
++    b_linx8 = _mm256_fmadd_ps(b_linx8, intermediate_upper_bound, offset);
 +
-+            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
-+            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
-+            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
-+            uox4 = vshrq_n_s32(uox4, 19);
-+            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
-+            vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4));
++    rx8 = _mm256_cvttps_epi32(r_linx8);
++    rx8 = _mm256_min_epi32(rx8, upper_bound);
++    rx8 = _mm256_max_epi32(rx8, zerox8);
 +
-+            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
-+            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
-+            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
-+            vox4 = vshrq_n_s32(vox4, 19);
-+            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
-+            vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4));
-+        }
-+    }
++    gx8 = _mm256_cvttps_epi32(g_linx8);
++    gx8 = _mm256_min_epi32(gx8, upper_bound);
++    gx8 = _mm256_max_epi32(gx8, zerox8);
 +
-+    // Process remaining pixels cannot fill the full simd register with scalar version
-+    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
-+        rdsty += offset;
-+        rdstu += offset >> 1;
-+        rdstv += offset >> 1;
-+        rsrcy += offset;
-+        rsrcu += offset >> 1;
-+        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
-+                                      rsrcy, rsrcu, rsrcv,
-+                                      dstlinesize, srclinesize,
-+                                      dstdepth, srcdepth,
-+                                      remainw, rheight, params);
-+    }
++    bx8 = _mm256_cvttps_epi32(b_linx8);
++    bx8 = _mm256_min_epi32(bx8, upper_bound);
++    bx8 = _mm256_max_epi32(bx8, zerox8);
++
++#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm256_extract_epi32(rx8, i)]; \
++g_out[i] = delin_lut[_mm256_extract_epi32(gx8, i)];                       \
++b_out[i] = delin_lut[_mm256_extract_epi32(bx8, i)];
++
++    SAVE_COLOR(0)
++    SAVE_COLOR(1)
++    SAVE_COLOR(2)
++    SAVE_COLOR(3)
++    SAVE_COLOR(4)
++    SAVE_COLOR(5)
++    SAVE_COLOR(6)
++    SAVE_COLOR(7)
++
++#undef SAVE_COLOR
 +}
 +
-+static void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
-+                                                     const uint16_t *srcy, const uint16_t *srcuv,
-+                                                     const int *dstlinesize, const int *srclinesize,
-+                                                     int dstdepth, int srcdepth,
-+                                                     int width, int height,
-+                                                     const struct TonemapIntParams *params)
++X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params)
 +{
-+    uint16_t *rdsty = dsty;
-+    uint16_t *rdstuv = dstuv;
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
 +    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
 +    int rheight = height;
-+    // not zero when not divisible by 8
++    // not zero when not divisible by 16
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
++    int remainw = width & 14;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -4614,652 +4350,1252 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
-+    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
-+    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
-+    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
-+    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
-+    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
-+    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
-+    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
-+    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
-+    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
-+    uint16x8_t uvx8;
-+    uint16x4_t ux2a, vx2a, ux2b, vx2b;
-+    uint16x8_t y0x8, y1x8, ux8, vx8;
-+    uint16x8_t r0x8, g0x8, b0x8;
-+    uint16x8_t r1x8, g1x8, b1x8;
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+    int16x8_t r0ox8, g0ox8, b0ox8;
-+    uint16x8_t y0ox8;
-+    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
-+    int32x4_t y0oax4, y0obx4;
++    __m256i ux8, vx8;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
-+    int16x8_t r1ox8, g1ox8, b1ox8;
-+    uint16x8_t y1ox8;
-+    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    int32x4_t y1oax4, y1obx4;
-+    int32x4_t uvoax4, uvobx4;
-+    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
-+    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
-+    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
-+    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
-+    int16x8_t out_sh2x8 = vdupq_n_s16(out_sh2);
-+    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
-+    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
-+            int x = xx << 3;
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
 +
-+            y0x8 = vld1q_u16(srcy + x);
-+            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            uvx8 = vld1q_u16(srcuv + x);
-+            if (in_depth == 10) {
-+                // shift to low10bits for 10bit input
-+                // shift bit has to be compile-time constant
-+                y0x8 = vshrq_n_u16(y0x8, 6);
-+                y1x8 = vshrq_n_u16(y1x8, 6);
-+                uvx8 = vshrq_n_u16(uvx8, 6);
-+            }
-+            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
-+            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
-+            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
++            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
 +
-+            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
-+            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
-+            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
-+            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
 +
-+            ux8 = vcombine_u16(ux2a, ux2b);
-+            vx8 = vcombine_u16(vx2a, vx2b);
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
++            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
 +
-+            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
-+            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
-+            r0x8 = vaddq_u16(r0x8, rndx8);
++            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
 +
-+            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
-+            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
-+            g0x8 = vaddq_u16(g0x8, rndx8);
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
 +
-+            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
-+            b0x8 = vaddq_u16(b0x8, rndx8);
++            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
 +
-+            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
-+            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
-+            r1x8 = vaddq_u16(r1x8, rndx8);
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
 +
-+            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
-+            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
-+            g1x8 = vaddq_u16(g1x8, rndx8);
++            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
 +
-+            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
-+            b1x8 = vaddq_u16(b1x8, rndx8);
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
-+            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
++            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+            r0ox8 = vld1q_s16(r);
-+            g0ox8 = vld1q_s16(g);
-+            b0ox8 = vld1q_s16(b);
++            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
 +
-+            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
-+            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
-+            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
 +
-+            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
-+            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
-+            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
 +
-+            y0oax4 = vmulq_n_s32(r0oax4, cry);
-+            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
-+            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
-+            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
 +
-+            y0obx4 = vmulq_n_s32(r0obx4, cry);
-+            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
-+            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
-+            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+            r1ox8 = vld1q_s16(r1);
-+            g1ox8 = vld1q_s16(g1);
-+            b1ox8 = vld1q_s16(b1);
++            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
 +
-+            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
-+            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
-+            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
 +
-+            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
-+            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
-+            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
 +
-+            y1oax4 = vmulq_n_s32(r1oax4, cry);
-+            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
-+            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
-+            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
 +
-+            y1obx4 = vmulq_n_s32(r1obx4, cry);
-+            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
-+            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
-+            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
 +
-+            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
-+            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
-+            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
-+            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
-+            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
-+            ravgx4 = vshrq_n_s32(ravgx4, 2);
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
-+            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
-+            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
-+            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
-+            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
-+            gavgx4 = vshrq_n_s32(gavgx4, 2);
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
-+            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
-+            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
-+            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
-+            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
-+            bavgx4 = vshrq_n_s32(bavgx4, 2);
++            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +
-+            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
-+            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
-+            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
 +
-+            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
-+            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
-+            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
 +
-+            switch(out_depth) {
-+                default:
-+                case 10:
-+                    y0oax4 = vshrq_n_s32(y0oax4, 19);
-+                    y0obx4 = vshrq_n_s32(y0obx4, 19);
-+                    y1oax4 = vshrq_n_s32(y1oax4, 19);
-+                    y1obx4 = vshrq_n_s32(y1obx4, 19);
-+                    uox4 = vshrq_n_s32(uox4, 19);
-+                    vox4 = vshrq_n_s32(vox4, 19);
-+                    break;
-+                case 16:
-+                    y0oax4 = vshrq_n_s32(y0oax4, 13);
-+                    y0obx4 = vshrq_n_s32(y0obx4, 13);
-+                    y1oax4 = vshrq_n_s32(y1oax4, 13);
-+                    y1obx4 = vshrq_n_s32(y1obx4, 13);
-+                    uox4 = vshrq_n_s32(uox4, 13);
-+                    vox4 = vshrq_n_s32(vox4, 13);
-+                    break;
-+            }
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
 +
-+            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
-+            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
-+            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
-+            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
-+            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
-+            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
-+            y0ox8 = vshlq_u16(y0ox8, out_sh2x8);
-+            vst1q_u16(&dsty[x], y0ox8);
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
-+            y1ox8 = vshlq_u16(y1ox8, out_sh2x8);
-+            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
++            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +
-+            uvoax4 = vzip1q_s32(uox4, vox4);
-+            uvobx4 = vzip2q_s32(uox4, vox4);
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
 +
-+            vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8));
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
++            uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256());
++            _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
++            vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256());
++            _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8));
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
++        int offset = width & (int)0xfffffff0;
 +        rdsty += offset;
-+        rdstuv += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
 +        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
-+                                       rsrcy, rsrcuv,
-+                                       dstlinesize, srclinesize,
-+                                       dstdepth, srcdepth,
-+                                       remainw, rheight, params);
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
 +    }
 +}
-+#endif
-+
-+#define LOAD_TONEMAP_PARAMS     TonemapxContext *s = ctx->priv; \
-+ThreadData *td = arg;                                           \
-+AVFrame *in = td->in;                                           \
-+AVFrame *out = td->out;                                         \
-+const AVPixFmtDescriptor *desc  = td->desc;                     \
-+const AVPixFmtDescriptor *odesc = td->odesc;                    \
-+const int ss = 1 << FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h); \
-+const int slice_start = (in->height / ss *  jobnr     ) / nb_jobs * ss; \
-+const int slice_end   = (in->height / ss * (jobnr + 1)) / nb_jobs * ss; \
-+TonemapIntParams params = {                                     \
-+.lut_peak            = s->lut_peak,                             \
-+.lin_lut             = s->lin_lut,                              \
-+.tonemap_lut         = s->tonemap_lut,                          \
-+.delin_lut           = s->delin_lut,                            \
-+.in_yuv_off          = s->in_yuv_off,                           \
-+.out_yuv_off         = s->out_yuv_off,                          \
-+.yuv2rgb_coeffs      = &s->yuv2rgb_coeffs,                      \
-+.rgb2yuv_coeffs      = &s->rgb2yuv_coeffs,                      \
-+.rgb2rgb_coeffs      = &s->rgb2rgb_coeffs,                      \
-+.rgb2rgb_passthrough = in->color_primaries == out->color_primaries,   \
-+.coeffs              = s->coeffs,                               \
-+.ocoeffs             = s->ocoeffs,                              \
-+.desat               = s->desat,                                \
-+};
 +
-+static int filter_slice_planar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                                 const int *dstlinesize, const int *srclinesize,
++                                                 int dstdepth, int srcdepth,
++                                                 int width, int height,
++                                                 const struct TonemapIntParams *params)
 +{
-+    LOAD_TONEMAP_PARAMS
-+    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 14;
 +
-+    s->tonemap_func_planar8(out->data[0] + out->linesize[0] * slice_start,
-+                            out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                            out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                            (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                            (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                            (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                            out->linesize, in->linesize,
-+                            odesc->comp[0].depth, desc->comp[0].depth,
-+                            out->width, slice_end - slice_start,
-+                            &params);
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
 +
-+    return 0;
-+}
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
 +
-+static int filter_slice_biplanar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
-+{
-+    LOAD_TONEMAP_PARAMS
-+    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
-+    s->tonemap_func_biplanar8(out->data[0] + out->linesize[0] * slice_start,
-+                              out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                              (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                              (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                              out->linesize, in->linesize,
-+                              odesc->comp[0].depth, desc->comp[0].depth,
-+                              out->width, slice_end - slice_start,
-+                              &params);
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    return 0;
-+}
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+static int filter_slice_planar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
-+{
-+    LOAD_TONEMAP_PARAMS
-+    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++    __m256i ux8, vx8;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
-+    s->tonemap_func_planar10(out->data[0] + out->linesize[0] * slice_start,
-+                             out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                             out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                             (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                             (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                             (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                             out->linesize, in->linesize,
-+                             odesc->comp[0].depth, desc->comp[0].depth,
-+                             out->width, slice_end - slice_start,
-+                             &params);
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
 +
-+    return 0;
-+}
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
++            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
 +
-+static int filter_slice_biplanar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
-+{
-+    LOAD_TONEMAP_PARAMS
-+    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
 +
-+    s->tonemap_func_biplanar10(out->data[0] + out->linesize[0] * slice_start,
-+                               out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                               (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                               (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                               out->linesize, in->linesize,
-+                               odesc->comp[0].depth, desc->comp[0].depth,
-+                               out->width, slice_end - slice_start,
-+                               &params);
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
++            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
 +
-+    return 0;
-+}
++            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
 +
-+static int filter_frame(AVFilterLink *link, AVFrame *in)
-+{
-+    AVFilterContext *ctx = link->dst;
-+    TonemapxContext *s = ctx->priv;
-+    AVFilterLink *outlink = ctx->outputs[0];
-+    AVFrame *out;
-+    const AVPixFmtDescriptor *desc;
-+    const AVPixFmtDescriptor *odesc;
-+    int ret;
-+    double peak = s->peak;
-+    const AVLumaCoefficients *coeffs;
-+    ThreadData td;
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
 +
-+    desc = av_pix_fmt_desc_get(link->format);
-+    odesc = av_pix_fmt_desc_get(outlink->format);
-+    if (!desc || !odesc) {
-+        av_frame_free(&in);
-+        return AVERROR_BUG;
-+    }
++            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
 +
-+    switch (odesc->comp[2].plane) {
-+        case 1: // biplanar
-+            if (odesc->comp[0].depth == 8) {
-+                s->filter_slice = filter_slice_biplanar8;
-+            } else {
-+                s->filter_slice = filter_slice_biplanar10;
-+            }
-+            break;
-+        default:
-+        case 2: // planar
-+            if (odesc->comp[0].depth == 8) {
-+                s->filter_slice = filter_slice_planar8;
-+            } else {
-+                s->filter_slice = filter_slice_planar10;
-+            }
-+            break;
-+    }
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
 +
-+    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
-+    if (!out) {
-+        av_frame_free(&in);
-+        return AVERROR(ENOMEM);
-+    }
++            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
 +
-+    if ((ret = av_frame_copy_props(out, in)) < 0)
-+        goto fail;
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+    /* read peak from side data if not passed in */
-+    if (!peak) {
-+        peak = ff_determine_signal_peak(in);
-+        av_log(s, AV_LOG_DEBUG, "Computed signal peak: %f\n", peak);
-+    }
++            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+    out->color_trc = s->trc;
-+    out->colorspace = s->spc;
-+    out->color_primaries = s->pri;
-+    out->color_range = s->range;
++            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
 +
-+    if (in->color_trc == AVCOL_TRC_UNSPECIFIED)
-+        in->color_trc = AVCOL_TRC_SMPTE2084;
-+    if (out->color_trc == AVCOL_TRC_UNSPECIFIED)
-+        out->color_trc = AVCOL_TRC_BT709;
++            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
 +
-+    if (in->colorspace == AVCOL_SPC_UNSPECIFIED)
-+        in->colorspace = AVCOL_SPC_BT2020_NCL;
-+    if (out->colorspace == AVCOL_SPC_UNSPECIFIED)
-+        out->colorspace = AVCOL_SPC_BT709;
++            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
 +
-+    if (in->color_primaries == AVCOL_PRI_UNSPECIFIED)
-+        in->color_primaries = AVCOL_PRI_BT2020;
-+    if (out->color_primaries == AVCOL_PRI_UNSPECIFIED)
-+        out->color_primaries = AVCOL_PRI_BT709;
++            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
 +
-+    if (in->color_range == AVCOL_RANGE_UNSPECIFIED)
-+        in->color_range = AVCOL_RANGE_MPEG;
-+    if (out->color_range == AVCOL_RANGE_UNSPECIFIED)
-+        out->color_range = AVCOL_RANGE_MPEG;
++            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+    if (!s->lin_lut || !s->delin_lut) {
-+        if ((ret = comput_trc_luts(s, in->color_trc, out->color_trc)) < 0)
-+            goto fail;
-+    }
++            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
 +
-+    if (!s->tonemap_lut || s->lut_peak != peak) {
-+        s->lut_peak = peak;
-+        if ((ret = compute_tonemap_lut(s, out->color_trc)) < 0)
-+            goto fail;
-+    }
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
 +
-+    coeffs = av_csp_luma_coeffs_from_avcsp(in->colorspace);
-+    if (s->coeffs != coeffs) {
-+        s->coeffs = coeffs;
-+        s->ocoeffs = av_csp_luma_coeffs_from_avcsp(out->colorspace);
-+        if ((ret = compute_yuv_coeffs(s, coeffs, s->ocoeffs, desc, odesc,
-+             in->color_range, out->color_range)) < 0)
-+            goto fail;
-+        if ((ret = compute_rgb_coeffs(s, in->color_primaries, out->color_primaries)) < 0)
-+            goto fail;
-+    }
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
 +
-+    /* do the tonemap */
-+    td.in    = in;
-+    td.out   = out;
-+    td.desc  = desc;
-+    td.odesc = odesc;
-+    td.peak  = peak;
-+    ctx->internal->execute(ctx, s->filter_slice, &td, NULL,
-+                           FFMIN(outlink->h >> FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h), ff_filter_get_nb_threads(ctx)));
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
 +
-+    av_frame_free(&in);
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
 +
-+    ff_update_hdr_metadata(out, peak);
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+    return ff_filter_frame(outlink, out);
-+fail:
-+    av_frame_free(&in);
-+    av_frame_free(&out);
-+    return ret;
-+}
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+static void uninit(AVFilterContext *ctx)
-+{
-+    TonemapxContext *s = ctx->priv;
++            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
 +
-+    av_freep(&s->lin_lut);
-+    av_freep(&s->delin_lut);
-+    av_freep(&s->tonemap_lut);
-+}
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
 +
-+static int query_formats(AVFilterContext *ctx)
-+{
-+    enum AVPixelFormat valid_in_pix_fmts[4];
-+    AVFilterFormats *formats;
-+    const AVPixFmtDescriptor *desc;
-+    TonemapxContext *s = ctx->priv;
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
 +
-+    if (!strcmp(s->format_str, "same")) {
-+        int res;
-+        formats = ff_make_format_list(in_pix_fmts);
-+        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
-+        if (res < 0)
-+            return res;
-+        s->format = AV_PIX_FMT_NONE;
-+    } else {
-+        int i, j = 0;
-+        int res;
-+        formats = ff_make_format_list(in_pix_fmts);
-+        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
-+        if (res < 0)
-+            return res;
-+        if (s->format == AV_PIX_FMT_NONE) {
-+            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
-+            return AVERROR(EINVAL);
-+        }
-+        s->format = av_get_pix_fmt(s->format_str);
-+        // Check again in case of the string is invalid
-+        if (s->format == AV_PIX_FMT_NONE) {
-+            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
-+            return AVERROR(EINVAL);
-+        }
-+        desc = av_pix_fmt_desc_get(s->format);
-+        // Filter out the input formats for requested output formats
-+        // The input and output must have the same planar format, either planar or bi-planar packed
-+        for (i = 0; in_pix_fmts[i] != AV_PIX_FMT_NONE; i++) {
-+            const AVPixFmtDescriptor *tdesc = av_pix_fmt_desc_get(in_pix_fmts[i]);
-+            if (tdesc->comp[2].plane == desc->comp[2].plane) {
-+                valid_in_pix_fmts[j] = in_pix_fmts[i];
-+                j++;
-+            }
-+        }
-+        valid_in_pix_fmts[j] = AV_PIX_FMT_NONE;
-+        formats = ff_make_format_list(valid_in_pix_fmts);
-+        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
-+        if (res < 0)
-+            return res;
-+        if (out_format_is_supported(s->format)) {
-+            formats = NULL;
-+            res = ff_add_format(&formats, s->format);
-+            if (res < 0)
-+                return res;
-+        } else {
-+            av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
-+                   av_get_pix_fmt_name(s->format));
-+            return AVERROR(ENOSYS);
-+        }
-+    }
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
 +
-+    return ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats);
-+}
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+static av_cold int init(AVFilterContext *ctx)
-+{
-+    TonemapxContext *s = ctx->priv;
-+    int cpu_flags = av_get_cpu_flags();
-+    av_log(ctx, AV_LOG_DEBUG, "Requested output format: %s\n",
-+           s->format_str);
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+#if ARCH_AARCH64
-+    if (have_neon(cpu_flags)) {
-+        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_neon;
-+        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_neon;
-+        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon;
-+        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon;
-+    }
-+#elif ARCH_X86
-+    if (X86_SSE42(cpu_flags)) {
-+        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_sse;
-+        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_sse;
-+        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse;
-+        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse;
-+    }
-+    if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) {
-+        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_avx;
-+        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_avx;
-+        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx;
-+        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx;
-+    }
-+#endif
++            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
 +
-+    if (!s->tonemap_func_biplanar8) {
-+        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12;
-+    }
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
 +
-+    if (!s->tonemap_func_biplanar10) {
-+        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010;
-+    }
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
 +
-+    if (!s->tonemap_func_planar8) {
-+        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p;
-+    }
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
 +
-+    if (!s->tonemap_func_planar10) {
-+        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10;
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8));
++        }
 +    }
 +
-+    switch(s->tonemap) {
-+        case TONEMAP_GAMMA:
-+            if (isnan(s->param))
-+                s->param = 1.8f;
-+            break;
-+        case TONEMAP_REINHARD:
-+            if (!isnan(s->param))
-+                s->param = (1.0f - s->param) / s->param;
-+            break;
-+        case TONEMAP_MOBIUS:
-+            if (isnan(s->param))
-+                s->param = 0.3f;
-+            break;
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
 +    }
++}
 +
-+    if (isnan(s->param))
-+        s->param = 1.0f;
++X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
++                                                  const uint16_t *srcy, const uint16_t *srcuv,
++                                                  const int *dstlinesize, const int *srclinesize,
++                                                  int dstdepth, int srcdepth,
++                                                  int width, int height,
++                                                  const struct TonemapIntParams *params)
++{
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 16
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 14;
 +
-+    return 0;
-+}
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++//    const int in_sh2 = 16 - in_depth;
 +
-+#define OFFSET(x) offsetof(TonemapxContext, x)
-+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
-+static const AVOption tonemapx_options[] = {
-+    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_BT2390}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, "tonemap" },
-+    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},              0, 0, FLAGS, "tonemap" },
-+    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},            0, 0, FLAGS, "tonemap" },
-+    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},             0, 0, FLAGS, "tonemap" },
-+    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0, 0, FLAGS, "tonemap" },
-+    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},          0, 0, FLAGS, "tonemap" },
-+    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},             0, 0, FLAGS, "tonemap" },
-+    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},            0, 0, FLAGS, "tonemap" },
-+    {     "bt2390",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_BT2390},            0, 0, FLAGS, "tonemap" },
-+    { "transfer",     "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, "transfer" },
-+    { "t",            "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, "transfer" },
-+    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},           0, 0, FLAGS, "transfer" },
-+    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},       0, 0, FLAGS, "transfer" },
-+    { "matrix",       "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, "matrix" },
-+    { "m",            "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, "matrix" },
-+    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},           0, 0, FLAGS, "matrix" },
-+    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},      0, 0, FLAGS, "matrix" },
-+    { "primaries",    "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, "primaries" },
-+    { "p",            "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, "primaries" },
-+    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709},           0, 0, FLAGS, "primaries" },
-+    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020},          0, 0, FLAGS, "primaries" },
-+    { "range",        "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, "range" },
-+    { "r",            "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, "range" },
-+    {     "tv",       0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},          0, 0, FLAGS, "range" },
-+    {     "pc",       0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},          0, 0, FLAGS, "range" },
-+    {     "limited",  0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},          0, 0, FLAGS, "range" },
-+    {     "full",     0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},          0, 0, FLAGS, "range" },
-+    { "format",       "output format",       OFFSET(format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS },
-+    { "param",        "tonemap parameter", OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },
-+    { "desat",        "desaturation strength", OFFSET(desat), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
-+    { "peak",         "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
-+    { NULL }
-+};
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++//    const int out_sh2 = 16 - out_depth;
 +
-+AVFILTER_DEFINE_CLASS(tonemapx);
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
-+static const AVFilterPad tonemapx_inputs[] = {
-+    {
-+        .name         = "default",
-+        .type         = AVMEDIA_TYPE_VIDEO,
-+        .filter_frame = filter_frame,
-+    },
-+};
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+static const AVFilterPad tonemapx_outputs[] = {
-+    {
-+        .name         = "default",
-+        .type         = AVMEDIA_TYPE_VIDEO,
-+    },
-+};
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+AVFilter ff_vf_tonemapx = {
-+    .name            = "tonemapx",
-+    .description     = NULL_IF_CONFIG_SMALL("HDR to SDR tonemapping"),
-+    .init            = init,
-+    .uninit          = uninit,
-+    .priv_size       = sizeof(TonemapxContext),
-+    .priv_class      = &tonemapx_class,
-+    FILTER_INPUTS(tonemapx_inputs),
-+    FILTER_OUTPUTS(tonemapx_outputs),
-+    FILTER_QUERY_FUNC(query_formats),
-+    .flags           = AVFILTER_FLAG_SLICE_THREADS,
-+};
++    __m256i uvx16, uvx8a, uvx8b;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
++
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
++
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                y0x16 = _mm256_srli_epi16(y0x16, 6);
++                y1x16 = _mm256_srli_epi16(y1x16, 6);
++                uvx16 = _mm256_srli_epi16(uvx16, 6);
++            }
++
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
++            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
++            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
++
++            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
++
++            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
++
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
++
++            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
++
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
++
++            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
++
++            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
++
++            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
++
++            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
++
++            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
++
++            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
++
++            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
++
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++
++            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
++            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
++            uvox16 = _mm256_packs_epi32(uvoax8, uvobx8);
++            _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
++    }
++}
++
++X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
++                                                              const uint16_t *srcy, const uint16_t *srcuv,
++                                                              const int *dstlinesize, const int *srclinesize,
++                                                              int dstdepth, int srcdepth,
++                                                              int width, int height,
++                                                              const struct TonemapIntParams *params)
++{
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 14;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
++
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++    __m256i uvx16, uvx8a, uvx8b;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
++
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
++
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                y0x16 = _mm256_srli_epi16(y0x16, 6);
++                y1x16 = _mm256_srli_epi16(y1x16, 6);
++                uvx16 = _mm256_srli_epi16(uvx16, 6);
++            }
++
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
++            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
++            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
++
++            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
++
++            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
++
++            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
++
++            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
++
++            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
++            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
++
++            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
++
++            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
++
++            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
++
++            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
++
++            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
++
++            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
++
++            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
++
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++                                params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                params->rgb2rgb_passthrough);
++
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++
++            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++
++            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
++            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
++            uvox16 = _mm256_packus_epi32(uvoax8, uvobx8);
++            uvox16 = _mm256_slli_epi16(uvox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16);
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff0;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
++                                            rsrcy, rsrcuv,
++                                            dstlinesize, srclinesize,
++                                            dstdepth, srcdepth,
++                                            remainw, rheight, params);
++    }
++}
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
+===================================================================
+--- /dev/null
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
+@@ -0,0 +1,58 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef FFMPEG_VF_TONEMAPX_INTRIN_AVX_H
++#define FFMPEG_VF_TONEMAPX_INTRIN_AVX_H
++
++#include <immintrin.h>
++#include <emmintrin.h>
++#include <smmintrin.h>
++
++#include "libavfilter/vf_tonemapx.h"
++
++X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                               const int *dstlinesize, const int *srclinesize,
++                                               int dstdepth, int srcdepth,
++                                               int width, int height,
++                                               const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                                 const int *dstlinesize, const int *srclinesize,
++                                                 int dstdepth, int srcdepth,
++                                                 int width, int height,
++                                                 const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
++                                                  const uint16_t *srcy, const uint16_t *srcuv,
++                                                  const int *dstlinesize, const int *srclinesize,
++                                                  int dstdepth, int srcdepth,
++                                                  int width, int height,
++                                                  const struct TonemapIntParams *params);
++
++X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
++                                                       const uint16_t *srcy, const uint16_t *srcuv,
++                                                       const int *dstlinesize, const int *srclinesize,
++                                                       int dstdepth, int srcdepth,
++                                                       int width, int height,
++                                                       const struct TonemapIntParams *params);
++
++#endif //FFMPEG_VF_TONEMAPX_INTRIN_AVX_H

From 009db693148cff85e7b9b79d7fa508f40233f47f Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sun, 30 Jun 2024 15:32:39 +0800
Subject: [PATCH 15/27] ci: workaround gcc for windows on docker builder

---
 docker-build-win64.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker-build-win64.sh b/docker-build-win64.sh
index 2819658872b..eb63a6f39c9 100755
--- a/docker-build-win64.sh
+++ b/docker-build-win64.sh
@@ -588,6 +588,9 @@ ffversion="$(dpkg-parsechangelog --show-field Version)"
 if [[ -f "patches/series" ]]; then
     quilt push -a
 fi
+# Workaround for GCC bug causing misaligned AVX instructions
+CFLAGS+=" -Wa,-muse-unaligned-vector-move"
+CXXFLAGS+=" -Wa,-muse-unaligned-vector-move"
 ./configure \
     --prefix=${FF_PREFIX} \
     ${FF_TARGET_FLAGS} \

From c6c64b3b1141c9e313bcb8a9ba166bc346f11ffa Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sun, 30 Jun 2024 21:42:24 +0800
Subject: [PATCH 16/27] tonemapx/avfilter: don't redefine white and float eps

---
 debian/patches/0080-add-tonemapx-filter.patch | 62 ++++++++-----------
 1 file changed, 26 insertions(+), 36 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 8a7e6674c4e..1c040dfb29b 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -285,13 +285,13 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +
 +static float bt2390(float s, float peak)
 +{
-+    float peak_pq = inverse_eotf_st2084(peak, REFERENCE_WHITE);
++    float peak_pq = inverse_eotf_st2084(peak, REFERENCE_WHITE_ALT);
 +    float scale = 1.0f / peak_pq;
 +
 +    // SDR peak
 +    float dst_peak = 1.0f;
-+    float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE) * scale;
-+    float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE) * scale;
++    float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE_ALT) * scale;
++    float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale;
 +
 +    float ks = 1.5f * maxLum - 0.5f;
 +    float tb = (s_pq - ks) / (1.0f - ks);
@@ -302,7 +302,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +               (-2.0f * tb3 + 3.0f * tb2) * maxLum;
 +    float sig = (s_pq < ks) ? s_pq : pb;
 +
-+    return eotf_st2084(sig * peak_pq, REFERENCE_WHITE);
++    return eotf_st2084(sig * peak_pq, REFERENCE_WHITE_ALT);
 +}
 +
 +static float mapsig(enum TonemapAlgorithm alg, float sig, double peak, double param)
@@ -343,7 +343,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +static float linearize(float x, enum AVColorTransferCharacteristic trc_src)
 +{
 +    if (trc_src == AVCOL_TRC_SMPTE2084)
-+        return eotf_st2084(x, REFERENCE_WHITE);
++        return eotf_st2084(x, REFERENCE_WHITE_ALT);
 +    else if (trc_src == AVCOL_TRC_ARIB_STD_B67)
 +        return eotf_arib_b67(x);
 +    else
@@ -538,11 +538,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +
 +// See also libavfilter/colorspacedsp_template.c
 +void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
-+                                           const uint16_t *srcy, const uint16_t *srcuv,
-+                                           const int *dstlinesize, const int *srclinesize,
-+                                           int dstdepth, int srcdepth,
-+                                           int width, int height,
-+                                           const struct TonemapIntParams *params)
++                                    const uint16_t *srcy, const uint16_t *srcuv,
++                                    const int *dstlinesize, const int *srclinesize,
++                                    int dstdepth, int srcdepth,
++                                    int width, int height,
++                                    const struct TonemapIntParams *params)
 +{
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -629,11 +629,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +}
 +
 +void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                        const int *dstlinesize, const int *srclinesize,
-+                                        int dstdepth, int srcdepth,
-+                                        int width, int height,
-+                                        const struct TonemapIntParams *params)
++                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                 const int *dstlinesize, const int *srclinesize,
++                                 int dstdepth, int srcdepth,
++                                 int width, int height,
++                                 const struct TonemapIntParams *params)
 +{
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -719,11 +719,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +}
 +
 +void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                          const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                          const int *dstlinesize, const int *srclinesize,
-+                                          int dstdepth, int srcdepth,
-+                                          int width, int height,
-+                                          const struct TonemapIntParams *params)
++                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                   const int *dstlinesize, const int *srclinesize,
++                                   int dstdepth, int srcdepth,
++                                   int width, int height,
++                                   const struct TonemapIntParams *params)
 +{
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -809,11 +809,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +}
 +
 +void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
-+                                                const uint16_t *srcy, const uint16_t *srcuv,
-+                                                const int *dstlinesize, const int *srclinesize,
-+                                                int dstdepth, int srcdepth,
-+                                                int width, int height,
-+                                                const struct TonemapIntParams *params)
++                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                         const int *dstlinesize, const int *srclinesize,
++                                         int dstdepth, int srcdepth,
++                                         int width, int height,
++                                         const struct TonemapIntParams *params)
 +{
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -4057,7 +4057,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.h
-@@ -0,0 +1,86 @@
+@@ -0,0 +1,76 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -4081,16 +4081,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h
 +
 +#include "colorspace.h"
 +
-+#ifdef REFERENCE_WHITE
-+#undef REFERENCE_WHITE
-+#endif
-+#define REFERENCE_WHITE 203.0f
-+
-+#ifdef FLOAT_EPS
-+#undef FLOAT_EPS
-+#endif
-+#define FLOAT_EPS 1.175494351e-38f
-+
 +#if defined(__GNUC__) || defined(__clang__)
 +#    if (__GNUC__ >= 11) || (__clang_major__ >= 12)
 +#        define X86_64_V2 __attribute__((target("arch=x86-64-v2")))

From 6fb941647131c7dd35569ba5d1e4c3f642e17820 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sun, 30 Jun 2024 21:45:57 +0800
Subject: [PATCH 17/27] avfilter/tonemapx: use AVFILTER instead of FFMPEG_VF as
 header prefix

---
 debian/patches/0080-add-tonemapx-filter.patch | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 1c040dfb29b..2655051a5a6 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -2574,8 +2574,8 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
-+#ifndef FFMPEG_VF_TONEMAPX_INTRIN_NEON_H
-+#define FFMPEG_VF_TONEMAPX_INTRIN_NEON_H
++#ifndef AVFILTER_TONEMAPX_INTRIN_NEON_H
++#define AVFILTER_TONEMAPX_INTRIN_NEON_H
 +
 +#include <arm_neon.h>
 +
@@ -2609,7 +2609,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
 +                                              int width, int height,
 +                                              const struct TonemapIntParams *params);
 +
-+#endif //FFMPEG_VF_TONEMAPX_INTRIN_NEON_H
++#endif //AVFILTER_TONEMAPX_INTRIN_NEON_H
 Index: FFmpeg/libavfilter/x86/Makefile
 ===================================================================
 --- FFmpeg.orig/libavfilter/x86/Makefile
@@ -4015,8 +4015,8 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
-+#ifndef FFMPEG_VF_TONEMAPX_INTRIN_SSE_H
-+#define FFMPEG_VF_TONEMAPX_INTRIN_SSE_H
++#ifndef AVFILTER_TONEMAPX_INTRIN_SSE_H
++#define AVFILTER_TONEMAPX_INTRIN_SSE_H
 +
 +#include <immintrin.h>
 +#include <emmintrin.h>
@@ -4052,7 +4052,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
 +                                                       int width, int height,
 +                                                       const struct TonemapIntParams *params);
 +
-+#endif //FFMPEG_VF_TONEMAPX_INTRIN_SSE_H
++#endif //AVFILTER_TONEMAPX_INTRIN_SSE_H
 Index: FFmpeg/libavfilter/vf_tonemapx.h
 ===================================================================
 --- /dev/null
@@ -4076,8 +4076,8 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
-+#ifndef FFMPEG_VF_TONEMAPX_H
-+#define FFMPEG_VF_TONEMAPX_H
++#ifndef AVFILTER_TONEMAPX_H
++#define AVFILTER_TONEMAPX_H
 +
 +#include "colorspace.h"
 +
@@ -4133,7 +4133,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h
 +                                                int width, int height,
 +                                                const struct TonemapIntParams *params);
 +
-+#endif //FFMPEG_VF_TONEMAPX_H
++#endif //AVFILTER_TONEMAPX_H
 Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 ===================================================================
 --- /dev/null
@@ -5551,8 +5551,8 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
-+#ifndef FFMPEG_VF_TONEMAPX_INTRIN_AVX_H
-+#define FFMPEG_VF_TONEMAPX_INTRIN_AVX_H
++#ifndef AVFILTER_TONEMAPX_INTRIN_AVX_H
++#define AVFILTER_TONEMAPX_INTRIN_AVX_H
 +
 +#include <immintrin.h>
 +#include <emmintrin.h>
@@ -5588,4 +5588,4 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
 +                                                       int width, int height,
 +                                                       const struct TonemapIntParams *params);
 +
-+#endif //FFMPEG_VF_TONEMAPX_INTRIN_AVX_H
++#endif //AVFILTER_TONEMAPX_INTRIN_AVX_H

From 8694c6a6cc45aff3ab7af6894a77480431dddc18 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sun, 30 Jun 2024 22:35:08 +0800
Subject: [PATCH 18/27] avfilter/tonemapx: code cleanup

---
 debian/patches/0080-add-tonemapx-filter.patch | 118 +++++++++---------
 1 file changed, 61 insertions(+), 57 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 2655051a5a6..83ba6256705 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -115,7 +115,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,1203 @@
+@@ -0,0 +1,1211 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -148,7 +148,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#include "libavutil/mem_internal.h"
 +#include "libavutil/opt.h"
 +#include "libavutil/cpu.h"
-+#if  ARCH_AARCH64
++#if ARCH_AARCH64
 +#   include "libavutil/aarch64/cpu.h"
 +#   include "aarch64/vf_tonemapx_intrin_neon.h"
 +#endif
@@ -358,8 +358,8 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +        return x;
 +}
 +
-+static int comput_trc_luts(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src,
-+                           enum AVColorTransferCharacteristic trc_dst)
++static int compute_trc_luts(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src,
++                            enum AVColorTransferCharacteristic trc_dst)
 +{
 +    int i;
 +
@@ -514,21 +514,6 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    r_lin *= mapval;
 +    g_lin *= mapval;
 +    b_lin *= mapval;
-+
-+    /*float cmin = FFMIN(FFMIN(r_lin, g_lin), b_lin);
-+    if (cmin < 0.0) {
-+        float luma = ocoeffs->cr * r_lin + ocoeffs->cg * g_lin + ocoeffs->cb * b_lin;
-+        float coeff = cmin / (cmin - luma);
-+        r_lin = MIX(r_lin, luma, coeff);
-+        g_lin = MIX(g_lin, luma, coeff);
-+        b_lin = MIX(b_lin, luma, coeff);avassert
-+    }
-+    float cmax = FFMAX(FFMAX(r_lin, g_lin), b_lin);
-+    if (cmax > 1.0) {
-+        r_lin /= cmax;
-+        g_lin /= cmax;
-+        b_lin /= cmax;
-+    }*/
 +#undef MIX
 +
 +    *r_out = delin_lut[av_clip_uintp2(r_lin * 32767 + 0.5, 15)];
@@ -570,6 +555,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
++
 +    int16_t r[4], g[4], b[4];
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
@@ -610,10 +600,10 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                          params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
 +
-+            int r00 = r[0], g00 = g[0], b00 = b[0];
-+            int r01 = r[1], g01 = g[1], b01 = b[1];
-+            int r10 = r[2], g10 = g[2], b10 = b[2];
-+            int r11 = r[3], g11 = g[3], b11 = b[3];
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
 +
 +            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
 +            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
@@ -660,6 +650,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
++
 +    int16_t r[4], g[4], b[4];
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
@@ -700,10 +695,10 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                          params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
 +
-+            int r00 = r[0], g00 = g[0], b00 = b[0];
-+            int r01 = r[1], g01 = g[1], b01 = b[1];
-+            int r10 = r[2], g10 = g[2], b10 = b[2];
-+            int r11 = r[3], g11 = g[3], b11 = b[3];
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
 +
 +            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
 +            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
@@ -750,6 +745,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
++
 +    int16_t r[4], g[4], b[4];
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
@@ -790,10 +790,10 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                          params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
 +
-+            int r00 = r[0], g00 = g[0], b00 = b[0];
-+            int r01 = r[1], g01 = g[1], b01 = b[1];
-+            int r10 = r[2], g10 = g[2], b10 = b[2];
-+            int r11 = r[3], g11 = g[3], b11 = b[3];
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
 +
 +            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 16);
 +            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 16);
@@ -842,6 +842,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
++
 +    int16_t r[4], g[4], b[4];
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
@@ -882,10 +887,10 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +                          params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
 +
-+            int r00 = r[0], g00 = g[0], b00 = b[0];
-+            int r01 = r[1], g01 = g[1], b01 = b[1];
-+            int r10 = r[2], g10 = g[2], b10 = b[2];
-+            int r11 = r[3], g11 = g[3], b11 = b[3];
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
 +
 +            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
 +            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
@@ -966,9 +971,11 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    LOAD_TONEMAP_PARAMS
 +    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
 +
-+    s->tonemap_func_planar10(out->data[0] + out->linesize[0] * slice_start,
-+                             out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                             out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++    s->tonemap_func_planar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start),
++                             (uint16_t *) (out->data[1] +
++                                           out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
++                             (uint16_t *) (out->data[2] +
++                                           out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
 +                             (void*)(in->data[0] + in->linesize[0] * slice_start),
 +                             (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
 +                             (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
@@ -985,8 +992,9 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    LOAD_TONEMAP_PARAMS
 +    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
 +
-+    s->tonemap_func_biplanar10(out->data[0] + out->linesize[0] * slice_start,
-+                               out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++    s->tonemap_func_biplanar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start),
++                               (uint16_t *) (out->data[1] +
++                                             out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
 +                               (void*)(in->data[0] + in->linesize[0] * slice_start),
 +                               (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
 +                               out->linesize, in->linesize,
@@ -1076,7 +1084,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +        out->color_range = AVCOL_RANGE_MPEG;
 +
 +    if (!s->lin_lut || !s->delin_lut) {
-+        if ((ret = comput_trc_luts(s, in->color_trc, out->color_trc)) < 0)
++        if ((ret = compute_trc_luts(s, in->color_trc, out->color_trc)) < 0)
 +            goto fail;
 +    }
 +
@@ -1309,7 +1317,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +
 +AVFilter ff_vf_tonemapx = {
 +    .name            = "tonemapx",
-+    .description     = NULL_IF_CONFIG_SMALL("HDR to SDR tonemapping"),
++    .description     = NULL_IF_CONFIG_SMALL("SIMD optimized HDR to SDR tonemapping"),
 +    .init            = init,
 +    .uninit          = uninit,
 +    .priv_size       = sizeof(TonemapxContext),
@@ -1332,7 +1340,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
-@@ -0,0 +1,1216 @@
+@@ -0,0 +1,1215 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -1681,6 +1689,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
 +            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
 +            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            // output shift bits for 8bit outputs is 29 -8 = 21
 +            y0oax4 = vshrq_n_s32(y0oax4, 21);
 +            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
@@ -1755,14 +1764,14 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
 +            uox4 = vshrq_n_s32(uox4, 21);
 +            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
-+            vst1_lane_u32(&dstu[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(uox4), vdup_n_s16(0)))), 0);
++            vst1_lane_u32((uint32_t *) &dstu[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(uox4), vdup_n_s16(0)))), 0);
 +
 +            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
 +            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
 +            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
 +            vox4 = vshrq_n_s32(vox4, 21);
 +            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
-+            vst1_lane_u32(&dstv[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(vox4), vdup_n_s16(0)))), 0);
++            vst1_lane_u32((uint32_t *) &dstv[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(vox4), vdup_n_s16(0)))), 0);
 +        }
 +    }
 +
@@ -1803,13 +1812,11 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+//    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+//    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -1934,6 +1941,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
 +            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
 +            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            // output shift bits for 8bit outputs is 29 -8 = 21
 +            y0oax4 = vshrq_n_s32(y0oax4, 21);
 +            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
@@ -2298,7 +2306,6 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
@@ -2618,8 +2625,8 @@ Index: FFmpeg/libavfilter/x86/Makefile
  OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
  OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
  OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
-+OBJS-$(CONFIG_TONEMAPX_FILTER)               += x86/vf_tonemapx_intrin_sse.o
-+OBJS-$(CONFIG_TONEMAPX_FILTER)               += x86/vf_tonemapx_intrin_avx.o
++OBJS-$(CONFIG_TONEMAPX_FILTER)               += x86/vf_tonemapx_intrin_sse.o \
++                                                x86/vf_tonemapx_intrin_avx.o
  
  X86ASM-OBJS-$(CONFIG_SCENE_SAD)              += x86/scene_sad.o
  
@@ -2627,7 +2634,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
-@@ -0,0 +1,1362 @@
+@@ -0,0 +1,1361 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -2699,7 +2706,6 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +    __m128 mapvalx4, r_linx4, g_linx4, b_linx4;
 +    __m128 offset = _mm_set1_ps(0.5f);
 +    __m128i input_lut_offset = _mm_set1_epi32(2048);
-+    __m128i upper_bound = _mm_set1_epi32(32767);
 +    __m128 intermediate_upper_bound = _mm_set1_ps(32767.0f);
 +    __m128i r, g, b, rx4, gx4, bx4;
 +
@@ -3000,6 +3006,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            // output shift bits for 8bit outputs is 29 -8 = 21
 +            yoax4 = _mm_srai_epi32(yoax4, 21);
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
@@ -3408,13 +3415,11 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+//    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+//    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -3594,6 +3599,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            // output shift bits for 8bit outputs is 29 -8 = 21
 +            yoax4 = _mm_srai_epi32(yoax4, 21);
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
@@ -4138,7 +4144,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
-@@ -0,0 +1,1387 @@
+@@ -0,0 +1,1385 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -4928,13 +4934,11 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+//    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
-+//    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];

From 96654ff00ec1ecd6a1bdb4b31688440d5cd39977 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sun, 30 Jun 2024 22:50:11 +0800
Subject: [PATCH 19/27] avfilter/tonemapx: merge sse and avx multiplication

---
 debian/patches/0080-add-tonemapx-filter.patch | 108 +++++-------------
 1 file changed, 30 insertions(+), 78 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 83ba6256705..478a295cdbe 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -1689,7 +1689,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
 +            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
 +            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
-+            // output shift bits for 8bit outputs is 29 -8 = 21
++            // output shift bits for 8bit outputs is 29 - 8 = 21
 +            y0oax4 = vshrq_n_s32(y0oax4, 21);
 +            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
@@ -1941,7 +1941,7 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
 +            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
 +            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
-+            // output shift bits for 8bit outputs is 29 -8 = 21
++            // output shift bits for 8bit outputs is 29 - 8 = 21
 +            y0oax4 = vshrq_n_s32(y0oax4, 21);
 +            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
@@ -2634,7 +2634,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
-@@ -0,0 +1,1361 @@
+@@ -0,0 +1,1345 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -3006,7 +3006,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
-+            // output shift bits for 8bit outputs is 29 -8 = 21
++            // output shift bits for 8bit outputs is 29 - 8 = 21
 +            yoax4 = _mm_srai_epi32(yoax4, 21);
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
@@ -3194,27 +3194,25 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            vx4b = _mm_unpackhi_epi32(vx4, vx4);
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
 +            r0x4a = _mm_add_epi32(r0x4a, rndx4);
 +            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
 +            r0x4a = av_clip_int16_sse(r0x4a);
 +
-+            r1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
 +            r1x4a = _mm_add_epi32(r1x4a, rndx4);
 +            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
 +            r1x4a = av_clip_int16_sse(r1x4a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
 +            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
 +            g0x4a = _mm_add_epi32(g0x4a, rndx4);
 +            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
 +            g0x4a = av_clip_int16_sse(g0x4a);
 +
-+            g1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
 +            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
 +            g1x4a = _mm_add_epi32(g1x4a, rndx4);
@@ -3222,51 +3220,45 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            g1x4a = av_clip_int16_sse(g1x4a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
 +            b0x4a = _mm_add_epi32(b0x4a, rndx4);
 +            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
 +            b0x4a = av_clip_int16_sse(b0x4a);
 +
-+            b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
 +            b1x4a = _mm_add_epi32(b1x4a, rndx4);
 +            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
 +            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+            r0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
 +            r0x4b = _mm_add_epi32(r0x4b, rndx4);
 +            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
 +            r0x4b = av_clip_int16_sse(r0x4b);
 +
-+            r1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
 +            r1x4b = _mm_add_epi32(r1x4b, rndx4);
 +            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
 +            r1x4b = av_clip_int16_sse(r1x4b);
 +
-+            g0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
 +            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
 +            g0x4b = _mm_add_epi32(g0x4b, rndx4);
 +            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
 +            g0x4b = av_clip_int16_sse(g0x4b);
 +
-+            g1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
 +            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
 +            g1x4b = _mm_add_epi32(g1x4b, rndx4);
 +            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
 +            g1x4b = av_clip_int16_sse(g1x4b);
 +
-+            b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
 +            b0x4b = _mm_add_epi32(b0x4b, rndx4);
 +            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
 +            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+            b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
 +            b1x4b = _mm_add_epi32(b1x4b, rndx4);
 +            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
@@ -3599,7 +3591,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
-+            // output shift bits for 8bit outputs is 29 -8 = 21
++            // output shift bits for 8bit outputs is 29 - 8 = 21
 +            yoax4 = _mm_srai_epi32(yoax4, 21);
 +            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
@@ -3792,27 +3784,25 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
 +            r0x4a = _mm_add_epi32(r0x4a, rndx4);
 +            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
 +            r0x4a = av_clip_int16_sse(r0x4a);
 +
-+            r1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
 +            r1x4a = _mm_add_epi32(r1x4a, rndx4);
 +            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
 +            r1x4a = av_clip_int16_sse(r1x4a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
 +            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
 +            g0x4a = _mm_add_epi32(g0x4a, rndx4);
 +            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
 +            g0x4a = av_clip_int16_sse(g0x4a);
 +
-+            g1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
 +            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
 +            g1x4a = _mm_add_epi32(g1x4a, rndx4);
@@ -3820,51 +3810,45 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +            g1x4a = av_clip_int16_sse(g1x4a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
 +            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
 +            b0x4a = _mm_add_epi32(b0x4a, rndx4);
 +            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
 +            b0x4a = av_clip_int16_sse(b0x4a);
 +
-+            b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
 +            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
 +            b1x4a = _mm_add_epi32(b1x4a, rndx4);
 +            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
 +            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+            r0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
 +            r0x4b = _mm_add_epi32(r0x4b, rndx4);
 +            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
 +            r0x4b = av_clip_int16_sse(r0x4b);
 +
-+            r1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
 +            r1x4b = _mm_add_epi32(r1x4b, rndx4);
 +            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
 +            r1x4b = av_clip_int16_sse(r1x4b);
 +
-+            g0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
 +            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
 +            g0x4b = _mm_add_epi32(g0x4b, rndx4);
 +            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
 +            g0x4b = av_clip_int16_sse(g0x4b);
 +
-+            g1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
 +            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
 +            g1x4b = _mm_add_epi32(g1x4b, rndx4);
 +            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
 +            g1x4b = av_clip_int16_sse(g1x4b);
 +
-+            b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
 +            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
 +            b0x4b = _mm_add_epi32(b0x4b, rndx4);
 +            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
 +            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+            b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
 +            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
 +            b1x4b = _mm_add_epi32(b1x4b, rndx4);
 +            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
@@ -4144,7 +4128,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
-@@ -0,0 +1,1385 @@
+@@ -0,0 +1,1353 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -4398,27 +4382,25 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
 +            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
 +            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
 +            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
 +            r0x8a = av_clip_int16_avx(r0x8a);
 +
-+            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
 +            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
 +            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
 +            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
 +            r1x8a = av_clip_int16_avx(r1x8a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
 +            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
 +            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
 +            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
 +            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
 +            g0x8a = av_clip_int16_avx(g0x8a);
 +
-+            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
 +            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
 +            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
 +            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
@@ -4426,51 +4408,45 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            g1x8a = av_clip_int16_avx(g1x8a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
 +            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
 +            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
 +            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
 +            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
 +            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
 +            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
 +            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
 +            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
 +            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
 +            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
 +            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
 +            r0x8b = av_clip_int16_avx(r0x8b);
 +
-+            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
 +            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
 +            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
 +            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
 +            r1x8b = av_clip_int16_avx(r1x8b);
 +
-+            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
 +            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
 +            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
 +            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
 +            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
 +            g0x8b = av_clip_int16_avx(g0x8b);
 +
-+            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
 +            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
 +            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
 +            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
 +            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
 +            g1x8b = av_clip_int16_avx(g1x8b);
 +
-+            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
 +            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
 +            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
 +            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
 +            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
 +            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
 +            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
 +            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
@@ -4704,27 +4680,25 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
 +            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
 +            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
 +            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
 +            r0x8a = av_clip_int16_avx(r0x8a);
 +
-+            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
 +            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
 +            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
 +            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
 +            r1x8a = av_clip_int16_avx(r1x8a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
 +            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
 +            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
 +            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
 +            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
 +            g0x8a = av_clip_int16_avx(g0x8a);
 +
-+            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
 +            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
 +            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
 +            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
@@ -4732,51 +4706,45 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            g1x8a = av_clip_int16_avx(g1x8a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
 +            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
 +            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
 +            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
 +            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
 +            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
 +            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
 +            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
 +            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
 +            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
 +            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
 +            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
 +            r0x8b = av_clip_int16_avx(r0x8b);
 +
-+            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
 +            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
 +            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
 +            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
 +            r1x8b = av_clip_int16_avx(r1x8b);
 +
-+            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
 +            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
 +            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
 +            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
 +            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
 +            g0x8b = av_clip_int16_avx(g0x8b);
 +
-+            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
 +            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
 +            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
 +            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
 +            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
 +            g1x8b = av_clip_int16_avx(g1x8b);
 +
-+            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
 +            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
 +            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
 +            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
 +            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
 +            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
 +            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
 +            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
@@ -5014,27 +4982,25 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
 +            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
 +            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
 +            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
 +            r0x8a = av_clip_int16_avx(r0x8a);
 +
-+            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
 +            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
 +            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
 +            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
 +            r1x8a = av_clip_int16_avx(r1x8a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
 +            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
 +            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
 +            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
 +            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
 +            g0x8a = av_clip_int16_avx(g0x8a);
 +
-+            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
 +            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
 +            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
 +            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
@@ -5042,51 +5008,45 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            g1x8a = av_clip_int16_avx(g1x8a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
 +            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
 +            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
 +            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
 +            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
 +            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
 +            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
 +            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
 +            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
 +            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
 +            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
 +            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
 +            r0x8b = av_clip_int16_avx(r0x8b);
 +
-+            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
 +            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
 +            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
 +            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
 +            r1x8b = av_clip_int16_avx(r1x8b);
 +
-+            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
 +            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
 +            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
 +            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
 +            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
 +            g0x8b = av_clip_int16_avx(g0x8b);
 +
-+            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
 +            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
 +            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
 +            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
 +            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
 +            g1x8b = av_clip_int16_avx(g1x8b);
 +
-+            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
 +            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
 +            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
 +            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
 +            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
 +            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
 +            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
 +            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
@@ -5321,27 +5281,25 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
 +            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
 +            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
 +            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
 +            r0x8a = av_clip_int16_avx(r0x8a);
 +
-+            r1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
 +            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
 +            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
 +            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
 +            r1x8a = av_clip_int16_avx(r1x8a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
 +            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
 +            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
 +            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
 +            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
 +            g0x8a = av_clip_int16_avx(g0x8a);
 +
-+            g1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
 +            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
 +            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
 +            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
@@ -5349,51 +5307,45 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +            g1x8a = av_clip_int16_avx(g1x8a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
 +            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
 +            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
 +            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
 +            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+            b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
 +            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
 +            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
 +            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
 +            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+            r0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
 +            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
 +            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
 +            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
 +            r0x8b = av_clip_int16_avx(r0x8b);
 +
-+            r1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
 +            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
 +            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
 +            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
 +            r1x8b = av_clip_int16_avx(r1x8b);
 +
-+            g0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
 +            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
 +            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
 +            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
 +            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
 +            g0x8b = av_clip_int16_avx(g0x8b);
 +
-+            g1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
 +            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
 +            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
 +            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
 +            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
 +            g1x8b = av_clip_int16_avx(g1x8b);
 +
-+            b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
 +            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
 +            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
 +            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
 +            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+            b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
 +            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
 +            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
 +            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);

From 6200f53d72918aa7276ce0190db9d1a982933719 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Mon, 1 Jul 2024 00:40:18 +0800
Subject: [PATCH 20/27] avfilter/tonemapx: add compile time check for simd
 optimizations

---
 debian/patches/0080-add-tonemapx-filter.patch | 85 ++++++++++++++++---
 1 file changed, 72 insertions(+), 13 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 478a295cdbe..a3ad715f693 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -115,7 +115,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,1211 @@
+@@ -0,0 +1,1263 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -148,15 +148,35 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#include "libavutil/mem_internal.h"
 +#include "libavutil/opt.h"
 +#include "libavutil/cpu.h"
-+#if ARCH_AARCH64
-+#   include "libavutil/aarch64/cpu.h"
-+#   include "aarch64/vf_tonemapx_intrin_neon.h"
-+#endif
-+#if ARCH_X86
-+#   include "libavutil/x86/cpu.h"
-+#   include "x86/vf_tonemapx_intrin_sse.h"
-+#   include "x86/vf_tonemapx_intrin_avx.h"
-+#endif
++
++#if defined(__GNUC__) || defined(__clang__)
++#    if (__GNUC__ >= 10) || (__clang_major__ >= 11)
++#        define ENABLE_TONEMAPX_INTRINSICS
++#    endif // (__GNUC__ >= 10) || (__clang_major__ >= 11)
++#endif // defined(__GNUC__) || defined(__clang__)
++
++#ifdef ENABLE_TONEMAPX_INTRINSICS
++#    if ARCH_AARCH64
++#        if (HAVE_NEON) && (HAVE_INTRINSICS_NEON)
++#            include "libavutil/aarch64/cpu.h"
++#            include "aarch64/vf_tonemapx_intrin_neon.h"
++#        else
++#            undef ENABLE_INTRINSICS
++#        endif
++#    endif // ARCH_AARCH64
++#    if ARCH_X86
++#        include "libavutil/x86/cpu.h"
++#        if (HAVE_SSE42)
++#           include "x86/vf_tonemapx_intrin_sse.h"
++#           define ENABLE_TONEMAPX_SSE_INTRINSICS
++#        endif
++#        if (HAVE_AVX2) && (HAVE_FMA3)
++#            include "x86/vf_tonemapx_intrin_avx.h"
++#            define ENABLE_TONEMAPX_AVX_INTRINSICS
++#        endif
++#    endif // ARCH_X86
++#endif // ENABLE_TONEMAPX_INTRINSICS
++
 +
 +#include "avfilter.h"
 +#include "formats.h"
@@ -1199,30 +1219,46 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +{
 +    TonemapxContext *s = ctx->priv;
 +    int cpu_flags = av_get_cpu_flags();
-+    av_log(ctx, AV_LOG_DEBUG, "Requested output format: %s\n",
++    enum SIMDVariant active_simd = SIMD_NONE;
++    av_log(s, AV_LOG_DEBUG, "Requested output format: %s\n",
 +           s->format_str);
 +
++#ifdef ENABLE_TONEMAPX_INTRINSICS
 +#if ARCH_AARCH64
 +    if (have_neon(cpu_flags)) {
 +        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_neon;
 +        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_neon;
 +        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon;
 +        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon;
++        active_simd = SIMD_NEON;
 +    }
 +#elif ARCH_X86
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
 +    if (X86_SSE42(cpu_flags)) {
 +        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_sse;
 +        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_sse;
 +        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse;
 +        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse;
++        active_simd = SIMD_SSE;
 +    }
++#else
++    av_log(s, AV_LOG_WARNING, "SSE optimization disabled at compile time\n");
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
 +    if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) {
 +        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_avx;
 +        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_avx;
 +        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx;
 +        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx;
++        active_simd = SIMD_AVX;
 +    }
-+#endif
++#else
++    av_log(s, AV_LOG_WARNING, "AVX optimization disabled at compile time\n");
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
++#endif  // ARCH_X86/ARCH_AARCH64
++#else
++    av_log(s, AV_LOG_WARNING, "SIMD optimization disabled at compile time\n");
++#endif // ENABLE_TONEMAPX_INTRINSICS
 +
 +    if (!s->tonemap_func_biplanar8) {
 +        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12;
@@ -1240,6 +1276,22 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10;
 +    }
 +
++    switch(active_simd) {
++        case SIMD_NEON:
++            av_log(s, AV_LOG_INFO, "Using CPU capability: NEON\n");
++            break;
++        case SIMD_SSE:
++            av_log(s, AV_LOG_INFO, "Using CPU capability: SSE4.2\n");
++            break;
++        case SIMD_AVX:
++            av_log(s, AV_LOG_INFO, "Using CPU capabilities: AVX2 FMA3\n");
++            break;
++        default:
++        case SIMD_NONE:
++            av_log(s, AV_LOG_INFO, "No CPU SIMD extension available\n");
++            break;
++    }
++
 +    switch(s->tonemap) {
 +        case TONEMAP_GAMMA:
 +            if (isnan(s->param))
@@ -4047,7 +4099,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.h
-@@ -0,0 +1,76 @@
+@@ -0,0 +1,83 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -4095,6 +4147,13 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h
 +    double desat;
 +} TonemapIntParams;
 +
++enum SIMDVariant {
++    SIMD_NONE = -1,
++    SIMD_NEON,
++    SIMD_SSE,
++    SIMD_AVX
++};
++
 +void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
 +                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                        const int *dstlinesize, const int *srclinesize,

From 85dc4099599035a615d6b81d6b8c1419f393d131 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Mon, 1 Jul 2024 00:43:33 +0800
Subject: [PATCH 21/27] avfilter/tonemapx: correctly disable neon at compile
 time

---
 debian/patches/0080-add-tonemapx-filter.patch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index a3ad715f693..6cb7a912a9c 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -161,7 +161,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +#            include "libavutil/aarch64/cpu.h"
 +#            include "aarch64/vf_tonemapx_intrin_neon.h"
 +#        else
-+#            undef ENABLE_INTRINSICS
++#            undef ENABLE_TONEMAPX_INTRINSICS
 +#        endif
 +#    endif // ARCH_AARCH64
 +#    if ARCH_X86

From 5b15f002ba1836f43f329910bc1d869118dce8f2 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Mon, 1 Jul 2024 14:24:33 +0800
Subject: [PATCH 22/27] avfilter/tonemapx: fix indent

---
 debian/patches/0080-add-tonemapx-filter.patch | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 6cb7a912a9c..7181d9b1446 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -4155,32 +4155,32 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h
 +};
 +
 +void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                        const int *dstlinesize, const int *srclinesize,
-+                                        int dstdepth, int srcdepth,
-+                                        int width, int height,
-+                                        const struct TonemapIntParams *params);
++                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                 const int *dstlinesize, const int *srclinesize,
++                                 int dstdepth, int srcdepth,
++                                 int width, int height,
++                                 const struct TonemapIntParams *params);
 +
 +void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
-+                                           const uint16_t *srcy, const uint16_t *srcuv,
-+                                           const int *dstlinesize, const int *srclinesize,
-+                                           int dstdepth, int srcdepth,
-+                                           int width, int height,
-+                                           const struct TonemapIntParams *params);
++                                    const uint16_t *srcy, const uint16_t *srcuv,
++                                    const int *dstlinesize, const int *srclinesize,
++                                    int dstdepth, int srcdepth,
++                                    int width, int height,
++                                    const struct TonemapIntParams *params);
 +
 +void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                          const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                          const int *dstlinesize, const int *srclinesize,
-+                                          int dstdepth, int srcdepth,
-+                                          int width, int height,
-+                                          const struct TonemapIntParams *params);
++                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                   const int *dstlinesize, const int *srclinesize,
++                                   int dstdepth, int srcdepth,
++                                   int width, int height,
++                                   const struct TonemapIntParams *params);
 +
 +void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
-+                                                const uint16_t *srcy, const uint16_t *srcuv,
-+                                                const int *dstlinesize, const int *srclinesize,
-+                                                int dstdepth, int srcdepth,
-+                                                int width, int height,
-+                                                const struct TonemapIntParams *params);
++                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                         const int *dstlinesize, const int *srclinesize,
++                                         int dstdepth, int srcdepth,
++                                         int width, int height,
++                                         const struct TonemapIntParams *params);
 +
 +#endif //AVFILTER_TONEMAPX_H
 Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c

From 713e7077c12ac7a8ea86381ab3411d86df00693f Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Mon, 1 Jul 2024 15:09:12 +0800
Subject: [PATCH 23/27] ci: move windows gcc workaround to configure script

---
 builder/variants/defaults-gpl.sh              |  2 --
 debian/patches/0080-add-tonemapx-filter.patch | 17 +++++++++++++++++
 docker-build-win64.sh                         |  3 ---
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/builder/variants/defaults-gpl.sh b/builder/variants/defaults-gpl.sh
index 4591981fe2c..e5a6523f669 100755
--- a/builder/variants/defaults-gpl.sh
+++ b/builder/variants/defaults-gpl.sh
@@ -6,5 +6,3 @@ GIT_BRANCH="jellyfin"
 LICENSE_FILE="COPYING.GPLv3"
 
 [[ $TARGET == linux* ]] && FF_CONFIGURE+=" --disable-libxcb --disable-xlib" || true
-[[ $TARGET == win* ]] && FF_CFLAGS+=" -Wa,-muse-unaligned-vector-move" || true
-[[ $TARGET == win* ]] && FF_CXXFLAGS+=" -Wa,-muse-unaligned-vector-move" || true
diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 7181d9b1446..68e808506cc 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -10,6 +10,23 @@ Index: FFmpeg/configure
  tonemap_vaapi_filter_deps="vaapi VAProcFilterParameterBufferHDRToneMapping"
  tonemap_opencl_filter_deps="opencl const_nan"
  tonemap_videotoolbox_filter_deps="metal corevideo videotoolbox const_nan"
+@@ -7295,6 +7296,16 @@ elif enabled gcc; then
+             check_cflags -mpreferred-stack-boundary=4
+             ;;
+         esac
++    elif enabled x86_64; then
++        case $target_os in
++        mingw64*|win64|cygwin*)
++            # GCC on Windows cannot guarantee a 32-byte aligned stack
++            # Such alignment is required by certain AVX instructions
++            # Force GCC to use the unaligned equivalents instead
++            check_cflags -Wa,-muse-unaligned-vector-move
++            check_cxxflags -Wa,-muse-unaligned-vector-move
++            ;;
++        esac
+     fi
+ elif enabled llvm_gcc; then
+     check_cflags -mllvm -stack-alignment=16
 Index: FFmpeg/libavfilter/allfilters.c
 ===================================================================
 --- FFmpeg.orig/libavfilter/allfilters.c
diff --git a/docker-build-win64.sh b/docker-build-win64.sh
index eb63a6f39c9..2819658872b 100755
--- a/docker-build-win64.sh
+++ b/docker-build-win64.sh
@@ -588,9 +588,6 @@ ffversion="$(dpkg-parsechangelog --show-field Version)"
 if [[ -f "patches/series" ]]; then
     quilt push -a
 fi
-# Workaround for GCC bug causing misaligned AVX instructions
-CFLAGS+=" -Wa,-muse-unaligned-vector-move"
-CXXFLAGS+=" -Wa,-muse-unaligned-vector-move"
 ./configure \
     --prefix=${FF_PREFIX} \
     ${FF_TARGET_FLAGS} \

From 5c9a0907b37b6344e2a500c0ab2ef742ebee124b Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Mon, 1 Jul 2024 16:50:47 +0800
Subject: [PATCH 24/27] avfilter/tonemapx: fix more indent

---
 debian/patches/0080-add-tonemapx-filter.patch | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 68e808506cc..b8f5786020e 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -16,7 +16,7 @@ Index: FFmpeg/configure
          esac
 +    elif enabled x86_64; then
 +        case $target_os in
-+        mingw64*|win64|cygwin*)
++        mingw*|win*|cygwin*)
 +            # GCC on Windows cannot guarantee a 32-byte aligned stack
 +            # Such alignment is required by certain AVX instructions
 +            # Force GCC to use the unaligned equivalents instead
@@ -5258,11 +5258,11 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +}
 +
 +X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
-+                                                              const uint16_t *srcy, const uint16_t *srcuv,
-+                                                              const int *dstlinesize, const int *srclinesize,
-+                                                              int dstdepth, int srcdepth,
-+                                                              int width, int height,
-+                                                              const struct TonemapIntParams *params)
++                                                       const uint16_t *srcy, const uint16_t *srcuv,
++                                                       const int *dstlinesize, const int *srclinesize,
++                                                       int dstdepth, int srcdepth,
++                                                       int width, int height,
++                                                       const struct TonemapIntParams *params)
 +{
 +    uint16_t *rdsty = dsty;
 +    uint16_t *rdstuv = dstuv;

From 943263936c5585b62034f19449e7d7cbf753ccea Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Wed, 3 Jul 2024 12:03:54 +0800
Subject: [PATCH 25/27] avfilter/tonemapx: compiler config improvements

Co-authored-by: Nyanmisaka <nst799610810@gmail.com>
---
 debian/patches/0080-add-tonemapx-filter.patch | 7956 +++++++++--------
 1 file changed, 4028 insertions(+), 3928 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index b8f5786020e..ec7516f4eb2 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -1,8 +1,29 @@
-Index: FFmpeg/configure
+Index: jellyfin-ffmpeg/configure
 ===================================================================
---- FFmpeg.orig/configure
-+++ FFmpeg/configure
-@@ -3772,6 +3772,7 @@ tinterlace_filter_deps="gpl"
+--- jellyfin-ffmpeg.orig/configure
++++ jellyfin-ffmpeg/configure
+@@ -2211,6 +2211,9 @@ HEADERS_LIST="
+
+ INTRINSICS_LIST="
+     intrinsics_neon
++    intrinsics_sse42
++    intrinsics_avx2
++    intrinsics_fma3
+ "
+
+ MATH_FUNCS="
+@@ -2676,6 +2679,10 @@ avx2_deps="avx"
+ avx512_deps="avx2"
+ avx512icl_deps="avx512"
+
++intrinsics_sse42_deps="sse42"
++intrinsics_fma3_deps="fma3"
++intrinsics_avx2_deps="avx2"
++
+ mmx_external_deps="x86asm"
+ mmx_inline_deps="inline_asm x86"
+ mmx_suggest="mmx_external mmx_inline"
+@@ -3772,6 +3779,7 @@ tinterlace_filter_deps="gpl"
  tinterlace_merge_test_deps="tinterlace_filter"
  tinterlace_pad_test_deps="tinterlace_filter"
  tonemap_filter_deps="const_nan"
@@ -10,7 +31,27 @@ Index: FFmpeg/configure
  tonemap_vaapi_filter_deps="vaapi VAProcFilterParameterBufferHDRToneMapping"
  tonemap_opencl_filter_deps="opencl const_nan"
  tonemap_videotoolbox_filter_deps="metal corevideo videotoolbox const_nan"
-@@ -7295,6 +7296,16 @@ elif enabled gcc; then
+@@ -6230,6 +6238,19 @@ fi
+
+ check_cc intrinsics_neon arm_neon.h "int16x8_t test = vdupq_n_s16(0)"
+
++disable intrinsics_sse42 && test_cc -msse4.2 <<EOF && enable intrinsics_sse42
++#include <immintrin.h>
++int main(void) { __m128i t = _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_setzero_si128()); return 0; }
++EOF
++disable intrinsics_fma3 && test_cc -mfma <<EOF && enable intrinsics_fma3
++#include <immintrin.h>
++int main(void) { __m256 t = _mm256_fmadd_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps()); return 0; }
++EOF
++disable intrinsics_avx2 && test_cc -mavx2 <<EOF && enable intrinsics_avx2
++#include <immintrin.h>
++int main(void) { __m256i t = _mm256_abs_epi32(_mm256_setzero_si256()); return 0; }
++EOF
++
+ check_ldflags -Wl,--as-needed
+ check_ldflags -Wl,-z,noexecstack
+
+@@ -7295,6 +7316,16 @@ elif enabled gcc; then
              check_cflags -mpreferred-stack-boundary=4
              ;;
          esac
@@ -27,99 +68,10 @@ Index: FFmpeg/configure
      fi
  elif enabled llvm_gcc; then
      check_cflags -mllvm -stack-alignment=16
-Index: FFmpeg/libavfilter/allfilters.c
-===================================================================
---- FFmpeg.orig/libavfilter/allfilters.c
-+++ FFmpeg/libavfilter/allfilters.c
-@@ -484,6 +484,7 @@ extern const AVFilter ff_vf_tmedian;
- extern const AVFilter ff_vf_tmidequalizer;
- extern const AVFilter ff_vf_tmix;
- extern const AVFilter ff_vf_tonemap;
-+extern const AVFilter ff_vf_tonemapx;
- extern const AVFilter ff_vf_tonemap_cuda;
- extern const AVFilter ff_vf_tonemap_opencl;
- extern const AVFilter ff_vf_tonemap_vaapi;
-Index: FFmpeg/libavfilter/colorspace.c
-===================================================================
---- FFmpeg.orig/libavfilter/colorspace.c
-+++ FFmpeg/libavfilter/colorspace.c
-@@ -17,6 +17,7 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#include "libavutil/avassert.h"
- #include "libavutil/frame.h"
- #include "libavutil/mastering_display_metadata.h"
- #include "libavutil/pixdesc.h"
-@@ -354,3 +355,51 @@ float inverse_eotf_arib_b67(float x) {
- float inverse_eotf_bt1886(float x) {
-     return x > 0.0f ? powf(x, 1.0f / 2.4f) : 0.0f;
- }
-+
-+int ff_get_range_off(int *off, int *y_rng, int *uv_rng,
-+                     enum AVColorRange rng, int depth)
-+{
-+    switch (rng) {
-+    case AVCOL_RANGE_UNSPECIFIED:
-+    case AVCOL_RANGE_MPEG:
-+        *off = 16 << (depth - 8);
-+        *y_rng = 219 << (depth - 8);
-+        *uv_rng = 224 << (depth - 8);
-+        break;
-+    case AVCOL_RANGE_JPEG:
-+        *off = 0;
-+        *y_rng = *uv_rng = (256 << (depth - 8)) - 1;
-+        break;
-+    default:
-+        return AVERROR(EINVAL);
-+    }
-+
-+    return 0;
-+}
-+
-+void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3],
-+                       int depth, int y_rng, int uv_rng, int yuv2rgb)
-+{
-+#define N (yuv2rgb ? m : n)
-+#define M (yuv2rgb ? n : m)
-+    int rng, n, m, o;
-+    int bits = 1 << (yuv2rgb ? (depth - 1) : (29 - depth));
-+    for (rng = y_rng, n = 0; n < 3; n++, rng = uv_rng) {
-+        for (m = 0; m < 3; m++) {
-+            out[N][M][0] = lrint(bits * (yuv2rgb ? 28672 : rng) * table[N][M] / (yuv2rgb ? rng : 28672));
-+            for (o = 1; o < 8; o++)
-+                out[N][M][o] = out[N][M][0];
-+        }
-+    }
-+#undef N
-+#undef M
-+
-+    if (yuv2rgb) {
-+        av_assert2(out[0][1][0] == 0);
-+        av_assert2(out[2][2][0] == 0);
-+        av_assert2(out[0][0][0] == out[1][0][0]);
-+        av_assert2(out[0][0][0] == out[2][0][0]);
-+    } else {
-+        av_assert2(out[1][2][0] == out[2][0][0]);
-+    }
-+}
-Index: FFmpeg/libavfilter/colorspace.h
-===================================================================
---- FFmpeg.orig/libavfilter/colorspace.h
-+++ FFmpeg/libavfilter/colorspace.h
-@@ -85,4 +85,8 @@ float eotf_arib_b67(float x);
- float inverse_eotf_arib_b67(float x);
- float inverse_eotf_bt1886(float x);
- 
-+int ff_get_range_off(int *off, int *y_rng, int *uv_rng,
-+                     enum AVColorRange rng, int depth);
-+void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3],
-+                       int depth, int y_rng, int uv_rng, int yuv2rgb);
- #endif
-Index: FFmpeg/libavfilter/Makefile
+Index: jellyfin-ffmpeg/libavfilter/Makefile
 ===================================================================
---- FFmpeg.orig/libavfilter/Makefile
-+++ FFmpeg/libavfilter/Makefile
+--- jellyfin-ffmpeg.orig/libavfilter/Makefile
++++ jellyfin-ffmpeg/libavfilter/Makefile
 @@ -516,6 +516,7 @@ OBJS-$(CONFIG_TMEDIAN_FILTER)
  OBJS-$(CONFIG_TMIDEQUALIZER_FILTER)          += vf_tmidequalizer.o
  OBJS-$(CONFIG_TMIX_FILTER)                   += vf_mix.o framesync.o
@@ -128,12 +80,23 @@ Index: FFmpeg/libavfilter/Makefile
  OBJS-$(CONFIG_TONEMAP_OPENCL_FILTER)         += vf_tonemap_opencl.o opencl.o \
                                                  opencl/tonemap.o opencl/colorspace_common.o
  OBJS-$(CONFIG_TONEMAP_CUDA_FILTER)           += vf_tonemap_cuda.o cuda/tonemap.ptx.o \
-Index: FFmpeg/libavfilter/vf_tonemapx.c
+Index: jellyfin-ffmpeg/libavfilter/aarch64/Makefile
+===================================================================
+--- jellyfin-ffmpeg.orig/libavfilter/aarch64/Makefile
++++ jellyfin-ffmpeg/libavfilter/aarch64/Makefile
+@@ -1,3 +1,4 @@
+ OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
++OBJS-$(CONFIG_TONEMAPX_FILTER)               += aarch64/vf_tonemapx_intrin_neon.o
+
+ NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
+Index: jellyfin-ffmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 ===================================================================
 --- /dev/null
-+++ FFmpeg/libavfilter/vf_tonemapx.c
-@@ -0,0 +1,1263 @@
++++ jellyfin-ffmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
+@@ -0,0 +1,1229 @@
 +/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
@@ -151,426 +114,466 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
-+/**
-+ * @file
-+ * tonemap algorithms
-+ */
-+
-+#include <float.h>
-+#include <string.h>
-+
-+#include "libavutil/avassert.h"
-+#include "libavutil/imgutils.h"
-+#include "libavutil/internal.h"
-+#include "libavutil/mem_internal.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/cpu.h"
++#include "vf_tonemapx_intrin_neon.h"
 +
-+#if defined(__GNUC__) || defined(__clang__)
-+#    if (__GNUC__ >= 10) || (__clang_major__ >= 11)
-+#        define ENABLE_TONEMAPX_INTRINSICS
-+#    endif // (__GNUC__ >= 10) || (__clang_major__ >= 11)
-+#endif // defined(__GNUC__) || defined(__clang__)
++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
++#    include <arm_neon.h>
++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +
-+#ifdef ENABLE_TONEMAPX_INTRINSICS
-+#    if ARCH_AARCH64
-+#        if (HAVE_NEON) && (HAVE_INTRINSICS_NEON)
-+#            include "libavutil/aarch64/cpu.h"
-+#            include "aarch64/vf_tonemapx_intrin_neon.h"
-+#        else
-+#            undef ENABLE_TONEMAPX_INTRINSICS
-+#        endif
-+#    endif // ARCH_AARCH64
-+#    if ARCH_X86
-+#        include "libavutil/x86/cpu.h"
-+#        if (HAVE_SSE42)
-+#           include "x86/vf_tonemapx_intrin_sse.h"
-+#           define ENABLE_TONEMAPX_SSE_INTRINSICS
-+#        endif
-+#        if (HAVE_AVX2) && (HAVE_FMA3)
-+#            include "x86/vf_tonemapx_intrin_avx.h"
-+#            define ENABLE_TONEMAPX_AVX_INTRINSICS
-+#        endif
-+#    endif // ARCH_X86
-+#endif // ENABLE_TONEMAPX_INTRINSICS
++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
++static inline void tonemap_int16x8_neon(uint16x8_t r_in, uint16x8_t g_in, uint16x8_t b_in,
++                                        int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                                        float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                                        const AVLumaCoefficients *coeffs,
++                                        const AVLumaCoefficients *ocoeffs, double desat,
++                                        double (*rgb2rgb)[3][3],
++                                        int rgb2rgb_passthrough)
++{
++    int16x8_t sig8;
++    float32x4_t mapvalx4a;
++    float32x4_t mapvalx4b;
++    float32x4_t r_linx4a;
++    float32x4_t r_linx4b;
++    float32x4_t g_linx4a;
++    float32x4_t g_linx4b;
++    float32x4_t b_linx4a;
++    float32x4_t b_linx4b;
++    float32x4_t offset = vdupq_n_f32(0.5f);
++    int32x4_t output_upper_bound = vdupq_n_s32(32767);
++    int32x4_t zerox4 = vdupq_n_s32(0);
++    int16x8_t input_lut_offset = vdupq_n_s16(2048);
++    int16x8_t input_upper_bound = vdupq_n_s16(32767);
++    int16x8_t r, g, b;
++    int32x4_t rx4a, gx4a, bx4a, rx4b, gx4b, bx4b;
 +
++    float mapval4a[4], mapval4b[4], r_lin4a[4], r_lin4b[4], g_lin4a[4], g_lin4b[4], b_lin4a[4], b_lin4b[4];
 +
-+#include "avfilter.h"
-+#include "formats.h"
-+#include "internal.h"
-+#include "video.h"
-+#include "vf_tonemapx.h"
++    r = vreinterpretq_s16_u16(r_in);
++    g = vreinterpretq_s16_u16(g_in);
++    b = vreinterpretq_s16_u16(b_in);
 +
-+enum TonemapAlgorithm {
-+    TONEMAP_NONE,
-+    TONEMAP_LINEAR,
-+    TONEMAP_GAMMA,
-+    TONEMAP_CLIP,
-+    TONEMAP_REINHARD,
-+    TONEMAP_HABLE,
-+    TONEMAP_MOBIUS,
-+    TONEMAP_BT2390,
-+    TONEMAP_MAX,
-+};
++    sig8 = vmaxq_s16(r, vmaxq_s16(g, b));
++    sig8 = vaddq_s16(sig8, input_lut_offset);
++    sig8 = vminq_s16(sig8, input_upper_bound);
++    sig8 = vmaxq_s16(sig8, vreinterpretq_s16_s32(zerox4));
 +
-+typedef struct TonemapxContext {
-+    const AVClass *class;
++    r = vaddq_s16(r, input_lut_offset);
++    r = vminq_s16(r, input_upper_bound);
++    r = vmaxq_s16(r, vreinterpretq_s16_s32(zerox4));
++    g = vaddq_s16(g, input_lut_offset);
++    g = vminq_s16(g, input_upper_bound);
++    g = vmaxq_s16(g, vreinterpretq_s16_s32(zerox4));
++    b = vaddq_s16(b, input_lut_offset);
++    b = vminq_s16(b, input_upper_bound);
++    b = vmaxq_s16(b, vreinterpretq_s16_s32(zerox4));
 +
-+    enum TonemapAlgorithm tonemap;
-+    enum AVColorTransferCharacteristic trc;
-+    enum AVColorSpace spc;
-+    enum AVColorPrimaries pri;
-+    enum AVColorRange range;
-+    enum AVPixelFormat format;
-+    char *format_str;
-+    double param;
-+    double desat;
-+    double peak;
++    // Cannot use loop here as the lane has to be compile-time constant
++#define LOAD_LUT(i) mapval4a[i] = tonemap_lut[vget_lane_s16(vget_low_s16(sig8), i)]; \
++mapval4b[i] = tonemap_lut[vget_lane_s16(vget_high_s16(sig8), i)];                    \
++r_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(r), i)];                             \
++r_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(r), i)];                            \
++g_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(g), i)];                             \
++g_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(g), i)];                            \
++b_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(b), i)];                             \
++b_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(b), i)];
 +
-+    const AVLumaCoefficients *coeffs, *ocoeffs;
++    LOAD_LUT(0)
++    LOAD_LUT(1)
++    LOAD_LUT(2)
++    LOAD_LUT(3)
 +
-+    double lut_peak;
-+    float *lin_lut;
-+    float *tonemap_lut;
-+    uint16_t *delin_lut;
-+    int in_yuv_off, out_yuv_off;
++#undef  LOAD_LUT
 +
-+    DECLARE_ALIGNED(16, int16_t, yuv2rgb_coeffs)[3][3][8];
-+    DECLARE_ALIGNED(16, int16_t, rgb2yuv_coeffs)[3][3][8];
-+    DECLARE_ALIGNED(16, double,  rgb2rgb_coeffs)[3][3];
++    mapvalx4a = vld1q_f32(mapval4a);
++    mapvalx4b = vld1q_f32(mapval4b);
++    r_linx4a = vld1q_f32(r_lin4a);
++    r_linx4b = vld1q_f32(r_lin4b);
++    g_linx4a = vld1q_f32(g_lin4a);
++    g_linx4b = vld1q_f32(g_lin4b);
++    b_linx4a = vld1q_f32(b_lin4a);
++    b_linx4b = vld1q_f32(b_lin4b);
 +
-+    int (*filter_slice) (AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
++    if (!rgb2rgb_passthrough) {
++        r_linx4a = vmulq_n_f32(r_linx4a, (float)(*rgb2rgb)[0][0]);
++        r_linx4a = vfmaq_n_f32(r_linx4a, g_linx4a, (float)(*rgb2rgb)[0][1]);
++        r_linx4a = vfmaq_n_f32(r_linx4a, b_linx4a, (float)(*rgb2rgb)[0][2]);
++        r_linx4b = vmulq_n_f32(r_linx4b, (float)(*rgb2rgb)[0][0]);
++        r_linx4b = vfmaq_n_f32(r_linx4b, g_linx4b, (float)(*rgb2rgb)[0][1]);
++        r_linx4b = vfmaq_n_f32(r_linx4b, b_linx4b, (float)(*rgb2rgb)[0][2]);
 +
-+    void (*tonemap_func_biplanar8) (uint8_t *dsty, uint8_t *dstuv,
-+                                    const uint16_t *srcy, const uint16_t *srcuv,
-+                                    const int *dstlinesize, const int *srclinesize,
-+                                    int dstdepth, int srcdepth,
-+                                    int width, int height,
-+                                    const struct TonemapIntParams *params);
++        g_linx4a = vmulq_n_f32(g_linx4a, (float)(*rgb2rgb)[1][1]);
++        g_linx4a = vfmaq_n_f32(g_linx4a, r_linx4a, (float)(*rgb2rgb)[1][0]);
++        g_linx4a = vfmaq_n_f32(g_linx4a, b_linx4a, (float)(*rgb2rgb)[1][2]);
++        g_linx4b = vmulq_n_f32(g_linx4b, (float)(*rgb2rgb)[1][1]);
++        g_linx4b = vfmaq_n_f32(g_linx4b, r_linx4b, (float)(*rgb2rgb)[1][0]);
++        g_linx4b = vfmaq_n_f32(g_linx4b, b_linx4b, (float)(*rgb2rgb)[1][2]);
 +
-+    void (*tonemap_func_planar8) (uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                  const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                  const int *dstlinesize, const int *srclinesize,
-+                                  int dstdepth, int srcdepth,
-+                                  int width, int height,
-+                                  const struct TonemapIntParams *params);
++        b_linx4a = vmulq_n_f32(b_linx4a, (float)(*rgb2rgb)[2][2]);
++        b_linx4a = vfmaq_n_f32(b_linx4a, r_linx4a, (float)(*rgb2rgb)[2][0]);
++        b_linx4a = vfmaq_n_f32(b_linx4a, g_linx4a, (float)(*rgb2rgb)[2][1]);
++        b_linx4b = vmulq_n_f32(b_linx4b, (float)(*rgb2rgb)[2][2]);
++        b_linx4b = vfmaq_n_f32(b_linx4b, r_linx4b, (float)(*rgb2rgb)[2][0]);
++        b_linx4b = vfmaq_n_f32(b_linx4b, g_linx4b, (float)(*rgb2rgb)[2][1]);
++    }
 +
-+    void (*tonemap_func_biplanar10) (uint16_t *dsty, uint16_t *dstuv,
-+                                     const uint16_t *srcy, const uint16_t *srcuv,
-+                                     const int *dstlinesize, const int *srclinesize,
-+                                     int dstdepth, int srcdepth,
-+                                     int width, int height,
-+                                     const struct TonemapIntParams *params);
++    if (desat > 0) {
++        float32x4_t eps_x4 = vdupq_n_f32(FLOAT_EPS);
++        float32x4_t desat4 = vdupq_n_f32((float)desat);
++        float32x4_t luma4 = vdupq_n_f32(0);
++        float32x4_t overbright4;
++        // Group A
++        luma4 = vmlaq_n_f32(luma4, r_linx4a, (float)av_q2d(coeffs->cr));
++        luma4 = vmlaq_n_f32(luma4, g_linx4a, (float)av_q2d(coeffs->cg));
++        luma4 = vmlaq_n_f32(luma4, b_linx4a, (float)av_q2d(coeffs->cb));
++        overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
++        r_linx4a = vmlsq_f32(r_linx4a, r_linx4a, overbright4);
++        r_linx4a = vmlaq_f32(r_linx4a, luma4, overbright4);
++        g_linx4a = vmlsq_f32(g_linx4a, g_linx4a, overbright4);
++        g_linx4a = vmlaq_f32(g_linx4a, luma4, overbright4);
++        b_linx4a = vmlsq_f32(b_linx4a, b_linx4a, overbright4);
++        b_linx4a = vmlaq_f32(b_linx4a, luma4, overbright4);
++        // Group B
++        luma4 = vdupq_n_f32(0);
++        luma4 = vmlaq_n_f32(luma4, r_linx4b, (float)av_q2d(coeffs->cr));
++        luma4 = vmlaq_n_f32(luma4, g_linx4b, (float)av_q2d(coeffs->cg));
++        luma4 = vmlaq_n_f32(luma4, b_linx4b, (float)av_q2d(coeffs->cb));
++        overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
++        r_linx4b = vmlsq_f32(r_linx4b, r_linx4b, overbright4);
++        r_linx4b = vmlaq_f32(r_linx4b, luma4, overbright4);
++        g_linx4b = vmlsq_f32(g_linx4b, g_linx4b, overbright4);
++        g_linx4b = vmlaq_f32(g_linx4b, luma4, overbright4);
++        b_linx4b = vmlsq_f32(b_linx4b, b_linx4b, overbright4);
++        b_linx4b = vmlaq_f32(b_linx4b, luma4, overbright4);
++    }
 +
-+    void (*tonemap_func_planar10) (uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                   const int *dstlinesize, const int *srclinesize,
-+                                   int dstdepth, int srcdepth,
-+                                   int width, int height,
-+                                   const struct TonemapIntParams *params);
-+} TonemapxContext;
++    r_linx4a = vmulq_f32(r_linx4a, mapvalx4a);
++    g_linx4a = vmulq_f32(g_linx4a, mapvalx4a);
++    b_linx4a = vmulq_f32(b_linx4a, mapvalx4a);
 +
-+typedef struct ThreadData {
-+    AVFrame *in, *out;
-+    const AVPixFmtDescriptor *desc, *odesc;
-+    double peak;
-+} ThreadData;
++    r_linx4b = vmulq_f32(r_linx4b, mapvalx4b);
++    g_linx4b = vmulq_f32(g_linx4b, mapvalx4b);
++    b_linx4b = vmulq_f32(b_linx4b, mapvalx4b);
 +
-+static const enum AVPixelFormat in_pix_fmts[] = {
-+    AV_PIX_FMT_YUV420P10,
-+    AV_PIX_FMT_P010,
-+    AV_PIX_FMT_P016,
-+    AV_PIX_FMT_NONE,
-+};
++    r_linx4a = vmlaq_n_f32(offset, r_linx4a, 32767);
++    r_linx4b = vmlaq_n_f32(offset, r_linx4b, 32767);
++    g_linx4a = vmlaq_n_f32(offset, g_linx4a, 32767);
++    g_linx4b = vmlaq_n_f32(offset, g_linx4b, 32767);
++    b_linx4a = vmlaq_n_f32(offset, b_linx4a, 32767);
++    b_linx4b = vmlaq_n_f32(offset, b_linx4b, 32767);
 +
-+static const enum AVPixelFormat out_pix_fmts[] = {
-+    AV_PIX_FMT_YUV420P,
-+    AV_PIX_FMT_YUV420P10,
-+    AV_PIX_FMT_NV12,
-+    AV_PIX_FMT_P010,
-+    AV_PIX_FMT_P016,
-+};
++    rx4a = vcvtq_s32_f32(r_linx4a);
++    rx4a = vminq_s32(rx4a, output_upper_bound);
++    rx4a = vmaxq_s32(rx4a, zerox4);
++    gx4a = vcvtq_s32_f32(g_linx4a);
++    gx4a = vminq_s32(gx4a, output_upper_bound);
++    gx4a = vmaxq_s32(gx4a, zerox4);
++    bx4a = vcvtq_s32_f32(b_linx4a);
++    bx4a = vminq_s32(bx4a, output_upper_bound);
++    bx4a = vmaxq_s32(bx4a, zerox4);
++    rx4b = vcvtq_s32_f32(r_linx4b);
++    rx4b = vminq_s32(rx4b, output_upper_bound);
++    rx4b = vmaxq_s32(rx4b, zerox4);
++    gx4b = vcvtq_s32_f32(g_linx4b);
++    gx4b = vminq_s32(gx4b, output_upper_bound);
++    gx4b = vmaxq_s32(gx4b, zerox4);
++    bx4b = vcvtq_s32_f32(b_linx4b);
++    bx4b = vminq_s32(bx4b, output_upper_bound);
++    bx4b = vmaxq_s32(bx4b, zerox4);
 +
-+static int out_format_is_supported(enum AVPixelFormat fmt)
-+{
-+    int i;
++    r_out[0] = delin_lut[vget_lane_s32(vget_low_s32(rx4a), 0)];
++    r_out[1] = delin_lut[vget_lane_s32(vget_low_s32(rx4a), 1)];
++    r_out[2] = delin_lut[vget_lane_s32(vget_high_s32(rx4a), 0)];
++    r_out[3] = delin_lut[vget_lane_s32(vget_high_s32(rx4a), 1)];
++    r_out[4] = delin_lut[vget_lane_s32(vget_low_s32(rx4b), 0)];
++    r_out[5] = delin_lut[vget_lane_s32(vget_low_s32(rx4b), 1)];
++    r_out[6] = delin_lut[vget_lane_s32(vget_high_s32(rx4b), 0)];
++    r_out[7] = delin_lut[vget_lane_s32(vget_high_s32(rx4b), 1)];
 +
-+    for (i = 0; i < FF_ARRAY_ELEMS(out_pix_fmts); i++)
-+        if (out_pix_fmts[i] == fmt)
-+            return 1;
-+    return 0;
-+}
++    g_out[0] = delin_lut[vget_lane_s32(vget_low_s32(gx4a), 0)];
++    g_out[1] = delin_lut[vget_lane_s32(vget_low_s32(gx4a), 1)];
++    g_out[2] = delin_lut[vget_lane_s32(vget_high_s32(gx4a), 0)];
++    g_out[3] = delin_lut[vget_lane_s32(vget_high_s32(gx4a), 1)];
++    g_out[4] = delin_lut[vget_lane_s32(vget_low_s32(gx4b), 0)];
++    g_out[5] = delin_lut[vget_lane_s32(vget_low_s32(gx4b), 1)];
++    g_out[6] = delin_lut[vget_lane_s32(vget_high_s32(gx4b), 0)];
++    g_out[7] = delin_lut[vget_lane_s32(vget_high_s32(gx4b), 1)];
 +
-+static float hable(float in)
-+{
-+    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
-+    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
++    b_out[0] = delin_lut[vget_lane_s32(vget_low_s32(bx4a), 0)];
++    b_out[1] = delin_lut[vget_lane_s32(vget_low_s32(bx4a), 1)];
++    b_out[2] = delin_lut[vget_lane_s32(vget_high_s32(bx4a), 0)];
++    b_out[3] = delin_lut[vget_lane_s32(vget_high_s32(bx4a), 1)];
++    b_out[4] = delin_lut[vget_lane_s32(vget_low_s32(bx4b), 0)];
++    b_out[5] = delin_lut[vget_lane_s32(vget_low_s32(bx4b), 1)];
++    b_out[6] = delin_lut[vget_lane_s32(vget_high_s32(bx4b), 0)];
++    b_out[7] = delin_lut[vget_lane_s32(vget_high_s32(bx4b), 1)];
 +}
++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +
-+static float mobius(float in, float j, double peak)
++void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                      const int *dstlinesize, const int *srclinesize,
++                                      int dstdepth, int srcdepth,
++                                      int width, int height,
++                                      const struct TonemapIntParams *params)
 +{
-+    float a, b;
++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstu = dstu;
++    uint8_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
 +
-+    if (in <= j)
-+        return in;
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
 +
-+    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
-+    b = (j * j - 2.0f * j * peak + peak) / FFMAX(peak - 1.0f, FLOAT_EPS);
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
 +
-+    return (b * b + 2.0f * b * j + j * j) / (b - a) * (in + a) / (in + b);
-+}
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
-+static float bt2390(float s, float peak)
-+{
-+    float peak_pq = inverse_eotf_st2084(peak, REFERENCE_WHITE_ALT);
-+    float scale = 1.0f / peak_pq;
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    // SDR peak
-+    float dst_peak = 1.0f;
-+    float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE_ALT) * scale;
-+    float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale;
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
++    uint16x4_t ux4, vx4;
 +
-+    float ks = 1.5f * maxLum - 0.5f;
-+    float tb = (s_pq - ks) / (1.0f - ks);
-+    float tb2 = tb * tb;
-+    float tb3 = tb2 * tb;
-+    float pb = (2.0f * tb3 - 3.0f * tb2 + 1.0f) * ks +
-+               (tb3 - 2.0f * tb2 + tb) * (1.0f - ks) +
-+               (-2.0f * tb3 + 3.0f * tb2) * maxLum;
-+    float sig = (s_pq < ks) ? s_pq : pb;
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    int16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
 +
-+    return eotf_st2084(sig * peak_pq, REFERENCE_WHITE_ALT);
-+}
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    int16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+static float mapsig(enum TonemapAlgorithm alg, float sig, double peak, double param)
-+{
-+    switch(alg) {
-+    default:
-+    case TONEMAP_NONE:
-+        // do nothing
-+        break;
-+    case TONEMAP_LINEAR:
-+        sig = sig * param / peak;
-+        break;
-+    case TONEMAP_GAMMA:
-+        sig = sig > 0.05f
-+              ? pow(sig / peak, 1.0f / param)
-+              : sig * pow(0.05f / peak, 1.0f / param) / 0.05f;
-+        break;
-+    case TONEMAP_CLIP:
-+        sig = av_clipf(sig * param, 0, 1.0f);
-+        break;
-+    case TONEMAP_HABLE:
-+        sig = hable(sig) / hable(peak);
-+        break;
-+    case TONEMAP_REINHARD:
-+        sig = sig / (sig + param) * (peak + param) / peak;
-+        break;
-+    case TONEMAP_MOBIUS:
-+        sig = mobius(sig, param, peak);
-+        break;
-+    case TONEMAP_BT2390:
-+        sig = bt2390(sig, peak);
-+        break;
-+    }
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            ux4 = vld1_u16(srcu + (x >> 1));
++            vx4 = vld1_u16(srcv + (x >> 1));
 +
-+    return sig;
-+}
-+
-+static float linearize(float x, enum AVColorTransferCharacteristic trc_src)
-+{
-+    if (trc_src == AVCOL_TRC_SMPTE2084)
-+        return eotf_st2084(x, REFERENCE_WHITE_ALT);
-+    else if (trc_src == AVCOL_TRC_ARIB_STD_B67)
-+        return eotf_arib_b67(x);
-+    else
-+        return x;
-+}
-+
-+static float delinearize(float x, enum AVColorTransferCharacteristic trc_dst)
-+{
-+    if (trc_dst == AVCOL_TRC_BT709 || trc_dst == AVCOL_TRC_BT2020_10)
-+        return inverse_eotf_bt1886(x);
-+    else
-+        return x;
-+}
-+
-+static int compute_trc_luts(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src,
-+                            enum AVColorTransferCharacteristic trc_dst)
-+{
-+    int i;
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
++            ux8 = vsubq_u16(ux8, in_uv_offx8);
++            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
++            vx8 = vsubq_u16(vx8, in_uv_offx8);
 +
-+    if (!s->lin_lut && !(s->lin_lut = av_calloc(32768, sizeof(float))))
-+        return AVERROR(ENOMEM);
-+    if (!s->delin_lut && !(s->delin_lut = av_calloc(32768, sizeof(uint16_t))))
-+        return AVERROR(ENOMEM);
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
 +
-+    for (i = 0; i < 32768; i++) {
-+        double v1 = (i - 2048.0f) / 28672.0f;
-+        double v2 = i / 32767.0f;
-+        s->lin_lut[i] = FFMAX(linearize(v1, trc_src), 0);
-+        s->delin_lut[i] = av_clip_int16(lrint(delinearize(v2, trc_dst) * 28672.0f));
-+    }
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
 +
-+    return 0;
-+}
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
 +
-+static int compute_tonemap_lut(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src)
-+{
-+    int i;
-+    double peak = s->lut_peak;
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
 +
-+    if (!s->tonemap_lut && !(s->tonemap_lut = av_calloc(32768, sizeof(float))))
-+        return AVERROR(ENOMEM);
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
 +
-+    for (i = 0; i < 32768; i++) {
-+        double v = (i - 2048.0f) / 28672.0f;
-+        double sig = linearize(v, trc_src);
-+        float mapped = mapsig(s->tonemap, sig, peak, s->param);
-+        s->tonemap_lut[i] = (sig > 0.0f && mapped > 0.0f) ? mapped / sig : 0.0f;
-+    }
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
 +
-+    return 0;
-+}
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
 +
-+static int compute_yuv_coeffs(TonemapxContext *s,
-+                              const AVLumaCoefficients *coeffs,
-+                              const AVLumaCoefficients *ocoeffs,
-+                              const AVPixFmtDescriptor *idesc,
-+                              const AVPixFmtDescriptor *odesc,
-+                              enum AVColorRange irng,
-+                              enum AVColorRange orng)
-+{
-+    double rgb2yuv[3][3], yuv2rgb[3][3];
-+    int res;
-+    int y_rng, uv_rng;
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
 +
-+    res = ff_get_range_off(&s->in_yuv_off, &y_rng, &uv_rng,
-+                           irng, idesc->comp[0].depth);
-+    if (res < 0) {
-+        av_log(s, AV_LOG_ERROR,
-+               "Unsupported input color range %d (%s)\n",
-+               irng, av_color_range_name(irng));
-+        return res;
-+    }
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
 +
-+    ff_fill_rgb2yuv_table(coeffs, rgb2yuv);
-+    ff_matrix_invert_3x3(rgb2yuv, yuv2rgb);
-+    ff_fill_rgb2yuv_table(ocoeffs, rgb2yuv);
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
 +
-+    ff_get_yuv_coeffs(s->yuv2rgb_coeffs, yuv2rgb, idesc->comp[0].depth,
-+                      y_rng, uv_rng, 1);
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            // output shift bits for 8bit outputs is 29 - 8 = 21
++            y0oax4 = vshrq_n_s32(y0oax4, 21);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
-+    res = ff_get_range_off(&s->out_yuv_off, &y_rng, &uv_rng,
-+                           orng, odesc->comp[0].depth);
-+    if (res < 0) {
-+        av_log(s, AV_LOG_ERROR,
-+               "Unsupported output color range %d (%s)\n",
-+               orng, av_color_range_name(orng));
-+        return res;
-+    }
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, 21);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
 +
-+    ff_get_yuv_coeffs(s->rgb2yuv_coeffs, rgb2yuv, odesc->comp[0].depth,
-+                      y_rng, uv_rng, 0);
++            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
++            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
 +
-+    return 0;
-+}
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
 +
-+static int compute_rgb_coeffs(TonemapxContext *s,
-+                              enum AVColorPrimaries iprm,
-+                              enum AVColorPrimaries oprm)
-+{
-+    double rgb2xyz[3][3], xyz2rgb[3][3];
-+    const AVColorPrimariesDesc *iprm_desc = av_csp_primaries_desc_from_id(iprm);
-+    const AVColorPrimariesDesc *oprm_desc = av_csp_primaries_desc_from_id(oprm);
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
 +
-+    if (!iprm_desc) {
-+        av_log(s, AV_LOG_ERROR,
-+               "Unsupported input color primaries %d (%s)\n",
-+               iprm, av_color_primaries_name(iprm));
-+        return AVERROR(EINVAL);
-+    }
-+    if (!oprm_desc) {
-+        av_log(s, AV_LOG_ERROR,
-+               "Unsupported output color primaries %d (%s)\n",
-+               oprm, av_color_primaries_name(oprm));
-+        return AVERROR(EINVAL);
-+    }
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
 +
-+    ff_fill_rgb2xyz_table(&oprm_desc->prim, &oprm_desc->wp, rgb2xyz);
-+    ff_matrix_invert_3x3(rgb2xyz, xyz2rgb);
-+    ff_fill_rgb2xyz_table(&iprm_desc->prim, &iprm_desc->wp, rgb2xyz);
-+    ff_matrix_mul_3x3(s->rgb2rgb_coeffs, rgb2xyz, xyz2rgb);
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, 21);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
 +
-+    return 0;
-+}
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, 21);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
 +
-+static void tonemap_int16(int16_t r_in, int16_t g_in, int16_t b_in,
-+                          int16_t *r_out, int16_t *g_out, int16_t *b_out,
-+                          float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
-+                          const AVLumaCoefficients *coeffs,
-+                          const AVLumaCoefficients *ocoeffs, double desat,
-+                          double (*rgb2rgb)[3][3],
-+                          int rgb2rgb_passthrough)
-+{
-+    int16_t sig;
-+    float mapval, r_lin, g_lin, b_lin;
++            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
++            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
 +
-+    /* load values */
-+    *r_out = r_in;
-+    *g_out = g_in;
-+    *b_out = b_in;
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
 +
-+    /* pick the brightest component, reducing the value range as necessary
-+     * to keep the entire signal in range and preventing discoloration due to
-+     * out-of-bounds clipping */
-+    sig = FFMAX3(r_in, g_in, b_in);
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
 +
-+    mapval = tonemap_lut[av_clip_uintp2(sig + 2048, 15)];
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
 +
-+    r_lin = lin_lut[av_clip_uintp2(r_in + 2048, 15)];
-+    g_lin = lin_lut[av_clip_uintp2(g_in + 2048, 15)];
-+    b_lin = lin_lut[av_clip_uintp2(b_in + 2048, 15)];
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, 21);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vst1_lane_u32((uint32_t *) &dstu[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(uox4), vdup_n_s16(0)))), 0);
 +
-+    if (!rgb2rgb_passthrough) {
-+        r_lin = (*rgb2rgb)[0][0] * r_lin + (*rgb2rgb)[0][1] * g_lin + (*rgb2rgb)[0][2] * b_lin;
-+        g_lin = (*rgb2rgb)[1][0] * r_lin + (*rgb2rgb)[1][1] * g_lin + (*rgb2rgb)[1][2] * b_lin;
-+        b_lin = (*rgb2rgb)[2][0] * r_lin + (*rgb2rgb)[2][1] * g_lin + (*rgb2rgb)[2][2] * b_lin;
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            vox4 = vshrq_n_s32(vox4, 21);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++            vst1_lane_u32((uint32_t *) &dstv[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(vox4), vdup_n_s16(0)))), 0);
++        }
 +    }
 +
-+#define MIX(x,y,a) (x) * (1 - (a)) + (y) * (a)
-+    /* desaturate to prevent unnatural colors */
-+    if (desat > 0) {
-+        float luma = av_q2d(coeffs->cr) * r_lin + av_q2d(coeffs->cg) * g_lin + av_q2d(coeffs->cb) * b_lin;
-+        float overbright = FFMAX(luma - desat, FLOAT_EPS) / FFMAX(luma, FLOAT_EPS);
-+        r_lin = MIX(r_lin, luma, overbright);
-+        g_lin = MIX(g_lin, luma, overbright);
-+        b_lin = MIX(b_lin, luma, overbright);
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
++                                    rsrcy, rsrcu, rsrcv,
++                                    dstlinesize, srclinesize,
++                                    dstdepth, srcdepth,
++                                    remainw, rheight, params);
 +    }
-+
-+    r_lin *= mapval;
-+    g_lin *= mapval;
-+    b_lin *= mapval;
-+#undef MIX
-+
-+    *r_out = delin_lut[av_clip_uintp2(r_lin * 32767 + 0.5, 15)];
-+    *g_out = delin_lut[av_clip_uintp2(g_lin * 32767 + 0.5, 15)];
-+    *b_out = delin_lut[av_clip_uintp2(b_lin * 32767 + 0.5, 15)];
++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +}
 +
-+// See also libavfilter/colorspacedsp_template.c
-+void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
-+                                    const uint16_t *srcy, const uint16_t *srcuv,
-+                                    const int *dstlinesize, const int *srclinesize,
-+                                    int dstdepth, int srcdepth,
-+                                    int width, int height,
-+                                    const struct TonemapIntParams *params)
++void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
++                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                         const int *dstlinesize, const int *srclinesize,
++                                         int dstdepth, int srcdepth,
++                                         int width, int height,
++                                         const struct TonemapIntParams *params)
 +{
++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
++    uint8_t *rdsty = dsty;
++    uint8_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
@@ -592,171 +595,238 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int r00, g00, b00;
-+    int r01, g01, b01;
-+    int r10, g10, b10;
-+    int r11, g11, b11;
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x8_t uvx8;
++    uint16x4_t ux2a, vx2a, ux2b, vx2b;
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
 +
-+    int16_t r[4], g[4], b[4];
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    int16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
++
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    int16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x4_t uvoax4, uvobx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
-+            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
-+            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
-+
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            uvx8 = vld1q_u16(srcuv + x);
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                // shift bit has to be compile-time constant
++                y0x8 = vshrq_n_u16(y0x8, 6);
++                y1x8 = vshrq_n_u16(y1x8, 6);
++                uvx8 = vshrq_n_u16(uvx8, 6);
++            }
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
 +
-+            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
++            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
++            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
++            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
 +
-+            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            ux8 = vcombine_u16(ux2a, ux2b);
++            vx8 = vcombine_u16(vx2a, vx2b);
 +
-+            r00 = r[0], g00 = g[0], b00 = b[0];
-+            r01 = r[1], g01 = g[1], b01 = b[1];
-+            r10 = r[2], g10 = g[2], b10 = b[2];
-+            r11 = r[3], g11 = g[3], b11 = b[3];
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
 +
-+            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
-+            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
-+            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
-+            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
 +
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstuv[x]     = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
-+            dstuv[x + 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
-+#undef AVG
-+        }
-+    }
-+}
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
 +
-+void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                 const int *dstlinesize, const int *srclinesize,
-+                                 int dstdepth, int srcdepth,
-+                                 int width, int height,
-+                                 const struct TonemapIntParams *params)
-+{
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
 +
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
 +
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
 +
-+    int r00, g00, b00;
-+    int r01, g01, b01;
-+    int r10, g10, b10;
-+    int r11, g11, b11;
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
 +
-+    int16_t r[4], g[4], b[4];
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                         ) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
-+            int u = (srcu[x >> 1]) - in_uv_offset;
-+            int v = (srcv[x >> 1]) - in_uv_offset;
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
 +
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
 +
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            // output shift bits for 8bit outputs is 29 - 8 = 21
++            y0oax4 = vshrq_n_s32(y0oax4, 21);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
 +
-+            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, 21);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
 +
-+            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
++            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
 +
-+            r00 = r[0], g00 = g[0], b00 = b[0];
-+            r01 = r[1], g01 = g[1], b01 = b[1];
-+            r10 = r[2], g10 = g[2], b10 = b[2];
-+            r11 = r[3], g11 = g[3], b11 = b[3];
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
 +
-+            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
-+            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
-+            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
-+            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
 +
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstu[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
-+            dstv[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
-+#undef AVG
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, 21);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, 21);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++
++            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
++            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
++
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
++
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
++
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
++
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, 21);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            vox4 = vshrq_n_s32(vox4, 21);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++
++            uvoax4 = vzip1q_s32(uox4, vox4);
++            uvobx4 = vzip2q_s32(uox4, vox4);
++
++            vst1_u8(&dstuv[x], vqmovun_s16(vcombine_s16(vmovn_s32(uvoax4), vmovn_s32(uvobx4))));
 +        }
 +    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
++                                       rsrcy, rsrcuv,
++                                       dstlinesize, srclinesize,
++                                       dstdepth, srcdepth,
++                                       remainw, rheight, params);
++    }
++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +}
 +
-+void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                   const int *dstlinesize, const int *srclinesize,
-+                                   int dstdepth, int srcdepth,
-+                                   int width, int height,
-+                                   const struct TonemapIntParams *params)
++void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                        const int *dstlinesize, const int *srclinesize,
++                                        int dstdepth, int srcdepth,
++                                        int width, int height,
++                                        const struct TonemapIntParams *params)
 +{
++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstu = dstu;
++    uint16_t *rdstv = dstv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcu = srcu;
++    const uint16_t *rsrcv = srcv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
@@ -782,84 +852,229 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int r00, g00, b00;
-+    int r01, g01, b01;
-+    int r10, g10, b10;
-+    int r11, g11, b11;
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x4_t ux4, vx4;
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
 +
-+    int16_t r[4], g[4], b[4];
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    uint16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
++
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    uint16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                         ) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
-+            int u = (srcu[x >> 1]) - in_uv_offset;
-+            int v = (srcv[x >> 1]) - in_uv_offset;
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            ux4 = vld1_u16(srcu + (x >> 1));
++            vx4 = vld1_u16(srcv + (x >> 1));
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
 +
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
++            ux8 = vsubq_u16(ux8, in_uv_offx8);
++            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
++            vx8 = vsubq_u16(vx8, in_uv_offx8);
 +
-+            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
 +
-+            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
 +
-+            r00 = r[0], g00 = g[0], b00 = b[0];
-+            r01 = r[1], g01 = g[1], b01 = b[1];
-+            r10 = r[2], g10 = g[2], b10 = b[2];
-+            r11 = r[3], g11 = g[3], b11 = b[3];
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
 +
-+            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 16);
-+            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 16);
-+            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 16);
-+            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 16);
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
 +
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 16);
-+            dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 16);
-+#undef AVG
-+        }
-+    }
-+}
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
 +
-+void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
-+                                         const uint16_t *srcy, const uint16_t *srcuv,
-+                                         const int *dstlinesize, const int *srclinesize,
-+                                         int dstdepth, int srcdepth,
-+                                         int width, int height,
-+                                         const struct TonemapIntParams *params)
-+{
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
-+    const int in_sh2 = 16 - in_depth;
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
 +
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
++
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++            y0oax4 = vshrq_n_s32(y0oax4, 19);
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++            y0obx4 = vshrq_n_s32(y0obx4, 19);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++
++            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
++            vst1q_u16(&dsty[x], y0ox8);
++
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
++
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++            y1oax4 = vshrq_n_s32(y1oax4, 19);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++            y1obx4 = vshrq_n_s32(y1obx4, 19);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++
++            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
++            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
++
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
++
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
++
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
++
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++            uox4 = vshrq_n_s32(uox4, 19);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4));
++
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++            vox4 = vshrq_n_s32(vox4, 19);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++            vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4));
++        }
++    }
++
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstu += offset >> 1;
++        rdstv += offset >> 1;
++        rsrcy += offset;
++        rsrcu += offset >> 1;
++        rsrcv += offset >> 1;
++        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
++                                      rsrcy, rsrcu, rsrcv,
++                                      dstlinesize, srclinesize,
++                                      dstdepth, srcdepth,
++                                      remainw, rheight, params);
++    }
++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
++}
++
++void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
++                                              const uint16_t *srcy, const uint16_t *srcuv,
++                                              const int *dstlinesize, const int *srclinesize,
++                                              int dstdepth, int srcdepth,
++                                              int width, int height,
++                                              const struct TonemapIntParams *params)
++{
++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
++    uint16_t *rdsty = dsty;
++    uint16_t *rdstuv = dstuv;
++    const uint16_t *rsrcy = srcy;
++    const uint16_t *rsrcuv = srcuv;
++    int rheight = height;
++    // not zero when not divisible by 8
++    // intentionally leave last pixel emtpy when input is odd
++    int remainw = width & 6;
++
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
++
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
 +    const int out_sh2 = 16 - out_depth;
@@ -879,540 +1094,389 @@ Index: FFmpeg/libavfilter/vf_tonemapx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int r00, g00, b00;
-+    int r01, g01, b01;
-+    int r10, g10, b10;
-+    int r11, g11, b11;
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
++    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
++    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
++    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
++    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
++    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
++    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
++    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
++    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
++    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
++    uint16x8_t uvx8;
++    uint16x4_t ux2a, vx2a, ux2b, vx2b;
++    uint16x8_t y0x8, y1x8, ux8, vx8;
++    uint16x8_t r0x8, g0x8, b0x8;
++    uint16x8_t r1x8, g1x8, b1x8;
 +
-+    int16_t r[4], g[4], b[4];
++    int16x8_t r0ox8, g0ox8, b0ox8;
++    uint16x8_t y0ox8;
++    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
++    int32x4_t y0oax4, y0obx4;
++
++    int16x8_t r1ox8, g1ox8, b1ox8;
++    uint16x8_t y1ox8;
++    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    int32x4_t y1oax4, y1obx4;
++    int32x4_t uvoax4, uvobx4;
++    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
++    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
++    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
++    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
++    int16x8_t out_sh2x8 = vdupq_n_s16(out_sh2);
++    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
++    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int x = 0; x < width; x += 2) {
-+            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
-+            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
-+            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
-+            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
-+            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
-+            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
-+            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
-+            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
-+            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
++            y0x8 = vld1q_u16(srcy + x);
++            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
++            uvx8 = vld1q_u16(srcuv + x);
++            if (in_depth == 10) {
++                // shift to low10bits for 10bit input
++                // shift bit has to be compile-time constant
++                y0x8 = vshrq_n_u16(y0x8, 6);
++                y1x8 = vshrq_n_u16(y1x8, 6);
++                uvx8 = vshrq_n_u16(uvx8, 6);
++            }
++            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
++            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
 +
-+            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
++            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
++            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
++            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
 +
-+            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
-+            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
++            ux8 = vcombine_u16(ux2a, ux2b);
++            vx8 = vcombine_u16(vx2a, vx2b);
 +
-+            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
-+            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
-+                          params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
++            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
++            r0x8 = vaddq_u16(r0x8, rndx8);
 +
-+            r00 = r[0], g00 = g[0], b00 = b[0];
-+            r01 = r[1], g01 = g[1], b01 = b[1];
-+            r10 = r[2], g10 = g[2], b10 = b[2];
-+            r11 = r[3], g11 = g[3], b11 = b[3];
++            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
++            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
++            g0x8 = vaddq_u16(g0x8, rndx8);
 +
-+            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
++            b0x8 = vaddq_u16(b0x8, rndx8);
 +
-+#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
-+            dstuv[x]     = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
-+            dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
-+#undef AVG
-+        }
-+    }
-+}
-+
-+#define LOAD_TONEMAP_PARAMS     TonemapxContext *s = ctx->priv; \
-+ThreadData *td = arg;                                           \
-+AVFrame *in = td->in;                                           \
-+AVFrame *out = td->out;                                         \
-+const AVPixFmtDescriptor *desc  = td->desc;                     \
-+const AVPixFmtDescriptor *odesc = td->odesc;                    \
-+const int ss = 1 << FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h); \
-+const int slice_start = (in->height / ss *  jobnr     ) / nb_jobs * ss; \
-+const int slice_end   = (in->height / ss * (jobnr + 1)) / nb_jobs * ss; \
-+TonemapIntParams params = {                                     \
-+.lut_peak            = s->lut_peak,                             \
-+.lin_lut             = s->lin_lut,                              \
-+.tonemap_lut         = s->tonemap_lut,                          \
-+.delin_lut           = s->delin_lut,                            \
-+.in_yuv_off          = s->in_yuv_off,                           \
-+.out_yuv_off         = s->out_yuv_off,                          \
-+.yuv2rgb_coeffs      = &s->yuv2rgb_coeffs,                      \
-+.rgb2yuv_coeffs      = &s->rgb2yuv_coeffs,                      \
-+.rgb2rgb_coeffs      = &s->rgb2rgb_coeffs,                      \
-+.rgb2rgb_passthrough = in->color_primaries == out->color_primaries,   \
-+.coeffs              = s->coeffs,                               \
-+.ocoeffs             = s->ocoeffs,                              \
-+.desat               = s->desat,                                \
-+};
++            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
++            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
++            r1x8 = vaddq_u16(r1x8, rndx8);
 +
-+static int filter_slice_planar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
-+{
-+    LOAD_TONEMAP_PARAMS
-+    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
++            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
++            g1x8 = vaddq_u16(g1x8, rndx8);
 +
-+    s->tonemap_func_planar8(out->data[0] + out->linesize[0] * slice_start,
-+                            out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                            out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                            (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                            (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                            (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                            out->linesize, in->linesize,
-+                            odesc->comp[0].depth, desc->comp[0].depth,
-+                            out->width, slice_end - slice_start,
-+                            &params);
++            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
++            b1x8 = vaddq_u16(b1x8, rndx8);
 +
-+    return 0;
-+}
++            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
++            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
++                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
++                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
++                                 params->rgb2rgb_passthrough);
 +
-+static int filter_slice_biplanar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
-+{
-+    LOAD_TONEMAP_PARAMS
-+    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++            r0ox8 = vld1q_s16(r);
++            g0ox8 = vld1q_s16(g);
++            b0ox8 = vld1q_s16(b);
 +
-+    s->tonemap_func_biplanar8(out->data[0] + out->linesize[0] * slice_start,
-+                              out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
-+                              (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                              (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                              out->linesize, in->linesize,
-+                              odesc->comp[0].depth, desc->comp[0].depth,
-+                              out->width, slice_end - slice_start,
-+                              &params);
++            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
++            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
++            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
 +
-+    return 0;
-+}
++            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
++            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
++            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
 +
-+static int filter_slice_planar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
-+{
-+    LOAD_TONEMAP_PARAMS
-+    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++            y0oax4 = vmulq_n_s32(r0oax4, cry);
++            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
++            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
++            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
 +
-+    s->tonemap_func_planar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start),
-+                             (uint16_t *) (out->data[1] +
-+                                           out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
-+                             (uint16_t *) (out->data[2] +
-+                                           out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
-+                             (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                             (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                             (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                             out->linesize, in->linesize,
-+                             odesc->comp[0].depth, desc->comp[0].depth,
-+                             out->width, slice_end - slice_start,
-+                             &params);
++            y0obx4 = vmulq_n_s32(r0obx4, cry);
++            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
++            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
++            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
 +
-+    return 0;
-+}
++            r1ox8 = vld1q_s16(r1);
++            g1ox8 = vld1q_s16(g1);
++            b1ox8 = vld1q_s16(b1);
 +
-+static int filter_slice_biplanar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
-+{
-+    LOAD_TONEMAP_PARAMS
-+    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
++            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
++            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
++            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
 +
-+    s->tonemap_func_biplanar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start),
-+                               (uint16_t *) (out->data[1] +
-+                                             out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
-+                               (void*)(in->data[0] + in->linesize[0] * slice_start),
-+                               (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
-+                               out->linesize, in->linesize,
-+                               odesc->comp[0].depth, desc->comp[0].depth,
-+                               out->width, slice_end - slice_start,
-+                               &params);
++            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
++            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
++            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
 +
-+    return 0;
-+}
++            y1oax4 = vmulq_n_s32(r1oax4, cry);
++            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
++            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
++            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
 +
-+static int filter_frame(AVFilterLink *link, AVFrame *in)
-+{
-+    AVFilterContext *ctx = link->dst;
-+    TonemapxContext *s = ctx->priv;
-+    AVFilterLink *outlink = ctx->outputs[0];
-+    AVFrame *out;
-+    const AVPixFmtDescriptor *desc;
-+    const AVPixFmtDescriptor *odesc;
-+    int ret;
-+    double peak = s->peak;
-+    const AVLumaCoefficients *coeffs;
-+    ThreadData td;
++            y1obx4 = vmulq_n_s32(r1obx4, cry);
++            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
++            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
++            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
 +
-+    desc = av_pix_fmt_desc_get(link->format);
-+    odesc = av_pix_fmt_desc_get(outlink->format);
-+    if (!desc || !odesc) {
-+        av_frame_free(&in);
-+        return AVERROR_BUG;
-+    }
++            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
++            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
++            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
++            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
++            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
++            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
++            ravgx4 = vshrq_n_s32(ravgx4, 2);
 +
-+    switch (odesc->comp[2].plane) {
-+        case 1: // biplanar
-+            if (odesc->comp[0].depth == 8) {
-+                s->filter_slice = filter_slice_biplanar8;
-+            } else {
-+                s->filter_slice = filter_slice_biplanar10;
-+            }
-+            break;
-+        default:
-+        case 2: // planar
-+            if (odesc->comp[0].depth == 8) {
-+                s->filter_slice = filter_slice_planar8;
-+            } else {
-+                s->filter_slice = filter_slice_planar10;
-+            }
-+            break;
-+    }
++            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
++            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
++            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
++            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
++            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
++            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
++            gavgx4 = vshrq_n_s32(gavgx4, 2);
 +
-+    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
-+    if (!out) {
-+        av_frame_free(&in);
-+        return AVERROR(ENOMEM);
-+    }
++            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
++            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
++            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
++            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
++            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
++            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
++            bavgx4 = vshrq_n_s32(bavgx4, 2);
 +
-+    if ((ret = av_frame_copy_props(out, in)) < 0)
-+        goto fail;
++            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
++            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
++            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
 +
-+    /* read peak from side data if not passed in */
-+    if (!peak) {
-+        peak = ff_determine_signal_peak(in);
-+        av_log(s, AV_LOG_DEBUG, "Computed signal peak: %f\n", peak);
-+    }
++            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
++            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
++            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
 +
-+    out->color_trc = s->trc;
-+    out->colorspace = s->spc;
-+    out->color_primaries = s->pri;
-+    out->color_range = s->range;
++            switch(out_depth) {
++                default:
++                case 10:
++                    y0oax4 = vshrq_n_s32(y0oax4, 19);
++                    y0obx4 = vshrq_n_s32(y0obx4, 19);
++                    y1oax4 = vshrq_n_s32(y1oax4, 19);
++                    y1obx4 = vshrq_n_s32(y1obx4, 19);
++                    uox4 = vshrq_n_s32(uox4, 19);
++                    vox4 = vshrq_n_s32(vox4, 19);
++                    break;
++                case 16:
++                    y0oax4 = vshrq_n_s32(y0oax4, 13);
++                    y0obx4 = vshrq_n_s32(y0obx4, 13);
++                    y1oax4 = vshrq_n_s32(y1oax4, 13);
++                    y1obx4 = vshrq_n_s32(y1obx4, 13);
++                    uox4 = vshrq_n_s32(uox4, 13);
++                    vox4 = vshrq_n_s32(vox4, 13);
++                    break;
++            }
 +
-+    if (in->color_trc == AVCOL_TRC_UNSPECIFIED)
-+        in->color_trc = AVCOL_TRC_SMPTE2084;
-+    if (out->color_trc == AVCOL_TRC_UNSPECIFIED)
-+        out->color_trc = AVCOL_TRC_BT709;
++            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
 +
-+    if (in->colorspace == AVCOL_SPC_UNSPECIFIED)
-+        in->colorspace = AVCOL_SPC_BT2020_NCL;
-+    if (out->colorspace == AVCOL_SPC_UNSPECIFIED)
-+        out->colorspace = AVCOL_SPC_BT709;
++            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
++            y0ox8 = vshlq_u16(y0ox8, out_sh2x8);
++            vst1q_u16(&dsty[x], y0ox8);
 +
-+    if (in->color_primaries == AVCOL_PRI_UNSPECIFIED)
-+        in->color_primaries = AVCOL_PRI_BT2020;
-+    if (out->color_primaries == AVCOL_PRI_UNSPECIFIED)
-+        out->color_primaries = AVCOL_PRI_BT709;
++            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
++            y1ox8 = vshlq_u16(y1ox8, out_sh2x8);
++            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
 +
-+    if (in->color_range == AVCOL_RANGE_UNSPECIFIED)
-+        in->color_range = AVCOL_RANGE_MPEG;
-+    if (out->color_range == AVCOL_RANGE_UNSPECIFIED)
-+        out->color_range = AVCOL_RANGE_MPEG;
++            uvoax4 = vzip1q_s32(uox4, vox4);
++            uvobx4 = vzip2q_s32(uox4, vox4);
 +
-+    if (!s->lin_lut || !s->delin_lut) {
-+        if ((ret = compute_trc_luts(s, in->color_trc, out->color_trc)) < 0)
-+            goto fail;
++            vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8));
++        }
 +    }
 +
-+    if (!s->tonemap_lut || s->lut_peak != peak) {
-+        s->lut_peak = peak;
-+        if ((ret = compute_tonemap_lut(s, out->color_trc)) < 0)
-+            goto fail;
++    // Process remaining pixels cannot fill the full simd register with scalar version
++    if (remainw) {
++        int offset = width & (int)0xfffffff8;
++        rdsty += offset;
++        rdstuv += offset;
++        rsrcy += offset;
++        rsrcuv += offset;
++        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
++                                            rsrcy, rsrcuv,
++                                            dstlinesize, srclinesize,
++                                            dstdepth, srcdepth,
++                                            remainw, rheight, params);
 +    }
++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
++}
+Index: jellyfin-ffmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
+===================================================================
+--- /dev/null
++++ jellyfin-ffmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
+@@ -0,0 +1,54 @@
++/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+    coeffs = av_csp_luma_coeffs_from_avcsp(in->colorspace);
-+    if (s->coeffs != coeffs) {
-+        s->coeffs = coeffs;
-+        s->ocoeffs = av_csp_luma_coeffs_from_avcsp(out->colorspace);
-+        if ((ret = compute_yuv_coeffs(s, coeffs, s->ocoeffs, desc, odesc,
-+             in->color_range, out->color_range)) < 0)
-+            goto fail;
-+        if ((ret = compute_rgb_coeffs(s, in->color_primaries, out->color_primaries)) < 0)
-+            goto fail;
-+    }
++#ifndef AVFILTER_TONEMAPX_INTRIN_NEON_H
++#define AVFILTER_TONEMAPX_INTRIN_NEON_H
 +
-+    /* do the tonemap */
-+    td.in    = in;
-+    td.out   = out;
-+    td.desc  = desc;
-+    td.odesc = odesc;
-+    td.peak  = peak;
-+    ctx->internal->execute(ctx, s->filter_slice, &td, NULL,
-+                           FFMIN(outlink->h >> FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h), ff_filter_get_nb_threads(ctx)));
++#include "libavfilter/vf_tonemapx.h"
 +
-+    av_frame_free(&in);
++void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                      const int *dstlinesize, const int *srclinesize,
++                                      int dstdepth, int srcdepth,
++                                      int width, int height,
++                                      const struct TonemapIntParams *params);
 +
-+    ff_update_hdr_metadata(out, peak);
++void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
++                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                         const int *dstlinesize, const int *srclinesize,
++                                         int dstdepth, int srcdepth,
++                                         int width, int height,
++                                         const struct TonemapIntParams *params);
 +
-+    return ff_filter_frame(outlink, out);
-+fail:
-+    av_frame_free(&in);
-+    av_frame_free(&out);
-+    return ret;
-+}
++void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                        const int *dstlinesize, const int *srclinesize,
++                                        int dstdepth, int srcdepth,
++                                        int width, int height,
++                                        const struct TonemapIntParams *params);
 +
-+static void uninit(AVFilterContext *ctx)
-+{
-+    TonemapxContext *s = ctx->priv;
++void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
++                                              const uint16_t *srcy, const uint16_t *srcuv,
++                                              const int *dstlinesize, const int *srclinesize,
++                                              int dstdepth, int srcdepth,
++                                              int width, int height,
++                                              const struct TonemapIntParams *params);
 +
-+    av_freep(&s->lin_lut);
-+    av_freep(&s->delin_lut);
-+    av_freep(&s->tonemap_lut);
-+}
++#endif // AVFILTER_TONEMAPX_INTRIN_NEON_H
+Index: jellyfin-ffmpeg/libavfilter/allfilters.c
+===================================================================
+--- jellyfin-ffmpeg.orig/libavfilter/allfilters.c
++++ jellyfin-ffmpeg/libavfilter/allfilters.c
+@@ -484,6 +484,7 @@ extern const AVFilter ff_vf_tmedian;
+ extern const AVFilter ff_vf_tmidequalizer;
+ extern const AVFilter ff_vf_tmix;
+ extern const AVFilter ff_vf_tonemap;
++extern const AVFilter ff_vf_tonemapx;
+ extern const AVFilter ff_vf_tonemap_cuda;
+ extern const AVFilter ff_vf_tonemap_opencl;
+ extern const AVFilter ff_vf_tonemap_vaapi;
+Index: jellyfin-ffmpeg/libavfilter/colorspace.c
+===================================================================
+--- jellyfin-ffmpeg.orig/libavfilter/colorspace.c
++++ jellyfin-ffmpeg/libavfilter/colorspace.c
+@@ -17,6 +17,7 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+
++#include "libavutil/avassert.h"
+ #include "libavutil/frame.h"
+ #include "libavutil/mastering_display_metadata.h"
+ #include "libavutil/pixdesc.h"
+@@ -354,3 +355,51 @@ float inverse_eotf_arib_b67(float x) {
+ float inverse_eotf_bt1886(float x) {
+     return x > 0.0f ? powf(x, 1.0f / 2.4f) : 0.0f;
+ }
 +
-+static int query_formats(AVFilterContext *ctx)
++int ff_get_range_off(int *off, int *y_rng, int *uv_rng,
++                     enum AVColorRange rng, int depth)
 +{
-+    enum AVPixelFormat valid_in_pix_fmts[4];
-+    AVFilterFormats *formats;
-+    const AVPixFmtDescriptor *desc;
-+    TonemapxContext *s = ctx->priv;
-+
-+    if (!strcmp(s->format_str, "same")) {
-+        int res;
-+        formats = ff_make_format_list(in_pix_fmts);
-+        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
-+        if (res < 0)
-+            return res;
-+        s->format = AV_PIX_FMT_NONE;
-+    } else {
-+        int i, j = 0;
-+        int res;
-+        formats = ff_make_format_list(in_pix_fmts);
-+        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
-+        if (res < 0)
-+            return res;
-+        if (s->format == AV_PIX_FMT_NONE) {
-+            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
-+            return AVERROR(EINVAL);
-+        }
-+        s->format = av_get_pix_fmt(s->format_str);
-+        // Check again in case of the string is invalid
-+        if (s->format == AV_PIX_FMT_NONE) {
-+            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
-+            return AVERROR(EINVAL);
-+        }
-+        desc = av_pix_fmt_desc_get(s->format);
-+        // Filter out the input formats for requested output formats
-+        // The input and output must have the same planar format, either planar or bi-planar packed
-+        for (i = 0; in_pix_fmts[i] != AV_PIX_FMT_NONE; i++) {
-+            const AVPixFmtDescriptor *tdesc = av_pix_fmt_desc_get(in_pix_fmts[i]);
-+            if (tdesc->comp[2].plane == desc->comp[2].plane) {
-+                valid_in_pix_fmts[j] = in_pix_fmts[i];
-+                j++;
-+            }
-+        }
-+        valid_in_pix_fmts[j] = AV_PIX_FMT_NONE;
-+        formats = ff_make_format_list(valid_in_pix_fmts);
-+        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
-+        if (res < 0)
-+            return res;
-+        if (out_format_is_supported(s->format)) {
-+            formats = NULL;
-+            res = ff_add_format(&formats, s->format);
-+            if (res < 0)
-+                return res;
-+        } else {
-+            av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
-+                   av_get_pix_fmt_name(s->format));
-+            return AVERROR(ENOSYS);
-+        }
++    switch (rng) {
++    case AVCOL_RANGE_UNSPECIFIED:
++    case AVCOL_RANGE_MPEG:
++        *off = 16 << (depth - 8);
++        *y_rng = 219 << (depth - 8);
++        *uv_rng = 224 << (depth - 8);
++        break;
++    case AVCOL_RANGE_JPEG:
++        *off = 0;
++        *y_rng = *uv_rng = (256 << (depth - 8)) - 1;
++        break;
++    default:
++        return AVERROR(EINVAL);
 +    }
 +
-+    return ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats);
++    return 0;
 +}
 +
-+static av_cold int init(AVFilterContext *ctx)
++void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3],
++                       int depth, int y_rng, int uv_rng, int yuv2rgb)
 +{
-+    TonemapxContext *s = ctx->priv;
-+    int cpu_flags = av_get_cpu_flags();
-+    enum SIMDVariant active_simd = SIMD_NONE;
-+    av_log(s, AV_LOG_DEBUG, "Requested output format: %s\n",
-+           s->format_str);
-+
-+#ifdef ENABLE_TONEMAPX_INTRINSICS
-+#if ARCH_AARCH64
-+    if (have_neon(cpu_flags)) {
-+        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_neon;
-+        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_neon;
-+        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon;
-+        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon;
-+        active_simd = SIMD_NEON;
-+    }
-+#elif ARCH_X86
-+#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
-+    if (X86_SSE42(cpu_flags)) {
-+        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_sse;
-+        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_sse;
-+        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse;
-+        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse;
-+        active_simd = SIMD_SSE;
-+    }
-+#else
-+    av_log(s, AV_LOG_WARNING, "SSE optimization disabled at compile time\n");
-+#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
-+#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
-+    if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) {
-+        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_avx;
-+        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_avx;
-+        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx;
-+        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx;
-+        active_simd = SIMD_AVX;
-+    }
-+#else
-+    av_log(s, AV_LOG_WARNING, "AVX optimization disabled at compile time\n");
-+#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
-+#endif  // ARCH_X86/ARCH_AARCH64
-+#else
-+    av_log(s, AV_LOG_WARNING, "SIMD optimization disabled at compile time\n");
-+#endif // ENABLE_TONEMAPX_INTRINSICS
-+
-+    if (!s->tonemap_func_biplanar8) {
-+        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12;
++#define N (yuv2rgb ? m : n)
++#define M (yuv2rgb ? n : m)
++    int rng, n, m, o;
++    int bits = 1 << (yuv2rgb ? (depth - 1) : (29 - depth));
++    for (rng = y_rng, n = 0; n < 3; n++, rng = uv_rng) {
++        for (m = 0; m < 3; m++) {
++            out[N][M][0] = lrint(bits * (yuv2rgb ? 28672 : rng) * table[N][M] / (yuv2rgb ? rng : 28672));
++            for (o = 1; o < 8; o++)
++                out[N][M][o] = out[N][M][0];
++        }
 +    }
++#undef N
++#undef M
 +
-+    if (!s->tonemap_func_biplanar10) {
-+        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010;
-+    }
-+
-+    if (!s->tonemap_func_planar8) {
-+        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p;
-+    }
-+
-+    if (!s->tonemap_func_planar10) {
-+        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10;
-+    }
-+
-+    switch(active_simd) {
-+        case SIMD_NEON:
-+            av_log(s, AV_LOG_INFO, "Using CPU capability: NEON\n");
-+            break;
-+        case SIMD_SSE:
-+            av_log(s, AV_LOG_INFO, "Using CPU capability: SSE4.2\n");
-+            break;
-+        case SIMD_AVX:
-+            av_log(s, AV_LOG_INFO, "Using CPU capabilities: AVX2 FMA3\n");
-+            break;
-+        default:
-+        case SIMD_NONE:
-+            av_log(s, AV_LOG_INFO, "No CPU SIMD extension available\n");
-+            break;
-+    }
-+
-+    switch(s->tonemap) {
-+        case TONEMAP_GAMMA:
-+            if (isnan(s->param))
-+                s->param = 1.8f;
-+            break;
-+        case TONEMAP_REINHARD:
-+            if (!isnan(s->param))
-+                s->param = (1.0f - s->param) / s->param;
-+            break;
-+        case TONEMAP_MOBIUS:
-+            if (isnan(s->param))
-+                s->param = 0.3f;
-+            break;
++    if (yuv2rgb) {
++        av_assert2(out[0][1][0] == 0);
++        av_assert2(out[2][2][0] == 0);
++        av_assert2(out[0][0][0] == out[1][0][0]);
++        av_assert2(out[0][0][0] == out[2][0][0]);
++    } else {
++        av_assert2(out[1][2][0] == out[2][0][0]);
 +    }
-+
-+    if (isnan(s->param))
-+        s->param = 1.0f;
-+
-+    return 0;
 +}
-+
-+#define OFFSET(x) offsetof(TonemapxContext, x)
-+#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
-+static const AVOption tonemapx_options[] = {
-+    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_BT2390}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, "tonemap" },
-+    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},              0, 0, FLAGS, "tonemap" },
-+    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},            0, 0, FLAGS, "tonemap" },
-+    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},             0, 0, FLAGS, "tonemap" },
-+    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0, 0, FLAGS, "tonemap" },
-+    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},          0, 0, FLAGS, "tonemap" },
-+    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},             0, 0, FLAGS, "tonemap" },
-+    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},            0, 0, FLAGS, "tonemap" },
-+    {     "bt2390",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_BT2390},            0, 0, FLAGS, "tonemap" },
-+    { "transfer",     "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, "transfer" },
-+    { "t",            "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, "transfer" },
-+    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},           0, 0, FLAGS, "transfer" },
-+    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},       0, 0, FLAGS, "transfer" },
-+    { "matrix",       "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, "matrix" },
-+    { "m",            "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, "matrix" },
-+    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},           0, 0, FLAGS, "matrix" },
-+    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},      0, 0, FLAGS, "matrix" },
-+    { "primaries",    "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, "primaries" },
-+    { "p",            "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, "primaries" },
-+    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709},           0, 0, FLAGS, "primaries" },
-+    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020},          0, 0, FLAGS, "primaries" },
-+    { "range",        "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, "range" },
-+    { "r",            "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, "range" },
-+    {     "tv",       0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},          0, 0, FLAGS, "range" },
-+    {     "pc",       0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},          0, 0, FLAGS, "range" },
-+    {     "limited",  0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},          0, 0, FLAGS, "range" },
-+    {     "full",     0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},          0, 0, FLAGS, "range" },
-+    { "format",       "output format",       OFFSET(format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS },
-+    { "param",        "tonemap parameter", OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },
-+    { "desat",        "desaturation strength", OFFSET(desat), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
-+    { "peak",         "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
-+    { NULL }
-+};
-+
-+AVFILTER_DEFINE_CLASS(tonemapx);
-+
-+static const AVFilterPad tonemapx_inputs[] = {
-+    {
-+        .name         = "default",
-+        .type         = AVMEDIA_TYPE_VIDEO,
-+        .filter_frame = filter_frame,
-+    },
-+};
-+
-+static const AVFilterPad tonemapx_outputs[] = {
-+    {
-+        .name         = "default",
-+        .type         = AVMEDIA_TYPE_VIDEO,
-+    },
-+};
-+
-+AVFilter ff_vf_tonemapx = {
-+    .name            = "tonemapx",
-+    .description     = NULL_IF_CONFIG_SMALL("SIMD optimized HDR to SDR tonemapping"),
-+    .init            = init,
-+    .uninit          = uninit,
-+    .priv_size       = sizeof(TonemapxContext),
-+    .priv_class      = &tonemapx_class,
-+    FILTER_INPUTS(tonemapx_inputs),
-+    FILTER_OUTPUTS(tonemapx_outputs),
-+    FILTER_QUERY_FUNC(query_formats),
-+    .flags           = AVFILTER_FLAG_SLICE_THREADS,
-+};
-Index: FFmpeg/libavfilter/aarch64/Makefile
+Index: jellyfin-ffmpeg/libavfilter/colorspace.h
 ===================================================================
---- FFmpeg.orig/libavfilter/aarch64/Makefile
-+++ FFmpeg/libavfilter/aarch64/Makefile
-@@ -1,3 +1,4 @@
- OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
-+OBJS-$(CONFIG_TONEMAPX_FILTER)               += aarch64/vf_tonemapx_intrin_neon.o
- 
- NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
-Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
+--- jellyfin-ffmpeg.orig/libavfilter/colorspace.h
++++ jellyfin-ffmpeg/libavfilter/colorspace.h
+@@ -85,4 +85,8 @@ float eotf_arib_b67(float x);
+ float inverse_eotf_arib_b67(float x);
+ float inverse_eotf_bt1886(float x);
+
++int ff_get_range_off(int *off, int *y_rng, int *uv_rng,
++                     enum AVColorRange rng, int depth);
++void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3],
++                       int depth, int y_rng, int uv_rng, int yuv2rgb);
+ #endif
+Index: jellyfin-ffmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
-+++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
-@@ -0,0 +1,1215 @@
++++ jellyfin-ffmpeg/libavfilter/vf_tonemapx.c
+@@ -0,0 +1,1267 @@
 +/*
-+ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
-+ *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
@@ -1430,457 +1494,416 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
-+#include "vf_tonemapx_intrin_neon.h"
-+
-+static inline void tonemap_int16x8_neon(uint16x8_t r_in, uint16x8_t g_in, uint16x8_t b_in,
-+                                        int16_t *r_out, int16_t *g_out, int16_t *b_out,
-+                                        float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
-+                                        const AVLumaCoefficients *coeffs,
-+                                        const AVLumaCoefficients *ocoeffs, double desat,
-+                                        double (*rgb2rgb)[3][3],
-+                                        int rgb2rgb_passthrough)
-+{
-+    int16x8_t sig8;
-+    float32x4_t mapvalx4a;
-+    float32x4_t mapvalx4b;
-+    float32x4_t r_linx4a;
-+    float32x4_t r_linx4b;
-+    float32x4_t g_linx4a;
-+    float32x4_t g_linx4b;
-+    float32x4_t b_linx4a;
-+    float32x4_t b_linx4b;
-+    float32x4_t offset = vdupq_n_f32(0.5f);
-+    int32x4_t output_upper_bound = vdupq_n_s32(32767);
-+    int32x4_t zerox4 = vdupq_n_s32(0);
-+    int16x8_t input_lut_offset = vdupq_n_s16(2048);
-+    int16x8_t input_upper_bound = vdupq_n_s16(32767);
-+    int16x8_t r, g, b;
-+    int32x4_t rx4a, gx4a, bx4a, rx4b, gx4b, bx4b;
++/**
++ * @file
++ * tonemap algorithms
++ */
 +
-+    float mapval4a[4], mapval4b[4], r_lin4a[4], r_lin4b[4], g_lin4a[4], g_lin4b[4], b_lin4a[4], b_lin4b[4];
++#include <float.h>
++#include <string.h>
 +
-+    r = vreinterpretq_s16_u16(r_in);
-+    g = vreinterpretq_s16_u16(g_in);
-+    b = vreinterpretq_s16_u16(b_in);
++#include "libavutil/avassert.h"
++#include "libavutil/imgutils.h"
++#include "libavutil/internal.h"
++#include "libavutil/mem_internal.h"
++#include "libavutil/opt.h"
++#include "libavutil/cpu.h"
 +
-+    sig8 = vmaxq_s16(r, vmaxq_s16(g, b));
-+    sig8 = vaddq_s16(sig8, input_lut_offset);
-+    sig8 = vminq_s16(sig8, input_upper_bound);
-+    sig8 = vmaxq_s16(sig8, vreinterpretq_s16_s32(zerox4));
++#include "vf_tonemapx.h"
 +
-+    r = vaddq_s16(r, input_lut_offset);
-+    r = vminq_s16(r, input_upper_bound);
-+    r = vmaxq_s16(r, vreinterpretq_s16_s32(zerox4));
-+    g = vaddq_s16(g, input_lut_offset);
-+    g = vminq_s16(g, input_upper_bound);
-+    g = vmaxq_s16(g, vreinterpretq_s16_s32(zerox4));
-+    b = vaddq_s16(b, input_lut_offset);
-+    b = vminq_s16(b, input_upper_bound);
-+    b = vmaxq_s16(b, vreinterpretq_s16_s32(zerox4));
++#ifdef CC_SUPPORTS_TONEMAPX_INTRINSICS
++#    if ARCH_AARCH64
++#        if HAVE_INTRINSICS_NEON
++#            include "libavutil/aarch64/cpu.h"
++#            include "aarch64/vf_tonemapx_intrin_neon.h"
++#        endif
++#    endif // ARCH_AARCH64
++#    if ARCH_X86
++#        include "libavutil/x86/cpu.h"
++#        if HAVE_INTRINSICS_SSE42
++#            include "x86/vf_tonemapx_intrin_sse.h"
++#        endif
++#        if HAVE_INTRINSICS_AVX2 && HAVE_INTRINSICS_FMA3
++#            include "x86/vf_tonemapx_intrin_avx.h"
++#        endif
++#    endif // ARCH_X86
++#endif // CC_SUPPORTS_TONEMAPX_INTRINSICS
 +
-+    // Cannot use loop here as the lane has to be compile-time constant
-+#define LOAD_LUT(i) mapval4a[i] = tonemap_lut[vget_lane_s16(vget_low_s16(sig8), i)]; \
-+mapval4b[i] = tonemap_lut[vget_lane_s16(vget_high_s16(sig8), i)];                    \
-+r_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(r), i)];                             \
-+r_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(r), i)];                            \
-+g_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(g), i)];                             \
-+g_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(g), i)];                            \
-+b_lin4a[i] = lin_lut[vget_lane_s16(vget_low_s16(b), i)];                             \
-+b_lin4b[i] = lin_lut[vget_lane_s16(vget_high_s16(b), i)];
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "video.h"
 +
-+    LOAD_LUT(0)
-+    LOAD_LUT(1)
-+    LOAD_LUT(2)
-+    LOAD_LUT(3)
++enum TonemapAlgorithm {
++    TONEMAP_NONE,
++    TONEMAP_LINEAR,
++    TONEMAP_GAMMA,
++    TONEMAP_CLIP,
++    TONEMAP_REINHARD,
++    TONEMAP_HABLE,
++    TONEMAP_MOBIUS,
++    TONEMAP_BT2390,
++    TONEMAP_MAX,
++};
 +
-+#undef  LOAD_LUT
++typedef struct TonemapxContext {
++    const AVClass *class;
 +
-+    mapvalx4a = vld1q_f32(mapval4a);
-+    mapvalx4b = vld1q_f32(mapval4b);
-+    r_linx4a = vld1q_f32(r_lin4a);
-+    r_linx4b = vld1q_f32(r_lin4b);
-+    g_linx4a = vld1q_f32(g_lin4a);
-+    g_linx4b = vld1q_f32(g_lin4b);
-+    b_linx4a = vld1q_f32(b_lin4a);
-+    b_linx4b = vld1q_f32(b_lin4b);
++    enum TonemapAlgorithm tonemap;
++    enum AVColorTransferCharacteristic trc;
++    enum AVColorSpace spc;
++    enum AVColorPrimaries pri;
++    enum AVColorRange range;
++    enum AVPixelFormat format;
++    char *format_str;
++    double param;
++    double desat;
++    double peak;
 +
-+    if (!rgb2rgb_passthrough) {
-+        r_linx4a = vmulq_n_f32(r_linx4a, (float)(*rgb2rgb)[0][0]);
-+        r_linx4a = vfmaq_n_f32(r_linx4a, g_linx4a, (float)(*rgb2rgb)[0][1]);
-+        r_linx4a = vfmaq_n_f32(r_linx4a, b_linx4a, (float)(*rgb2rgb)[0][2]);
-+        r_linx4b = vmulq_n_f32(r_linx4b, (float)(*rgb2rgb)[0][0]);
-+        r_linx4b = vfmaq_n_f32(r_linx4b, g_linx4b, (float)(*rgb2rgb)[0][1]);
-+        r_linx4b = vfmaq_n_f32(r_linx4b, b_linx4b, (float)(*rgb2rgb)[0][2]);
++    const AVLumaCoefficients *coeffs, *ocoeffs;
 +
-+        g_linx4a = vmulq_n_f32(g_linx4a, (float)(*rgb2rgb)[1][1]);
-+        g_linx4a = vfmaq_n_f32(g_linx4a, r_linx4a, (float)(*rgb2rgb)[1][0]);
-+        g_linx4a = vfmaq_n_f32(g_linx4a, b_linx4a, (float)(*rgb2rgb)[1][2]);
-+        g_linx4b = vmulq_n_f32(g_linx4b, (float)(*rgb2rgb)[1][1]);
-+        g_linx4b = vfmaq_n_f32(g_linx4b, r_linx4b, (float)(*rgb2rgb)[1][0]);
-+        g_linx4b = vfmaq_n_f32(g_linx4b, b_linx4b, (float)(*rgb2rgb)[1][2]);
++    double lut_peak;
++    float *lin_lut;
++    float *tonemap_lut;
++    uint16_t *delin_lut;
++    int in_yuv_off, out_yuv_off;
 +
-+        b_linx4a = vmulq_n_f32(b_linx4a, (float)(*rgb2rgb)[2][2]);
-+        b_linx4a = vfmaq_n_f32(b_linx4a, r_linx4a, (float)(*rgb2rgb)[2][0]);
-+        b_linx4a = vfmaq_n_f32(b_linx4a, g_linx4a, (float)(*rgb2rgb)[2][1]);
-+        b_linx4b = vmulq_n_f32(b_linx4b, (float)(*rgb2rgb)[2][2]);
-+        b_linx4b = vfmaq_n_f32(b_linx4b, r_linx4b, (float)(*rgb2rgb)[2][0]);
-+        b_linx4b = vfmaq_n_f32(b_linx4b, g_linx4b, (float)(*rgb2rgb)[2][1]);
-+    }
++    DECLARE_ALIGNED(16, int16_t, yuv2rgb_coeffs)[3][3][8];
++    DECLARE_ALIGNED(16, int16_t, rgb2yuv_coeffs)[3][3][8];
++    DECLARE_ALIGNED(16, double,  rgb2rgb_coeffs)[3][3];
 +
-+    if (desat > 0) {
-+        float32x4_t eps_x4 = vdupq_n_f32(FLOAT_EPS);
-+        float32x4_t desat4 = vdupq_n_f32((float)desat);
-+        float32x4_t luma4 = vdupq_n_f32(0);
-+        float32x4_t overbright4;
-+        // Group A
-+        luma4 = vmlaq_n_f32(luma4, r_linx4a, (float)av_q2d(coeffs->cr));
-+        luma4 = vmlaq_n_f32(luma4, g_linx4a, (float)av_q2d(coeffs->cg));
-+        luma4 = vmlaq_n_f32(luma4, b_linx4a, (float)av_q2d(coeffs->cb));
-+        overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
-+        r_linx4a = vmlsq_f32(r_linx4a, r_linx4a, overbright4);
-+        r_linx4a = vmlaq_f32(r_linx4a, luma4, overbright4);
-+        g_linx4a = vmlsq_f32(g_linx4a, g_linx4a, overbright4);
-+        g_linx4a = vmlaq_f32(g_linx4a, luma4, overbright4);
-+        b_linx4a = vmlsq_f32(b_linx4a, b_linx4a, overbright4);
-+        b_linx4a = vmlaq_f32(b_linx4a, luma4, overbright4);
-+        // Group B
-+        luma4 = vdupq_n_f32(0);
-+        luma4 = vmlaq_n_f32(luma4, r_linx4b, (float)av_q2d(coeffs->cr));
-+        luma4 = vmlaq_n_f32(luma4, g_linx4b, (float)av_q2d(coeffs->cg));
-+        luma4 = vmlaq_n_f32(luma4, b_linx4b, (float)av_q2d(coeffs->cb));
-+        overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
-+        r_linx4b = vmlsq_f32(r_linx4b, r_linx4b, overbright4);
-+        r_linx4b = vmlaq_f32(r_linx4b, luma4, overbright4);
-+        g_linx4b = vmlsq_f32(g_linx4b, g_linx4b, overbright4);
-+        g_linx4b = vmlaq_f32(g_linx4b, luma4, overbright4);
-+        b_linx4b = vmlsq_f32(b_linx4b, b_linx4b, overbright4);
-+        b_linx4b = vmlaq_f32(b_linx4b, luma4, overbright4);
-+    }
++    int (*filter_slice) (AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
 +
-+    r_linx4a = vmulq_f32(r_linx4a, mapvalx4a);
-+    g_linx4a = vmulq_f32(g_linx4a, mapvalx4a);
-+    b_linx4a = vmulq_f32(b_linx4a, mapvalx4a);
++    void (*tonemap_func_biplanar8) (uint8_t *dsty, uint8_t *dstuv,
++                                    const uint16_t *srcy, const uint16_t *srcuv,
++                                    const int *dstlinesize, const int *srclinesize,
++                                    int dstdepth, int srcdepth,
++                                    int width, int height,
++                                    const struct TonemapIntParams *params);
 +
-+    r_linx4b = vmulq_f32(r_linx4b, mapvalx4b);
-+    g_linx4b = vmulq_f32(g_linx4b, mapvalx4b);
-+    b_linx4b = vmulq_f32(b_linx4b, mapvalx4b);
++    void (*tonemap_func_planar8) (uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                  const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                  const int *dstlinesize, const int *srclinesize,
++                                  int dstdepth, int srcdepth,
++                                  int width, int height,
++                                  const struct TonemapIntParams *params);
 +
-+    r_linx4a = vmlaq_n_f32(offset, r_linx4a, 32767);
-+    r_linx4b = vmlaq_n_f32(offset, r_linx4b, 32767);
-+    g_linx4a = vmlaq_n_f32(offset, g_linx4a, 32767);
-+    g_linx4b = vmlaq_n_f32(offset, g_linx4b, 32767);
-+    b_linx4a = vmlaq_n_f32(offset, b_linx4a, 32767);
-+    b_linx4b = vmlaq_n_f32(offset, b_linx4b, 32767);
++    void (*tonemap_func_biplanar10) (uint16_t *dsty, uint16_t *dstuv,
++                                     const uint16_t *srcy, const uint16_t *srcuv,
++                                     const int *dstlinesize, const int *srclinesize,
++                                     int dstdepth, int srcdepth,
++                                     int width, int height,
++                                     const struct TonemapIntParams *params);
 +
-+    rx4a = vcvtq_s32_f32(r_linx4a);
-+    rx4a = vminq_s32(rx4a, output_upper_bound);
-+    rx4a = vmaxq_s32(rx4a, zerox4);
-+    gx4a = vcvtq_s32_f32(g_linx4a);
-+    gx4a = vminq_s32(gx4a, output_upper_bound);
-+    gx4a = vmaxq_s32(gx4a, zerox4);
-+    bx4a = vcvtq_s32_f32(b_linx4a);
-+    bx4a = vminq_s32(bx4a, output_upper_bound);
-+    bx4a = vmaxq_s32(bx4a, zerox4);
-+    rx4b = vcvtq_s32_f32(r_linx4b);
-+    rx4b = vminq_s32(rx4b, output_upper_bound);
-+    rx4b = vmaxq_s32(rx4b, zerox4);
-+    gx4b = vcvtq_s32_f32(g_linx4b);
-+    gx4b = vminq_s32(gx4b, output_upper_bound);
-+    gx4b = vmaxq_s32(gx4b, zerox4);
-+    bx4b = vcvtq_s32_f32(b_linx4b);
-+    bx4b = vminq_s32(bx4b, output_upper_bound);
-+    bx4b = vmaxq_s32(bx4b, zerox4);
++    void (*tonemap_func_planar10) (uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                   const int *dstlinesize, const int *srclinesize,
++                                   int dstdepth, int srcdepth,
++                                   int width, int height,
++                                   const struct TonemapIntParams *params);
++} TonemapxContext;
 +
-+    r_out[0] = delin_lut[vget_lane_s32(vget_low_s32(rx4a), 0)];
-+    r_out[1] = delin_lut[vget_lane_s32(vget_low_s32(rx4a), 1)];
-+    r_out[2] = delin_lut[vget_lane_s32(vget_high_s32(rx4a), 0)];
-+    r_out[3] = delin_lut[vget_lane_s32(vget_high_s32(rx4a), 1)];
-+    r_out[4] = delin_lut[vget_lane_s32(vget_low_s32(rx4b), 0)];
-+    r_out[5] = delin_lut[vget_lane_s32(vget_low_s32(rx4b), 1)];
-+    r_out[6] = delin_lut[vget_lane_s32(vget_high_s32(rx4b), 0)];
-+    r_out[7] = delin_lut[vget_lane_s32(vget_high_s32(rx4b), 1)];
++typedef struct ThreadData {
++    AVFrame *in, *out;
++    const AVPixFmtDescriptor *desc, *odesc;
++    double peak;
++} ThreadData;
 +
-+    g_out[0] = delin_lut[vget_lane_s32(vget_low_s32(gx4a), 0)];
-+    g_out[1] = delin_lut[vget_lane_s32(vget_low_s32(gx4a), 1)];
-+    g_out[2] = delin_lut[vget_lane_s32(vget_high_s32(gx4a), 0)];
-+    g_out[3] = delin_lut[vget_lane_s32(vget_high_s32(gx4a), 1)];
-+    g_out[4] = delin_lut[vget_lane_s32(vget_low_s32(gx4b), 0)];
-+    g_out[5] = delin_lut[vget_lane_s32(vget_low_s32(gx4b), 1)];
-+    g_out[6] = delin_lut[vget_lane_s32(vget_high_s32(gx4b), 0)];
-+    g_out[7] = delin_lut[vget_lane_s32(vget_high_s32(gx4b), 1)];
++static const enum AVPixelFormat in_pix_fmts[] = {
++    AV_PIX_FMT_YUV420P10,
++    AV_PIX_FMT_P010,
++    AV_PIX_FMT_P016,
++    AV_PIX_FMT_NONE,
++};
 +
-+    b_out[0] = delin_lut[vget_lane_s32(vget_low_s32(bx4a), 0)];
-+    b_out[1] = delin_lut[vget_lane_s32(vget_low_s32(bx4a), 1)];
-+    b_out[2] = delin_lut[vget_lane_s32(vget_high_s32(bx4a), 0)];
-+    b_out[3] = delin_lut[vget_lane_s32(vget_high_s32(bx4a), 1)];
-+    b_out[4] = delin_lut[vget_lane_s32(vget_low_s32(bx4b), 0)];
-+    b_out[5] = delin_lut[vget_lane_s32(vget_low_s32(bx4b), 1)];
-+    b_out[6] = delin_lut[vget_lane_s32(vget_high_s32(bx4b), 0)];
-+    b_out[7] = delin_lut[vget_lane_s32(vget_high_s32(bx4b), 1)];
-+}
++static const enum AVPixelFormat out_pix_fmts[] = {
++    AV_PIX_FMT_YUV420P,
++    AV_PIX_FMT_YUV420P10,
++    AV_PIX_FMT_NV12,
++    AV_PIX_FMT_P010,
++    AV_PIX_FMT_P016,
++};
 +
-+void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                      const int *dstlinesize, const int *srclinesize,
-+                                      int dstdepth, int srcdepth,
-+                                      int width, int height,
-+                                      const struct TonemapIntParams *params)
++static int out_format_is_supported(enum AVPixelFormat fmt)
 +{
-+    uint8_t *rdsty = dsty;
-+    uint8_t *rdstu = dstu;
-+    uint8_t *rdstv = dstv;
-+    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcu = srcu;
-+    const uint16_t *rsrcv = srcv;
-+    int rheight = height;
-+    // not zero when not divisible by 8
-+    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
++    int i;
 +
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
++    for (i = 0; i < FF_ARRAY_ELEMS(out_pix_fmts); i++)
++        if (out_pix_fmts[i] == fmt)
++            return 1;
++    return 0;
++}
 +
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
++static float hable(float in)
++{
++    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
++    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
++}
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++static float mobius(float in, float j, double peak)
++{
++    float a, b;
 +
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++    if (in <= j)
++        return in;
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
-+    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
-+    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
-+    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
-+    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
-+    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
-+    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
-+    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
-+    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
-+    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
-+    uint16x8_t y0x8, y1x8, ux8, vx8;
-+    uint16x8_t r0x8, g0x8, b0x8;
-+    uint16x8_t r1x8, g1x8, b1x8;
-+    uint16x4_t ux4, vx4;
++    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
++    b = (j * j - 2.0f * j * peak + peak) / FFMAX(peak - 1.0f, FLOAT_EPS);
 +
-+    int16x8_t r0ox8, g0ox8, b0ox8;
-+    int16x8_t y0ox8;
-+    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
-+    int32x4_t y0oax4, y0obx4;
++    return (b * b + 2.0f * b * j + j * j) / (b - a) * (in + a) / (in + b);
++}
 +
-+    int16x8_t r1ox8, g1ox8, b1ox8;
-+    int16x8_t y1ox8;
-+    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    int32x4_t y1oax4, y1obx4;
-+    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
-+    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
-+    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
-+    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
-+    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
-+    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
-+            int x = xx << 3;
++static float bt2390(float s, float peak)
++{
++    float peak_pq = inverse_eotf_st2084(peak, REFERENCE_WHITE_ALT);
++    float scale = 1.0f / peak_pq;
 +
-+            y0x8 = vld1q_u16(srcy + x);
-+            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            ux4 = vld1_u16(srcu + (x >> 1));
-+            vx4 = vld1_u16(srcv + (x >> 1));
++    // SDR peak
++    float dst_peak = 1.0f;
++    float s_pq = inverse_eotf_st2084(s, REFERENCE_WHITE_ALT) * scale;
++    float maxLum = inverse_eotf_st2084(dst_peak, REFERENCE_WHITE_ALT) * scale;
 +
-+            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
-+            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
-+            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
-+            ux8 = vsubq_u16(ux8, in_uv_offx8);
-+            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
-+            vx8 = vsubq_u16(vx8, in_uv_offx8);
-+
-+            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
-+            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
-+            r0x8 = vaddq_u16(r0x8, rndx8);
-+
-+            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
-+            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
-+            g0x8 = vaddq_u16(g0x8, rndx8);
++    float ks = 1.5f * maxLum - 0.5f;
++    float tb = (s_pq - ks) / (1.0f - ks);
++    float tb2 = tb * tb;
++    float tb3 = tb2 * tb;
++    float pb = (2.0f * tb3 - 3.0f * tb2 + 1.0f) * ks +
++               (tb3 - 2.0f * tb2 + tb) * (1.0f - ks) +
++               (-2.0f * tb3 + 3.0f * tb2) * maxLum;
++    float sig = (s_pq < ks) ? s_pq : pb;
 +
-+            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
-+            b0x8 = vaddq_u16(b0x8, rndx8);
++    return eotf_st2084(sig * peak_pq, REFERENCE_WHITE_ALT);
++}
 +
-+            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
-+            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
-+            r1x8 = vaddq_u16(r1x8, rndx8);
++static float mapsig(enum TonemapAlgorithm alg, float sig, double peak, double param)
++{
++    switch(alg) {
++    default:
++    case TONEMAP_NONE:
++        // do nothing
++        break;
++    case TONEMAP_LINEAR:
++        sig = sig * param / peak;
++        break;
++    case TONEMAP_GAMMA:
++        sig = sig > 0.05f
++              ? pow(sig / peak, 1.0f / param)
++              : sig * pow(0.05f / peak, 1.0f / param) / 0.05f;
++        break;
++    case TONEMAP_CLIP:
++        sig = av_clipf(sig * param, 0, 1.0f);
++        break;
++    case TONEMAP_HABLE:
++        sig = hable(sig) / hable(peak);
++        break;
++    case TONEMAP_REINHARD:
++        sig = sig / (sig + param) * (peak + param) / peak;
++        break;
++    case TONEMAP_MOBIUS:
++        sig = mobius(sig, param, peak);
++        break;
++    case TONEMAP_BT2390:
++        sig = bt2390(sig, peak);
++        break;
++    }
 +
-+            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
-+            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
-+            g1x8 = vaddq_u16(g1x8, rndx8);
++    return sig;
++}
 +
-+            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
-+            b1x8 = vaddq_u16(b1x8, rndx8);
++static float linearize(float x, enum AVColorTransferCharacteristic trc_src)
++{
++    if (trc_src == AVCOL_TRC_SMPTE2084)
++        return eotf_st2084(x, REFERENCE_WHITE_ALT);
++    else if (trc_src == AVCOL_TRC_ARIB_STD_B67)
++        return eotf_arib_b67(x);
++    else
++        return x;
++}
 +
-+            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
-+            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
++static float delinearize(float x, enum AVColorTransferCharacteristic trc_dst)
++{
++    if (trc_dst == AVCOL_TRC_BT709 || trc_dst == AVCOL_TRC_BT2020_10)
++        return inverse_eotf_bt1886(x);
++    else
++        return x;
++}
 +
-+            r0ox8 = vld1q_s16(r);
-+            g0ox8 = vld1q_s16(g);
-+            b0ox8 = vld1q_s16(b);
++static int compute_trc_luts(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src,
++                            enum AVColorTransferCharacteristic trc_dst)
++{
++    int i;
 +
-+            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
-+            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
-+            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++    if (!s->lin_lut && !(s->lin_lut = av_calloc(32768, sizeof(float))))
++        return AVERROR(ENOMEM);
++    if (!s->delin_lut && !(s->delin_lut = av_calloc(32768, sizeof(uint16_t))))
++        return AVERROR(ENOMEM);
 +
-+            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
-+            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
-+            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++    for (i = 0; i < 32768; i++) {
++        double v1 = (i - 2048.0f) / 28672.0f;
++        double v2 = i / 32767.0f;
++        s->lin_lut[i] = FFMAX(linearize(v1, trc_src), 0);
++        s->delin_lut[i] = av_clip_int16(lrint(delinearize(v2, trc_dst) * 28672.0f));
++    }
 +
-+            y0oax4 = vmulq_n_s32(r0oax4, cry);
-+            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
-+            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
-+            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
-+            // output shift bits for 8bit outputs is 29 - 8 = 21
-+            y0oax4 = vshrq_n_s32(y0oax4, 21);
-+            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++    return 0;
++}
 +
-+            y0obx4 = vmulq_n_s32(r0obx4, cry);
-+            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
-+            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
-+            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
-+            y0obx4 = vshrq_n_s32(y0obx4, 21);
-+            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++static int compute_tonemap_lut(TonemapxContext *s, enum AVColorTransferCharacteristic trc_src)
++{
++    int i;
++    double peak = s->lut_peak;
 +
-+            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
-+            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
++    if (!s->tonemap_lut && !(s->tonemap_lut = av_calloc(32768, sizeof(float))))
++        return AVERROR(ENOMEM);
 +
-+            r1ox8 = vld1q_s16(r1);
-+            g1ox8 = vld1q_s16(g1);
-+            b1ox8 = vld1q_s16(b1);
++    for (i = 0; i < 32768; i++) {
++        double v = (i - 2048.0f) / 28672.0f;
++        double sig = linearize(v, trc_src);
++        float mapped = mapsig(s->tonemap, sig, peak, s->param);
++        s->tonemap_lut[i] = (sig > 0.0f && mapped > 0.0f) ? mapped / sig : 0.0f;
++    }
 +
-+            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
-+            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
-+            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++    return 0;
++}
 +
-+            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
-+            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
-+            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++static int compute_yuv_coeffs(TonemapxContext *s,
++                              const AVLumaCoefficients *coeffs,
++                              const AVLumaCoefficients *ocoeffs,
++                              const AVPixFmtDescriptor *idesc,
++                              const AVPixFmtDescriptor *odesc,
++                              enum AVColorRange irng,
++                              enum AVColorRange orng)
++{
++    double rgb2yuv[3][3], yuv2rgb[3][3];
++    int res;
++    int y_rng, uv_rng;
 +
-+            y1oax4 = vmulq_n_s32(r1oax4, cry);
-+            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
-+            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
-+            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
-+            y1oax4 = vshrq_n_s32(y1oax4, 21);
-+            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++    res = ff_get_range_off(&s->in_yuv_off, &y_rng, &uv_rng,
++                           irng, idesc->comp[0].depth);
++    if (res < 0) {
++        av_log(s, AV_LOG_ERROR,
++               "Unsupported input color range %d (%s)\n",
++               irng, av_color_range_name(irng));
++        return res;
++    }
 +
-+            y1obx4 = vmulq_n_s32(r1obx4, cry);
-+            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
-+            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
-+            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
-+            y1obx4 = vshrq_n_s32(y1obx4, 21);
-+            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++    ff_fill_rgb2yuv_table(coeffs, rgb2yuv);
++    ff_matrix_invert_3x3(rgb2yuv, yuv2rgb);
++    ff_fill_rgb2yuv_table(ocoeffs, rgb2yuv);
 +
-+            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
-+            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
++    ff_get_yuv_coeffs(s->yuv2rgb_coeffs, yuv2rgb, idesc->comp[0].depth,
++                      y_rng, uv_rng, 1);
 +
-+            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
-+            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
-+            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
-+            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
-+            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
-+            ravgx4 = vshrq_n_s32(ravgx4, 2);
++    res = ff_get_range_off(&s->out_yuv_off, &y_rng, &uv_rng,
++                           orng, odesc->comp[0].depth);
++    if (res < 0) {
++        av_log(s, AV_LOG_ERROR,
++               "Unsupported output color range %d (%s)\n",
++               orng, av_color_range_name(orng));
++        return res;
++    }
 +
-+            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
-+            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
-+            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
-+            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
-+            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
-+            gavgx4 = vshrq_n_s32(gavgx4, 2);
++    ff_get_yuv_coeffs(s->rgb2yuv_coeffs, rgb2yuv, odesc->comp[0].depth,
++                      y_rng, uv_rng, 0);
 +
-+            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
-+            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
-+            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
-+            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
-+            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
-+            bavgx4 = vshrq_n_s32(bavgx4, 2);
++    return 0;
++}
 +
-+            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
-+            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
-+            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
-+            uox4 = vshrq_n_s32(uox4, 21);
-+            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
-+            vst1_lane_u32((uint32_t *) &dstu[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(uox4), vdup_n_s16(0)))), 0);
++static int compute_rgb_coeffs(TonemapxContext *s,
++                              enum AVColorPrimaries iprm,
++                              enum AVColorPrimaries oprm)
++{
++    double rgb2xyz[3][3], xyz2rgb[3][3];
++    const AVColorPrimariesDesc *iprm_desc = av_csp_primaries_desc_from_id(iprm);
++    const AVColorPrimariesDesc *oprm_desc = av_csp_primaries_desc_from_id(oprm);
 +
-+            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
-+            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
-+            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
-+            vox4 = vshrq_n_s32(vox4, 21);
-+            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
-+            vst1_lane_u32((uint32_t *) &dstv[x >> 1], vreinterpret_u32_u8(vqmovun_s16(vcombine_s16(vmovn_s32(vox4), vdup_n_s16(0)))), 0);
-+        }
++    if (!iprm_desc) {
++        av_log(s, AV_LOG_ERROR,
++               "Unsupported input color primaries %d (%s)\n",
++               iprm, av_color_primaries_name(iprm));
++        return AVERROR(EINVAL);
++    }
++    if (!oprm_desc) {
++        av_log(s, AV_LOG_ERROR,
++               "Unsupported output color primaries %d (%s)\n",
++               oprm, av_color_primaries_name(oprm));
++        return AVERROR(EINVAL);
 +    }
 +
-+    // Process remaining pixels cannot fill the full simd register with scalar version
-+    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
-+        rdsty += offset;
-+        rdstu += offset >> 1;
-+        rdstv += offset >> 1;
-+        rsrcy += offset;
-+        rsrcu += offset >> 1;
-+        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p(rdsty, rdstu, rdstv,
-+                                    rsrcy, rsrcu, rsrcv,
-+                                    dstlinesize, srclinesize,
-+                                    dstdepth, srcdepth,
-+                                    remainw, rheight, params);
-+    }
++    ff_fill_rgb2xyz_table(&oprm_desc->prim, &oprm_desc->wp, rgb2xyz);
++    ff_matrix_invert_3x3(rgb2xyz, xyz2rgb);
++    ff_fill_rgb2xyz_table(&iprm_desc->prim, &iprm_desc->wp, rgb2xyz);
++    ff_matrix_mul_3x3(s->rgb2rgb_coeffs, rgb2xyz, xyz2rgb);
++
++    return 0;
 +}
 +
-+void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
-+                                         const uint16_t *srcy, const uint16_t *srcuv,
-+                                         const int *dstlinesize, const int *srclinesize,
-+                                         int dstdepth, int srcdepth,
-+                                         int width, int height,
-+                                         const struct TonemapIntParams *params)
++static void tonemap_int16(int16_t r_in, int16_t g_in, int16_t b_in,
++                          int16_t *r_out, int16_t *g_out, int16_t *b_out,
++                          float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
++                          const AVLumaCoefficients *coeffs,
++                          const AVLumaCoefficients *ocoeffs, double desat,
++                          double (*rgb2rgb)[3][3],
++                          int rgb2rgb_passthrough)
 +{
-+    uint8_t *rdsty = dsty;
-+    uint8_t *rdstuv = dstuv;
-+    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
-+    int rheight = height;
-+    // not zero when not divisible by 8
-+    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
++    int16_t sig;
++    float mapval, r_lin, g_lin, b_lin;
++
++    /* load values */
++    *r_out = r_in;
++    *g_out = g_in;
++    *b_out = b_in;
++
++    /* pick the brightest component, reducing the value range as necessary
++     * to keep the entire signal in range and preventing discoloration due to
++     * out-of-bounds clipping */
++    sig = FFMAX3(r_in, g_in, b_in);
++
++    mapval = tonemap_lut[av_clip_uintp2(sig + 2048, 15)];
++
++    r_lin = lin_lut[av_clip_uintp2(r_in + 2048, 15)];
++    g_lin = lin_lut[av_clip_uintp2(g_in + 2048, 15)];
++    b_lin = lin_lut[av_clip_uintp2(b_in + 2048, 15)];
++
++    if (!rgb2rgb_passthrough) {
++        r_lin = (*rgb2rgb)[0][0] * r_lin + (*rgb2rgb)[0][1] * g_lin + (*rgb2rgb)[0][2] * b_lin;
++        g_lin = (*rgb2rgb)[1][0] * r_lin + (*rgb2rgb)[1][1] * g_lin + (*rgb2rgb)[1][2] * b_lin;
++        b_lin = (*rgb2rgb)[2][0] * r_lin + (*rgb2rgb)[2][1] * g_lin + (*rgb2rgb)[2][2] * b_lin;
++    }
++
++#define MIX(x,y,a) (x) * (1 - (a)) + (y) * (a)
++    /* desaturate to prevent unnatural colors */
++    if (desat > 0) {
++        float luma = av_q2d(coeffs->cr) * r_lin + av_q2d(coeffs->cg) * g_lin + av_q2d(coeffs->cb) * b_lin;
++        float overbright = FFMAX(luma - desat, FLOAT_EPS) / FFMAX(luma, FLOAT_EPS);
++        r_lin = MIX(r_lin, luma, overbright);
++        g_lin = MIX(g_lin, luma, overbright);
++        b_lin = MIX(b_lin, luma, overbright);
++    }
++
++    r_lin *= mapval;
++    g_lin *= mapval;
++    b_lin *= mapval;
++#undef MIX
 +
++    *r_out = delin_lut[av_clip_uintp2(r_lin * 32767 + 0.5, 15)];
++    *g_out = delin_lut[av_clip_uintp2(g_lin * 32767 + 0.5, 15)];
++    *b_out = delin_lut[av_clip_uintp2(b_lin * 32767 + 0.5, 15)];
++}
++
++// See also libavfilter/colorspacedsp_template.c
++void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
++                                    const uint16_t *srcy, const uint16_t *srcuv,
++                                    const int *dstlinesize, const int *srclinesize,
++                                    int dstdepth, int srcdepth,
++                                    int width, int height,
++                                    const struct TonemapIntParams *params)
++{
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
@@ -1902,245 +1925,277 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
-+    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
-+    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
-+    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
-+    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
-+    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
-+    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
-+    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
-+    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
-+    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
-+    uint16x8_t uvx8;
-+    uint16x4_t ux2a, vx2a, ux2b, vx2b;
-+    uint16x8_t y0x8, y1x8, ux8, vx8;
-+    uint16x8_t r0x8, g0x8, b0x8;
-+    uint16x8_t r1x8, g1x8, b1x8;
-+
-+    int16x8_t r0ox8, g0ox8, b0ox8;
-+    int16x8_t y0ox8;
-+    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
-+    int32x4_t y0oax4, y0obx4;
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
 +
-+    int16x8_t r1ox8, g1ox8, b1ox8;
-+    int16x8_t y1ox8;
-+    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    int32x4_t y1oax4, y1obx4;
-+    int32x4_t uvoax4, uvobx4;
-+    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
-+    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
-+    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
-+    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
-+    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
-+    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
++    int16_t r[4], g[4], b[4];
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
-+            int x = xx << 3;
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
++            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
++            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
 +
-+            y0x8 = vld1q_u16(srcy + x);
-+            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            uvx8 = vld1q_u16(srcuv + x);
-+            if (in_depth == 10) {
-+                // shift to low10bits for 10bit input
-+                // shift bit has to be compile-time constant
-+                y0x8 = vshrq_n_u16(y0x8, 6);
-+                y1x8 = vshrq_n_u16(y1x8, 6);
-+                uvx8 = vshrq_n_u16(uvx8, 6);
-+            }
-+            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
-+            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
-+            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
 +
-+            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
-+            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
-+            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
-+            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
 +
-+            ux8 = vcombine_u16(ux2a, ux2b);
-+            vx8 = vcombine_u16(vx2a, vx2b);
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
 +
-+            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
-+            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
-+            r0x8 = vaddq_u16(r0x8, rndx8);
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
 +
-+            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
-+            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
-+            g0x8 = vaddq_u16(g0x8, rndx8);
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
 +
-+            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
-+            b0x8 = vaddq_u16(b0x8, rndx8);
++            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
++            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
 +
-+            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
-+            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
-+            r1x8 = vaddq_u16(r1x8, rndx8);
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstuv[x]     = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
++            dstuv[x + 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
++#undef AVG
++        }
++    }
++}
 +
-+            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
-+            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
-+            g1x8 = vaddq_u16(g1x8, rndx8);
++void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                 const int *dstlinesize, const int *srclinesize,
++                                 int dstdepth, int srcdepth,
++                                 int width, int height,
++                                 const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
 +
-+            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
-+            b1x8 = vaddq_u16(b1x8, rndx8);
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
 +
-+            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
-+            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
-+            r0ox8 = vld1q_s16(r);
-+            g0ox8 = vld1q_s16(g);
-+            b0ox8 = vld1q_s16(b);
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
-+            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
-+            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
 +
-+            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
-+            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
-+            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                         ) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
++            int u = (srcu[x >> 1]) - in_uv_offset;
++            int v = (srcv[x >> 1]) - in_uv_offset;
 +
-+            y0oax4 = vmulq_n_s32(r0oax4, cry);
-+            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
-+            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
-+            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
-+            // output shift bits for 8bit outputs is 29 - 8 = 21
-+            y0oax4 = vshrq_n_s32(y0oax4, 21);
-+            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
 +
-+            y0obx4 = vmulq_n_s32(r0obx4, cry);
-+            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
-+            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
-+            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
-+            y0obx4 = vshrq_n_s32(y0obx4, 21);
-+            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
 +
-+            y0ox8 = vcombine_s16(vqmovn_s32(y0oax4), vqmovn_s32(y0obx4));
-+            vst1_u8(&dsty[x], vqmovun_s16(y0ox8));
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
 +
-+            r1ox8 = vld1q_s16(r1);
-+            g1ox8 = vld1q_s16(g1);
-+            b1ox8 = vld1q_s16(b1);
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
 +
-+            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
-+            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
-+            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
 +
-+            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
-+            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
-+            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++            dsty[x]                      = av_clip_uint8(params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh));
++            dsty[x + 1]                  = av_clip_uint8(params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x]     = av_clip_uint8(params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh));
++            dsty[dstlinesize[0] + x + 1] = av_clip_uint8(params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh));
 +
-+            y1oax4 = vmulq_n_s32(r1oax4, cry);
-+            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
-+            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
-+            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
-+            y1oax4 = vshrq_n_s32(y1oax4, 21);
-+            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstu[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh));
++            dstv[x >> 1] = av_clip_uint8(out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh));
++#undef AVG
++        }
++    }
++}
 +
-+            y1obx4 = vmulq_n_s32(r1obx4, cry);
-+            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
-+            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
-+            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
-+            y1obx4 = vshrq_n_s32(y1obx4, 21);
-+            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                   const int *dstlinesize, const int *srclinesize,
++                                   int dstdepth, int srcdepth,
++                                   int width, int height,
++                                   const struct TonemapIntParams *params)
++{
++    const int in_depth = srcdepth;
++    const int in_uv_offset = 128 << (in_depth - 8);
++    const int in_sh = in_depth - 1;
++    const int in_rnd = 1 << (in_sh - 1);
 +
-+            y1ox8 = vcombine_s16(vqmovn_s32(y1oax4), vqmovn_s32(y1obx4));
-+            vst1_u8(&dsty[x + dstlinesize[0]], vqmovun_s16(y1ox8));
++    const int out_depth = dstdepth;
++    const int out_uv_offset = 128 << (out_depth - 8);
++    const int out_sh = 29 - out_depth;
++    const int out_rnd = 1 << (out_sh - 1);
 +
-+            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
-+            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
-+            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
-+            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
-+            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
-+            ravgx4 = vshrq_n_s32(ravgx4, 2);
++    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
++    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
++    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
++    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
++    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
 +
-+            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
-+            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
-+            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
-+            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
-+            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
-+            gavgx4 = vshrq_n_s32(gavgx4, 2);
++    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
++    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
++    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
++    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
++    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
++    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
++    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
++    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
-+            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
-+            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
-+            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
-+            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
-+            bavgx4 = vshrq_n_s32(bavgx4, 2);
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
 +
-+            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
-+            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
-+            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
-+            uox4 = vshrq_n_s32(uox4, 21);
-+            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
++    int16_t r[4], g[4], b[4];
++    for (; height > 1; height -= 2,
++                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                         ) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                     ) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]    ) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1]) - params->in_yuv_off;
++            int u = (srcu[x >> 1]) - in_uv_offset;
++            int v = (srcv[x >> 1]) - in_uv_offset;
 +
-+            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
-+            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
-+            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
-+            vox4 = vshrq_n_s32(vox4, 21);
-+            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
 +
-+            uvoax4 = vzip1q_s32(uox4, vox4);
-+            uvobx4 = vzip2q_s32(uox4, vox4);
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
 +
-+            vst1_u8(&dstuv[x], vqmovun_s16(vcombine_s16(vmovn_s32(uvoax4), vmovn_s32(uvobx4))));
-+        }
-+    }
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
 +
-+    // Process remaining pixels cannot fill the full simd register with scalar version
-+    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
-+        rdsty += offset;
-+        rdstuv += offset;
-+        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_nv12(rdsty, rdstuv,
-+                                       rsrcy, rsrcuv,
-+                                       dstlinesize, srclinesize,
-+                                       dstdepth, srcdepth,
-+                                       remainw, rheight, params);
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
++
++            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)), 16);
++            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)), 16);
++            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)), 16);
++            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)), 16);
++
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstu[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)), 16);
++            dstv[x >> 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)), 16);
++#undef AVG
++        }
 +    }
 +}
 +
-+void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                        const int *dstlinesize, const int *srclinesize,
-+                                        int dstdepth, int srcdepth,
-+                                        int width, int height,
-+                                        const struct TonemapIntParams *params)
++void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
++                                         const uint16_t *srcy, const uint16_t *srcuv,
++                                         const int *dstlinesize, const int *srclinesize,
++                                         int dstdepth, int srcdepth,
++                                         int width, int height,
++                                         const struct TonemapIntParams *params)
 +{
-+    uint16_t *rdsty = dsty;
-+    uint16_t *rdstu = dstu;
-+    uint16_t *rdstv = dstv;
-+    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcu = srcu;
-+    const uint16_t *rsrcv = srcv;
-+    int rheight = height;
-+    // not zero when not divisible by 8
-+    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
-+
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
 +    const int in_sh = in_depth - 1;
 +    const int in_rnd = 1 << (in_sh - 1);
++    const int in_sh2 = 16 - in_depth;
 +
 +    const int out_depth = dstdepth;
 +    const int out_uv_offset = 128 << (out_depth - 8);
 +    const int out_sh = 29 - out_depth;
 +    const int out_rnd = 1 << (out_sh - 1);
++    const int out_sh2 = 16 - out_depth;
 +
 +    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
 +    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
@@ -2157,482 +2212,543 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
-+    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
-+    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
-+    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
-+    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
-+    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
-+    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
-+    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
-+    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
-+    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
-+    uint16x4_t ux4, vx4;
-+    uint16x8_t y0x8, y1x8, ux8, vx8;
-+    uint16x8_t r0x8, g0x8, b0x8;
-+    uint16x8_t r1x8, g1x8, b1x8;
-+
-+    int16x8_t r0ox8, g0ox8, b0ox8;
-+    uint16x8_t y0ox8;
-+    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
-+    int32x4_t y0oax4, y0obx4;
++    int r00, g00, b00;
++    int r01, g01, b01;
++    int r10, g10, b10;
++    int r11, g11, b11;
 +
-+    int16x8_t r1ox8, g1ox8, b1ox8;
-+    uint16x8_t y1ox8;
-+    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    int32x4_t y1oax4, y1obx4;
-+    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
-+    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
-+    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
-+    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
-+    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
-+    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
++    int16_t r[4], g[4], b[4];
 +    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
-+            int x = xx << 3;
-+
-+            y0x8 = vld1q_u16(srcy + x);
-+            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            ux4 = vld1_u16(srcu + (x >> 1));
-+            vx4 = vld1_u16(srcv + (x >> 1));
-+            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
-+            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
++                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
++                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
++        for (int x = 0; x < width; x += 2) {
++            int y00 = (srcy[x]                          >> in_sh2) - params->in_yuv_off;
++            int y01 = (srcy[x + 1]                      >> in_sh2) - params->in_yuv_off;
++            int y10 = (srcy[srclinesize[0] / 2 + x]     >> in_sh2) - params->in_yuv_off;
++            int y11 = (srcy[srclinesize[0] / 2 + x + 1] >> in_sh2) - params->in_yuv_off;
++            int u = (srcuv[x]     >> in_sh2) - in_uv_offset;
++            int v = (srcuv[x + 1] >> in_sh2) - in_uv_offset;
 +
-+            ux8 = vcombine_u16(vzip1_u16(ux4, ux4), vzip2_u16(ux4, ux4));
-+            ux8 = vsubq_u16(ux8, in_uv_offx8);
-+            vx8 = vcombine_u16(vzip1_u16(vx4, vx4), vzip2_u16(vx4, vx4));
-+            vx8 = vsubq_u16(vx8, in_uv_offx8);
++            r[0] = av_clip_int16((y00 * cy + crv * v + in_rnd) >> in_sh);
++            r[1] = av_clip_int16((y01 * cy + crv * v + in_rnd) >> in_sh);
++            r[2] = av_clip_int16((y10 * cy + crv * v + in_rnd) >> in_sh);
++            r[3] = av_clip_int16((y11 * cy + crv * v + in_rnd) >> in_sh);
 +
-+            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
-+            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
-+            r0x8 = vaddq_u16(r0x8, rndx8);
++            g[0] = av_clip_int16((y00 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[1] = av_clip_int16((y01 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[2] = av_clip_int16((y10 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
++            g[3] = av_clip_int16((y11 * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
 +
-+            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
-+            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
-+            g0x8 = vaddq_u16(g0x8, rndx8);
++            b[0] = av_clip_int16((y00 * cy + cbu * u + in_rnd) >> in_sh);
++            b[1] = av_clip_int16((y01 * cy + cbu * u + in_rnd) >> in_sh);
++            b[2] = av_clip_int16((y10 * cy + cbu * u + in_rnd) >> in_sh);
++            b[3] = av_clip_int16((y11 * cy + cbu * u + in_rnd) >> in_sh);
 +
-+            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
-+            b0x8 = vaddq_u16(b0x8, rndx8);
++            tonemap_int16(r[0], g[0], b[0], &r[0], &g[0], &b[0],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[1], g[1], b[1], &r[1], &g[1], &b[1],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[2], g[2], b[2], &r[2], &g[2], &b[2],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
++            tonemap_int16(r[3], g[3], b[3], &r[3], &g[3], &b[3],
++                          params->lin_lut, params->tonemap_lut, params->delin_lut,
++                          params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs, params->rgb2rgb_passthrough);
 +
-+            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
-+            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
-+            r1x8 = vaddq_u16(r1x8, rndx8);
++            r00 = r[0], g00 = g[0], b00 = b[0];
++            r01 = r[1], g01 = g[1], b01 = b[1];
++            r10 = r[2], g10 = g[2], b10 = b[2];
++            r11 = r[3], g11 = g[3], b11 = b[3];
 +
-+            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
-+            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
-+            g1x8 = vaddq_u16(g1x8, rndx8);
++            dsty[x]                          = av_clip_uintp2((params->out_yuv_off + ((r00 * cry + g00 * cgy + b00 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[x + 1]                      = av_clip_uintp2((params->out_yuv_off + ((r01 * cry + g01 * cgy + b01 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[dstlinesize[0] / 2 + x]     = av_clip_uintp2((params->out_yuv_off + ((r10 * cry + g10 * cgy + b10 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
++            dsty[dstlinesize[0] / 2 + x + 1] = av_clip_uintp2((params->out_yuv_off + ((r11 * cry + g11 * cgy + b11 * cby + out_rnd) >> out_sh)) << out_sh2, 16);
 +
-+            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
-+            b1x8 = vaddq_u16(b1x8, rndx8);
++#define AVG(a,b,c,d) (((a) + (b) + (c) + (d) + 2) >> 2)
++            dstuv[x]     = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cru + AVG(g00, g01, g10, g11) * ocgu + AVG(b00, b01, b10, b11) * cburv + out_rnd) >> out_sh)) << out_sh2, 16);
++            dstuv[x + 1] = av_clip_uintp2((out_uv_offset + ((AVG(r00, r01, r10, r11) * cburv + AVG(g00, g01, g10, g11) * ocgv + AVG(b00, b01, b10, b11) * cbv + out_rnd) >> out_sh)) << out_sh2, 16);
++#undef AVG
++        }
++    }
++}
 +
-+            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
-+            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
++#define LOAD_TONEMAP_PARAMS     TonemapxContext *s = ctx->priv; \
++ThreadData *td = arg;                                           \
++AVFrame *in = td->in;                                           \
++AVFrame *out = td->out;                                         \
++const AVPixFmtDescriptor *desc  = td->desc;                     \
++const AVPixFmtDescriptor *odesc = td->odesc;                    \
++const int ss = 1 << FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h); \
++const int slice_start = (in->height / ss *  jobnr     ) / nb_jobs * ss; \
++const int slice_end   = (in->height / ss * (jobnr + 1)) / nb_jobs * ss; \
++TonemapIntParams params = {                                     \
++.lut_peak            = s->lut_peak,                             \
++.lin_lut             = s->lin_lut,                              \
++.tonemap_lut         = s->tonemap_lut,                          \
++.delin_lut           = s->delin_lut,                            \
++.in_yuv_off          = s->in_yuv_off,                           \
++.out_yuv_off         = s->out_yuv_off,                          \
++.yuv2rgb_coeffs      = &s->yuv2rgb_coeffs,                      \
++.rgb2yuv_coeffs      = &s->rgb2yuv_coeffs,                      \
++.rgb2rgb_coeffs      = &s->rgb2rgb_coeffs,                      \
++.rgb2rgb_passthrough = in->color_primaries == out->color_primaries,   \
++.coeffs              = s->coeffs,                               \
++.ocoeffs             = s->ocoeffs,                              \
++.desat               = s->desat,                                \
++};
 +
-+            r0ox8 = vld1q_s16(r);
-+            g0ox8 = vld1q_s16(g);
-+            b0ox8 = vld1q_s16(b);
++static int filter_slice_planar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
 +
-+            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
-+            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
-+            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++    s->tonemap_func_planar8(out->data[0] + out->linesize[0] * slice_start,
++                            out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                            out->data[2] + out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                            (void*)(in->data[0] + in->linesize[0] * slice_start),
++                            (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                            (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                            out->linesize, in->linesize,
++                            odesc->comp[0].depth, desc->comp[0].depth,
++                            out->width, slice_end - slice_start,
++                            &params);
 +
-+            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
-+            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
-+            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++    return 0;
++}
 +
-+            y0oax4 = vmulq_n_s32(r0oax4, cry);
-+            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
-+            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
-+            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
-+            y0oax4 = vshrq_n_s32(y0oax4, 19);
-+            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
++static int filter_slice_biplanar8(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
 +
-+            y0obx4 = vmulq_n_s32(r0obx4, cry);
-+            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
-+            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
-+            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
-+            y0obx4 = vshrq_n_s32(y0obx4, 19);
-+            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
++    s->tonemap_func_biplanar8(out->data[0] + out->linesize[0] * slice_start,
++                              out->data[1] + out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h),
++                              (void*)(in->data[0] + in->linesize[0] * slice_start),
++                              (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                              out->linesize, in->linesize,
++                              odesc->comp[0].depth, desc->comp[0].depth,
++                              out->width, slice_end - slice_start,
++                              &params);
 +
-+            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
-+            vst1q_u16(&dsty[x], y0ox8);
++    return 0;
++}
 +
-+            r1ox8 = vld1q_s16(r1);
-+            g1ox8 = vld1q_s16(g1);
-+            b1ox8 = vld1q_s16(b1);
++static int filter_slice_planar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "planar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
 +
-+            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
-+            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
-+            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++    s->tonemap_func_planar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start),
++                             (uint16_t *) (out->data[1] +
++                                           out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
++                             (uint16_t *) (out->data[2] +
++                                           out->linesize[2] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
++                             (void*)(in->data[0] + in->linesize[0] * slice_start),
++                             (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                             (void*)(in->data[2] + in->linesize[2] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                             out->linesize, in->linesize,
++                             odesc->comp[0].depth, desc->comp[0].depth,
++                             out->width, slice_end - slice_start,
++                             &params);
 +
-+            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
-+            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
-+            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++    return 0;
++}
 +
-+            y1oax4 = vmulq_n_s32(r1oax4, cry);
-+            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
-+            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
-+            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
-+            y1oax4 = vshrq_n_s32(y1oax4, 19);
-+            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
++static int filter_slice_biplanar10(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
++{
++    LOAD_TONEMAP_PARAMS
++    av_log(s, AV_LOG_DEBUG, "biplanar dst depth: %d, src depth: %d\n", odesc->comp[0].depth, desc->comp[0].depth);
 +
-+            y1obx4 = vmulq_n_s32(r1obx4, cry);
-+            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
-+            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
-+            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
-+            y1obx4 = vshrq_n_s32(y1obx4, 19);
-+            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
++    s->tonemap_func_biplanar10((uint16_t *) (out->data[0] + out->linesize[0] * slice_start),
++                               (uint16_t *) (out->data[1] +
++                                             out->linesize[1] * AV_CEIL_RSHIFT(slice_start, desc->log2_chroma_h)),
++                               (void*)(in->data[0] + in->linesize[0] * slice_start),
++                               (void*)(in->data[1] + in->linesize[1] * AV_CEIL_RSHIFT(slice_start, odesc->log2_chroma_h)),
++                               out->linesize, in->linesize,
++                               odesc->comp[0].depth, desc->comp[0].depth,
++                               out->width, slice_end - slice_start,
++                               &params);
 +
-+            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
-+            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
++    return 0;
++}
 +
-+            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
-+            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
-+            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
-+            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
-+            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
-+            ravgx4 = vshrq_n_s32(ravgx4, 2);
-+
-+            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
-+            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
-+            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
-+            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
-+            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
-+            gavgx4 = vshrq_n_s32(gavgx4, 2);
-+
-+            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
-+            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
-+            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
-+            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
-+            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
-+            bavgx4 = vshrq_n_s32(bavgx4, 2);
-+
-+            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
-+            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
-+            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
-+            uox4 = vshrq_n_s32(uox4, 19);
-+            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
-+            vst1_u16(&dstu[x >> 1], vqmovun_s32(uox4));
++static int filter_frame(AVFilterLink *link, AVFrame *in)
++{
++    AVFilterContext *ctx = link->dst;
++    TonemapxContext *s = ctx->priv;
++    AVFilterLink *outlink = ctx->outputs[0];
++    AVFrame *out;
++    const AVPixFmtDescriptor *desc;
++    const AVPixFmtDescriptor *odesc;
++    int ret;
++    double peak = s->peak;
++    const AVLumaCoefficients *coeffs;
++    ThreadData td;
 +
-+            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
-+            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
-+            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
-+            vox4 = vshrq_n_s32(vox4, 19);
-+            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
-+            vst1_u16(&dstv[x >> 1], vqmovun_s32(vox4));
-+        }
++    desc = av_pix_fmt_desc_get(link->format);
++    odesc = av_pix_fmt_desc_get(outlink->format);
++    if (!desc || !odesc) {
++        av_frame_free(&in);
++        return AVERROR_BUG;
 +    }
 +
-+    // Process remaining pixels cannot fill the full simd register with scalar version
-+    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
-+        rdsty += offset;
-+        rdstu += offset >> 1;
-+        rdstv += offset >> 1;
-+        rsrcy += offset;
-+        rsrcu += offset >> 1;
-+        rsrcv += offset >> 1;
-+        tonemap_frame_420p10_2_420p10(rdsty, rdstu, rdstv,
-+                                      rsrcy, rsrcu, rsrcv,
-+                                      dstlinesize, srclinesize,
-+                                      dstdepth, srcdepth,
-+                                      remainw, rheight, params);
++    switch (odesc->comp[2].plane) {
++        case 1: // biplanar
++            if (odesc->comp[0].depth == 8) {
++                s->filter_slice = filter_slice_biplanar8;
++            } else {
++                s->filter_slice = filter_slice_biplanar10;
++            }
++            break;
++        default:
++        case 2: // planar
++            if (odesc->comp[0].depth == 8) {
++                s->filter_slice = filter_slice_planar8;
++            } else {
++                s->filter_slice = filter_slice_planar10;
++            }
++            break;
 +    }
-+}
 +
-+void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
-+                                              const uint16_t *srcy, const uint16_t *srcuv,
-+                                              const int *dstlinesize, const int *srclinesize,
-+                                              int dstdepth, int srcdepth,
-+                                              int width, int height,
-+                                              const struct TonemapIntParams *params)
-+{
-+    uint16_t *rdsty = dsty;
-+    uint16_t *rdstuv = dstuv;
-+    const uint16_t *rsrcy = srcy;
-+    const uint16_t *rsrcuv = srcuv;
-+    int rheight = height;
-+    // not zero when not divisible by 8
-+    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
++    out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
++    if (!out) {
++        av_frame_free(&in);
++        return AVERROR(ENOMEM);
++    }
 +
-+    const int in_depth = srcdepth;
-+    const int in_uv_offset = 128 << (in_depth - 8);
-+    const int in_sh = in_depth - 1;
-+    const int in_rnd = 1 << (in_sh - 1);
++    if ((ret = av_frame_copy_props(out, in)) < 0)
++        goto fail;
 +
-+    const int out_depth = dstdepth;
-+    const int out_uv_offset = 128 << (out_depth - 8);
-+    const int out_sh = 29 - out_depth;
-+    const int out_rnd = 1 << (out_sh - 1);
-+    const int out_sh2 = 16 - out_depth;
++    /* read peak from side data if not passed in */
++    if (!peak) {
++        peak = ff_determine_signal_peak(in);
++        av_log(s, AV_LOG_DEBUG, "Computed signal peak: %f\n", peak);
++    }
 +
-+    int cy  = (*params->yuv2rgb_coeffs)[0][0][0];
-+    int crv = (*params->yuv2rgb_coeffs)[0][2][0];
-+    int cgu = (*params->yuv2rgb_coeffs)[1][1][0];
-+    int cgv = (*params->yuv2rgb_coeffs)[1][2][0];
-+    int cbu = (*params->yuv2rgb_coeffs)[2][1][0];
++    out->color_trc = s->trc;
++    out->colorspace = s->spc;
++    out->color_primaries = s->pri;
++    out->color_range = s->range;
 +
-+    int cry   = (*params->rgb2yuv_coeffs)[0][0][0];
-+    int cgy   = (*params->rgb2yuv_coeffs)[0][1][0];
-+    int cby   = (*params->rgb2yuv_coeffs)[0][2][0];
-+    int cru   = (*params->rgb2yuv_coeffs)[1][0][0];
-+    int ocgu  = (*params->rgb2yuv_coeffs)[1][1][0];
-+    int cburv = (*params->rgb2yuv_coeffs)[1][2][0];
-+    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
-+    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
++    if (in->color_trc == AVCOL_TRC_UNSPECIFIED)
++        in->color_trc = AVCOL_TRC_SMPTE2084;
++    if (out->color_trc == AVCOL_TRC_UNSPECIFIED)
++        out->color_trc = AVCOL_TRC_BT709;
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
-+    uint16_t cy_shifted = av_clip_int16(cy >> in_sh);
-+    uint16_t rnd_shifted = av_clip_int16(in_rnd >> in_sh);
-+    uint16_t crv_shifted = av_clip_int16(crv >> in_sh);
-+    uint16_t cgu_shifted = av_clip_int16(cgu >> in_sh);
-+    uint16_t cgv_shifted = av_clip_int16(cgv >> in_sh);
-+    uint16_t cbu_shifted = av_clip_int16(cbu >> in_sh);
-+    uint16x8_t rndx8 = vdupq_n_u16(rnd_shifted);
-+    uint16x8_t in_yuv_offx8 = vdupq_n_u16(av_clip_int16(params->in_yuv_off));
-+    uint16x8_t in_uv_offx8 = vdupq_n_u16(av_clip_int16(in_uv_offset));
-+    uint16x8_t uvx8;
-+    uint16x4_t ux2a, vx2a, ux2b, vx2b;
-+    uint16x8_t y0x8, y1x8, ux8, vx8;
-+    uint16x8_t r0x8, g0x8, b0x8;
-+    uint16x8_t r1x8, g1x8, b1x8;
++    if (in->colorspace == AVCOL_SPC_UNSPECIFIED)
++        in->colorspace = AVCOL_SPC_BT2020_NCL;
++    if (out->colorspace == AVCOL_SPC_UNSPECIFIED)
++        out->colorspace = AVCOL_SPC_BT709;
 +
-+    int16x8_t r0ox8, g0ox8, b0ox8;
-+    uint16x8_t y0ox8;
-+    int32x4_t r0oax4, r0obx4, g0oax4, g0obx4, b0oax4, b0obx4;
-+    int32x4_t y0oax4, y0obx4;
++    if (in->color_primaries == AVCOL_PRI_UNSPECIFIED)
++        in->color_primaries = AVCOL_PRI_BT2020;
++    if (out->color_primaries == AVCOL_PRI_UNSPECIFIED)
++        out->color_primaries = AVCOL_PRI_BT709;
 +
-+    int16x8_t r1ox8, g1ox8, b1ox8;
-+    uint16x8_t y1ox8;
-+    int32x4_t r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    int32x4_t y1oax4, y1obx4;
-+    int32x4_t uvoax4, uvobx4;
-+    int32x2_t ravgax2, gavgax2, bavgax2, ravgbx2, gavgbx2, bavgbx2;
-+    int32x4_t ravgx4, gavgx4, bavgx4, uox4, vox4;
-+    int32x4_t out_yuv_offx4 = vdupq_n_s32(params->out_yuv_off);
-+    int32x4_t out_rndx4 = vdupq_n_s32(out_rnd);
-+    int16x8_t out_sh2x8 = vdupq_n_s16(out_sh2);
-+    int32x4_t out_uv_offsetx4 = vdupq_n_s32(out_uv_offset);
-+    int32x4_t rgb_avg_rndx4 = vdupq_n_s32(2);
-+    for (; height > 1; height -= 2,
-+                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
-+                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
-+            int x = xx << 3;
++    if (in->color_range == AVCOL_RANGE_UNSPECIFIED)
++        in->color_range = AVCOL_RANGE_MPEG;
++    if (out->color_range == AVCOL_RANGE_UNSPECIFIED)
++        out->color_range = AVCOL_RANGE_MPEG;
 +
-+            y0x8 = vld1q_u16(srcy + x);
-+            y1x8 = vld1q_u16(srcy + (srclinesize[0] / 2 + x));
-+            uvx8 = vld1q_u16(srcuv + x);
-+            if (in_depth == 10) {
-+                // shift to low10bits for 10bit input
-+                // shift bit has to be compile-time constant
-+                y0x8 = vshrq_n_u16(y0x8, 6);
-+                y1x8 = vshrq_n_u16(y1x8, 6);
-+                uvx8 = vshrq_n_u16(uvx8, 6);
-+            }
-+            y0x8 = vsubq_u16(y0x8, in_yuv_offx8);
-+            y1x8 = vsubq_u16(y1x8, in_yuv_offx8);
-+            uvx8 = vsubq_u16(uvx8, in_uv_offx8);
++    if (!s->lin_lut || !s->delin_lut) {
++        if ((ret = compute_trc_luts(s, in->color_trc, out->color_trc)) < 0)
++            goto fail;
++    }
 +
-+            ux2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 0), vdup_lane_u16(vget_low_u16(uvx8), 2), 2);
-+            vx2a = vext_u16(vdup_lane_u16(vget_low_u16(uvx8), 1), vdup_lane_u16(vget_low_u16(uvx8), 3), 2);
-+            ux2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 0), vdup_lane_u16(vget_high_u16(uvx8), 2), 2);
-+            vx2b = vext_u16(vdup_lane_u16(vget_high_u16(uvx8), 1), vdup_lane_u16(vget_high_u16(uvx8), 3), 2);
++    if (!s->tonemap_lut || s->lut_peak != peak) {
++        s->lut_peak = peak;
++        if ((ret = compute_tonemap_lut(s, out->color_trc)) < 0)
++            goto fail;
++    }
 +
-+            ux8 = vcombine_u16(ux2a, ux2b);
-+            vx8 = vcombine_u16(vx2a, vx2b);
++    coeffs = av_csp_luma_coeffs_from_avcsp(in->colorspace);
++    if (s->coeffs != coeffs) {
++        s->coeffs = coeffs;
++        s->ocoeffs = av_csp_luma_coeffs_from_avcsp(out->colorspace);
++        if ((ret = compute_yuv_coeffs(s, coeffs, s->ocoeffs, desc, odesc,
++             in->color_range, out->color_range)) < 0)
++            goto fail;
++        if ((ret = compute_rgb_coeffs(s, in->color_primaries, out->color_primaries)) < 0)
++            goto fail;
++    }
 +
-+            r0x8 = g0x8 = b0x8 = vmulq_n_u16(y0x8, cy_shifted);
-+            r0x8 = vmlaq_n_u16(r0x8, vx8, crv_shifted);
-+            r0x8 = vaddq_u16(r0x8, rndx8);
++    /* do the tonemap */
++    td.in    = in;
++    td.out   = out;
++    td.desc  = desc;
++    td.odesc = odesc;
++    td.peak  = peak;
++    ctx->internal->execute(ctx, s->filter_slice, &td, NULL,
++                           FFMIN(outlink->h >> FFMAX(desc->log2_chroma_h, odesc->log2_chroma_h), ff_filter_get_nb_threads(ctx)));
 +
-+            g0x8 = vmlaq_n_u16(g0x8, ux8, cgu_shifted);
-+            g0x8 = vmlaq_n_u16(g0x8, vx8, cgv_shifted);
-+            g0x8 = vaddq_u16(g0x8, rndx8);
++    av_frame_free(&in);
 +
-+            b0x8 = vmlaq_n_u16(b0x8, ux8, cbu_shifted);
-+            b0x8 = vaddq_u16(b0x8, rndx8);
++    ff_update_hdr_metadata(out, peak);
 +
-+            r1x8 = g1x8 = b1x8 = vmulq_n_u16(y1x8, cy_shifted);
-+            r1x8 = vmlaq_n_u16(r1x8, vx8, crv_shifted);
-+            r1x8 = vaddq_u16(r1x8, rndx8);
++    return ff_filter_frame(outlink, out);
++fail:
++    av_frame_free(&in);
++    av_frame_free(&out);
++    return ret;
++}
 +
-+            g1x8 = vmlaq_n_u16(g1x8, ux8, cgu_shifted);
-+            g1x8 = vmlaq_n_u16(g1x8, vx8, cgv_shifted);
-+            g1x8 = vaddq_u16(g1x8, rndx8);
-+
-+            b1x8 = vmlaq_n_u16(b1x8, ux8, cbu_shifted);
-+            b1x8 = vaddq_u16(b1x8, rndx8);
-+
-+            tonemap_int16x8_neon(r0x8, g0x8, b0x8, (int16_t *) &r, (int16_t *) &g, (int16_t *) &b,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
-+            tonemap_int16x8_neon(r1x8, g1x8, b1x8, (int16_t *) &r1, (int16_t *) &g1, (int16_t *) &b1,
-+                                 params->lin_lut, params->tonemap_lut, params->delin_lut,
-+                                 params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
-+                                 params->rgb2rgb_passthrough);
-+
-+            r0ox8 = vld1q_s16(r);
-+            g0ox8 = vld1q_s16(g);
-+            b0ox8 = vld1q_s16(b);
-+
-+            r0oax4 = vmovl_s16(vget_low_s16(r0ox8));
-+            g0oax4 = vmovl_s16(vget_low_s16(g0ox8));
-+            b0oax4 = vmovl_s16(vget_low_s16(b0ox8));
++static void uninit(AVFilterContext *ctx)
++{
++    TonemapxContext *s = ctx->priv;
 +
-+            r0obx4 = vmovl_s16(vget_high_s16(r0ox8));
-+            g0obx4 = vmovl_s16(vget_high_s16(g0ox8));
-+            b0obx4 = vmovl_s16(vget_high_s16(b0ox8));
++    av_freep(&s->lin_lut);
++    av_freep(&s->delin_lut);
++    av_freep(&s->tonemap_lut);
++}
 +
-+            y0oax4 = vmulq_n_s32(r0oax4, cry);
-+            y0oax4 = vmlaq_n_s32(y0oax4, g0oax4, cgy);
-+            y0oax4 = vmlaq_n_s32(y0oax4, b0oax4, cby);
-+            y0oax4 = vaddq_s32(y0oax4, out_rndx4);
++static int query_formats(AVFilterContext *ctx)
++{
++    enum AVPixelFormat valid_in_pix_fmts[4];
++    AVFilterFormats *formats;
++    const AVPixFmtDescriptor *desc;
++    TonemapxContext *s = ctx->priv;
 +
-+            y0obx4 = vmulq_n_s32(r0obx4, cry);
-+            y0obx4 = vmlaq_n_s32(y0obx4, g0obx4, cgy);
-+            y0obx4 = vmlaq_n_s32(y0obx4, b0obx4, cby);
-+            y0obx4 = vaddq_s32(y0obx4, out_rndx4);
++    if (!strcmp(s->format_str, "same")) {
++        int res;
++        formats = ff_make_format_list(in_pix_fmts);
++        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++        if (res < 0)
++            return res;
++        s->format = AV_PIX_FMT_NONE;
++    } else {
++        int i, j = 0;
++        int res;
++        formats = ff_make_format_list(in_pix_fmts);
++        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++        if (res < 0)
++            return res;
++        if (s->format == AV_PIX_FMT_NONE) {
++            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
++            return AVERROR(EINVAL);
++        }
++        s->format = av_get_pix_fmt(s->format_str);
++        // Check again in case of the string is invalid
++        if (s->format == AV_PIX_FMT_NONE) {
++            av_log(ctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
++            return AVERROR(EINVAL);
++        }
++        desc = av_pix_fmt_desc_get(s->format);
++        // Filter out the input formats for requested output formats
++        // The input and output must have the same planar format, either planar or bi-planar packed
++        for (i = 0; in_pix_fmts[i] != AV_PIX_FMT_NONE; i++) {
++            const AVPixFmtDescriptor *tdesc = av_pix_fmt_desc_get(in_pix_fmts[i]);
++            if (tdesc->comp[2].plane == desc->comp[2].plane) {
++                valid_in_pix_fmts[j] = in_pix_fmts[i];
++                j++;
++            }
++        }
++        valid_in_pix_fmts[j] = AV_PIX_FMT_NONE;
++        formats = ff_make_format_list(valid_in_pix_fmts);
++        res = ff_formats_ref(formats, &ctx->inputs[0]->outcfg.formats);
++        if (res < 0)
++            return res;
++        if (out_format_is_supported(s->format)) {
++            formats = NULL;
++            res = ff_add_format(&formats, s->format);
++            if (res < 0)
++                return res;
++        } else {
++            av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
++                   av_get_pix_fmt_name(s->format));
++            return AVERROR(ENOSYS);
++        }
++    }
 +
-+            r1ox8 = vld1q_s16(r1);
-+            g1ox8 = vld1q_s16(g1);
-+            b1ox8 = vld1q_s16(b1);
++    return ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats);
++}
 +
-+            r1oax4 = vmovl_s16(vget_low_s16(r1ox8));
-+            g1oax4 = vmovl_s16(vget_low_s16(g1ox8));
-+            b1oax4 = vmovl_s16(vget_low_s16(b1ox8));
++static av_cold int init(AVFilterContext *ctx)
++{
++    TonemapxContext *s = ctx->priv;
++    enum SIMDVariant active_simd = SIMD_NONE;
++    av_log(s, AV_LOG_DEBUG, "Requested output format: %s\n",
++           s->format_str);
 +
-+            r1obx4 = vmovl_s16(vget_high_s16(r1ox8));
-+            g1obx4 = vmovl_s16(vget_high_s16(g1ox8));
-+            b1obx4 = vmovl_s16(vget_high_s16(b1ox8));
++#if ARCH_AARCH64
++#ifdef ENABLE_TONEMAPX_NEON_INTRINSICS
++    {
++        int cpu_flags = av_get_cpu_flags();
++        if (have_neon(cpu_flags)) {
++            s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_neon;
++            s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_neon;
++            s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_neon;
++            s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_neon;
++            active_simd = SIMD_NEON;
++        }
++    }
++#else
++    av_log(s, AV_LOG_WARNING, "NEON optimization disabled at compile time\n");
++#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
++#elif ARCH_X86
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
++    {
++        int cpu_flags = av_get_cpu_flags();
++        if (X86_SSE42(cpu_flags)) {
++            s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_sse;
++            s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_sse;
++            s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_sse;
++            s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_sse;
++            active_simd = SIMD_SSE;
++        }
++    }
++#else
++    av_log(s, AV_LOG_WARNING, "SSE optimization disabled at compile time\n");
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
++    {
++        int cpu_flags = av_get_cpu_flags();
++        if (X86_AVX2(cpu_flags) && X86_FMA3(cpu_flags)) {
++            s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12_avx;
++            s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010_avx;
++            s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p_avx;
++            s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10_avx;
++            active_simd = SIMD_AVX;
++        }
++    }
++#else
++    av_log(s, AV_LOG_WARNING, "AVX optimization disabled at compile time\n");
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
++#endif // ARCH_X86/ARCH_AARCH64
 +
-+            y1oax4 = vmulq_n_s32(r1oax4, cry);
-+            y1oax4 = vmlaq_n_s32(y1oax4, g1oax4, cgy);
-+            y1oax4 = vmlaq_n_s32(y1oax4, b1oax4, cby);
-+            y1oax4 = vaddq_s32(y1oax4, out_rndx4);
++#if !defined(ENABLE_TONEMAPX_NEON_INTRINSICS) && \
++    !defined(ENABLE_TONEMAPX_SSE_INTRINSICS) && \
++    !defined(ENABLE_TONEMAPX_AVX_INTRINSICS)
++    av_log(s, AV_LOG_WARNING, "SIMD optimization disabled at compile time\n");
++#endif
 +
-+            y1obx4 = vmulq_n_s32(r1obx4, cry);
-+            y1obx4 = vmlaq_n_s32(y1obx4, g1obx4, cgy);
-+            y1obx4 = vmlaq_n_s32(y1obx4, b1obx4, cby);
-+            y1obx4 = vaddq_s32(y1obx4, out_rndx4);
++    if (!s->tonemap_func_biplanar8) {
++        s->tonemap_func_biplanar8 = tonemap_frame_p016_p010_2_nv12;
++    }
 +
-+            ravgax2 = vpadd_s32(vget_low_s32(r0oax4), vget_high_s32(r0oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r0obx4), vget_high_s32(r0obx4));
-+            ravgx4 = vcombine_s32(ravgax2, ravgbx2);
-+            ravgax2 = vpadd_s32(vget_low_s32(r1oax4), vget_high_s32(r1oax4));
-+            ravgbx2 = vpadd_s32(vget_low_s32(r1obx4), vget_high_s32(r1obx4));
-+            ravgx4 = vaddq_s32(ravgx4, vcombine_s32(ravgax2, ravgbx2));
-+            ravgx4 = vaddq_s32(ravgx4, rgb_avg_rndx4);
-+            ravgx4 = vshrq_n_s32(ravgx4, 2);
++    if (!s->tonemap_func_biplanar10) {
++        s->tonemap_func_biplanar10 = tonemap_frame_p016_p010_2_p016_p010;
++    }
 +
-+            gavgax2 = vpadd_s32(vget_low_s32(g0oax4), vget_high_s32(g0oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g0obx4), vget_high_s32(g0obx4));
-+            gavgx4 = vcombine_s32(gavgax2, gavgbx2);
-+            gavgax2 = vpadd_s32(vget_low_s32(g1oax4), vget_high_s32(g1oax4));
-+            gavgbx2 = vpadd_s32(vget_low_s32(g1obx4), vget_high_s32(g1obx4));
-+            gavgx4 = vaddq_s32(gavgx4, vcombine_s32(gavgax2, gavgbx2));
-+            gavgx4 = vaddq_s32(gavgx4, rgb_avg_rndx4);
-+            gavgx4 = vshrq_n_s32(gavgx4, 2);
++    if (!s->tonemap_func_planar8) {
++        s->tonemap_func_planar8 = tonemap_frame_420p10_2_420p;
++    }
 +
-+            bavgax2 = vpadd_s32(vget_low_s32(b0oax4), vget_high_s32(b0oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b0obx4), vget_high_s32(b0obx4));
-+            bavgx4 = vcombine_s32(bavgax2, bavgbx2);
-+            bavgax2 = vpadd_s32(vget_low_s32(b1oax4), vget_high_s32(b1oax4));
-+            bavgbx2 = vpadd_s32(vget_low_s32(b1obx4), vget_high_s32(b1obx4));
-+            bavgx4 = vaddq_s32(bavgx4, vcombine_s32(bavgax2, bavgbx2));
-+            bavgx4 = vaddq_s32(bavgx4, rgb_avg_rndx4);
-+            bavgx4 = vshrq_n_s32(bavgx4, 2);
++    if (!s->tonemap_func_planar10) {
++        s->tonemap_func_planar10 = tonemap_frame_420p10_2_420p10;
++    }
 +
-+            uox4 = vmlaq_n_s32(out_rndx4, ravgx4, cru);
-+            uox4 = vmlaq_n_s32(uox4, gavgx4, ocgu);
-+            uox4 = vmlaq_n_s32(uox4, bavgx4, cburv);
++    switch(active_simd) {
++        case SIMD_NEON:
++            av_log(s, AV_LOG_INFO, "Using CPU capability: NEON\n");
++            break;
++        case SIMD_SSE:
++            av_log(s, AV_LOG_INFO, "Using CPU capability: SSE4.2\n");
++            break;
++        case SIMD_AVX:
++            av_log(s, AV_LOG_INFO, "Using CPU capabilities: AVX2 FMA3\n");
++            break;
++        default:
++        case SIMD_NONE:
++            av_log(s, AV_LOG_INFO, "No CPU SIMD extension available\n");
++            break;
++    }
 +
-+            vox4 = vmlaq_n_s32(out_rndx4, ravgx4, cburv);
-+            vox4 = vmlaq_n_s32(vox4, gavgx4, ocgv);
-+            vox4 = vmlaq_n_s32(vox4, bavgx4, cbv);
++    switch(s->tonemap) {
++        case TONEMAP_GAMMA:
++            if (isnan(s->param))
++                s->param = 1.8f;
++            break;
++        case TONEMAP_REINHARD:
++            if (!isnan(s->param))
++                s->param = (1.0f - s->param) / s->param;
++            break;
++        case TONEMAP_MOBIUS:
++            if (isnan(s->param))
++                s->param = 0.3f;
++            break;
++    }
 +
-+            switch(out_depth) {
-+                default:
-+                case 10:
-+                    y0oax4 = vshrq_n_s32(y0oax4, 19);
-+                    y0obx4 = vshrq_n_s32(y0obx4, 19);
-+                    y1oax4 = vshrq_n_s32(y1oax4, 19);
-+                    y1obx4 = vshrq_n_s32(y1obx4, 19);
-+                    uox4 = vshrq_n_s32(uox4, 19);
-+                    vox4 = vshrq_n_s32(vox4, 19);
-+                    break;
-+                case 16:
-+                    y0oax4 = vshrq_n_s32(y0oax4, 13);
-+                    y0obx4 = vshrq_n_s32(y0obx4, 13);
-+                    y1oax4 = vshrq_n_s32(y1oax4, 13);
-+                    y1obx4 = vshrq_n_s32(y1obx4, 13);
-+                    uox4 = vshrq_n_s32(uox4, 13);
-+                    vox4 = vshrq_n_s32(vox4, 13);
-+                    break;
-+            }
++    if (isnan(s->param))
++        s->param = 1.0f;
 +
-+            y0oax4 = vaddq_s32(y0oax4, out_yuv_offx4);
-+            y0obx4 = vaddq_s32(y0obx4, out_yuv_offx4);
-+            y1oax4 = vaddq_s32(y1oax4, out_yuv_offx4);
-+            y1obx4 = vaddq_s32(y1obx4, out_yuv_offx4);
-+            uox4 = vaddq_s32(uox4, out_uv_offsetx4);
-+            vox4 = vaddq_s32(vox4, out_uv_offsetx4);
++    return 0;
++}
 +
-+            y0ox8 = vcombine_u16(vqmovun_s32(y0oax4), vqmovun_s32(y0obx4));
-+            y0ox8 = vshlq_u16(y0ox8, out_sh2x8);
-+            vst1q_u16(&dsty[x], y0ox8);
++#define OFFSET(x) offsetof(TonemapxContext, x)
++#define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
++static const AVOption tonemapx_options[] = {
++    { "tonemap",      "tonemap algorithm selection", OFFSET(tonemap), AV_OPT_TYPE_INT, {.i64 = TONEMAP_BT2390}, TONEMAP_NONE, TONEMAP_MAX - 1, FLAGS, "tonemap" },
++    {     "none",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE},              0, 0, FLAGS, "tonemap" },
++    {     "linear",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR},            0, 0, FLAGS, "tonemap" },
++    {     "gamma",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA},             0, 0, FLAGS, "tonemap" },
++    {     "clip",     0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP},              0, 0, FLAGS, "tonemap" },
++    {     "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD},          0, 0, FLAGS, "tonemap" },
++    {     "hable",    0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE},             0, 0, FLAGS, "tonemap" },
++    {     "mobius",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS},            0, 0, FLAGS, "tonemap" },
++    {     "bt2390",   0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_BT2390},            0, 0, FLAGS, "tonemap" },
++    { "transfer",     "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, "transfer" },
++    { "t",            "set transfer characteristic", OFFSET(trc), AV_OPT_TYPE_INT, {.i64 = AVCOL_TRC_BT709}, -1, INT_MAX, FLAGS, "transfer" },
++    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709},           0, 0, FLAGS, "transfer" },
++    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT2020_10},       0, 0, FLAGS, "transfer" },
++    { "matrix",       "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, "matrix" },
++    { "m",            "set colorspace matrix", OFFSET(spc), AV_OPT_TYPE_INT, {.i64 = AVCOL_SPC_BT709}, -1, INT_MAX, FLAGS, "matrix" },
++    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709},           0, 0, FLAGS, "matrix" },
++    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL},      0, 0, FLAGS, "matrix" },
++    { "primaries",    "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, "primaries" },
++    { "p",            "set color primaries", OFFSET(pri), AV_OPT_TYPE_INT, {.i64 = AVCOL_PRI_BT709}, -1, INT_MAX, FLAGS, "primaries" },
++    {     "bt709",    0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709},           0, 0, FLAGS, "primaries" },
++    {     "bt2020",   0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020},          0, 0, FLAGS, "primaries" },
++    { "range",        "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, "range" },
++    { "r",            "set color range", OFFSET(range), AV_OPT_TYPE_INT, {.i64 = AVCOL_RANGE_MPEG}, -1, INT_MAX, FLAGS, "range" },
++    {     "tv",       0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},          0, 0, FLAGS, "range" },
++    {     "pc",       0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},          0, 0, FLAGS, "range" },
++    {     "limited",  0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_MPEG},          0, 0, FLAGS, "range" },
++    {     "full",     0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},          0, 0, FLAGS, "range" },
++    { "format",       "output format",       OFFSET(format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS },
++    { "param",        "tonemap parameter", OFFSET(param), AV_OPT_TYPE_DOUBLE, {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },
++    { "desat",        "desaturation strength", OFFSET(desat), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
++    { "peak",         "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, {.dbl = 0}, 0, DBL_MAX, FLAGS },
++    { NULL }
++};
 +
-+            y1ox8 = vcombine_u16(vqmovun_s32(y1oax4), vqmovun_s32(y1obx4));
-+            y1ox8 = vshlq_u16(y1ox8, out_sh2x8);
-+            vst1q_u16(&dsty[x + dstlinesize[0] / 2], y1ox8);
++AVFILTER_DEFINE_CLASS(tonemapx);
 +
-+            uvoax4 = vzip1q_s32(uox4, vox4);
-+            uvobx4 = vzip2q_s32(uox4, vox4);
++static const AVFilterPad tonemapx_inputs[] = {
++    {
++        .name         = "default",
++        .type         = AVMEDIA_TYPE_VIDEO,
++        .filter_frame = filter_frame,
++    },
++};
 +
-+            vst1q_u16(&dstuv[x], vshlq_u16(vcombine_u16(vqmovun_s32(uvoax4), vqmovun_s32(uvobx4)), out_sh2x8));
-+        }
-+    }
++static const AVFilterPad tonemapx_outputs[] = {
++    {
++        .name         = "default",
++        .type         = AVMEDIA_TYPE_VIDEO,
++    },
++};
 +
-+    // Process remaining pixels cannot fill the full simd register with scalar version
-+    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
-+        rdsty += offset;
-+        rdstuv += offset;
-+        rsrcy += offset;
-+        rsrcuv += offset;
-+        tonemap_frame_p016_p010_2_p016_p010(rdsty, rdstuv,
-+                                            rsrcy, rsrcuv,
-+                                            dstlinesize, srclinesize,
-+                                            dstdepth, srcdepth,
-+                                            remainw, rheight, params);
-+    }
-+}
-Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
++AVFilter ff_vf_tonemapx = {
++    .name            = "tonemapx",
++    .description     = NULL_IF_CONFIG_SMALL("SIMD optimized HDR to SDR tonemapping"),
++    .init            = init,
++    .uninit          = uninit,
++    .priv_size       = sizeof(TonemapxContext),
++    .priv_class      = &tonemapx_class,
++    FILTER_INPUTS(tonemapx_inputs),
++    FILTER_OUTPUTS(tonemapx_outputs),
++    FILTER_QUERY_FUNC(query_formats),
++    .flags           = AVFILTER_FLAG_SLICE_THREADS,
++};
+Index: jellyfin-ffmpeg/libavfilter/vf_tonemapx.h
 ===================================================================
 --- /dev/null
-+++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
-@@ -0,0 +1,56 @@
++++ jellyfin-ffmpeg/libavfilter/vf_tonemapx.h
+@@ -0,0 +1,106 @@
 +/*
-+ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
-+ *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
@@ -2650,60 +2766,112 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
-+#ifndef AVFILTER_TONEMAPX_INTRIN_NEON_H
-+#define AVFILTER_TONEMAPX_INTRIN_NEON_H
++#ifndef AVFILTER_TONEMAPX_H
++#define AVFILTER_TONEMAPX_H
 +
-+#include <arm_neon.h>
++#include "config.h"
++#include "colorspace.h"
 +
-+#include "libavfilter/vf_tonemapx.h"
++#if defined(__GNUC__) || defined(__clang__)
++#    if (__GNUC__ >= 11) || (__clang_major__ >= 12)
++#        define X86_64_V2 __attribute__((target("arch=x86-64-v2")))
++#        define X86_64_V3 __attribute__((target("arch=x86-64-v3")))
++#    else
++#        define X86_64_V2 __attribute__((target("sse4.2")))
++#        define X86_64_V3 __attribute__((target("avx2,fma")))
++#    endif // (__GNUC__ >= 11) || (__clang_major__ >= 12)
++#endif // defined(__GNUC__) || defined(__clang__)
 +
-+void tonemap_frame_420p10_2_420p_neon(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                      const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                      const int *dstlinesize, const int *srclinesize,
-+                                      int dstdepth, int srcdepth,
-+                                      int width, int height,
-+                                      const struct TonemapIntParams *params);
++#if defined(__GNUC__) || defined(__clang__)
++#    if (__GNUC__ >= 10) || (__clang_major__ >= 11)
++#        define CC_SUPPORTS_TONEMAPX_INTRINSICS
++#    endif // (__GNUC__ >= 10) || (__clang_major__ >= 11)
++#endif // defined(__GNUC__) || defined(__clang__)
 +
-+void tonemap_frame_p016_p010_2_nv12_neon(uint8_t *dsty, uint8_t *dstuv,
++#ifdef CC_SUPPORTS_TONEMAPX_INTRINSICS
++#    if ARCH_AARCH64
++#        if HAVE_INTRINSICS_NEON
++#            define ENABLE_TONEMAPX_NEON_INTRINSICS
++#        endif
++#    endif // ARCH_AARCH64
++#    if ARCH_X86
++#        if HAVE_INTRINSICS_SSE42
++#           define ENABLE_TONEMAPX_SSE_INTRINSICS
++#        endif
++#        if HAVE_INTRINSICS_AVX2 && HAVE_INTRINSICS_FMA3
++#            define ENABLE_TONEMAPX_AVX_INTRINSICS
++#        endif
++#    endif // ARCH_X86
++#endif // CC_SUPPORTS_TONEMAPX_INTRINSICS
++
++typedef struct TonemapIntParams {
++    double lut_peak;
++    float *lin_lut;
++    float *tonemap_lut;
++    uint16_t *delin_lut;
++    int in_yuv_off, out_yuv_off;
++    int16_t (*yuv2rgb_coeffs)[3][3][8];
++    int16_t (*rgb2yuv_coeffs)[3][3][8];
++    double  (*rgb2rgb_coeffs)[3][3];
++    int rgb2rgb_passthrough;
++    const AVLumaCoefficients *coeffs, *ocoeffs;
++    double desat;
++} TonemapIntParams;
++
++enum SIMDVariant {
++    SIMD_NONE = -1,
++    SIMD_NEON,
++    SIMD_SSE,
++    SIMD_AVX
++};
++
++void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                 const int *dstlinesize, const int *srclinesize,
++                                 int dstdepth, int srcdepth,
++                                 int width, int height,
++                                 const struct TonemapIntParams *params);
++
++void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
++                                    const uint16_t *srcy, const uint16_t *srcuv,
++                                    const int *dstlinesize, const int *srclinesize,
++                                    int dstdepth, int srcdepth,
++                                    int width, int height,
++                                    const struct TonemapIntParams *params);
++
++void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
++                                   const int *dstlinesize, const int *srclinesize,
++                                   int dstdepth, int srcdepth,
++                                   int width, int height,
++                                   const struct TonemapIntParams *params);
++
++void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
 +                                         const uint16_t *srcy, const uint16_t *srcuv,
 +                                         const int *dstlinesize, const int *srclinesize,
 +                                         int dstdepth, int srcdepth,
 +                                         int width, int height,
 +                                         const struct TonemapIntParams *params);
 +
-+void tonemap_frame_420p10_2_420p10_neon(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                        const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                        const int *dstlinesize, const int *srclinesize,
-+                                        int dstdepth, int srcdepth,
-+                                        int width, int height,
-+                                        const struct TonemapIntParams *params);
-+
-+void tonemap_frame_p016_p010_2_p016_p010_neon(uint16_t *dsty, uint16_t *dstuv,
-+                                              const uint16_t *srcy, const uint16_t *srcuv,
-+                                              const int *dstlinesize, const int *srclinesize,
-+                                              int dstdepth, int srcdepth,
-+                                              int width, int height,
-+                                              const struct TonemapIntParams *params);
-+
-+#endif //AVFILTER_TONEMAPX_INTRIN_NEON_H
-Index: FFmpeg/libavfilter/x86/Makefile
++#endif //AVFILTER_TONEMAPX_H
+Index: jellyfin-ffmpeg/libavfilter/x86/Makefile
 ===================================================================
---- FFmpeg.orig/libavfilter/x86/Makefile
-+++ FFmpeg/libavfilter/x86/Makefile
+--- jellyfin-ffmpeg.orig/libavfilter/x86/Makefile
++++ jellyfin-ffmpeg/libavfilter/x86/Makefile
 @@ -39,6 +39,8 @@ OBJS-$(CONFIG_VOLUME_FILTER)
  OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
  OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
  OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
 +OBJS-$(CONFIG_TONEMAPX_FILTER)               += x86/vf_tonemapx_intrin_sse.o \
 +                                                x86/vf_tonemapx_intrin_avx.o
- 
+
  X86ASM-OBJS-$(CONFIG_SCENE_SAD)              += x86/scene_sad.o
- 
-Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
+
+Index: jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 ===================================================================
 --- /dev/null
-+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
-@@ -0,0 +1,1345 @@
++++ jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
+@@ -0,0 +1,1367 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -2724,46 +2892,27 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
-+#include "vf_tonemapx_intrin_sse.h"
-+
-+// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction
-+// cast the register into float register and store with movss as a workaround
-+#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
-+__attribute__((always_inline))
-+X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) {
-+    _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a));
-+    return;
-+}
-+#endif
-+
-+X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a)
-+{
-+__m128i mask = _mm_set1_epi32(0x7FFF);
-+__m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF));
-+
-+__m128i zero = _mm_setzero_si128();
-+__m128i cmp = _mm_cmpeq_epi32(condition, zero);
-+
-+__m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask);
-+__m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a));
++#include "vf_tonemapx_intrin_avx.h"
 +
-+return result;
-+}
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
++#    include <immintrin.h>
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +
-+X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a)
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
++X86_64_V3 static inline __m256i av_clip_int16_avx(__m256i a)
 +{
-+__m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U));
-+__m128i mask = _mm_set1_epi32(~0xFFFF);
-+__m128i condition = _mm_and_si128(add_result, mask);
-+__m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128());
++__m256i add_result = _mm256_add_epi32(a, _mm256_set1_epi32(0x8000U));
++__m256i mask = _mm256_set1_epi32(~0xFFFF);
++__m256i condition = _mm256_and_si256(add_result, mask);
++__m256i cmp = _mm256_cmpeq_epi32(condition, _mm256_setzero_si256());
 +
-+__m128i shifted = _mm_srai_epi32(a, 31);
-+__m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF));
++__m256i shifted = _mm256_srai_epi32(a, 31);
++__m256i xor_result = _mm256_xor_si256(shifted, _mm256_set1_epi32(0x7FFF));
 +
-+return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result));
++return _mm256_or_si256(_mm256_and_si256(cmp, a), _mm256_andnot_si256(cmp, xor_result));
 +}
 +
-+X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in,
++X86_64_V3 static inline void tonemap_int32x8_avx(__m256i r_in, __m256i g_in, __m256i b_in,
 +                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
 +                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
 +                                                 const AVLumaCoefficients *coeffs,
@@ -2771,127 +2920,140 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +                                                 double (*rgb2rgb)[3][3],
 +                                                 int rgb2rgb_passthrough)
 +{
-+    __m128i sig4;
-+    __m128 mapvalx4, r_linx4, g_linx4, b_linx4;
-+    __m128 offset = _mm_set1_ps(0.5f);
-+    __m128i input_lut_offset = _mm_set1_epi32(2048);
-+    __m128 intermediate_upper_bound = _mm_set1_ps(32767.0f);
-+    __m128i r, g, b, rx4, gx4, bx4;
++    __m256i sig8;
++    __m256 mapvalx8, r_linx8, g_linx8, b_linx8;
++    __m256 offset = _mm256_set1_ps(0.5f);
++    __m256i zerox8 = _mm256_setzero_si256();
++    __m256i input_lut_offset = _mm256_set1_epi32(2048);
++    __m256i upper_bound = _mm256_set1_epi32(32767);
++    __m256 intermediate_upper_bound = _mm256_set1_ps(32767.0f);
++    __m256i r, g, b, rx8, gx8, bx8;
 +
-+    float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4];
++    float mapval8[8], r_lin8[8], g_lin8[8], b_lin8[8];
 +
-+    sig4 = _mm_max_epi32(r_in, _mm_max_epi32(g_in, b_in));
-+    sig4 = _mm_add_epi32(sig4, input_lut_offset);
-+    sig4 = av_clip_uint16_sse(sig4);
++    sig8 = _mm256_max_epi32(r_in, _mm256_max_epi32(g_in, b_in));
++    sig8 = _mm256_add_epi32(sig8, input_lut_offset);
++    sig8 = _mm256_min_epi32(sig8, upper_bound);
++    sig8 = _mm256_max_epi32(sig8, zerox8);
 +
-+    r = _mm_add_epi32(r_in, input_lut_offset);
-+    r = av_clip_uint16_sse(r);
-+    g = _mm_add_epi32(g_in, input_lut_offset);
-+    g = av_clip_uint16_sse(g);
-+    b = _mm_add_epi32(b_in, input_lut_offset);
-+    b = av_clip_uint16_sse(b);
++    r = _mm256_add_epi32(r_in, input_lut_offset);
++    r = _mm256_min_epi32(r, upper_bound);
++    r = _mm256_max_epi32(r, zerox8);
++    g = _mm256_add_epi32(g_in, input_lut_offset);
++    g = _mm256_min_epi32(g, upper_bound);
++    g = _mm256_max_epi32(g, zerox8);
++    b = _mm256_add_epi32(b_in, input_lut_offset);
++    b = _mm256_min_epi32(b, upper_bound);
++    b = _mm256_max_epi32(b, zerox8);
 +
-+    // Cannot use loop here as the lane has to be compile-time constant
-+#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \
-+r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)];                             \
-+g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)];                             \
-+b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)];
++#define LOAD_LUT(i) mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)]; \
++r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)];                             \
++g_lin8[i] = lin_lut[_mm256_extract_epi32(g, i)];                             \
++b_lin8[i] = lin_lut[_mm256_extract_epi32(b, i)];
 +
 +    LOAD_LUT(0)
 +    LOAD_LUT(1)
 +    LOAD_LUT(2)
 +    LOAD_LUT(3)
++    LOAD_LUT(4)
++    LOAD_LUT(5)
++    LOAD_LUT(6)
++    LOAD_LUT(7)
 +
 +#undef LOAD_LUT
 +
-+    mapvalx4 = _mm_loadu_ps(mapval4);
-+    r_linx4 = _mm_loadu_ps(r_lin4);
-+    g_linx4 = _mm_loadu_ps(g_lin4);
-+    b_linx4 = _mm_loadu_ps(b_lin4);
++    mapvalx8 = _mm256_loadu_ps(mapval8);
++    r_linx8 = _mm256_loadu_ps(r_lin8);
++    g_linx8 = _mm256_loadu_ps(g_lin8);
++    b_linx8 = _mm256_loadu_ps(b_lin8);
 +
 +    if (!rgb2rgb_passthrough) {
-+        r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0]));
-+        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1])));
-+        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2])));
++        r_linx8 = _mm256_mul_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][0]));
++        r_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][1]), r_linx8);
++        r_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][2]), r_linx8);
 +
-+        g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1]));
-+        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0])));
-+        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2])));
++        g_linx8 = _mm256_mul_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][1]));
++        g_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][0]), g_linx8);
++        g_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][2]), g_linx8);
 +
-+        b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2]));
-+        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0])));
-+        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1])));
++        b_linx8 = _mm256_mul_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][2]));
++        b_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][0]), b_linx8);
++        b_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][1]), b_linx8);
 +    }
 +
 +    if (desat > 0) {
-+        __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS);
-+        __m128 desat4 = _mm_set1_ps((float)desat);
-+        __m128 luma4 = _mm_set1_ps(0);
-+        __m128 overbright4;
++        __m256 eps_x8 = _mm256_set1_ps(FLOAT_EPS);
++        __m256 desat8 = _mm256_set1_ps((float)desat);
++        __m256 luma8 = _mm256_set1_ps(0);
++        __m256 overbright8;
 +
-+        luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr))));
-+        luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg))));
-+        luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb))));
-+        overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4));
-+        r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4));
-+        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4));
-+        g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4));
-+        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4));
-+        b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4));
-+        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4));
++        luma8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cr)), luma8);
++        luma8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cg)), luma8);
++        luma8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cb)), luma8);
++        overbright8 = _mm256_div_ps(_mm256_max_ps(_mm256_sub_ps(luma8, desat8), eps_x8), _mm256_max_ps(luma8, eps_x8));
++        r_linx8 = _mm256_fnmadd_ps(r_linx8, overbright8, r_linx8);
++        r_linx8 = _mm256_fmadd_ps(luma8, overbright8, r_linx8);
++        g_linx8 = _mm256_fnmadd_ps(g_linx8, overbright8, g_linx8);
++        g_linx8 = _mm256_fmadd_ps(luma8, overbright8, g_linx8);
++        b_linx8 = _mm256_fnmadd_ps(b_linx8, overbright8, b_linx8);
++        b_linx8 = _mm256_fmadd_ps(luma8, overbright8, b_linx8);
 +    }
 +
-+    r_linx4 = _mm_mul_ps(r_linx4, mapvalx4);
-+    g_linx4 = _mm_mul_ps(g_linx4, mapvalx4);
-+    b_linx4 = _mm_mul_ps(b_linx4, mapvalx4);
++    r_linx8 = _mm256_mul_ps(r_linx8, mapvalx8);
++    g_linx8 = _mm256_mul_ps(g_linx8, mapvalx8);
++    b_linx8 = _mm256_mul_ps(b_linx8, mapvalx8);
 +
-+    r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound);
-+    r_linx4 = _mm_add_ps(r_linx4, offset);
++    r_linx8 = _mm256_fmadd_ps(r_linx8, intermediate_upper_bound, offset);
++    g_linx8 = _mm256_fmadd_ps(g_linx8, intermediate_upper_bound, offset);
++    b_linx8 = _mm256_fmadd_ps(b_linx8, intermediate_upper_bound, offset);
 +
-+    g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound);
-+    g_linx4 = _mm_add_ps(g_linx4, offset);
++    rx8 = _mm256_cvttps_epi32(r_linx8);
++    rx8 = _mm256_min_epi32(rx8, upper_bound);
++    rx8 = _mm256_max_epi32(rx8, zerox8);
 +
-+    b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound);
-+    b_linx4 = _mm_add_ps(b_linx4, offset);
++    gx8 = _mm256_cvttps_epi32(g_linx8);
++    gx8 = _mm256_min_epi32(gx8, upper_bound);
++    gx8 = _mm256_max_epi32(gx8, zerox8);
 +
-+    rx4 = _mm_cvttps_epi32(r_linx4);
-+    rx4 = av_clip_uint16_sse(rx4);
-+    gx4 = _mm_cvttps_epi32(g_linx4);
-+    gx4 = av_clip_uint16_sse(gx4);
-+    bx4 = _mm_cvttps_epi32(b_linx4);
-+    bx4 = av_clip_uint16_sse(bx4);
++    bx8 = _mm256_cvttps_epi32(b_linx8);
++    bx8 = _mm256_min_epi32(bx8, upper_bound);
++    bx8 = _mm256_max_epi32(bx8, zerox8);
 +
-+#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \
-+g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)];                       \
-+b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)];
++#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm256_extract_epi32(rx8, i)]; \
++g_out[i] = delin_lut[_mm256_extract_epi32(gx8, i)];                       \
++b_out[i] = delin_lut[_mm256_extract_epi32(bx8, i)];
 +
 +    SAVE_COLOR(0)
 +    SAVE_COLOR(1)
 +    SAVE_COLOR(2)
 +    SAVE_COLOR(3)
++    SAVE_COLOR(4)
++    SAVE_COLOR(5)
++    SAVE_COLOR(6)
++    SAVE_COLOR(7)
 +
 +#undef SAVE_COLOR
 +}
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +
-+X86_64_V2 void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
 +                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                               const int *dstlinesize, const int *srclinesize,
 +                                               int dstdepth, int srcdepth,
 +                                               int width, int height,
 +                                               const struct TonemapIntParams *params)
 +{
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
 +    uint8_t *rdsty = dsty;
 +    uint8_t *rdstu = dstu;
 +    uint8_t *rdstv = dstv;
-+
 +    const uint16_t *rsrcy = srcy;
 +    const uint16_t *rsrcu = srcu;
 +    const uint16_t *rsrcv = srcv;
-+
 +    int rheight = height;
-+    // not zero when not divisible by 8
++    // not zero when not divisible by 16
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
++    int remainw = width & 14;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -2918,240 +3080,248 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
-+    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
-+    __m128i cyx4 = _mm_set1_epi32(cy);
-+    __m128i rndx4 = _mm_set1_epi32(in_rnd);
-+    __m128i zero128 = _mm_setzero_si128();
-+    __m128i ux4, vx4;
-+    __m128i y0x8, y1x8;
-+    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
-+    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
-+    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
++    __m256i ux8, vx8;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
-+    __m128i r0ox8, g0ox8, b0ox8;
-+    __m128i y0ox8;
-+    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
-+    __m128i yoax4, yobx4;
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
 +
-+    __m128i r1ox8, g1ox8, b1ox8;
-+    __m128i y1ox8;
-+    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    __m128i y1oax4, y1obx4;
-+    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4;
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
 +                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
-+            int x = xx << 3;
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
 +
-+            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
-+            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
-+            ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1)));
-+            vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1)));
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
++            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
 +
-+            y0x4a = _mm_cvtepu16_epi32(y0x8);
-+            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
-+            y1x4a = _mm_cvtepu16_epi32(y1x8);
-+            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
-+            ux4 = _mm_cvtepu16_epi32(ux4);
-+            vx4 = _mm_cvtepu16_epi32(vx4);
-+            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
-+            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
-+            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
-+            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
-+            ux4 = _mm_sub_epi32(ux4, in_uv_offx4);
-+            vx4 = _mm_sub_epi32(vx4, in_uv_offx4);
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
 +
-+            ux4a = _mm_unpacklo_epi32(ux4, ux4);
-+            ux4b = _mm_unpackhi_epi32(ux4, ux4);
-+            vx4a = _mm_unpacklo_epi32(vx4, vx4);
-+            vx4b = _mm_unpackhi_epi32(vx4, vx4);
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
++            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
++
++            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
-+            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
-+            r0x4a = _mm_add_epi32(r0x4a, rndx4);
-+            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
-+            r0x4a = av_clip_int16_sse(r0x4a);
++            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
 +
-+            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
-+            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
-+            r1x4a = _mm_add_epi32(r1x4a, rndx4);
-+            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
-+            r1x4a = av_clip_int16_sse(r1x4a);
++            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
-+            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
-+            g0x4a = _mm_add_epi32(g0x4a, rndx4);
-+            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
-+            g0x4a = av_clip_int16_sse(g0x4a);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
 +
-+            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
-+            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
-+            g1x4a = _mm_add_epi32(g1x4a, rndx4);
-+            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
-+            g1x4a = av_clip_int16_sse(g1x4a);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
-+            b0x4a = _mm_add_epi32(b0x4a, rndx4);
-+            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
-+            b0x4a = av_clip_int16_sse(b0x4a);
-+
-+            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
-+            b1x4a = _mm_add_epi32(b1x4a, rndx4);
-+            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
-+            b1x4a = av_clip_int16_sse(b1x4a);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
-+            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
-+            r0x4b = _mm_add_epi32(r0x4b, rndx4);
-+            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
-+            r0x4b = av_clip_int16_sse(r0x4b);
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
-+            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
-+            r1x4b = _mm_add_epi32(r1x4b, rndx4);
-+            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
-+            r1x4b = av_clip_int16_sse(r1x4b);
++            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
 +
-+            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
-+            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
-+            g0x4b = _mm_add_epi32(g0x4b, rndx4);
-+            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
-+            g0x4b = av_clip_int16_sse(g0x4b);
++            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
 +
-+            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
-+            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
-+            g1x4b = _mm_add_epi32(g1x4b, rndx4);
-+            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
-+            g1x4b = av_clip_int16_sse(g1x4b);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
 +
-+            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
-+            b0x4b = _mm_add_epi32(b0x4b, rndx4);
-+            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
-+            b0x4b = av_clip_int16_sse(b0x4b);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
 +
-+            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
-+            b1x4b = _mm_add_epi32(b1x4b, rndx4);
-+            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
-+            b1x4b = av_clip_int16_sse(b1x4b);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
++
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
-+            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
-+            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
 +
-+            roax4 = _mm_cvtepi16_epi32(r0ox8);
-+            goax4 = _mm_cvtepi16_epi32(g0ox8);
-+            boax4 = _mm_cvtepi16_epi32(b0ox8);
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
 +
-+            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
-+            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
-+            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
 +
-+            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
-+            // output shift bits for 8bit outputs is 29 - 8 = 21
-+            yoax4 = _mm_srai_epi32(yoax4, 21);
-+            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
-+            yobx4 = _mm_srai_epi32(yobx4, 21);
-+            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
-+            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
++            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +
-+            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
-+            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
-+            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
 +
-+            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
-+            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
-+            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
 +
-+            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
-+            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
-+            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
 +
-+            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
-+            y1oax4 = _mm_srai_epi32(y1oax4, 21);
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
-+            y1obx4 = _mm_srai_epi32(y1obx4, 21);
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
-+            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
++            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +
-+            ravgx4 = _mm_hadd_epi32(roax4, robx4);
-+            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
-+            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
-+            ravgx4 = _mm_srai_epi32(ravgx4, 2);
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
 +
-+            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
-+            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
-+            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
-+            gavgx4 = _mm_srai_epi32(gavgx4, 2);
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
 +
-+            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
-+            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
-+            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
-+            bavgx4 = _mm_srai_epi32(bavgx4, 2);
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
 +
-+            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
-+            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
-+            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
-+            uox4 = _mm_srai_epi32(uox4, 21);
-+            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
-+            _mm_storeu_si32(&dstu[x >> 1], _mm_packus_epi16(_mm_packs_epi32(uox4, zero128), zero128));
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
++            uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256());
++            _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8));
 +
-+            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
-+            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
-+            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
-+            vox4 = _mm_srai_epi32(vox4, 21);
-+            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
-+            _mm_storeu_si32(&dstv[x >> 1], _mm_packus_epi16(_mm_packs_epi32(vox4, zero128), zero128));
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
++            vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256());
++            _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8));
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
++        int offset = width & (int)0xfffffff0;
 +        rdsty += offset;
 +        rdstu += offset >> 1;
 +        rdstv += offset >> 1;
@@ -3164,15 +3334,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +                                    dstdepth, srcdepth,
 +                                    remainw, rheight, params);
 +    }
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
 +
-+X86_64_V2 void tonemap_frame_420p10_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
 +                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                                 const int *dstlinesize, const int *srclinesize,
 +                                                 int dstdepth, int srcdepth,
 +                                                 int width, int height,
 +                                                 const struct TonemapIntParams *params)
 +{
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
 +    uint16_t *rdsty = dsty;
 +    uint16_t *rdstu = dstu;
 +    uint16_t *rdstv = dstv;
@@ -3182,7 +3354,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
++    int remainw = width & 14;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -3209,239 +3381,245 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
-+
-+    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
-+    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
-+    __m128i cyx4 = _mm_set1_epi32(cy);
-+    __m128i rndx4 = _mm_set1_epi32(in_rnd);
-+    __m128i zero128 = _mm_setzero_si128();
-+    __m128i ux4, vx4;
-+    __m128i y0x8, y1x8;
-+    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
-+    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
-+    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+    __m128i r0ox8, g0ox8, b0ox8;
-+    __m128i y0ox8;
-+    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
-+    __m128i yoax4, yobx4;
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++    __m256i ux8, vx8;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
-+    __m128i r1ox8, g1ox8, b1ox8;
-+    __m128i y1ox8;
-+    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    __m128i y1oax4, y1obx4;
-+    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4;
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
-+            int x = xx << 3;
-+
-+            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
-+            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
-+            ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1)));
-+            vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1)));
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
 +
-+            y0x4a = _mm_cvtepu16_epi32(y0x8);
-+            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
-+            y1x4a = _mm_cvtepu16_epi32(y1x8);
-+            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
-+            ux4 = _mm_cvtepu16_epi32(ux4);
-+            vx4 = _mm_cvtepu16_epi32(vx4);
-+            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
-+            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
-+            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
-+            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
-+            ux4 = _mm_sub_epi32(ux4, in_uv_offx4);
-+            vx4 = _mm_sub_epi32(vx4, in_uv_offx4);
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
++            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
 +
-+            ux4a = _mm_unpacklo_epi32(ux4, ux4);
-+            ux4b = _mm_unpackhi_epi32(ux4, ux4);
-+            vx4a = _mm_unpacklo_epi32(vx4, vx4);
-+            vx4b = _mm_unpackhi_epi32(vx4, vx4);
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
++            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
++
++            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
++            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
-+            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
-+            r0x4a = _mm_add_epi32(r0x4a, rndx4);
-+            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
-+            r0x4a = av_clip_int16_sse(r0x4a);
++            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
 +
-+            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
-+            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
-+            r1x4a = _mm_add_epi32(r1x4a, rndx4);
-+            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
-+            r1x4a = av_clip_int16_sse(r1x4a);
++            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
-+            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
-+            g0x4a = _mm_add_epi32(g0x4a, rndx4);
-+            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
-+            g0x4a = av_clip_int16_sse(g0x4a);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
 +
-+            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
-+            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
-+            g1x4a = _mm_add_epi32(g1x4a, rndx4);
-+            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
-+            g1x4a = av_clip_int16_sse(g1x4a);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
-+            b0x4a = _mm_add_epi32(b0x4a, rndx4);
-+            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
-+            b0x4a = av_clip_int16_sse(b0x4a);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
-+            b1x4a = _mm_add_epi32(b1x4a, rndx4);
-+            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
-+            b1x4a = av_clip_int16_sse(b1x4a);
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
-+            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
-+            r0x4b = _mm_add_epi32(r0x4b, rndx4);
-+            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
-+            r0x4b = av_clip_int16_sse(r0x4b);
++            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
 +
-+            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
-+            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
-+            r1x4b = _mm_add_epi32(r1x4b, rndx4);
-+            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
-+            r1x4b = av_clip_int16_sse(r1x4b);
++            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
 +
-+            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
-+            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
-+            g0x4b = _mm_add_epi32(g0x4b, rndx4);
-+            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
-+            g0x4b = av_clip_int16_sse(g0x4b);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
 +
-+            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
-+            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
-+            g1x4b = _mm_add_epi32(g1x4b, rndx4);
-+            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
-+            g1x4b = av_clip_int16_sse(g1x4b);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
 +
-+            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
-+            b0x4b = _mm_add_epi32(b0x4b, rndx4);
-+            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
-+            b0x4b = av_clip_int16_sse(b0x4b);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
-+            b1x4b = _mm_add_epi32(b1x4b, rndx4);
-+            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
-+            b1x4b = av_clip_int16_sse(b1x4b);
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
 +
-+            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
-+            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
-+            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
-+
-+            roax4 = _mm_cvtepi16_epi32(r0ox8);
-+            goax4 = _mm_cvtepi16_epi32(g0ox8);
-+            boax4 = _mm_cvtepi16_epi32(b0ox8);
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
 +
-+            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
-+            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
-+            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
 +
-+            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
-+            yoax4 = _mm_srai_epi32(yoax4, out_sh);
-+            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
 +
-+            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
-+            yobx4 = _mm_srai_epi32(yobx4, out_sh);
-+            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
-+            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
-+            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
-+            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
++            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
 +
-+            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
-+            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
-+            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
 +
-+            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
-+            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
-+            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
 +
-+            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
-+            y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
 +
-+            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
-+            y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
-+            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            ravgx4 = _mm_hadd_epi32(roax4, robx4);
-+            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
-+            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
-+            ravgx4 = _mm_srai_epi32(ravgx4, 2);
++            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
 +
-+            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
-+            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
-+            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
-+            gavgx4 = _mm_srai_epi32(gavgx4, 2);
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
 +
-+            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
-+            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
-+            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
-+            bavgx4 = _mm_srai_epi32(bavgx4, 2);
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
 +
-+            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
-+            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
-+            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
-+            uox4 = _mm_srai_epi32(uox4, out_sh);
-+            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
-+            _mm_storeu_si64((__m128i_u *) &dstu[x >> 1], _mm_packus_epi32(uox4, zero128));
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
 +
-+            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
-+            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
-+            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
-+            vox4 = _mm_srai_epi32(vox4, out_sh);
-+            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
-+            _mm_storeu_si64((__m128i_u *) &dstv[x >> 1], _mm_packus_epi32(vox4, zero128));
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256());
++            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8));
++
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256());
++            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8));
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
++        int offset = width & (int)0xfffffff0;
 +        rdsty += offset;
 +        rdstu += offset >> 1;
 +        rdstv += offset >> 1;
@@ -3454,23 +3632,25 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +                                      dstdepth, srcdepth,
 +                                      remainw, rheight, params);
 +    }
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
 +
-+X86_64_V2 void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv,
++X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
 +                                                  const uint16_t *srcy, const uint16_t *srcuv,
 +                                                  const int *dstlinesize, const int *srclinesize,
 +                                                  int dstdepth, int srcdepth,
 +                                                  int width, int height,
 +                                                  const struct TonemapIntParams *params)
 +{
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
 +    uint8_t *rdsty = dsty;
 +    uint8_t *rdstuv = dstuv;
 +    const uint16_t *rsrcy = srcy;
 +    const uint16_t *rsrcuv = srcuv;
 +    int rheight = height;
-+    // not zero when not divisible by 8
++    // not zero when not divisible by 16
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
++    int remainw = width & 14;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -3497,248 +3677,252 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
-+    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
-+    __m128i cyx4 = _mm_set1_epi32(cy);
-+    __m128i rndx4 = _mm_set1_epi32(in_rnd);
-+    __m128i zero128 = _mm_setzero_si128();
-+    __m128i uvx8, uvx4a, uvx4b;
-+    __m128i y0x8, y1x8;
-+    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
-+    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
-+    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
++    __m256i uvx16, uvx8a, uvx8b;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
-+    __m128i r0ox8, g0ox8, b0ox8;
-+    __m128i y0ox8;
-+    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
-+    __m128i yoax4, yobx4;
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
 +
-+    __m128i r1ox8, g1ox8, b1ox8;
-+    __m128i y1ox8;
-+    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
-+    __m128i uoax4, voax4, ravgx4, gavgx4, bavgx4;
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
-+            int x = xx << 3;
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
 +
-+            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
-+            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
-+            uvx8 = _mm_lddqu_si128((__m128i*)(srcuv + x));
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
 +
 +            if (in_depth == 10) {
 +                // shift to low10bits for 10bit input
-+                // shift bit has to be compile-time constant
-+                y0x8 = _mm_srli_epi16(y0x8, 6);
-+                y1x8 = _mm_srli_epi16(y1x8, 6);
-+                uvx8 = _mm_srli_epi16(uvx8, 6);
++                y0x16 = _mm256_srli_epi16(y0x16, 6);
++                y1x16 = _mm256_srli_epi16(y1x16, 6);
++                uvx16 = _mm256_srli_epi16(uvx16, 6);
 +            }
-+            y0x4a = _mm_cvtepu16_epi32(y0x8);
-+            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
-+            y1x4a = _mm_cvtepu16_epi32(y1x8);
-+            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
-+            uvx4a = _mm_cvtepu16_epi32(uvx8);
-+            uvx4b = _mm_unpackhi_epi16(uvx8, zero128);
-+            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
-+            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
-+            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
-+            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
-+            uvx4a = _mm_sub_epi32(uvx4a, in_uv_offx4);
-+            uvx4b = _mm_sub_epi32(uvx4b, in_uv_offx4);
 +
-+            ux4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(2, 2, 0, 0));
-+            ux4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(2, 2, 0, 0));
-+            vx4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(3, 3, 1, 1));
-+            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
++            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
++            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
++
++            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
-+            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
-+            r0x4a = _mm_add_epi32(r0x4a, rndx4);
-+            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
-+            r0x4a = av_clip_int16_sse(r0x4a);
++            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
 +
-+            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
-+            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
-+            r1x4a = _mm_add_epi32(r1x4a, rndx4);
-+            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
-+            r1x4a = av_clip_int16_sse(r1x4a);
++            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
-+            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
-+            g0x4a = _mm_add_epi32(g0x4a, rndx4);
-+            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
-+            g0x4a = av_clip_int16_sse(g0x4a);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
 +
-+            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
-+            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
-+            g1x4a = _mm_add_epi32(g1x4a, rndx4);
-+            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
-+            g1x4a = av_clip_int16_sse(g1x4a);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
-+            b0x4a = _mm_add_epi32(b0x4a, rndx4);
-+            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
-+            b0x4a = av_clip_int16_sse(b0x4a);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
-+            b1x4a = _mm_add_epi32(b1x4a, rndx4);
-+            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
-+            b1x4a = av_clip_int16_sse(b1x4a);
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
-+            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
-+            r0x4b = _mm_add_epi32(r0x4b, rndx4);
-+            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
-+            r0x4b = av_clip_int16_sse(r0x4b);
++            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
 +
-+            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
-+            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
-+            r1x4b = _mm_add_epi32(r1x4b, rndx4);
-+            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
-+            r1x4b = av_clip_int16_sse(r1x4b);
++            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
 +
-+            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
-+            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
-+            g0x4b = _mm_add_epi32(g0x4b, rndx4);
-+            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
-+            g0x4b = av_clip_int16_sse(g0x4b);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
 +
-+            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
-+            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
-+            g1x4b = _mm_add_epi32(g1x4b, rndx4);
-+            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
-+            g1x4b = av_clip_int16_sse(g1x4b);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
 +
-+            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
-+            b0x4b = _mm_add_epi32(b0x4b, rndx4);
-+            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
-+            b0x4b = av_clip_int16_sse(b0x4b);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
-+            b1x4b = _mm_add_epi32(b1x4b, rndx4);
-+            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
-+            b1x4b = av_clip_int16_sse(b1x4b);
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
 +
-+            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
-+            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
-+            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
 +
-+            roax4 = _mm_cvtepi16_epi32(r0ox8);
-+            goax4 = _mm_cvtepi16_epi32(g0ox8);
-+            boax4 = _mm_cvtepi16_epi32(b0ox8);
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
 +
-+            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
-+            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
-+            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
 +
-+            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
-+            // output shift bits for 8bit outputs is 29 - 8 = 21
-+            yoax4 = _mm_srai_epi32(yoax4, 21);
-+            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
-+            yobx4 = _mm_srai_epi32(yobx4, 21);
-+            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
-+            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
++            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +
-+            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
-+            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
-+            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
 +
-+            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
-+            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
-+            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
 +
-+            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
-+            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
-+            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
 +
-+            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
-+            y1oax4 = _mm_srai_epi32(y1oax4, 21);
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
-+            y1obx4 = _mm_srai_epi32(y1obx4, 21);
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
-+            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
++            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +
-+            ravgx4 = _mm_hadd_epi32(roax4, robx4);
-+            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
-+            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
-+            ravgx4 = _mm_srai_epi32(ravgx4, 2);
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
 +
-+            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
-+            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
-+            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
-+            gavgx4 = _mm_srai_epi32(gavgx4, 2);
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
 +
-+            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
-+            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
-+            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
-+            bavgx4 = _mm_srai_epi32(bavgx4, 2);
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
 +
-+            uoax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
-+            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
-+            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
-+            uoax4 = _mm_srai_epi32(uoax4, 21);
-+            uoax4 = _mm_add_epi32(uoax4, _mm_set1_epi32(out_uv_offset));
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
 +
-+            voax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
-+            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
-+            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
-+            voax4 = _mm_srai_epi32(voax4, 21);
-+            voax4 = _mm_add_epi32(voax4, _mm_set1_epi32(out_uv_offset));
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
 +
-+            uvoax4 = _mm_unpacklo_epi32(uoax4, voax4);
-+            uvobx4 = _mm_unpackhi_epi32(uoax4, voax4);
-+            _mm_storeu_si64(&dstuv[x], _mm_packus_epi16(_mm_packs_epi32(uvoax4, uvobx4), zero128));
++            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
++            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
++            uvox16 = _mm256_packs_epi32(uvoax8, uvobx8);
++            _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
++        int offset = width & (int)0xfffffff0;
 +        rdsty += offset;
 +        rdstuv += offset;
 +        rsrcy += offset;
@@ -3749,15 +3933,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +                                       dstdepth, srcdepth,
 +                                       remainw, rheight, params);
 +    }
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
 +
-+X86_64_V2 void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty, uint16_t *dstuv,
++X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
 +                                                       const uint16_t *srcy, const uint16_t *srcuv,
 +                                                       const int *dstlinesize, const int *srclinesize,
 +                                                       int dstdepth, int srcdepth,
 +                                                       int width, int height,
 +                                                       const struct TonemapIntParams *params)
 +{
++#ifdef ENABLE_TONEMAPX_AVX_INTRINSICS
 +    uint16_t *rdsty = dsty;
 +    uint16_t *rdstuv = dstuv;
 +    const uint16_t *rsrcy = srcy;
@@ -3765,7 +3951,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 6;
++    int remainw = width & 14;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -3793,251 +3979,254 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[8], g[8], b[8];
-+    int16_t r1[8], g1[8], b1[8];
-+
-+    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
-+    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
-+    __m128i cyx4 = _mm_set1_epi32(cy);
-+    __m128i rndx4 = _mm_set1_epi32(in_rnd);
-+    __m128i zero128 = _mm_setzero_si128();
-+    __m128i uvx8, uvx4a, uvx4b;
-+    __m128i y0x8, y1x8;
-+    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
-+    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
-+    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
++    int16_t r[16], g[16], b[16];
++    int16_t r1[16], g1[16], b1[16];
++    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
++    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
++    __m256i cyx8 = _mm256_set1_epi32(cy);
++    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
 +
-+    __m128i r0ox8, g0ox8, b0ox8;
-+    __m128i y0ox8;
-+    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
-+    __m128i yoax4, yobx4;
++    __m256i r0ox16, g0ox16, b0ox16;
++    __m256i y0ox16;
++    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
++    __m256i yoax8, yobx8;
++    __m256i uvx16, uvx8a, uvx8b;
++    __m256i y0x16, y1x16;
++    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
++    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
++    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
 +
-+    __m128i r1ox8, g1ox8, b1ox8;
-+    __m128i y1ox8;
-+    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
-+    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
-+    __m128i uoax4, voax4, ravgx4, gavgx4, bavgx4, uvox8;
++    __m256i r1ox16, g1ox16, b1ox16;
++    __m256i y1ox16;
++    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
++    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
++    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 3; xx++) {
-+            int x = xx << 3;
++        for (int xx = 0; xx < width >> 4; xx++) {
++            int x = xx << 4;
 +
-+            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
-+            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
-+            uvx8 = _mm_lddqu_si128((__m128i*)(srcuv + x));
++            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
++            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
 +
 +            if (in_depth == 10) {
 +                // shift to low10bits for 10bit input
-+                // shift bit has to be compile-time constant
-+                y0x8 = _mm_srli_epi16(y0x8, 6);
-+                y1x8 = _mm_srli_epi16(y1x8, 6);
-+                uvx8 = _mm_srli_epi16(uvx8, 6);
++                y0x16 = _mm256_srli_epi16(y0x16, 6);
++                y1x16 = _mm256_srli_epi16(y1x16, 6);
++                uvx16 = _mm256_srli_epi16(uvx16, 6);
 +            }
-+            y0x4a = _mm_cvtepu16_epi32(y0x8);
-+            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
-+            y1x4a = _mm_cvtepu16_epi32(y1x8);
-+            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
-+            uvx4a = _mm_cvtepu16_epi32(uvx8);
-+            uvx4b = _mm_unpackhi_epi16(uvx8, zero128);
-+            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
-+            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
-+            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
-+            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
-+            uvx4a = _mm_sub_epi32(uvx4a, in_uv_offx4);
-+            uvx4b = _mm_sub_epi32(uvx4b, in_uv_offx4);
 +
-+            ux4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(2, 2, 0, 0));
-+            ux4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(2, 2, 0, 0));
-+            vx4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(3, 3, 1, 1));
-+            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
++            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
++            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
++            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
++            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
++            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
++            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
++            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
++            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
++            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
++            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
++            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
 +
-+            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
-+            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
-+            r0x4a = _mm_add_epi32(r0x4a, rndx4);
-+            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
-+            r0x4a = av_clip_int16_sse(r0x4a);
++            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++
++            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
++            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
++            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
++            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
++            r0x8a = av_clip_int16_avx(r0x8a);
 +
-+            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
-+            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
-+            r1x4a = _mm_add_epi32(r1x4a, rndx4);
-+            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
-+            r1x4a = av_clip_int16_sse(r1x4a);
++            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
++            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
++            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
++            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
++            r1x8a = av_clip_int16_avx(r1x8a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
-+            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
-+            g0x4a = _mm_add_epi32(g0x4a, rndx4);
-+            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
-+            g0x4a = av_clip_int16_sse(g0x4a);
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
++            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
++            g0x8a = av_clip_int16_avx(g0x8a);
 +
-+            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
-+            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
-+            g1x4a = _mm_add_epi32(g1x4a, rndx4);
-+            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
-+            g1x4a = av_clip_int16_sse(g1x4a);
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
++            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
++            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
++            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
++            g1x8a = av_clip_int16_avx(g1x8a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
-+            b0x4a = _mm_add_epi32(b0x4a, rndx4);
-+            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
-+            b0x4a = av_clip_int16_sse(b0x4a);
++            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
++            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
++            b0x8a = av_clip_int16_avx(b0x8a);
 +
-+            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
-+            b1x4a = _mm_add_epi32(b1x4a, rndx4);
-+            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
-+            b1x4a = av_clip_int16_sse(b1x4a);
++            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
++            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
++            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
++            b1x8a = av_clip_int16_avx(b1x8a);
 +
-+            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
-+            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
-+            r0x4b = _mm_add_epi32(r0x4b, rndx4);
-+            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
-+            r0x4b = av_clip_int16_sse(r0x4b);
++            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
++            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
++            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
++            r0x8b = av_clip_int16_avx(r0x8b);
 +
-+            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
-+            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
-+            r1x4b = _mm_add_epi32(r1x4b, rndx4);
-+            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
-+            r1x4b = av_clip_int16_sse(r1x4b);
++            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
++            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
++            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
++            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
++            r1x8b = av_clip_int16_avx(r1x8b);
 +
-+            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
-+            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
-+            g0x4b = _mm_add_epi32(g0x4b, rndx4);
-+            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
-+            g0x4b = av_clip_int16_sse(g0x4b);
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
++            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
++            g0x8b = av_clip_int16_avx(g0x8b);
 +
-+            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
-+            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
-+            g1x4b = _mm_add_epi32(g1x4b, rndx4);
-+            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
-+            g1x4b = av_clip_int16_sse(g1x4b);
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
++            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
++            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
++            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
++            g1x8b = av_clip_int16_avx(g1x8b);
 +
-+            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
-+            b0x4b = _mm_add_epi32(b0x4b, rndx4);
-+            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
-+            b0x4b = av_clip_int16_sse(b0x4b);
++            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
++            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
++            b0x8b = av_clip_int16_avx(b0x8b);
 +
-+            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
-+            b1x4b = _mm_add_epi32(b1x4b, rndx4);
-+            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
-+            b1x4b = av_clip_int16_sse(b1x4b);
++            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
++            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
++            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
++            b1x8b = av_clip_int16_avx(b1x8b);
 +
-+            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
++            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
++            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
++            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
++            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
-+            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
-+            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
++            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
++            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
++            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
 +
-+            roax4 = _mm_cvtepi16_epi32(r0ox8);
-+            goax4 = _mm_cvtepi16_epi32(g0ox8);
-+            boax4 = _mm_cvtepi16_epi32(b0ox8);
++            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
++            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
++            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
 +
-+            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
-+            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
-+            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
++            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
++            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
++            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
 +
-+            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
-+            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
-+            yoax4 = _mm_srai_epi32(yoax4, out_sh);
-+            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
++            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
++            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
++            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
-+            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
-+            yobx4 = _mm_srai_epi32(yobx4, out_sh);
-+            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
++            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
++            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
++            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
-+            y0ox8 = _mm_slli_epi16(y0ox8, out_sh2);
-+            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
++            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
++            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
 +
-+            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
-+            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
-+            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
++            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
++            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
++            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
 +
-+            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
-+            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
-+            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
++            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
++            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
++            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
 +
-+            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
-+            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
-+            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
++            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
++            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
++            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
 +
-+            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
-+            y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
-+            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
++            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
++            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
++            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
-+            y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
-+            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
++            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
++            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
++            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
 +
-+            y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
-+            y1ox8 = _mm_slli_epi16(y1ox8, out_sh2);
-+            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
++            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
++            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
++            y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
 +
-+            ravgx4 = _mm_hadd_epi32(roax4, robx4);
-+            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
-+            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
-+            ravgx4 = _mm_srai_epi32(ravgx4, 2);
++            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
++            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
++            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
 +
-+            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
-+            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
-+            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
-+            gavgx4 = _mm_srai_epi32(gavgx4, 2);
++            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
++            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
++            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
 +
-+            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
-+            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
-+            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
-+            bavgx4 = _mm_srai_epi32(bavgx4, 2);
++            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
++            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
++            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
++            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
 +
-+            uoax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
-+            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
-+            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
-+            uoax4 = _mm_srai_epi32(uoax4, out_sh);
-+            uoax4 = _mm_add_epi32(uoax4, _mm_set1_epi32(out_uv_offset));
++            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
++            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
++            uox8 = _mm256_srai_epi32(uox8, out_sh);
++            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
 +
-+            voax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
-+            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
-+            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
-+            voax4 = _mm_srai_epi32(voax4, out_sh);
-+            voax4 = _mm_add_epi32(voax4, _mm_set1_epi32(out_uv_offset));
++            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
++            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
++            vox8 = _mm256_srai_epi32(vox8, out_sh);
++            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
 +
-+            uvoax4 = _mm_unpacklo_epi32(uoax4, voax4);
-+            uvobx4 = _mm_unpackhi_epi32(uoax4, voax4);
-+            uvox8 = _mm_packus_epi32(uvoax4, uvobx4);
-+            uvox8 = _mm_slli_epi16(uvox8, out_sh2);
-+            _mm_storeu_si128((__m128i_u *) &dstuv[x], uvox8);
++            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
++            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
++            uvox16 = _mm256_packus_epi32(uvoax8, uvobx8);
++            uvox16 = _mm256_slli_epi16(uvox16, out_sh2);
++            _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16);
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff8;
++        int offset = width & (int)0xfffffff0;
 +        rdsty += offset;
 +        rdstuv += offset;
 +        rsrcy += offset;
@@ -4048,12 +4237,13 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +                                            dstdepth, srcdepth,
 +                                            remainw, rheight, params);
 +    }
++#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
-Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
+Index: jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
 ===================================================================
 --- /dev/null
-+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
-@@ -0,0 +1,58 @@
++++ jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
+@@ -0,0 +1,54 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -4074,50 +4264,48 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
-+#ifndef AVFILTER_TONEMAPX_INTRIN_SSE_H
-+#define AVFILTER_TONEMAPX_INTRIN_SSE_H
-+
-+#include <immintrin.h>
-+#include <emmintrin.h>
-+#include <smmintrin.h>
++#ifndef AVFILTER_TONEMAPX_INTRIN_AVX_H
++#define AVFILTER_TONEMAPX_INTRIN_AVX_H
 +
 +#include "libavfilter/vf_tonemapx.h"
 +
-+X86_64_V2 void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
 +                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                               const int *dstlinesize, const int *srclinesize,
 +                                               int dstdepth, int srcdepth,
 +                                               int width, int height,
 +                                               const struct TonemapIntParams *params);
 +
-+X86_64_V2 void tonemap_frame_420p10_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
 +                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                                 const int *dstlinesize, const int *srclinesize,
 +                                                 int dstdepth, int srcdepth,
 +                                                 int width, int height,
 +                                                 const struct TonemapIntParams *params);
 +
-+X86_64_V2 void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv,
++X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
 +                                                  const uint16_t *srcy, const uint16_t *srcuv,
 +                                                  const int *dstlinesize, const int *srclinesize,
 +                                                  int dstdepth, int srcdepth,
 +                                                  int width, int height,
 +                                                  const struct TonemapIntParams *params);
 +
-+X86_64_V2 void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty, uint16_t *dstuv,
++X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
 +                                                       const uint16_t *srcy, const uint16_t *srcuv,
 +                                                       const int *dstlinesize, const int *srclinesize,
 +                                                       int dstdepth, int srcdepth,
 +                                                       int width, int height,
 +                                                       const struct TonemapIntParams *params);
 +
-+#endif //AVFILTER_TONEMAPX_INTRIN_SSE_H
-Index: FFmpeg/libavfilter/vf_tonemapx.h
++#endif // AVFILTER_TONEMAPX_INTRIN_AVX_H
+Index: jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 ===================================================================
 --- /dev/null
-+++ FFmpeg/libavfilter/vf_tonemapx.h
-@@ -0,0 +1,83 @@
++++ jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
+@@ -0,0 +1,1359 @@
 +/*
++ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
++ *
 + * This file is part of FFmpeg.
 + *
 + * FFmpeg is free software; you can redistribute it and/or
@@ -4135,112 +4323,51 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
-+#ifndef AVFILTER_TONEMAPX_H
-+#define AVFILTER_TONEMAPX_H
-+
-+#include "colorspace.h"
-+
-+#if defined(__GNUC__) || defined(__clang__)
-+#    if (__GNUC__ >= 11) || (__clang_major__ >= 12)
-+#        define X86_64_V2 __attribute__((target("arch=x86-64-v2")))
-+#        define X86_64_V3 __attribute__((target("arch=x86-64-v3")))
-+#    else
-+#        define X86_64_V2 __attribute__((target("sse4.2")))
-+#        define X86_64_V3 __attribute__((target("avx2,fma")))
-+#    endif // (__GNUC__ >= 11) || (__clang_major__ >= 12)
-+#endif // defined(__GNUC__) || defined(__clang__)
-+
-+typedef struct TonemapIntParams {
-+    double lut_peak;
-+    float *lin_lut;
-+    float *tonemap_lut;
-+    uint16_t *delin_lut;
-+    int in_yuv_off, out_yuv_off;
-+    int16_t (*yuv2rgb_coeffs)[3][3][8];
-+    int16_t (*rgb2yuv_coeffs)[3][3][8];
-+    double  (*rgb2rgb_coeffs)[3][3];
-+    int rgb2rgb_passthrough;
-+    const AVLumaCoefficients *coeffs, *ocoeffs;
-+    double desat;
-+} TonemapIntParams;
-+
-+enum SIMDVariant {
-+    SIMD_NONE = -1,
-+    SIMD_NEON,
-+    SIMD_SSE,
-+    SIMD_AVX
-+};
++#include "vf_tonemapx_intrin_sse.h"
 +
-+void tonemap_frame_420p10_2_420p(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
-+                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                 const int *dstlinesize, const int *srclinesize,
-+                                 int dstdepth, int srcdepth,
-+                                 int width, int height,
-+                                 const struct TonemapIntParams *params);
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
++#    include <immintrin.h>
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
 +
-+void tonemap_frame_p016_p010_2_nv12(uint8_t *dsty, uint8_t *dstuv,
-+                                    const uint16_t *srcy, const uint16_t *srcuv,
-+                                    const int *dstlinesize, const int *srclinesize,
-+                                    int dstdepth, int srcdepth,
-+                                    int width, int height,
-+                                    const struct TonemapIntParams *params);
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
++// GCC 10 and below does not implement _mm_storeu_si32 with movd instruction
++// cast the register into float register and store with movss as a workaround
++#if (defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ <= 10)
++__attribute__((always_inline))
++X86_64_V2 static inline void _mm_storeu_si32(void* mem_addr, __m128i a) {
++    _mm_store_ss((float*)mem_addr, _mm_castsi128_ps(a));
++    return;
++}
++#endif
 +
-+void tonemap_frame_420p10_2_420p10(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
-+                                   const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
-+                                   const int *dstlinesize, const int *srclinesize,
-+                                   int dstdepth, int srcdepth,
-+                                   int width, int height,
-+                                   const struct TonemapIntParams *params);
++X86_64_V2 static inline __m128i av_clip_uint16_sse(__m128i a)
++{
++__m128i mask = _mm_set1_epi32(0x7FFF);
++__m128i condition = _mm_and_si128(a, _mm_set1_epi32(~0x7FFF));
 +
-+void tonemap_frame_p016_p010_2_p016_p010(uint16_t *dsty, uint16_t *dstuv,
-+                                         const uint16_t *srcy, const uint16_t *srcuv,
-+                                         const int *dstlinesize, const int *srclinesize,
-+                                         int dstdepth, int srcdepth,
-+                                         int width, int height,
-+                                         const struct TonemapIntParams *params);
++__m128i zero = _mm_setzero_si128();
++__m128i cmp = _mm_cmpeq_epi32(condition, zero);
 +
-+#endif //AVFILTER_TONEMAPX_H
-Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
-===================================================================
---- /dev/null
-+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
-@@ -0,0 +1,1353 @@
-+/*
-+ * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
++__m128i neg_a = _mm_and_si128(_mm_srai_epi32(_mm_xor_si128(a, _mm_set1_epi32(-1)), 31), mask);
++__m128i result = _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, neg_a));
 +
-+#include "vf_tonemapx_intrin_avx.h"
++return result;
++}
 +
-+X86_64_V3 static inline __m256i av_clip_int16_avx(__m256i a)
++X86_64_V2 static inline __m128i av_clip_int16_sse(__m128i a)
 +{
-+__m256i add_result = _mm256_add_epi32(a, _mm256_set1_epi32(0x8000U));
-+__m256i mask = _mm256_set1_epi32(~0xFFFF);
-+__m256i condition = _mm256_and_si256(add_result, mask);
-+__m256i cmp = _mm256_cmpeq_epi32(condition, _mm256_setzero_si256());
++__m128i add_result = _mm_add_epi32(a, _mm_set1_epi32(0x8000U));
++__m128i mask = _mm_set1_epi32(~0xFFFF);
++__m128i condition = _mm_and_si128(add_result, mask);
++__m128i cmp = _mm_cmpeq_epi32(condition, _mm_setzero_si128());
 +
-+__m256i shifted = _mm256_srai_epi32(a, 31);
-+__m256i xor_result = _mm256_xor_si256(shifted, _mm256_set1_epi32(0x7FFF));
++__m128i shifted = _mm_srai_epi32(a, 31);
++__m128i xor_result = _mm_xor_si128(shifted, _mm_set1_epi32(0x7FFF));
 +
-+return _mm256_or_si256(_mm256_and_si256(cmp, a), _mm256_andnot_si256(cmp, xor_result));
++return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, xor_result));
 +}
 +
-+X86_64_V3 static inline void tonemap_int32x8_avx(__m256i r_in, __m256i g_in, __m256i b_in,
++X86_64_V2 static inline void tonemap_int32x4_sse(__m128i r_in, __m128i g_in, __m128i b_in,
 +                                                 int16_t *r_out, int16_t *g_out, int16_t *b_out,
 +                                                 float *lin_lut, float *tonemap_lut, uint16_t *delin_lut,
 +                                                 const AVLumaCoefficients *coeffs,
@@ -4248,138 +4375,129 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +                                                 double (*rgb2rgb)[3][3],
 +                                                 int rgb2rgb_passthrough)
 +{
-+    __m256i sig8;
-+    __m256 mapvalx8, r_linx8, g_linx8, b_linx8;
-+    __m256 offset = _mm256_set1_ps(0.5f);
-+    __m256i zerox8 = _mm256_setzero_si256();
-+    __m256i input_lut_offset = _mm256_set1_epi32(2048);
-+    __m256i upper_bound = _mm256_set1_epi32(32767);
-+    __m256 intermediate_upper_bound = _mm256_set1_ps(32767.0f);
-+    __m256i r, g, b, rx8, gx8, bx8;
++    __m128i sig4;
++    __m128 mapvalx4, r_linx4, g_linx4, b_linx4;
++    __m128 offset = _mm_set1_ps(0.5f);
++    __m128i input_lut_offset = _mm_set1_epi32(2048);
++    __m128 intermediate_upper_bound = _mm_set1_ps(32767.0f);
++    __m128i r, g, b, rx4, gx4, bx4;
 +
-+    float mapval8[8], r_lin8[8], g_lin8[8], b_lin8[8];
++    float mapval4[4], r_lin4[4], g_lin4[4], b_lin4[4];
 +
-+    sig8 = _mm256_max_epi32(r_in, _mm256_max_epi32(g_in, b_in));
-+    sig8 = _mm256_add_epi32(sig8, input_lut_offset);
-+    sig8 = _mm256_min_epi32(sig8, upper_bound);
-+    sig8 = _mm256_max_epi32(sig8, zerox8);
++    sig4 = _mm_max_epi32(r_in, _mm_max_epi32(g_in, b_in));
++    sig4 = _mm_add_epi32(sig4, input_lut_offset);
++    sig4 = av_clip_uint16_sse(sig4);
 +
-+    r = _mm256_add_epi32(r_in, input_lut_offset);
-+    r = _mm256_min_epi32(r, upper_bound);
-+    r = _mm256_max_epi32(r, zerox8);
-+    g = _mm256_add_epi32(g_in, input_lut_offset);
-+    g = _mm256_min_epi32(g, upper_bound);
-+    g = _mm256_max_epi32(g, zerox8);
-+    b = _mm256_add_epi32(b_in, input_lut_offset);
-+    b = _mm256_min_epi32(b, upper_bound);
-+    b = _mm256_max_epi32(b, zerox8);
++    r = _mm_add_epi32(r_in, input_lut_offset);
++    r = av_clip_uint16_sse(r);
++    g = _mm_add_epi32(g_in, input_lut_offset);
++    g = av_clip_uint16_sse(g);
++    b = _mm_add_epi32(b_in, input_lut_offset);
++    b = av_clip_uint16_sse(b);
 +
-+#define LOAD_LUT(i) mapval8[i] = tonemap_lut[_mm256_extract_epi32(sig8, i)]; \
-+r_lin8[i] = lin_lut[_mm256_extract_epi32(r, i)];                             \
-+g_lin8[i] = lin_lut[_mm256_extract_epi32(g, i)];                             \
-+b_lin8[i] = lin_lut[_mm256_extract_epi32(b, i)];
++    // Cannot use loop here as the lane has to be compile-time constant
++#define LOAD_LUT(i) mapval4[i] = tonemap_lut[_mm_extract_epi32(sig4, i)]; \
++r_lin4[i] = lin_lut[_mm_extract_epi32(r, i)];                             \
++g_lin4[i] = lin_lut[_mm_extract_epi32(g, i)];                             \
++b_lin4[i] = lin_lut[_mm_extract_epi32(b, i)];
 +
 +    LOAD_LUT(0)
 +    LOAD_LUT(1)
 +    LOAD_LUT(2)
 +    LOAD_LUT(3)
-+    LOAD_LUT(4)
-+    LOAD_LUT(5)
-+    LOAD_LUT(6)
-+    LOAD_LUT(7)
 +
 +#undef LOAD_LUT
 +
-+    mapvalx8 = _mm256_loadu_ps(mapval8);
-+    r_linx8 = _mm256_loadu_ps(r_lin8);
-+    g_linx8 = _mm256_loadu_ps(g_lin8);
-+    b_linx8 = _mm256_loadu_ps(b_lin8);
++    mapvalx4 = _mm_loadu_ps(mapval4);
++    r_linx4 = _mm_loadu_ps(r_lin4);
++    g_linx4 = _mm_loadu_ps(g_lin4);
++    b_linx4 = _mm_loadu_ps(b_lin4);
 +
 +    if (!rgb2rgb_passthrough) {
-+        r_linx8 = _mm256_mul_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][0]));
-+        r_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][1]), r_linx8);
-+        r_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[0][2]), r_linx8);
++        r_linx4 = _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][0]));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][1])));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[0][2])));
 +
-+        g_linx8 = _mm256_mul_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][1]));
-+        g_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][0]), g_linx8);
-+        g_linx8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[1][2]), g_linx8);
++        g_linx4 = _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][1]));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][0])));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[1][2])));
 +
-+        b_linx8 = _mm256_mul_ps(b_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][2]));
-+        b_linx8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][0]), b_linx8);
-+        b_linx8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)(*rgb2rgb)[2][1]), b_linx8);
++        b_linx4 = _mm_mul_ps(b_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][2]));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][0])));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)(*rgb2rgb)[2][1])));
 +    }
 +
 +    if (desat > 0) {
-+        __m256 eps_x8 = _mm256_set1_ps(FLOAT_EPS);
-+        __m256 desat8 = _mm256_set1_ps((float)desat);
-+        __m256 luma8 = _mm256_set1_ps(0);
-+        __m256 overbright8;
++        __m128 eps_x4 = _mm_set1_ps(FLOAT_EPS);
++        __m128 desat4 = _mm_set1_ps((float)desat);
++        __m128 luma4 = _mm_set1_ps(0);
++        __m128 overbright4;
 +
-+        luma8 = _mm256_fmadd_ps(r_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cr)), luma8);
-+        luma8 = _mm256_fmadd_ps(g_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cg)), luma8);
-+        luma8 = _mm256_fmadd_ps(b_linx8, _mm256_set1_ps((float)av_q2d(coeffs->cb)), luma8);
-+        overbright8 = _mm256_div_ps(_mm256_max_ps(_mm256_sub_ps(luma8, desat8), eps_x8), _mm256_max_ps(luma8, eps_x8));
-+        r_linx8 = _mm256_fnmadd_ps(r_linx8, overbright8, r_linx8);
-+        r_linx8 = _mm256_fmadd_ps(luma8, overbright8, r_linx8);
-+        g_linx8 = _mm256_fnmadd_ps(g_linx8, overbright8, g_linx8);
-+        g_linx8 = _mm256_fmadd_ps(luma8, overbright8, g_linx8);
-+        b_linx8 = _mm256_fnmadd_ps(b_linx8, overbright8, b_linx8);
-+        b_linx8 = _mm256_fmadd_ps(luma8, overbright8, b_linx8);
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(r_linx4, _mm_set1_ps((float)av_q2d(coeffs->cr))));
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(g_linx4, _mm_set1_ps((float)av_q2d(coeffs->cg))));
++        luma4 = _mm_add_ps(luma4, _mm_mul_ps(b_linx4, _mm_set1_ps((float)av_q2d(coeffs->cb))));
++        overbright4 = _mm_div_ps(_mm_max_ps(_mm_sub_ps(luma4, desat4), eps_x4), _mm_max_ps(luma4, eps_x4));
++        r_linx4 = _mm_sub_ps(r_linx4, _mm_mul_ps(r_linx4, overbright4));
++        r_linx4 = _mm_add_ps(r_linx4, _mm_mul_ps(luma4, overbright4));
++        g_linx4 = _mm_sub_ps(g_linx4, _mm_mul_ps(g_linx4, overbright4));
++        g_linx4 = _mm_add_ps(g_linx4, _mm_mul_ps(luma4, overbright4));
++        b_linx4 = _mm_sub_ps(b_linx4, _mm_mul_ps(b_linx4, overbright4));
++        b_linx4 = _mm_add_ps(b_linx4, _mm_mul_ps(luma4, overbright4));
 +    }
 +
-+    r_linx8 = _mm256_mul_ps(r_linx8, mapvalx8);
-+    g_linx8 = _mm256_mul_ps(g_linx8, mapvalx8);
-+    b_linx8 = _mm256_mul_ps(b_linx8, mapvalx8);
++    r_linx4 = _mm_mul_ps(r_linx4, mapvalx4);
++    g_linx4 = _mm_mul_ps(g_linx4, mapvalx4);
++    b_linx4 = _mm_mul_ps(b_linx4, mapvalx4);
 +
-+    r_linx8 = _mm256_fmadd_ps(r_linx8, intermediate_upper_bound, offset);
-+    g_linx8 = _mm256_fmadd_ps(g_linx8, intermediate_upper_bound, offset);
-+    b_linx8 = _mm256_fmadd_ps(b_linx8, intermediate_upper_bound, offset);
++    r_linx4 = _mm_mul_ps(r_linx4, intermediate_upper_bound);
++    r_linx4 = _mm_add_ps(r_linx4, offset);
 +
-+    rx8 = _mm256_cvttps_epi32(r_linx8);
-+    rx8 = _mm256_min_epi32(rx8, upper_bound);
-+    rx8 = _mm256_max_epi32(rx8, zerox8);
++    g_linx4 = _mm_mul_ps(g_linx4, intermediate_upper_bound);
++    g_linx4 = _mm_add_ps(g_linx4, offset);
 +
-+    gx8 = _mm256_cvttps_epi32(g_linx8);
-+    gx8 = _mm256_min_epi32(gx8, upper_bound);
-+    gx8 = _mm256_max_epi32(gx8, zerox8);
++    b_linx4 = _mm_mul_ps(b_linx4, intermediate_upper_bound);
++    b_linx4 = _mm_add_ps(b_linx4, offset);
 +
-+    bx8 = _mm256_cvttps_epi32(b_linx8);
-+    bx8 = _mm256_min_epi32(bx8, upper_bound);
-+    bx8 = _mm256_max_epi32(bx8, zerox8);
++    rx4 = _mm_cvttps_epi32(r_linx4);
++    rx4 = av_clip_uint16_sse(rx4);
++    gx4 = _mm_cvttps_epi32(g_linx4);
++    gx4 = av_clip_uint16_sse(gx4);
++    bx4 = _mm_cvttps_epi32(b_linx4);
++    bx4 = av_clip_uint16_sse(bx4);
 +
-+#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm256_extract_epi32(rx8, i)]; \
-+g_out[i] = delin_lut[_mm256_extract_epi32(gx8, i)];                       \
-+b_out[i] = delin_lut[_mm256_extract_epi32(bx8, i)];
++#define SAVE_COLOR(i) r_out[i] = delin_lut[_mm_extract_epi32(rx4, i)]; \
++g_out[i] = delin_lut[_mm_extract_epi32(gx4, i)];                       \
++b_out[i] = delin_lut[_mm_extract_epi32(bx4, i)];
 +
 +    SAVE_COLOR(0)
 +    SAVE_COLOR(1)
 +    SAVE_COLOR(2)
 +    SAVE_COLOR(3)
-+    SAVE_COLOR(4)
-+    SAVE_COLOR(5)
-+    SAVE_COLOR(6)
-+    SAVE_COLOR(7)
 +
 +#undef SAVE_COLOR
 +}
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
 +
-+X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++X86_64_V2 void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
 +                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                               const int *dstlinesize, const int *srclinesize,
 +                                               int dstdepth, int srcdepth,
 +                                               int width, int height,
 +                                               const struct TonemapIntParams *params)
 +{
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
 +    uint8_t *rdsty = dsty;
 +    uint8_t *rdstu = dstu;
 +    uint8_t *rdstv = dstv;
++
 +    const uint16_t *rsrcy = srcy;
 +    const uint16_t *rsrcu = srcu;
 +    const uint16_t *rsrcv = srcv;
++
 +    int rheight = height;
-+    // not zero when not divisible by 16
++    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 14;
++    int remainw = width & 6;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -4406,248 +4524,240 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[16], g[16], b[16];
-+    int16_t r1[16], g1[16], b1[16];
-+    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
-+    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
-+    __m256i cyx8 = _mm256_set1_epi32(cy);
-+    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
 +
-+    __m256i ux8, vx8;
-+    __m256i y0x16, y1x16;
-+    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
++    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
++    __m128i cyx4 = _mm_set1_epi32(cy);
++    __m128i rndx4 = _mm_set1_epi32(in_rnd);
++    __m128i zero128 = _mm_setzero_si128();
++    __m128i ux4, vx4;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
-+    __m256i r0ox16, g0ox16, b0ox16;
-+    __m256i y0ox16;
-+    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
-+    __m256i yoax8, yobx8;
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
 +
-+    __m256i r1ox16, g1ox16, b1ox16;
-+    __m256i y1ox16;
-+    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8;
-+    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4;
++    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstu += dstlinesize[1], dstv += dstlinesize[2],
 +                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[2] / 2) {
-+        for (int xx = 0; xx < width >> 4; xx++) {
-+            int x = xx << 4;
-+
-+            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
-+            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
-+            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
-+            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
-+            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
-+            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1)));
++            vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1)));
 +
-+            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
-+            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
-+            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
-+            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
-+            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            ux4 = _mm_cvtepu16_epi32(ux4);
++            vx4 = _mm_cvtepu16_epi32(vx4);
++            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
++            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
++            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
++            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
++            ux4 = _mm_sub_epi32(ux4, in_uv_offx4);
++            vx4 = _mm_sub_epi32(vx4, in_uv_offx4);
 +
-+            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-+            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-+            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-+            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            ux4a = _mm_unpacklo_epi32(ux4, ux4);
++            ux4b = _mm_unpackhi_epi32(ux4, ux4);
++            vx4a = _mm_unpacklo_epi32(vx4, vx4);
++            vx4b = _mm_unpackhi_epi32(vx4, vx4);
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
-+            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
-+            r0x8a = av_clip_int16_avx(r0x8a);
++            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r0x4a = _mm_add_epi32(r0x4a, rndx4);
++            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
++            r0x4a = av_clip_int16_sse(r0x4a);
 +
-+            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
-+            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
-+            r1x8a = av_clip_int16_avx(r1x8a);
++            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r1x4a = _mm_add_epi32(r1x4a, rndx4);
++            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
++            r1x4a = av_clip_int16_sse(r1x4a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
-+            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
-+            g0x8a = av_clip_int16_avx(g0x8a);
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g0x4a = _mm_add_epi32(g0x4a, rndx4);
++            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
++            g0x4a = av_clip_int16_sse(g0x4a);
 +
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
-+            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
-+            g1x8a = av_clip_int16_avx(g1x8a);
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g1x4a = _mm_add_epi32(g1x4a, rndx4);
++            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
++            g1x4a = av_clip_int16_sse(g1x4a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
-+            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
-+            b0x8a = av_clip_int16_avx(b0x8a);
++            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b0x4a = _mm_add_epi32(b0x4a, rndx4);
++            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
++            b0x4a = av_clip_int16_sse(b0x4a);
 +
-+            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
-+            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
-+            b1x8a = av_clip_int16_avx(b1x8a);
++            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b1x4a = _mm_add_epi32(b1x4a, rndx4);
++            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
++            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
-+            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
-+            r0x8b = av_clip_int16_avx(r0x8b);
++            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r0x4b = _mm_add_epi32(r0x4b, rndx4);
++            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
++            r0x4b = av_clip_int16_sse(r0x4b);
 +
-+            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
-+            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
-+            r1x8b = av_clip_int16_avx(r1x8b);
++            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r1x4b = _mm_add_epi32(r1x4b, rndx4);
++            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
++            r1x4b = av_clip_int16_sse(r1x4b);
 +
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
-+            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
-+            g0x8b = av_clip_int16_avx(g0x8b);
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g0x4b = _mm_add_epi32(g0x4b, rndx4);
++            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
++            g0x4b = av_clip_int16_sse(g0x4b);
 +
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
-+            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
-+            g1x8b = av_clip_int16_avx(g1x8b);
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g1x4b = _mm_add_epi32(g1x4b, rndx4);
++            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
++            g1x4b = av_clip_int16_sse(g1x4b);
 +
-+            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
-+            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
-+            b0x8b = av_clip_int16_avx(b0x8b);
++            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b0x4b = _mm_add_epi32(b0x4b, rndx4);
++            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
++            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
-+            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
-+            b1x8b = av_clip_int16_avx(b1x8b);
++            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b1x4b = _mm_add_epi32(b1x4b, rndx4);
++            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
++            b1x4b = av_clip_int16_sse(b1x4b);
 +
-+            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
-+            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
-+            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
 +
-+            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
-+            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
-+            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
 +
-+            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
-+            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
-+            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
 +
-+            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
-+            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            // output shift bits for 8bit outputs is 29 - 8 = 21
++            yoax4 = _mm_srai_epi32(yoax4, 21);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
-+            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, 21);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
-+            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
++            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
 +
-+            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
-+            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
-+            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
 +
-+            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
-+            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
-+            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
 +
-+            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
-+            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
-+            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
 +
-+            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
-+            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, 21);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
-+            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, 21);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
-+            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
++            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
 +
-+            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
-+            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
-+            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
 +
-+            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
-+            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
-+            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
 +
-+            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
-+            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
-+            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
 +
-+            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
-+            uox8 = _mm256_srai_epi32(uox8, out_sh);
-+            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
-+            uox8 = _mm256_packs_epi32(uox8, _mm256_setzero_si256());
-+            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            uox8 = _mm256_packus_epi16(uox8, _mm256_setzero_si256());
-+            _mm_storeu_si64(&dstu[x >> 1], _mm256_castsi256_si128(uox8));
++            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uox4 = _mm_srai_epi32(uox4, 21);
++            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si32(&dstu[x >> 1], _mm_packus_epi16(_mm_packs_epi32(uox4, zero128), zero128));
 +
-+            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
-+            vox8 = _mm256_srai_epi32(vox8, out_sh);
-+            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
-+            vox8 = _mm256_packs_epi32(vox8, _mm256_setzero_si256());
-+            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            vox8 = _mm256_packus_epi16(vox8, _mm256_setzero_si256());
-+            _mm_storeu_si64(&dstv[x >> 1], _mm256_castsi256_si128(vox8));
++            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            vox4 = _mm_srai_epi32(vox4, 21);
++            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si32(&dstv[x >> 1], _mm_packus_epi16(_mm_packs_epi32(vox4, zero128), zero128));
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff0;
++        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
 +        rdstu += offset >> 1;
 +        rdstv += offset >> 1;
@@ -4660,15 +4770,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +                                    dstdepth, srcdepth,
 +                                    remainw, rheight, params);
 +    }
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
 +}
 +
-+X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++X86_64_V2 void tonemap_frame_420p10_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
 +                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                                 const int *dstlinesize, const int *srclinesize,
 +                                                 int dstdepth, int srcdepth,
 +                                                 int width, int height,
 +                                                 const struct TonemapIntParams *params)
 +{
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
 +    uint16_t *rdsty = dsty;
 +    uint16_t *rdstu = dstu;
 +    uint16_t *rdstv = dstv;
@@ -4678,7 +4790,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 14;
++    int remainw = width & 6;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -4705,245 +4817,239 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[16], g[16], b[16];
-+    int16_t r1[16], g1[16], b1[16];
-+    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
-+    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
-+    __m256i cyx8 = _mm256_set1_epi32(cy);
-+    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
 +
-+    __m256i r0ox16, g0ox16, b0ox16;
-+    __m256i y0ox16;
-+    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
-+    __m256i yoax8, yobx8;
-+    __m256i ux8, vx8;
-+    __m256i y0x16, y1x16;
-+    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
++    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
++    __m128i cyx4 = _mm_set1_epi32(cy);
++    __m128i rndx4 = _mm_set1_epi32(in_rnd);
++    __m128i zero128 = _mm_setzero_si128();
++    __m128i ux4, vx4;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
-+    __m256i r1ox16, g1ox16, b1ox16;
-+    __m256i y1ox16;
-+    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8;
-+    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
++
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4;
++    __m128i uox4, vox4, ravgx4, gavgx4, bavgx4;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstu += dstlinesize[1] / 2, dstv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcu += srclinesize[1] / 2, srcv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 4; xx++) {
-+            int x = xx << 4;
-+
-+            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
-+            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            ux8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcu + (x >> 1))));
-+            vx8 = _mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i_u *)(srcv + (x >> 1))));
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
-+            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
-+            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
-+            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            ux4 = _mm_loadu_si64((__m128i*)(srcu + (x >> 1)));
++            vx4 = _mm_loadu_si64((__m128i*)(srcv + (x >> 1)));
 +
-+            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
-+            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
-+            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
-+            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            ux8 = _mm256_sub_epi32(ux8, in_uv_offx8);
-+            vx8 = _mm256_sub_epi32(vx8, in_uv_offx8);
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            ux4 = _mm_cvtepu16_epi32(ux4);
++            vx4 = _mm_cvtepu16_epi32(vx4);
++            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
++            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
++            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
++            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
++            ux4 = _mm_sub_epi32(ux4, in_uv_offx4);
++            vx4 = _mm_sub_epi32(vx4, in_uv_offx4);
 +
-+            ux8a = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-+            ux8b = _mm256_permutevar8x32_epi32(ux8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
-+            vx8a = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
-+            vx8b = _mm256_permutevar8x32_epi32(vx8, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4));
++            ux4a = _mm_unpacklo_epi32(ux4, ux4);
++            ux4b = _mm_unpackhi_epi32(ux4, ux4);
++            vx4a = _mm_unpacklo_epi32(vx4, vx4);
++            vx4b = _mm_unpackhi_epi32(vx4, vx4);
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
-+            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
-+            r0x8a = av_clip_int16_avx(r0x8a);
++            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r0x4a = _mm_add_epi32(r0x4a, rndx4);
++            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
++            r0x4a = av_clip_int16_sse(r0x4a);
 +
-+            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
-+            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
-+            r1x8a = av_clip_int16_avx(r1x8a);
++            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r1x4a = _mm_add_epi32(r1x4a, rndx4);
++            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
++            r1x4a = av_clip_int16_sse(r1x4a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
-+            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
-+            g0x8a = av_clip_int16_avx(g0x8a);
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g0x4a = _mm_add_epi32(g0x4a, rndx4);
++            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
++            g0x4a = av_clip_int16_sse(g0x4a);
 +
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
-+            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
-+            g1x8a = av_clip_int16_avx(g1x8a);
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g1x4a = _mm_add_epi32(g1x4a, rndx4);
++            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
++            g1x4a = av_clip_int16_sse(g1x4a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
-+            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
-+            b0x8a = av_clip_int16_avx(b0x8a);
++            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b0x4a = _mm_add_epi32(b0x4a, rndx4);
++            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
++            b0x4a = av_clip_int16_sse(b0x4a);
 +
-+            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
-+            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
-+            b1x8a = av_clip_int16_avx(b1x8a);
++            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b1x4a = _mm_add_epi32(b1x4a, rndx4);
++            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
++            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
-+            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
-+            r0x8b = av_clip_int16_avx(r0x8b);
++            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r0x4b = _mm_add_epi32(r0x4b, rndx4);
++            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
++            r0x4b = av_clip_int16_sse(r0x4b);
 +
-+            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
-+            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
-+            r1x8b = av_clip_int16_avx(r1x8b);
++            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r1x4b = _mm_add_epi32(r1x4b, rndx4);
++            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
++            r1x4b = av_clip_int16_sse(r1x4b);
 +
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
-+            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
-+            g0x8b = av_clip_int16_avx(g0x8b);
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g0x4b = _mm_add_epi32(g0x4b, rndx4);
++            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
++            g0x4b = av_clip_int16_sse(g0x4b);
 +
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
-+            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
-+            g1x8b = av_clip_int16_avx(g1x8b);
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g1x4b = _mm_add_epi32(g1x4b, rndx4);
++            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
++            g1x4b = av_clip_int16_sse(g1x4b);
 +
-+            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
-+            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
-+            b0x8b = av_clip_int16_avx(b0x8b);
++            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b0x4b = _mm_add_epi32(b0x4b, rndx4);
++            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
++            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
-+            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
-+            b1x8b = av_clip_int16_avx(b1x8b);
++            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b1x4b = _mm_add_epi32(b1x4b, rndx4);
++            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
++            b1x4b = av_clip_int16_sse(b1x4b);
 +
-+            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
-+            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
-+            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
 +
-+            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
-+            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
-+            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
 +
-+            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
-+            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
-+            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
 +
-+            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
-+            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            yoax4 = _mm_srai_epi32(yoax4, out_sh);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
-+            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, out_sh);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
-+            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
++            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
 +
-+            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
-+            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
-+            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
 +
-+            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
-+            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
-+            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
 +
-+            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
-+            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
-+            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
 +
-+            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
-+            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
-+            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
-+            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++            y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
 +
-+            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
-+            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
-+            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
 +
-+            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
-+            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
-+            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
 +
-+            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
-+            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
-+            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
 +
-+            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
-+            uox8 = _mm256_srai_epi32(uox8, out_sh);
-+            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
-+            uox8 = _mm256_packus_epi32(uox8, _mm256_setzero_si256());
-+            uox8 = _mm256_permute4x64_epi64(uox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dstu[x >> 1], _mm256_castsi256_si128(uox8));
++            uox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uox4 = _mm_add_epi32(uox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uox4 = _mm_srai_epi32(uox4, out_sh);
++            uox4 = _mm_add_epi32(uox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si64((__m128i_u *) &dstu[x >> 1], _mm_packus_epi32(uox4, zero128));
 +
-+            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
-+            vox8 = _mm256_srai_epi32(vox8, out_sh);
-+            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
-+            vox8 = _mm256_packus_epi32(vox8, _mm256_setzero_si256());
-+            vox8 = _mm256_permute4x64_epi64(vox8, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dstv[x >> 1], _mm256_castsi256_si128(vox8));
++            vox4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            vox4 = _mm_add_epi32(vox4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            vox4 = _mm_srai_epi32(vox4, out_sh);
++            vox4 = _mm_add_epi32(vox4, _mm_set1_epi32(out_uv_offset));
++            _mm_storeu_si64((__m128i_u *) &dstv[x >> 1], _mm_packus_epi32(vox4, zero128));
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff0;
++        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
 +        rdstu += offset >> 1;
 +        rdstv += offset >> 1;
@@ -4956,23 +5062,25 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +                                      dstdepth, srcdepth,
 +                                      remainw, rheight, params);
 +    }
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
 +}
 +
-+X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
++X86_64_V2 void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv,
 +                                                  const uint16_t *srcy, const uint16_t *srcuv,
 +                                                  const int *dstlinesize, const int *srclinesize,
 +                                                  int dstdepth, int srcdepth,
 +                                                  int width, int height,
 +                                                  const struct TonemapIntParams *params)
 +{
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
 +    uint8_t *rdsty = dsty;
 +    uint8_t *rdstuv = dstuv;
 +    const uint16_t *rsrcy = srcy;
 +    const uint16_t *rsrcuv = srcuv;
 +    int rheight = height;
-+    // not zero when not divisible by 16
++    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 14;
++    int remainw = width & 6;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -4999,252 +5107,248 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[16], g[16], b[16];
-+    int16_t r1[16], g1[16], b1[16];
-+    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
-+    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
-+    __m256i cyx8 = _mm256_set1_epi32(cy);
-+    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
 +
-+    __m256i uvx16, uvx8a, uvx8b;
-+    __m256i y0x16, y1x16;
-+    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
++    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
++    __m128i cyx4 = _mm_set1_epi32(cy);
++    __m128i rndx4 = _mm_set1_epi32(in_rnd);
++    __m128i zero128 = _mm_setzero_si128();
++    __m128i uvx8, uvx4a, uvx4b;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
-+    __m256i r0ox16, g0ox16, b0ox16;
-+    __m256i y0ox16;
-+    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
-+    __m256i yoax8, yobx8;
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
 +
-+    __m256i r1ox16, g1ox16, b1ox16;
-+    __m256i y1ox16;
-+    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
-+    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
++    __m128i uoax4, voax4, ravgx4, gavgx4, bavgx4;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0] * 2, dstuv += dstlinesize[1],
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 4; xx++) {
-+            int x = xx << 4;
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
-+            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx8 = _mm_lddqu_si128((__m128i*)(srcuv + x));
 +
 +            if (in_depth == 10) {
 +                // shift to low10bits for 10bit input
-+                y0x16 = _mm256_srli_epi16(y0x16, 6);
-+                y1x16 = _mm256_srli_epi16(y1x16, 6);
-+                uvx16 = _mm256_srli_epi16(uvx16, 6);
++                // shift bit has to be compile-time constant
++                y0x8 = _mm_srli_epi16(y0x8, 6);
++                y1x8 = _mm_srli_epi16(y1x8, 6);
++                uvx8 = _mm_srli_epi16(uvx8, 6);
 +            }
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            uvx4a = _mm_cvtepu16_epi32(uvx8);
++            uvx4b = _mm_unpackhi_epi16(uvx8, zero128);
++            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
++            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
++            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
++            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
++            uvx4a = _mm_sub_epi32(uvx4a, in_uv_offx4);
++            uvx4b = _mm_sub_epi32(uvx4b, in_uv_offx4);
 +
-+            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
-+            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
-+            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
-+            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
-+            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
-+            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
-+            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
-+            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
-+            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
-+            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
-+            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
-+
-+            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
-+            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
-+            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
-+            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++            ux4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
-+            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
-+            r0x8a = av_clip_int16_avx(r0x8a);
++            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r0x4a = _mm_add_epi32(r0x4a, rndx4);
++            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
++            r0x4a = av_clip_int16_sse(r0x4a);
 +
-+            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
-+            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
-+            r1x8a = av_clip_int16_avx(r1x8a);
++            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r1x4a = _mm_add_epi32(r1x4a, rndx4);
++            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
++            r1x4a = av_clip_int16_sse(r1x4a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
-+            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
-+            g0x8a = av_clip_int16_avx(g0x8a);
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g0x4a = _mm_add_epi32(g0x4a, rndx4);
++            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
++            g0x4a = av_clip_int16_sse(g0x4a);
 +
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
-+            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
-+            g1x8a = av_clip_int16_avx(g1x8a);
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g1x4a = _mm_add_epi32(g1x4a, rndx4);
++            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
++            g1x4a = av_clip_int16_sse(g1x4a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
-+            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
-+            b0x8a = av_clip_int16_avx(b0x8a);
++            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b0x4a = _mm_add_epi32(b0x4a, rndx4);
++            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
++            b0x4a = av_clip_int16_sse(b0x4a);
 +
-+            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
-+            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
-+            b1x8a = av_clip_int16_avx(b1x8a);
++            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b1x4a = _mm_add_epi32(b1x4a, rndx4);
++            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
++            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
-+            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
-+            r0x8b = av_clip_int16_avx(r0x8b);
++            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r0x4b = _mm_add_epi32(r0x4b, rndx4);
++            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
++            r0x4b = av_clip_int16_sse(r0x4b);
 +
-+            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
-+            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
-+            r1x8b = av_clip_int16_avx(r1x8b);
++            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r1x4b = _mm_add_epi32(r1x4b, rndx4);
++            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
++            r1x4b = av_clip_int16_sse(r1x4b);
 +
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
-+            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
-+            g0x8b = av_clip_int16_avx(g0x8b);
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g0x4b = _mm_add_epi32(g0x4b, rndx4);
++            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
++            g0x4b = av_clip_int16_sse(g0x4b);
 +
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
-+            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
-+            g1x8b = av_clip_int16_avx(g1x8b);
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g1x4b = _mm_add_epi32(g1x4b, rndx4);
++            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
++            g1x4b = av_clip_int16_sse(g1x4b);
 +
-+            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
-+            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
-+            b0x8b = av_clip_int16_avx(b0x8b);
++            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b0x4b = _mm_add_epi32(b0x4b, rndx4);
++            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
++            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
-+            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
-+            b1x8b = av_clip_int16_avx(b1x8b);
++            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b1x4b = _mm_add_epi32(b1x4b, rndx4);
++            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
++            b1x4b = av_clip_int16_sse(b1x4b);
 +
-+            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
-+            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
-+            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
 +
-+            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
-+            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
-+            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
 +
-+            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
-+            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
-+            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
 +
-+            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
-+            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            // output shift bits for 8bit outputs is 29 - 8 = 21
++            yoax4 = _mm_srai_epi32(yoax4, 21);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
-+            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, 21);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y0ox16 = _mm256_packs_epi32(yoax8, yobx8);
-+            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y0ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            y0ox8 = _mm_packs_epi32(yoax4, yobx4);
++            _mm_storeu_si64(&dsty[x], _mm_packus_epi16(y0ox8, zero128));
 +
-+            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
-+            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
-+            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
 +
-+            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
-+            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
-+            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
 +
-+            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
-+            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
-+            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
 +
-+            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
-+            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, 21);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
-+            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, 21);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1ox16 = _mm256_packs_epi32(y1oax8, y1obx8);
-+            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0]], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(y1ox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            y1ox8 = _mm_packs_epi32(y1oax4, y1obx4);
++            _mm_storeu_si64(&dsty[x + dstlinesize[0]], _mm_packus_epi16(y1ox8, zero128));
 +
-+            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
-+            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
-+            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
 +
-+            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
-+            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
-+            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
 +
-+            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
-+            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
-+            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
 +
-+            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
-+            uox8 = _mm256_srai_epi32(uox8, out_sh);
-+            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uoax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uoax4 = _mm_srai_epi32(uoax4, 21);
++            uoax4 = _mm_add_epi32(uoax4, _mm_set1_epi32(out_uv_offset));
 +
-+            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
-+            vox8 = _mm256_srai_epi32(vox8, out_sh);
-+            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            voax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            voax4 = _mm_srai_epi32(voax4, 21);
++            voax4 = _mm_add_epi32(voax4, _mm_set1_epi32(out_uv_offset));
 +
-+            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
-+            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
-+            uvox16 = _mm256_packs_epi32(uvoax8, uvobx8);
-+            _mm_storeu_si128((__m128i_u *) &dstuv[x], _mm256_castsi256_si128(_mm256_permute4x64_epi64(_mm256_packus_epi16(uvox16, _mm256_setzero_si256()), _MM_SHUFFLE(3, 1, 2, 0))));
++            uvoax4 = _mm_unpacklo_epi32(uoax4, voax4);
++            uvobx4 = _mm_unpackhi_epi32(uoax4, voax4);
++            _mm_storeu_si64(&dstuv[x], _mm_packus_epi16(_mm_packs_epi32(uvoax4, uvobx4), zero128));
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff0;
++        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
 +        rdstuv += offset;
 +        rsrcy += offset;
@@ -5255,15 +5359,17 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +                                       dstdepth, srcdepth,
 +                                       remainw, rheight, params);
 +    }
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
 +}
 +
-+X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
++X86_64_V2 void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty, uint16_t *dstuv,
 +                                                       const uint16_t *srcy, const uint16_t *srcuv,
 +                                                       const int *dstlinesize, const int *srclinesize,
 +                                                       int dstdepth, int srcdepth,
 +                                                       int width, int height,
 +                                                       const struct TonemapIntParams *params)
 +{
++#ifdef ENABLE_TONEMAPX_SSE_INTRINSICS
 +    uint16_t *rdsty = dsty;
 +    uint16_t *rdstuv = dstuv;
 +    const uint16_t *rsrcy = srcy;
@@ -5271,7 +5377,7 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    int rheight = height;
 +    // not zero when not divisible by 8
 +    // intentionally leave last pixel emtpy when input is odd
-+    int remainw = width & 14;
++    int remainw = width & 6;
 +
 +    const int in_depth = srcdepth;
 +    const int in_uv_offset = 128 << (in_depth - 8);
@@ -5299,254 +5405,251 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    int ocgv  = (*params->rgb2yuv_coeffs)[2][1][0];
 +    int cbv   = (*params->rgb2yuv_coeffs)[2][2][0];
 +
-+    int16_t r[16], g[16], b[16];
-+    int16_t r1[16], g1[16], b1[16];
-+    __m256i in_yuv_offx8 = _mm256_set1_epi32(params->in_yuv_off);
-+    __m256i in_uv_offx8 = _mm256_set1_epi32(in_uv_offset);
-+    __m256i cyx8 = _mm256_set1_epi32(cy);
-+    __m256i rndx8 = _mm256_set1_epi32(in_rnd);
++    int16_t r[8], g[8], b[8];
++    int16_t r1[8], g1[8], b1[8];
 +
-+    __m256i r0ox16, g0ox16, b0ox16;
-+    __m256i y0ox16;
-+    __m256i roax8, robx8, goax8, gobx8, boax8, bobx8;
-+    __m256i yoax8, yobx8;
-+    __m256i uvx16, uvx8a, uvx8b;
-+    __m256i y0x16, y1x16;
-+    __m256i y0x8a, y0x8b, y1x8a, y1x8b, ux8a, ux8b, vx8a, vx8b;
-+    __m256i r0x8a, g0x8a, b0x8a, r0x8b, g0x8b, b0x8b;
-+    __m256i r1x8a, g1x8a, b1x8a, r1x8b, g1x8b, b1x8b;
++    __m128i in_yuv_offx4 = _mm_set1_epi32(params->in_yuv_off);
++    __m128i in_uv_offx4= _mm_set1_epi32(in_uv_offset);
++    __m128i cyx4 = _mm_set1_epi32(cy);
++    __m128i rndx4 = _mm_set1_epi32(in_rnd);
++    __m128i zero128 = _mm_setzero_si128();
++    __m128i uvx8, uvx4a, uvx4b;
++    __m128i y0x8, y1x8;
++    __m128i y0x4a, y0x4b, y1x4a, y1x4b, ux4a, ux4b, vx4a, vx4b;
++    __m128i r0x4a, g0x4a, b0x4a, r0x4b, g0x4b, b0x4b;
++    __m128i r1x4a, g1x4a, b1x4a, r1x4b, g1x4b, b1x4b;
 +
-+    __m256i r1ox16, g1ox16, b1ox16;
-+    __m256i y1ox16;
-+    __m256i r1oax8, r1obx8, g1oax8, g1obx8, b1oax8, b1obx8;
-+    __m256i y1oax8, y1obx8, uvoax8, uvobx8, uvox16;
-+    __m256i uox8, vox8, ravgx8, gavgx8, bavgx8;
++    __m128i r0ox8, g0ox8, b0ox8;
++    __m128i y0ox8;
++    __m128i roax4, robx4, goax4, gobx4, boax4, bobx4;
++    __m128i yoax4, yobx4;
++
++    __m128i r1ox8, g1ox8, b1ox8;
++    __m128i y1ox8;
++    __m128i r1oax4, r1obx4, g1oax4, g1obx4, b1oax4, b1obx4;
++    __m128i y1oax4, y1obx4, uvoax4, uvobx4;
++    __m128i uoax4, voax4, ravgx4, gavgx4, bavgx4, uvox8;
 +    for (; height > 1; height -= 2,
 +                       dsty += dstlinesize[0], dstuv += dstlinesize[1] / 2,
 +                       srcy += srclinesize[0], srcuv += srclinesize[1] / 2) {
-+        for (int xx = 0; xx < width >> 4; xx++) {
-+            int x = xx << 4;
++        for (int xx = 0; xx < width >> 3; xx++) {
++            int x = xx << 3;
 +
-+            y0x16 = _mm256_lddqu_si256((__m256i*)(srcy + x));
-+            y1x16 = _mm256_lddqu_si256((__m256i*)(srcy + (srclinesize[0] / 2 + x)));
-+            uvx16 = _mm256_lddqu_si256((__m256i*)(srcuv + x));
++            y0x8 = _mm_lddqu_si128((__m128i*)(srcy + x));
++            y1x8 = _mm_lddqu_si128((__m128i*)(srcy + (srclinesize[0] / 2 + x)));
++            uvx8 = _mm_lddqu_si128((__m128i*)(srcuv + x));
 +
 +            if (in_depth == 10) {
 +                // shift to low10bits for 10bit input
-+                y0x16 = _mm256_srli_epi16(y0x16, 6);
-+                y1x16 = _mm256_srli_epi16(y1x16, 6);
-+                uvx16 = _mm256_srli_epi16(uvx16, 6);
++                // shift bit has to be compile-time constant
++                y0x8 = _mm_srli_epi16(y0x8, 6);
++                y1x8 = _mm_srli_epi16(y1x8, 6);
++                uvx8 = _mm_srli_epi16(uvx8, 6);
 +            }
++            y0x4a = _mm_cvtepu16_epi32(y0x8);
++            y0x4b = _mm_unpackhi_epi16(y0x8, zero128);
++            y1x4a = _mm_cvtepu16_epi32(y1x8);
++            y1x4b = _mm_unpackhi_epi16(y1x8, zero128);
++            uvx4a = _mm_cvtepu16_epi32(uvx8);
++            uvx4b = _mm_unpackhi_epi16(uvx8, zero128);
++            y0x4a = _mm_sub_epi32(y0x4a, in_yuv_offx4);
++            y1x4a = _mm_sub_epi32(y1x4a, in_yuv_offx4);
++            y0x4b = _mm_sub_epi32(y0x4b, in_yuv_offx4);
++            y1x4b = _mm_sub_epi32(y1x4b, in_yuv_offx4);
++            uvx4a = _mm_sub_epi32(uvx4a, in_uv_offx4);
++            uvx4b = _mm_sub_epi32(uvx4b, in_uv_offx4);
 +
-+            y0x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 0));
-+            y0x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y0x16, 1));
-+            y1x8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 0));
-+            y1x8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(y1x16, 1));
-+            uvx8a = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 0));
-+            uvx8b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(uvx16, 1));
-+            y0x8a = _mm256_sub_epi32(y0x8a, in_yuv_offx8);
-+            y1x8a = _mm256_sub_epi32(y1x8a, in_yuv_offx8);
-+            y0x8b = _mm256_sub_epi32(y0x8b, in_yuv_offx8);
-+            y1x8b = _mm256_sub_epi32(y1x8b, in_yuv_offx8);
-+            uvx8a = _mm256_sub_epi32(uvx8a, in_uv_offx8);
-+            uvx8b = _mm256_sub_epi32(uvx8b, in_uv_offx8);
-+
-+            ux8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(2, 2, 0, 0));
-+            ux8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(2, 2, 0, 0));
-+            vx8a = _mm256_shuffle_epi32(uvx8a, _MM_SHUFFLE(3, 3, 1, 1));
-+            vx8b = _mm256_shuffle_epi32(uvx8b, _MM_SHUFFLE(3, 3, 1, 1));
++            ux4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(2, 2, 0, 0));
++            ux4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(2, 2, 0, 0));
++            vx4a = _mm_shuffle_epi32(uvx4a, _MM_SHUFFLE(3, 3, 1, 1));
++            vx4b = _mm_shuffle_epi32(uvx4b, _MM_SHUFFLE(3, 3, 1, 1));
 +
 +            // r = av_clip_int16((y * cy + crv * v + in_rnd) >> in_sh);
-+            r0x8a = g0x8a = b0x8a = _mm256_mullo_epi32(y0x8a, cyx8);
-+            r0x8a = _mm256_add_epi32(r0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r0x8a = _mm256_add_epi32(r0x8a, rndx8);
-+            r0x8a = _mm256_srai_epi32(r0x8a, in_sh);
-+            r0x8a = av_clip_int16_avx(r0x8a);
++            r0x4a = g0x4a = b0x4a = _mm_mullo_epi32(y0x4a, cyx4);
++            r0x4a = _mm_add_epi32(r0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r0x4a = _mm_add_epi32(r0x4a, rndx4);
++            r0x4a = _mm_srai_epi32(r0x4a, in_sh);
++            r0x4a = av_clip_int16_sse(r0x4a);
 +
-+            r1x8a = g1x8a = b1x8a = _mm256_mullo_epi32(y1x8a, cyx8);
-+            r1x8a = _mm256_add_epi32(r1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(crv)));
-+            r1x8a = _mm256_add_epi32(r1x8a, rndx8);
-+            r1x8a = _mm256_srai_epi32(r1x8a, in_sh);
-+            r1x8a = av_clip_int16_avx(r1x8a);
++            r1x4a = g1x4a = b1x4a = _mm_mullo_epi32(y1x4a, cyx4);
++            r1x4a = _mm_add_epi32(r1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(crv)));
++            r1x4a = _mm_add_epi32(r1x4a, rndx4);
++            r1x4a = _mm_srai_epi32(r1x4a, in_sh);
++            r1x4a = av_clip_int16_sse(r1x4a);
 +
 +            // g = av_clip_int16((y * cy + cgu * u + cgv * v + in_rnd) >> in_sh);
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g0x8a = _mm256_add_epi32(g0x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g0x8a = _mm256_add_epi32(g0x8a, rndx8);
-+            g0x8a = _mm256_srai_epi32(g0x8a, in_sh);
-+            g0x8a = av_clip_int16_avx(g0x8a);
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g0x4a = _mm_add_epi32(g0x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g0x4a = _mm_add_epi32(g0x4a, rndx4);
++            g0x4a = _mm_srai_epi32(g0x4a, in_sh);
++            g0x4a = av_clip_int16_sse(g0x4a);
 +
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cgu)));
-+            g1x8a = _mm256_add_epi32(g1x8a, _mm256_mullo_epi32(vx8a, _mm256_set1_epi32(cgv)));
-+            g1x8a = _mm256_add_epi32(g1x8a, rndx8);
-+            g1x8a = _mm256_srai_epi32(g1x8a, in_sh);
-+            g1x8a = av_clip_int16_avx(g1x8a);
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cgu)));
++            g1x4a = _mm_add_epi32(g1x4a, _mm_mullo_epi32(vx4a, _mm_set1_epi32(cgv)));
++            g1x4a = _mm_add_epi32(g1x4a, rndx4);
++            g1x4a = _mm_srai_epi32(g1x4a, in_sh);
++            g1x4a = av_clip_int16_sse(g1x4a);
 +
 +            // b = av_clip_int16((y * cy + cbu * u + in_rnd) >> in_sh);
-+            b0x8a = _mm256_add_epi32(b0x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b0x8a = _mm256_add_epi32(b0x8a, rndx8);
-+            b0x8a = _mm256_srai_epi32(b0x8a, in_sh);
-+            b0x8a = av_clip_int16_avx(b0x8a);
++            b0x4a = _mm_add_epi32(b0x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b0x4a = _mm_add_epi32(b0x4a, rndx4);
++            b0x4a = _mm_srai_epi32(b0x4a, in_sh);
++            b0x4a = av_clip_int16_sse(b0x4a);
 +
-+            b1x8a = _mm256_add_epi32(b1x8a, _mm256_mullo_epi32(ux8a, _mm256_set1_epi32(cbu)));
-+            b1x8a = _mm256_add_epi32(b1x8a, rndx8);
-+            b1x8a = _mm256_srai_epi32(b1x8a, in_sh);
-+            b1x8a = av_clip_int16_avx(b1x8a);
++            b1x4a = _mm_add_epi32(b1x4a, _mm_mullo_epi32(ux4a, _mm_set1_epi32(cbu)));
++            b1x4a = _mm_add_epi32(b1x4a, rndx4);
++            b1x4a = _mm_srai_epi32(b1x4a, in_sh);
++            b1x4a = av_clip_int16_sse(b1x4a);
 +
-+            r0x8b = g0x8b = b0x8b = _mm256_mullo_epi32(y0x8b, cyx8);
-+            r0x8b = _mm256_add_epi32(r0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r0x8b = _mm256_add_epi32(r0x8b, rndx8);
-+            r0x8b = _mm256_srai_epi32(r0x8b, in_sh);
-+            r0x8b = av_clip_int16_avx(r0x8b);
++            r0x4b = g0x4b = b0x4b = _mm_mullo_epi32(y0x4b, cyx4);
++            r0x4b = _mm_add_epi32(r0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r0x4b = _mm_add_epi32(r0x4b, rndx4);
++            r0x4b = _mm_srai_epi32(r0x4b, in_sh);
++            r0x4b = av_clip_int16_sse(r0x4b);
 +
-+            r1x8b = g1x8b = b1x8b = _mm256_mullo_epi32(y1x8b, cyx8);
-+            r1x8b = _mm256_add_epi32(r1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(crv)));
-+            r1x8b = _mm256_add_epi32(r1x8b, rndx8);
-+            r1x8b = _mm256_srai_epi32(r1x8b, in_sh);
-+            r1x8b = av_clip_int16_avx(r1x8b);
++            r1x4b = g1x4b = b1x4b = _mm_mullo_epi32(y1x4b, cyx4);
++            r1x4b = _mm_add_epi32(r1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(crv)));
++            r1x4b = _mm_add_epi32(r1x4b, rndx4);
++            r1x4b = _mm_srai_epi32(r1x4b, in_sh);
++            r1x4b = av_clip_int16_sse(r1x4b);
 +
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g0x8b = _mm256_add_epi32(g0x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g0x8b = _mm256_add_epi32(g0x8b, rndx8);
-+            g0x8b = _mm256_srai_epi32(g0x8b, in_sh);
-+            g0x8b = av_clip_int16_avx(g0x8b);
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g0x4b = _mm_add_epi32(g0x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g0x4b = _mm_add_epi32(g0x4b, rndx4);
++            g0x4b = _mm_srai_epi32(g0x4b, in_sh);
++            g0x4b = av_clip_int16_sse(g0x4b);
 +
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cgu)));
-+            g1x8b = _mm256_add_epi32(g1x8b, _mm256_mullo_epi32(vx8b, _mm256_set1_epi32(cgv)));
-+            g1x8b = _mm256_add_epi32(g1x8b, rndx8);
-+            g1x8b = _mm256_srai_epi32(g1x8b, in_sh);
-+            g1x8b = av_clip_int16_avx(g1x8b);
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cgu)));
++            g1x4b = _mm_add_epi32(g1x4b, _mm_mullo_epi32(vx4b, _mm_set1_epi32(cgv)));
++            g1x4b = _mm_add_epi32(g1x4b, rndx4);
++            g1x4b = _mm_srai_epi32(g1x4b, in_sh);
++            g1x4b = av_clip_int16_sse(g1x4b);
 +
-+            b0x8b = _mm256_add_epi32(b0x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b0x8b = _mm256_add_epi32(b0x8b, rndx8);
-+            b0x8b = _mm256_srai_epi32(b0x8b, in_sh);
-+            b0x8b = av_clip_int16_avx(b0x8b);
++            b0x4b = _mm_add_epi32(b0x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b0x4b = _mm_add_epi32(b0x4b, rndx4);
++            b0x4b = _mm_srai_epi32(b0x4b, in_sh);
++            b0x4b = av_clip_int16_sse(b0x4b);
 +
-+            b1x8b = _mm256_add_epi32(b1x8b, _mm256_mullo_epi32(ux8b, _mm256_set1_epi32(cbu)));
-+            b1x8b = _mm256_add_epi32(b1x8b, rndx8);
-+            b1x8b = _mm256_srai_epi32(b1x8b, in_sh);
-+            b1x8b = av_clip_int16_avx(b1x8b);
++            b1x4b = _mm_add_epi32(b1x4b, _mm_mullo_epi32(ux4b, _mm_set1_epi32(cbu)));
++            b1x4b = _mm_add_epi32(b1x4b, rndx4);
++            b1x4b = _mm_srai_epi32(b1x4b, in_sh);
++            b1x4b = av_clip_int16_sse(b1x4b);
 +
-+            tonemap_int32x8_avx(r0x8a, g0x8a, b0x8a, r, g, b,
++            tonemap_int32x4_sse(r0x4a, g0x4a, b0x4a, r, g, b,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8a, g1x8a, b1x8a, r1, g1, b1,
++            tonemap_int32x4_sse(r1x4a, g1x4a, b1x4a, r1, g1, b1,
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r0x8b, g0x8b, b0x8b, &r[8], &g[8], &b[8],
++            tonemap_int32x4_sse(r0x4b, g0x4b, b0x4b, &r[4], &g[4], &b[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
-+            tonemap_int32x8_avx(r1x8b, g1x8b, b1x8b, &r1[8], &g1[8], &b1[8],
++            tonemap_int32x4_sse(r1x4b, g1x4b, b1x4b, &r1[4], &g1[4], &b1[4],
 +                                params->lin_lut, params->tonemap_lut, params->delin_lut,
 +                                params->coeffs, params->ocoeffs, params->desat, params->rgb2rgb_coeffs,
 +                                params->rgb2rgb_passthrough);
 +
-+            r0ox16 = _mm256_lddqu_si256((const __m256i_u *)r);
-+            g0ox16 = _mm256_lddqu_si256((const __m256i_u *)g);
-+            b0ox16 = _mm256_lddqu_si256((const __m256i_u *)b);
++            r0ox8 = _mm_lddqu_si128((const __m128i_u *)r);
++            g0ox8 = _mm_lddqu_si128((const __m128i_u *)g);
++            b0ox8 = _mm_lddqu_si128((const __m128i_u *)b);
 +
-+            roax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 0));
-+            goax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 0));
-+            boax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 0));
++            roax4 = _mm_cvtepi16_epi32(r0ox8);
++            goax4 = _mm_cvtepi16_epi32(g0ox8);
++            boax4 = _mm_cvtepi16_epi32(b0ox8);
 +
-+            robx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r0ox16, 1));
-+            gobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g0ox16, 1));
-+            bobx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b0ox16, 1));
++            robx4 = _mm_unpackhi_epi16(r0ox8, zero128);
++            gobx4 = _mm_unpackhi_epi16(g0ox8, zero128);
++            bobx4 = _mm_unpackhi_epi16(b0ox8, zero128);
 +
-+            yoax8 = _mm256_mullo_epi32(roax8, _mm256_set1_epi32(cry));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(goax8, _mm256_set1_epi32(cgy)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_mullo_epi32(boax8, _mm256_set1_epi32(cby)));
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(out_rnd));
-+            yoax8 = _mm256_srai_epi32(yoax8, out_sh);
-+            yoax8 = _mm256_add_epi32(yoax8, _mm256_set1_epi32(params->out_yuv_off));
++            yoax4 = _mm_mullo_epi32(roax4, _mm_set1_epi32(cry));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(goax4, _mm_set1_epi32(cgy)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_mullo_epi32(boax4, _mm_set1_epi32(cby)));
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(out_rnd));
++            yoax4 = _mm_srai_epi32(yoax4, out_sh);
++            yoax4 = _mm_add_epi32(yoax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            yobx8 = _mm256_mullo_epi32(robx8, _mm256_set1_epi32(cry));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(gobx8, _mm256_set1_epi32(cgy)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_mullo_epi32(bobx8, _mm256_set1_epi32(cby)));
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(out_rnd));
-+            yobx8 = _mm256_srai_epi32(yobx8, out_sh);
-+            yobx8 = _mm256_add_epi32(yobx8, _mm256_set1_epi32(params->out_yuv_off));
++            yobx4 = _mm_mullo_epi32(robx4, _mm_set1_epi32(cry));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(gobx4, _mm_set1_epi32(cgy)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_mullo_epi32(bobx4, _mm_set1_epi32(cby)));
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(out_rnd));
++            yobx4 = _mm_srai_epi32(yobx4, out_sh);
++            yobx4 = _mm_add_epi32(yobx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y0ox16 = _mm256_packus_epi32(yoax8, yobx8);
-+            y0ox16 = _mm256_permute4x64_epi64(y0ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            y0ox16 = _mm256_slli_epi16(y0ox16, out_sh2);
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x], y0ox16);
++            y0ox8 = _mm_packus_epi32(yoax4, yobx4);
++            y0ox8 = _mm_slli_epi16(y0ox8, out_sh2);
++            _mm_storeu_si128((__m128i_u *) &dsty[x], y0ox8);
 +
-+            r1ox16 = _mm256_lddqu_si256((const __m256i_u *)r1);
-+            g1ox16 = _mm256_lddqu_si256((const __m256i_u *)g1);
-+            b1ox16 = _mm256_lddqu_si256((const __m256i_u *)b1);
++            r1ox8 = _mm_lddqu_si128((const __m128i_u *)r1);
++            g1ox8 = _mm_lddqu_si128((const __m128i_u *)g1);
++            b1ox8 = _mm_lddqu_si128((const __m128i_u *)b1);
 +
-+            r1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 0));
-+            g1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 0));
-+            b1oax8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 0));
++            r1oax4 = _mm_cvtepi16_epi32(r1ox8);
++            g1oax4 = _mm_cvtepi16_epi32(g1ox8);
++            b1oax4 = _mm_cvtepi16_epi32(b1ox8);
 +
-+            r1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(r1ox16, 1));
-+            g1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(g1ox16, 1));
-+            b1obx8 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(b1ox16, 1));
++            r1obx4 = _mm_unpackhi_epi16(r1ox8, zero128);
++            g1obx4 = _mm_unpackhi_epi16(g1ox8, zero128);
++            b1obx4 = _mm_unpackhi_epi16(b1ox8, zero128);
 +
-+            y1oax8 = _mm256_mullo_epi32(r1oax8, _mm256_set1_epi32(cry));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(g1oax8, _mm256_set1_epi32(cgy)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_mullo_epi32(b1oax8, _mm256_set1_epi32(cby)));
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(out_rnd));
-+            y1oax8 = _mm256_srai_epi32(y1oax8, out_sh);
-+            y1oax8 = _mm256_add_epi32(y1oax8, _mm256_set1_epi32(params->out_yuv_off));
++            y1oax4 = _mm_mullo_epi32(r1oax4, _mm_set1_epi32(cry));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(g1oax4, _mm_set1_epi32(cgy)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_mullo_epi32(b1oax4, _mm_set1_epi32(cby)));
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(out_rnd));
++            y1oax4 = _mm_srai_epi32(y1oax4, out_sh);
++            y1oax4 = _mm_add_epi32(y1oax4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1obx8 = _mm256_mullo_epi32(r1obx8, _mm256_set1_epi32(cry));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(g1obx8, _mm256_set1_epi32(cgy)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_mullo_epi32(b1obx8, _mm256_set1_epi32(cby)));
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(out_rnd));
-+            y1obx8 = _mm256_srai_epi32(y1obx8, out_sh);
-+            y1obx8 = _mm256_add_epi32(y1obx8, _mm256_set1_epi32(params->out_yuv_off));
++            y1obx4 = _mm_mullo_epi32(r1obx4, _mm_set1_epi32(cry));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(g1obx4, _mm_set1_epi32(cgy)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_mullo_epi32(b1obx4, _mm_set1_epi32(cby)));
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(out_rnd));
++            y1obx4 = _mm_srai_epi32(y1obx4, out_sh);
++            y1obx4 = _mm_add_epi32(y1obx4, _mm_set1_epi32(params->out_yuv_off));
 +
-+            y1ox16 = _mm256_packus_epi32(y1oax8, y1obx8);
-+            y1ox16 = _mm256_permute4x64_epi64(y1ox16, _MM_SHUFFLE(3, 1, 2, 0));
-+            y1ox16 = _mm256_slli_epi16(y1ox16, out_sh2);
-+            _mm256_storeu_si256((__m256i_u *) &dsty[x + dstlinesize[0] / 2], y1ox16);
++            y1ox8 = _mm_packus_epi32(y1oax4, y1obx4);
++            y1ox8 = _mm_slli_epi16(y1ox8, out_sh2);
++            _mm_storeu_si128((__m128i_u *) &dsty[x + dstlinesize[0] / 2], y1ox8);
 +
-+            ravgx8 = _mm256_hadd_epi32(roax8, robx8);
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_hadd_epi32(r1oax8, r1obx8));
-+            ravgx8 = _mm256_permute4x64_epi64(ravgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            ravgx8 = _mm256_add_epi32(ravgx8, _mm256_set1_epi32(2));
-+            ravgx8 = _mm256_srai_epi32(ravgx8, 2);
++            ravgx4 = _mm_hadd_epi32(roax4, robx4);
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_hadd_epi32(r1oax4, r1obx4));
++            ravgx4 = _mm_add_epi32(ravgx4, _mm_set1_epi32(2));
++            ravgx4 = _mm_srai_epi32(ravgx4, 2);
 +
-+            gavgx8 = _mm256_hadd_epi32(goax8, gobx8);
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_hadd_epi32(g1oax8, g1obx8));
-+            gavgx8 = _mm256_permute4x64_epi64(gavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            gavgx8 = _mm256_add_epi32(gavgx8, _mm256_set1_epi32(2));
-+            gavgx8 = _mm256_srai_epi32(gavgx8, 2);
++            gavgx4 = _mm_hadd_epi32(goax4, gobx4);
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_hadd_epi32(g1oax4, g1obx4));
++            gavgx4 = _mm_add_epi32(gavgx4, _mm_set1_epi32(2));
++            gavgx4 = _mm_srai_epi32(gavgx4, 2);
 +
-+            bavgx8 = _mm256_hadd_epi32(boax8, bobx8);
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_hadd_epi32(b1oax8, b1obx8));
-+            bavgx8 = _mm256_permute4x64_epi64(bavgx8, _MM_SHUFFLE(3, 1, 2, 0));
-+            bavgx8 = _mm256_add_epi32(bavgx8, _mm256_set1_epi32(2));
-+            bavgx8 = _mm256_srai_epi32(bavgx8, 2);
++            bavgx4 = _mm_hadd_epi32(boax4, bobx4);
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_hadd_epi32(b1oax4, b1obx4));
++            bavgx4 = _mm_add_epi32(bavgx4, _mm_set1_epi32(2));
++            bavgx4 = _mm_srai_epi32(bavgx4, 2);
 +
-+            uox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cru)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgu)));
-+            uox8 = _mm256_add_epi32(uox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cburv)));
-+            uox8 = _mm256_srai_epi32(uox8, out_sh);
-+            uox8 = _mm256_add_epi32(uox8, _mm256_set1_epi32(out_uv_offset));
++            uoax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cru)));
++            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgu)));
++            uoax4 = _mm_add_epi32(uoax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cburv)));
++            uoax4 = _mm_srai_epi32(uoax4, out_sh);
++            uoax4 = _mm_add_epi32(uoax4, _mm_set1_epi32(out_uv_offset));
 +
-+            vox8 = _mm256_add_epi32(_mm256_set1_epi32(out_rnd), _mm256_mullo_epi32(ravgx8, _mm256_set1_epi32(cburv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(gavgx8, _mm256_set1_epi32(ocgv)));
-+            vox8 = _mm256_add_epi32(vox8, _mm256_mullo_epi32(bavgx8, _mm256_set1_epi32(cbv)));
-+            vox8 = _mm256_srai_epi32(vox8, out_sh);
-+            vox8 = _mm256_add_epi32(vox8, _mm256_set1_epi32(out_uv_offset));
++            voax4 = _mm_add_epi32(_mm_set1_epi32(out_rnd), _mm_mullo_epi32(ravgx4, _mm_set1_epi32(cburv)));
++            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(gavgx4, _mm_set1_epi32(ocgv)));
++            voax4 = _mm_add_epi32(voax4, _mm_mullo_epi32(bavgx4, _mm_set1_epi32(cbv)));
++            voax4 = _mm_srai_epi32(voax4, out_sh);
++            voax4 = _mm_add_epi32(voax4, _mm_set1_epi32(out_uv_offset));
 +
-+            uvoax8 = _mm256_unpacklo_epi32(uox8, vox8);
-+            uvobx8 = _mm256_unpackhi_epi32(uox8, vox8);
-+            uvox16 = _mm256_packus_epi32(uvoax8, uvobx8);
-+            uvox16 = _mm256_slli_epi16(uvox16, out_sh2);
-+            _mm256_storeu_si256((__m256i_u *) &dstuv[x], uvox16);
++            uvoax4 = _mm_unpacklo_epi32(uoax4, voax4);
++            uvobx4 = _mm_unpackhi_epi32(uoax4, voax4);
++            uvox8 = _mm_packus_epi32(uvoax4, uvobx4);
++            uvox8 = _mm_slli_epi16(uvox8, out_sh2);
++            _mm_storeu_si128((__m128i_u *) &dstuv[x], uvox8);
 +        }
 +    }
 +
 +    // Process remaining pixels cannot fill the full simd register with scalar version
 +    if (remainw) {
-+        int offset = width & (int)0xfffffff0;
++        int offset = width & (int)0xfffffff8;
 +        rdsty += offset;
 +        rdstuv += offset;
 +        rsrcy += offset;
@@ -5557,12 +5660,13 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +                                            dstdepth, srcdepth,
 +                                            remainw, rheight, params);
 +    }
++#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
 +}
-Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
+Index: jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
 ===================================================================
 --- /dev/null
-+++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
-@@ -0,0 +1,58 @@
++++ jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
+@@ -0,0 +1,54 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
 + *
@@ -5583,41 +5687,37 @@ Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 + */
 +
-+#ifndef AVFILTER_TONEMAPX_INTRIN_AVX_H
-+#define AVFILTER_TONEMAPX_INTRIN_AVX_H
-+
-+#include <immintrin.h>
-+#include <emmintrin.h>
-+#include <smmintrin.h>
++#ifndef AVFILTER_TONEMAPX_INTRIN_SSE_H
++#define AVFILTER_TONEMAPX_INTRIN_SSE_H
 +
 +#include "libavfilter/vf_tonemapx.h"
 +
-+X86_64_V3 void tonemap_frame_420p10_2_420p_avx(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
++X86_64_V2 void tonemap_frame_420p10_2_420p_sse(uint8_t *dsty, uint8_t *dstu, uint8_t *dstv,
 +                                               const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                               const int *dstlinesize, const int *srclinesize,
 +                                               int dstdepth, int srcdepth,
 +                                               int width, int height,
 +                                               const struct TonemapIntParams *params);
 +
-+X86_64_V3 void tonemap_frame_420p10_2_420p10_avx(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
++X86_64_V2 void tonemap_frame_420p10_2_420p10_sse(uint16_t *dsty, uint16_t *dstu, uint16_t *dstv,
 +                                                 const uint16_t *srcy, const uint16_t *srcu, const uint16_t *srcv,
 +                                                 const int *dstlinesize, const int *srclinesize,
 +                                                 int dstdepth, int srcdepth,
 +                                                 int width, int height,
 +                                                 const struct TonemapIntParams *params);
 +
-+X86_64_V3 void tonemap_frame_p016_p010_2_nv12_avx(uint8_t *dsty, uint8_t *dstuv,
++X86_64_V2 void tonemap_frame_p016_p010_2_nv12_sse(uint8_t *dsty, uint8_t *dstuv,
 +                                                  const uint16_t *srcy, const uint16_t *srcuv,
 +                                                  const int *dstlinesize, const int *srclinesize,
 +                                                  int dstdepth, int srcdepth,
 +                                                  int width, int height,
 +                                                  const struct TonemapIntParams *params);
 +
-+X86_64_V3 void tonemap_frame_p016_p010_2_p016_p010_avx(uint16_t *dsty, uint16_t *dstuv,
++X86_64_V2 void tonemap_frame_p016_p010_2_p016_p010_sse(uint16_t *dsty, uint16_t *dstuv,
 +                                                       const uint16_t *srcy, const uint16_t *srcuv,
 +                                                       const int *dstlinesize, const int *srclinesize,
 +                                                       int dstdepth, int srcdepth,
 +                                                       int width, int height,
 +                                                       const struct TonemapIntParams *params);
 +
-+#endif //AVFILTER_TONEMAPX_INTRIN_AVX_H
++#endif // AVFILTER_TONEMAPX_INTRIN_SSE_H

From 5f0da24174dfbe6ac59fdd34c7973e970daba7eb Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Wed, 3 Jul 2024 12:06:47 +0800
Subject: [PATCH 26/27] avfilter/tonemapx: allow gcc9 to use intrinsics

---
 debian/patches/0080-add-tonemapx-filter.patch | 98 +++++++++----------
 1 file changed, 49 insertions(+), 49 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index ec7516f4eb2..733e96936ad 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -1,21 +1,21 @@
-Index: jellyfin-ffmpeg/configure
+Index: FFmpeg/configure
 ===================================================================
---- jellyfin-ffmpeg.orig/configure
-+++ jellyfin-ffmpeg/configure
+--- FFmpeg.orig/configure
++++ FFmpeg/configure
 @@ -2211,6 +2211,9 @@ HEADERS_LIST="
-
+ 
  INTRINSICS_LIST="
      intrinsics_neon
 +    intrinsics_sse42
 +    intrinsics_avx2
 +    intrinsics_fma3
  "
-
+ 
  MATH_FUNCS="
 @@ -2676,6 +2679,10 @@ avx2_deps="avx"
  avx512_deps="avx2"
  avx512icl_deps="avx512"
-
+ 
 +intrinsics_sse42_deps="sse42"
 +intrinsics_fma3_deps="fma3"
 +intrinsics_avx2_deps="avx2"
@@ -32,9 +32,9 @@ Index: jellyfin-ffmpeg/configure
  tonemap_opencl_filter_deps="opencl const_nan"
  tonemap_videotoolbox_filter_deps="metal corevideo videotoolbox const_nan"
 @@ -6230,6 +6238,19 @@ fi
-
+ 
  check_cc intrinsics_neon arm_neon.h "int16x8_t test = vdupq_n_s16(0)"
-
+ 
 +disable intrinsics_sse42 && test_cc -msse4.2 <<EOF && enable intrinsics_sse42
 +#include <immintrin.h>
 +int main(void) { __m128i t = _mm_cmpgt_epi64(_mm_setzero_si128(), _mm_setzero_si128()); return 0; }
@@ -50,7 +50,7 @@ Index: jellyfin-ffmpeg/configure
 +
  check_ldflags -Wl,--as-needed
  check_ldflags -Wl,-z,noexecstack
-
+ 
 @@ -7295,6 +7316,16 @@ elif enabled gcc; then
              check_cflags -mpreferred-stack-boundary=4
              ;;
@@ -68,10 +68,10 @@ Index: jellyfin-ffmpeg/configure
      fi
  elif enabled llvm_gcc; then
      check_cflags -mllvm -stack-alignment=16
-Index: jellyfin-ffmpeg/libavfilter/Makefile
+Index: FFmpeg/libavfilter/Makefile
 ===================================================================
---- jellyfin-ffmpeg.orig/libavfilter/Makefile
-+++ jellyfin-ffmpeg/libavfilter/Makefile
+--- FFmpeg.orig/libavfilter/Makefile
++++ FFmpeg/libavfilter/Makefile
 @@ -516,6 +516,7 @@ OBJS-$(CONFIG_TMEDIAN_FILTER)
  OBJS-$(CONFIG_TMIDEQUALIZER_FILTER)          += vf_tmidequalizer.o
  OBJS-$(CONFIG_TMIX_FILTER)                   += vf_mix.o framesync.o
@@ -80,19 +80,19 @@ Index: jellyfin-ffmpeg/libavfilter/Makefile
  OBJS-$(CONFIG_TONEMAP_OPENCL_FILTER)         += vf_tonemap_opencl.o opencl.o \
                                                  opencl/tonemap.o opencl/colorspace_common.o
  OBJS-$(CONFIG_TONEMAP_CUDA_FILTER)           += vf_tonemap_cuda.o cuda/tonemap.ptx.o \
-Index: jellyfin-ffmpeg/libavfilter/aarch64/Makefile
+Index: FFmpeg/libavfilter/aarch64/Makefile
 ===================================================================
---- jellyfin-ffmpeg.orig/libavfilter/aarch64/Makefile
-+++ jellyfin-ffmpeg/libavfilter/aarch64/Makefile
+--- FFmpeg.orig/libavfilter/aarch64/Makefile
++++ FFmpeg/libavfilter/aarch64/Makefile
 @@ -1,3 +1,4 @@
  OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
 +OBJS-$(CONFIG_TONEMAPX_FILTER)               += aarch64/vf_tonemapx_intrin_neon.o
-
+ 
  NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
-Index: jellyfin-ffmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
+Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 ===================================================================
 --- /dev/null
-+++ jellyfin-ffmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
++++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 @@ -0,0 +1,1229 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
@@ -1323,10 +1323,10 @@ Index: jellyfin-ffmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
 +    }
 +#endif // ENABLE_TONEMAPX_NEON_INTRINSICS
 +}
-Index: jellyfin-ffmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
+Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
 ===================================================================
 --- /dev/null
-+++ jellyfin-ffmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
++++ FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
 @@ -0,0 +1,54 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
@@ -1382,10 +1382,10 @@ Index: jellyfin-ffmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.h
 +                                              const struct TonemapIntParams *params);
 +
 +#endif // AVFILTER_TONEMAPX_INTRIN_NEON_H
-Index: jellyfin-ffmpeg/libavfilter/allfilters.c
+Index: FFmpeg/libavfilter/allfilters.c
 ===================================================================
---- jellyfin-ffmpeg.orig/libavfilter/allfilters.c
-+++ jellyfin-ffmpeg/libavfilter/allfilters.c
+--- FFmpeg.orig/libavfilter/allfilters.c
++++ FFmpeg/libavfilter/allfilters.c
 @@ -484,6 +484,7 @@ extern const AVFilter ff_vf_tmedian;
  extern const AVFilter ff_vf_tmidequalizer;
  extern const AVFilter ff_vf_tmix;
@@ -1394,14 +1394,14 @@ Index: jellyfin-ffmpeg/libavfilter/allfilters.c
  extern const AVFilter ff_vf_tonemap_cuda;
  extern const AVFilter ff_vf_tonemap_opencl;
  extern const AVFilter ff_vf_tonemap_vaapi;
-Index: jellyfin-ffmpeg/libavfilter/colorspace.c
+Index: FFmpeg/libavfilter/colorspace.c
 ===================================================================
---- jellyfin-ffmpeg.orig/libavfilter/colorspace.c
-+++ jellyfin-ffmpeg/libavfilter/colorspace.c
+--- FFmpeg.orig/libavfilter/colorspace.c
++++ FFmpeg/libavfilter/colorspace.c
 @@ -17,6 +17,7 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
-
+ 
 +#include "libavutil/avassert.h"
  #include "libavutil/frame.h"
  #include "libavutil/mastering_display_metadata.h"
@@ -1458,23 +1458,23 @@ Index: jellyfin-ffmpeg/libavfilter/colorspace.c
 +        av_assert2(out[1][2][0] == out[2][0][0]);
 +    }
 +}
-Index: jellyfin-ffmpeg/libavfilter/colorspace.h
+Index: FFmpeg/libavfilter/colorspace.h
 ===================================================================
---- jellyfin-ffmpeg.orig/libavfilter/colorspace.h
-+++ jellyfin-ffmpeg/libavfilter/colorspace.h
+--- FFmpeg.orig/libavfilter/colorspace.h
++++ FFmpeg/libavfilter/colorspace.h
 @@ -85,4 +85,8 @@ float eotf_arib_b67(float x);
  float inverse_eotf_arib_b67(float x);
  float inverse_eotf_bt1886(float x);
-
+ 
 +int ff_get_range_off(int *off, int *y_rng, int *uv_rng,
 +                     enum AVColorRange rng, int depth);
 +void ff_get_yuv_coeffs(int16_t out[3][3][8], double (*table)[3],
 +                       int depth, int y_rng, int uv_rng, int yuv2rgb);
  #endif
-Index: jellyfin-ffmpeg/libavfilter/vf_tonemapx.c
+Index: FFmpeg/libavfilter/vf_tonemapx.c
 ===================================================================
 --- /dev/null
-+++ jellyfin-ffmpeg/libavfilter/vf_tonemapx.c
++++ FFmpeg/libavfilter/vf_tonemapx.c
 @@ -0,0 +1,1267 @@
 +/*
 + * This file is part of FFmpeg.
@@ -2743,10 +2743,10 @@ Index: jellyfin-ffmpeg/libavfilter/vf_tonemapx.c
 +    FILTER_QUERY_FUNC(query_formats),
 +    .flags           = AVFILTER_FLAG_SLICE_THREADS,
 +};
-Index: jellyfin-ffmpeg/libavfilter/vf_tonemapx.h
+Index: FFmpeg/libavfilter/vf_tonemapx.h
 ===================================================================
 --- /dev/null
-+++ jellyfin-ffmpeg/libavfilter/vf_tonemapx.h
++++ FFmpeg/libavfilter/vf_tonemapx.h
 @@ -0,0 +1,106 @@
 +/*
 + * This file is part of FFmpeg.
@@ -2783,7 +2783,7 @@ Index: jellyfin-ffmpeg/libavfilter/vf_tonemapx.h
 +#endif // defined(__GNUC__) || defined(__clang__)
 +
 +#if defined(__GNUC__) || defined(__clang__)
-+#    if (__GNUC__ >= 10) || (__clang_major__ >= 11)
++#    if (__GNUC__ >= 9) || (__clang_major__ >= 11)
 +#        define CC_SUPPORTS_TONEMAPX_INTRINSICS
 +#    endif // (__GNUC__ >= 10) || (__clang_major__ >= 11)
 +#endif // defined(__GNUC__) || defined(__clang__)
@@ -2854,23 +2854,23 @@ Index: jellyfin-ffmpeg/libavfilter/vf_tonemapx.h
 +                                         const struct TonemapIntParams *params);
 +
 +#endif //AVFILTER_TONEMAPX_H
-Index: jellyfin-ffmpeg/libavfilter/x86/Makefile
+Index: FFmpeg/libavfilter/x86/Makefile
 ===================================================================
---- jellyfin-ffmpeg.orig/libavfilter/x86/Makefile
-+++ jellyfin-ffmpeg/libavfilter/x86/Makefile
+--- FFmpeg.orig/libavfilter/x86/Makefile
++++ FFmpeg/libavfilter/x86/Makefile
 @@ -39,6 +39,8 @@ OBJS-$(CONFIG_VOLUME_FILTER)
  OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
  OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
  OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
 +OBJS-$(CONFIG_TONEMAPX_FILTER)               += x86/vf_tonemapx_intrin_sse.o \
 +                                                x86/vf_tonemapx_intrin_avx.o
-
+ 
  X86ASM-OBJS-$(CONFIG_SCENE_SAD)              += x86/scene_sad.o
-
-Index: jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
+ 
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 ===================================================================
 --- /dev/null
-+++ jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 @@ -0,0 +1,1367 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
@@ -4239,10 +4239,10 @@ Index: jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.c
 +    }
 +#endif // ENABLE_TONEMAPX_AVX_INTRINSICS
 +}
-Index: jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
 ===================================================================
 --- /dev/null
-+++ jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
 @@ -0,0 +1,54 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
@@ -4298,10 +4298,10 @@ Index: jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_avx.h
 +                                                       const struct TonemapIntParams *params);
 +
 +#endif // AVFILTER_TONEMAPX_INTRIN_AVX_H
-Index: jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 ===================================================================
 --- /dev/null
-+++ jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 @@ -0,0 +1,1359 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>
@@ -5662,10 +5662,10 @@ Index: jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.c
 +    }
 +#endif // ENABLE_TONEMAPX_SSE_INTRINSICS
 +}
-Index: jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
+Index: FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
 ===================================================================
 --- /dev/null
-+++ jellyfin-ffmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
++++ FFmpeg/libavfilter/x86/vf_tonemapx_intrin_sse.h
 @@ -0,0 +1,54 @@
 +/*
 + * Copyright (c) 2024 Gnattu OC <gnattuoc@me.com>

From 10cd1a13c741e99630b7c13bf9a8adb4fe58a844 Mon Sep 17 00:00:00 2001
From: gnattu <gnattuoc@me.com>
Date: Sun, 7 Jul 2024 00:55:17 +0800
Subject: [PATCH 27/27] avfilter/tonemapx: always target specific extension

---
 debian/patches/0080-add-tonemapx-filter.patch | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/debian/patches/0080-add-tonemapx-filter.patch b/debian/patches/0080-add-tonemapx-filter.patch
index 733e96936ad..ff6492200f8 100644
--- a/debian/patches/0080-add-tonemapx-filter.patch
+++ b/debian/patches/0080-add-tonemapx-filter.patch
@@ -2747,7 +2747,7 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h
 ===================================================================
 --- /dev/null
 +++ FFmpeg/libavfilter/vf_tonemapx.h
-@@ -0,0 +1,106 @@
+@@ -0,0 +1,99 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -2772,15 +2772,8 @@ Index: FFmpeg/libavfilter/vf_tonemapx.h
 +#include "config.h"
 +#include "colorspace.h"
 +
-+#if defined(__GNUC__) || defined(__clang__)
-+#    if (__GNUC__ >= 11) || (__clang_major__ >= 12)
-+#        define X86_64_V2 __attribute__((target("arch=x86-64-v2")))
-+#        define X86_64_V3 __attribute__((target("arch=x86-64-v3")))
-+#    else
-+#        define X86_64_V2 __attribute__((target("sse4.2")))
-+#        define X86_64_V3 __attribute__((target("avx2,fma")))
-+#    endif // (__GNUC__ >= 11) || (__clang_major__ >= 12)
-+#endif // defined(__GNUC__) || defined(__clang__)
++#define X86_64_V2 __attribute__((target("sse4.2")))
++#define X86_64_V3 __attribute__((target("avx2,fma")))
 +
 +#if defined(__GNUC__) || defined(__clang__)
 +#    if (__GNUC__ >= 9) || (__clang_major__ >= 11)