From 47552c8ce3ae7bd64d74b812aa71c7d52866bb85 Mon Sep 17 00:00:00 2001
From: Julek <31626592+dnjulek@users.noreply.github.com>
Date: Fri, 19 Jan 2024 12:59:50 -0300
Subject: [PATCH] back to true gauss because the recursive is causing some
 issues with video

---
 src/blur.zig        | 111 +++++++++++++++++++++
 src/rblur.zig       | 231 --------------------------------------------
 src/ssimulacra2.zig |  50 +++-------
 3 files changed, 122 insertions(+), 270 deletions(-)
 create mode 100644 src/blur.zig
 delete mode 100644 src/rblur.zig

diff --git a/src/blur.zig b/src/blur.zig
new file mode 100644
index 0000000..796b7f1
--- /dev/null
+++ b/src/blur.zig
@@ -0,0 +1,111 @@
+const std = @import("std");
+const allocator = std.heap.c_allocator;
+
+inline fn blur_h(srcp: anytype, dstp: [*]f32, kernel: [9]f32, width: usize) void {
+    const ksize: usize = 9;
+    const radius: usize = ksize >> 1;
+
+    var j: usize = 0;
+    while (j < @min(width, radius)) : (j += 1) {
+        const dist_from_right: usize = width - 1 - j;
+        var accum: f32 = 0.0;
+        var k: usize = 0;
+        while (k < radius) : (k += 1) {
+            const idx: usize = if (j < radius - k) (@min(radius - k - j, width - 1)) else (j - radius + k);
+            accum += kernel[k] * srcp[idx];
+        }
+
+        k = radius;
+        while (k < ksize) : (k += 1) {
+            const idx: usize = if (dist_from_right < k - radius) (j - @min(k - radius - dist_from_right, j)) else (j - radius + k);
+            accum += kernel[k] * srcp[idx];
+        }
+
+        dstp[j] = accum;
+    }
+
+    j = radius;
+    while (j < width - @min(width, radius)) : (j += 1) {
+        var accum: f32 = 0.0;
+        var k: usize = 0;
+        while (k < ksize) : (k += 1) {
+            accum += kernel[k] * srcp[j - radius + k];
+        }
+
+        dstp[j] = accum;
+    }
+
+    j = @max(radius, width - @min(width, radius));
+    while (j < width) : (j += 1) {
+        const dist_from_right: usize = width - 1 - j;
+        var accum: f32 = 0.0;
+        var k: usize = 0;
+        while (k < radius) : (k += 1) {
+            const idx: usize = if (j < radius - k) (@min(radius - k - j, width - 1)) else (j - radius + k);
+            accum += kernel[k] * srcp[idx];
+        }
+
+        k = radius;
+        while (k < ksize) : (k += 1) {
+            const idx: usize = if (dist_from_right < k - radius) (j - @min(k - radius - dist_from_right, j)) else (j - radius + k);
+            accum += kernel[k] * srcp[idx];
+        }
+
+        dstp[j] = accum;
+    }
+}
+
+inline fn blur_v(src: anytype, dstp: [*]f32, kernel: [9]f32, width: usize) void {
+    var j: usize = 0;
+    while (j < width) : (j += 1) {
+        var accum: f32 = 0.0;
+        var k: usize = 0;
+        while (k < 9) : (k += 1) {
+            accum += kernel[k] * src[k][j];
+        }
+
+        dstp[j] = accum;
+    }
+}
+
+pub inline fn process(src: [*]const f32, dst: [*]f32, stride: usize, width: usize, height: usize) void {
+    const kernel = [9]f32{
+        0.0076144188642501831054687500,
+        0.0360749699175357818603515625,
+        0.1095860823988914489746093750,
+        0.2134445458650588989257812500,
+        0.2665599882602691650390625000,
+        0.2134445458650588989257812500,
+        0.1095860823988914489746093750,
+        0.0360749699175357818603515625,
+        0.0076144188642501831054687500,
+    };
+
+    const ksize: usize = 9;
+    const radius: usize = ksize >> 1;
+    var i: usize = 0;
+    while (i < height) : (i += 1) {
+        var srcp: [9][*]const f32 = undefined;
+        const dstp: [*]f32 = dst + i * stride;
+        const dist_from_bottom: usize = height - 1 - i;
+
+        const tmp_arr = allocator.alignedAlloc(f32, 64, width) catch unreachable;
+        defer allocator.free(tmp_arr);
+        const tmp: [*]f32 = tmp_arr.ptr;
+
+        var k: usize = 0;
+        while (k < radius) : (k += 1) {
+            const row: usize = if (i < radius - k) (@min(radius - k - i, height - 1)) else (i - radius + k);
+            srcp[k] = src + row * stride;
+        }
+
+        k = radius;
+        while (k < ksize) : (k += 1) {
+            const row: usize = if (dist_from_bottom < k - radius) (i - @min(k - radius - dist_from_bottom, i)) else (i - radius + k);
+            srcp[k] = src + row * stride;
+        }
+
+        blur_v(srcp, tmp, kernel, width);
+        blur_h(tmp, dstp, kernel, width);
+    }
+}
diff --git a/src/rblur.zig b/src/rblur.zig
deleted file mode 100644
index a8fe0d7..0000000
--- a/src/rblur.zig
+++ /dev/null
@@ -1,231 +0,0 @@
-const std = @import("std");
-const ssimulacra2 = @import("ssimulacra2.zig");
-const allocator = std.heap.c_allocator;
-
-const vec_t = @Vector(16, f32);
-
-inline fn v_pass(src: [*]const f32, dst: [*]f32, stride: usize, d: *ssimulacra2.Ssimulacra2Data, width: usize, height: usize) void {
-    const big_n = d.radius;
-    const mul_in_1: vec_t = @splat(d.mul_in[0]);
-    const mul_in_3: vec_t = @splat(d.mul_in[4]);
-    const mul_in_5: vec_t = @splat(d.mul_in[8]);
-    const mul_prev_1: vec_t = @splat(d.mul_prev[0]);
-    const mul_prev_3: vec_t = @splat(d.mul_prev[4]);
-    const mul_prev_5: vec_t = @splat(d.mul_prev[8]);
-    const mul_prev2_1: vec_t = @splat(d.mul_prev2[0]);
-    const mul_prev2_3: vec_t = @splat(d.mul_prev2[4]);
-    const mul_prev2_5: vec_t = @splat(d.mul_prev2[8]);
-    const v00: vec_t = @splat(@as(f32, 0.0));
-
-    const iheight: i32 = @intCast(height);
-
-    var x: usize = 0;
-    while (x < width) : (x += 16) {
-        const srcp = src + x;
-        var dstp = dst + x;
-        var prev_1: vec_t = v00;
-        var prev_3: vec_t = v00;
-        var prev_5: vec_t = v00;
-        var prev2_1: vec_t = v00;
-        var prev2_3: vec_t = v00;
-        var prev2_5: vec_t = v00;
-
-        var n: i32 = -big_n + 1;
-        while (n < iheight) : (n += 1) {
-            const top: i32 = n - big_n - 1;
-            const bot: i32 = n + big_n - 1;
-            const top_val: vec_t = if (top >= 0) (srcp[(@as(usize, @intCast(top)) * stride)..][0..16].*) else v00;
-            const bot_val: vec_t = if (bot < iheight) (srcp[(@as(usize, @intCast(bot)) * stride)..][0..16].*) else v00;
-            const sum: vec_t = top_val + bot_val;
-
-            var out_1: vec_t = sum * mul_in_1;
-            var out_3: vec_t = sum * mul_in_3;
-            var out_5: vec_t = sum * mul_in_5;
-
-            out_1 = @mulAdd(vec_t, mul_prev2_1, prev2_1, out_1);
-            out_3 = @mulAdd(vec_t, mul_prev2_3, prev2_3, out_3);
-            out_5 = @mulAdd(vec_t, mul_prev2_5, prev2_5, out_5);
-            prev2_1 = prev_1;
-            prev2_3 = prev_3;
-            prev2_5 = prev_5;
-
-            out_1 = @mulAdd(vec_t, mul_prev_1, prev_1, out_1);
-            out_3 = @mulAdd(vec_t, mul_prev_3, prev_3, out_3);
-            out_5 = @mulAdd(vec_t, mul_prev_5, prev_5, out_5);
-            prev_1 = out_1;
-            prev_3 = out_3;
-            prev_5 = out_5;
-
-            if (n >= 0) {
-                dstp[(@as(usize, @intCast(n)) * stride)..][0..16].* = out_1 + out_3 + out_5;
-            }
-        }
-    }
-}
-
-inline fn h_pass(src: [*]const f32, dst: [*]f32, stride: usize, d: *ssimulacra2.Ssimulacra2Data, width: usize, height: usize) void {
-    const big_n = d.radius;
-    const mul_in_1 = d.mul_in[0];
-    const mul_in_3 = d.mul_in[4];
-    const mul_in_5 = d.mul_in[8];
-    const mul_prev_1 = d.mul_prev[0];
-    const mul_prev_3 = d.mul_prev[4];
-    const mul_prev_5 = d.mul_prev[8];
-    const mul_prev2_1 = d.mul_prev2[0];
-    const mul_prev2_3 = d.mul_prev2[4];
-    const mul_prev2_5 = d.mul_prev2[8];
-
-    const iwidth: i32 = @intCast(width);
-
-    var y: usize = 0;
-    while (y < height) : (y += 1) {
-        const srcp = src + y * stride;
-        var dstp = dst + y * stride;
-        var prev_1: f32 = 0.0;
-        var prev_3: f32 = 0.0;
-        var prev_5: f32 = 0.0;
-        var prev2_1: f32 = 0.0;
-        var prev2_3: f32 = 0.0;
-        var prev2_5: f32 = 0.0;
-
-        var n: i32 = -big_n + 1;
-        while (n < iwidth) : (n += 1) {
-            const left: i32 = n - big_n - 1;
-            const right: i32 = n + big_n - 1;
-            const left_val: f32 = if (left >= 0) (srcp[@intCast(left)]) else 0.0;
-            const right_val: f32 = if (right < iwidth) (srcp[@intCast(right)]) else 0.0;
-            const sum: f32 = left_val + right_val;
-
-            var out_1: f32 = sum * mul_in_1;
-            var out_3: f32 = sum * mul_in_3;
-            var out_5: f32 = sum * mul_in_5;
-
-            out_1 = @mulAdd(f32, mul_prev2_1, prev2_1, out_1);
-            out_3 = @mulAdd(f32, mul_prev2_3, prev2_3, out_3);
-            out_5 = @mulAdd(f32, mul_prev2_5, prev2_5, out_5);
-            prev2_1 = prev_1;
-            prev2_3 = prev_3;
-            prev2_5 = prev_5;
-
-            out_1 = @mulAdd(f32, mul_prev_1, prev_1, out_1);
-            out_3 = @mulAdd(f32, mul_prev_3, prev_3, out_3);
-            out_5 = @mulAdd(f32, mul_prev_5, prev_5, out_5);
-            prev_1 = out_1;
-            prev_3 = out_3;
-            prev_5 = out_5;
-
-            if (n >= 0) {
-                dstp[@intCast(n)] = out_1 + out_3 + out_5;
-            }
-        }
-    }
-}
-
-pub inline fn process(srcp: [*]const f32, dstp: [*]f32, stride: usize, width: usize, height: usize, d: *ssimulacra2.Ssimulacra2Data) void {
-    if (d.tmp_blur.len == 1) {
-        d.tmp_blur = allocator.alignedAlloc(f32, 64, stride * height) catch unreachable;
-    }
-
-    const tmpp = d.tmp_blur.ptr;
-    h_pass(srcp, tmpp, stride, d, width, height);
-    v_pass(tmpp, dstp, stride, d, width, height);
-}
-
-pub inline fn Inv3x3Matrix(matrix: [*]f64) void {
-    var temp: [9]f64 = undefined;
-    temp[0] = @mulAdd(f64, matrix[4], matrix[8], -(matrix[5] * matrix[7]));
-    temp[1] = @mulAdd(f64, matrix[2], matrix[7], -(matrix[1] * matrix[8]));
-    temp[2] = @mulAdd(f64, matrix[1], matrix[5], -(matrix[2] * matrix[4]));
-    temp[3] = @mulAdd(f64, matrix[5], matrix[6], -(matrix[3] * matrix[8]));
-    temp[4] = @mulAdd(f64, matrix[0], matrix[8], -(matrix[2] * matrix[6]));
-    temp[5] = @mulAdd(f64, matrix[2], matrix[3], -(matrix[0] * matrix[5]));
-    temp[6] = @mulAdd(f64, matrix[3], matrix[7], -(matrix[4] * matrix[6]));
-    temp[7] = @mulAdd(f64, matrix[1], matrix[6], -(matrix[0] * matrix[7]));
-    temp[8] = @mulAdd(f64, matrix[0], matrix[4], -(matrix[1] * matrix[3]));
-    const det: f64 = @mulAdd(f64, matrix[0], temp[0], @mulAdd(f64, matrix[1], temp[3], (matrix[2] * temp[6])));
-
-    const idet: f64 = 1.0 / det;
-    var i: usize = 0;
-    while (i < 9) : (i += 1) {
-        matrix[i] = temp[i] * idet;
-    }
-}
-
-pub inline fn MatMul(a: [*]f64, b: [*]f64, ha: usize, wa: usize, wb: usize, d: [*]f64) void {
-    var temp: [wa]f64 = undefined;
-    var x: usize = 0;
-    while (x < wb) : (x += 1) {
-        var z: usize = 0;
-        while (z < wa) : (z += 1) {
-            temp[z] = b[z * wb + x];
-        }
-
-        var y: usize = 0;
-        while (y < ha) : (y += 1) {
-            var e: f64 = 0.0;
-            var j: usize = 0;
-            while (j < wa) : (j += 1) {
-                e += a[y * wa + j] * temp[j];
-            }
-
-            d[y * wb + x] = e;
-        }
-    }
-}
-
-pub inline fn gauss_init(sigma: f64, d: *ssimulacra2.Ssimulacra2Data) void {
-    const kPi: f64 = 3.141592653589793238;
-    const radius: f64 = @round(3.2795 * sigma + 0.2546);
-    const pi_div_2r: f64 = kPi / (2.0 * radius);
-    const omega = [3]f64{ pi_div_2r, 3.0 * pi_div_2r, 5.0 * pi_div_2r };
-    const p_1: f64 = 1.0 / @tan(0.5 * omega[0]);
-    const p_3: f64 = -1.0 / @tan(0.5 * omega[1]);
-    const p_5: f64 = 1.0 / @tan(0.5 * omega[2]);
-    const r_1: f64 = p_1 * p_1 / @sin(omega[0]);
-    const r_3: f64 = -p_3 * p_3 / @sin(omega[1]);
-    const r_5: f64 = p_5 * p_5 / @sin(omega[2]);
-    const neg_half_sigma2: f64 = -0.5 * sigma * sigma;
-    const recip_radius: f64 = 1.0 / radius;
-    var rho: [3]f64 = undefined;
-
-    var i: usize = 0;
-    while (i < 3) : (i += 1) {
-        rho[i] = @exp(neg_half_sigma2 * omega[i] * omega[i]) * recip_radius;
-    }
-
-    const D_13: f64 = p_1 * r_3 - r_1 * p_3;
-    const D_35: f64 = p_3 * r_5 - r_3 * p_5;
-    const D_51: f64 = p_5 * r_1 - r_5 * p_1;
-    const recip_d13: f64 = 1.0 / D_13;
-    const zeta_15: f64 = D_35 * recip_d13;
-    const zeta_35: f64 = D_51 * recip_d13;
-    var A = [9]f64{ p_1, p_3, p_5, r_1, r_3, r_5, zeta_15, zeta_35, 1.0 };
-    Inv3x3Matrix(&A);
-
-    var gamma = [3]f64{ 1.0, radius * radius - sigma * sigma, zeta_15 * rho[0] + zeta_35 * rho[1] + rho[2] };
-    var beta: [3]f64 = undefined;
-    MatMul(&A, &gamma, 3, 3, 1, &beta);
-    d.radius = @intFromFloat(radius);
-
-    var n2: [3]f64 = undefined;
-    var d1: [3]f64 = undefined;
-    i = 0;
-    while (i < 3) : (i += 1) {
-        n2[i] = -beta[i] * @cos(omega[i] * (radius + 1.0));
-        d1[i] = -2.0 * @cos(omega[i]);
-
-        const d_2: f64 = d1[i] * d1[i];
-        d.mul_prev[4 * i + 0] = @floatCast(-d1[i]);
-        d.mul_prev[4 * i + 1] = @floatCast(d_2 - 1.0);
-        d.mul_prev[4 * i + 2] = @floatCast(-d_2 * d1[i] + 2.0 * d1[i]);
-        d.mul_prev[4 * i + 3] = @floatCast(d_2 * d_2 - 3.0 * d_2 + 1.0);
-        d.mul_prev2[4 * i + 0] = -1.0;
-        d.mul_prev2[4 * i + 1] = @floatCast(d1[i]);
-        d.mul_prev2[4 * i + 2] = @floatCast(-d_2 + 1.0);
-        d.mul_prev2[4 * i + 3] = @floatCast(d_2 * d1[i] - 2.0 * d1[i]);
-        d.mul_in[4 * i + 0] = @floatCast(n2[i]);
-        d.mul_in[4 * i + 1] = @floatCast(-d1[i] * n2[i]);
-        d.mul_in[4 * i + 2] = @floatCast(d_2 * n2[i] - n2[i]);
-        d.mul_in[4 * i + 3] = @floatCast(-d_2 * d1[i] * n2[i] + 2.0 * d1[i] * n2[i]);
-    }
-}
diff --git a/src/ssimulacra2.zig b/src/ssimulacra2.zig
index ae07e45..1229340 100644
--- a/src/ssimulacra2.zig
+++ b/src/ssimulacra2.zig
@@ -2,7 +2,7 @@ const std = @import("std");
 const vs = @import("vapoursynth").vapoursynth4;
 const vsh = @import("vapoursynth").vshelper;
 
-const rblur = @import("rblur.zig");
+const blur = @import("blur.zig");
 const downscale = @import("downscale.zig");
 const multiply = @import("multiply.zig");
 const score = @import("score.zig");
@@ -22,14 +22,6 @@ const allocator = std.heap.c_allocator;
 pub const Ssimulacra2Data = struct {
     node1: ?*vs.Node,
     node2: ?*vs.Node,
-
-    tmp_ss: []f32,
-    tmp_blur: []f32,
-
-    radius: i32,
-    mul_in: [12]f32,
-    mul_prev: [12]f32,
-    mul_prev2: [12]f32,
 };
 
 inline fn copy_data(dst: [3][*]f32, src: [3][*]const f32, stride: usize, width: usize, height: usize) void {
@@ -46,7 +38,7 @@ inline fn copy_data(dst: [3][*]f32, src: [3][*]const f32, stride: usize, width:
     }
 }
 
-inline fn process(src8a: [3][*]const u8, src8b: [3][*]const u8, stride8: usize, width: usize, height: usize, d: *Ssimulacra2Data) f64 {
+inline fn process(src8a: [3][*]const u8, src8b: [3][*]const u8, stride8: usize, width: usize, height: usize) f64 {
     const stride: usize = stride8 >> (@sizeOf(f32) >> 1);
 
     const srcp1 = [3][*]const f32{
@@ -62,12 +54,9 @@ inline fn process(src8a: [3][*]const u8, src8b: [3][*]const u8, stride8: usize,
     };
 
     const wh: usize = stride * height;
-
-    if (d.tmp_ss.len == 1) {
-        d.tmp_ss = allocator.alignedAlloc(f32, 64, wh * 18) catch unreachable;
-    }
-
-    const tempp = d.tmp_ss.ptr;
+    const tmp_arr = allocator.alignedAlloc(f32, 32, width * height * 18) catch unreachable;
+    defer allocator.free(tmp_arr);
+    const tempp = tmp_arr.ptr;
     const srcp1b = [3][*]f32{ tempp, tempp + wh, tempp + (wh * 2) };
     const srcp2b = [3][*]f32{ tempp + (wh * 3), tempp + (wh * 4), tempp + (wh * 5) };
     const tmpp1 = [3][*]f32{ tempp + (wh * 6), tempp + (wh * 7), tempp + (wh * 8) };
@@ -105,16 +94,16 @@ inline fn process(src8a: [3][*]const u8, src8b: [3][*]const u8, stride8: usize,
         var plane: usize = 0;
         while (plane < 3) : (plane += 1) {
             multiply.process(tmpp1[plane], tmpp1[plane], tmpp3, stride2, width2, height2);
-            rblur.process(tmpp3, tmpps11, stride2, width2, height2, d);
+            blur.process(tmpp3, tmpps11, stride2, width2, height2);
 
             multiply.process(tmpp2[plane], tmpp2[plane], tmpp3, stride2, width2, height2);
-            rblur.process(tmpp3, tmpps22, stride2, width2, height2, d);
+            blur.process(tmpp3, tmpps22, stride2, width2, height2);
 
             multiply.process(tmpp1[plane], tmpp2[plane], tmpp3, stride2, width2, height2);
-            rblur.process(tmpp3, tmpps12, stride2, width2, height2, d);
+            blur.process(tmpp3, tmpps12, stride2, width2, height2);
 
-            rblur.process(tmpp1[plane], tmppmu1, stride2, width2, height2, d);
-            rblur.process(tmpp2[plane], tmpp3, stride2, width2, height2, d);
+            blur.process(tmpp1[plane], tmppmu1, stride2, width2, height2);
+            blur.process(tmpp2[plane], tmpp3, stride2, width2, height2);
 
             score.ssim_map(
                 tmpps11,
@@ -184,7 +173,6 @@ export fn ssimulacra2GetFrame(n: c_int, activation_reason: ar, instance_data: ?*
             stride,
             width,
             height,
-            d,
         );
 
         _ = vsapi.?.mapSetFloat.?(vsapi.?.getFramePropertiesRW.?(dst), "_SSIMULACRA2", val, ma.Replace);
@@ -198,15 +186,6 @@ export fn ssimulacra2Free(instance_data: ?*anyopaque, core: ?*vs.Core, vsapi: ?*
     const d: *Ssimulacra2Data = @ptrCast(@alignCast(instance_data));
     vsapi.?.freeNode.?(d.node1);
     vsapi.?.freeNode.?(d.node2);
-
-    if (d.tmp_ss.len != 1) {
-        allocator.free(d.tmp_ss);
-    }
-
-    if (d.tmp_blur.len != 1) {
-        allocator.free(d.tmp_blur);
-    }
-
     allocator.destroy(d);
 }
 
@@ -233,13 +212,6 @@ export fn ssimulacra2Create(in: ?*const vs.Map, out: ?*vs.Map, user_data: ?*anyo
         return;
     }
 
-    var tmp = [_]f32{0.0};
-    d.tmp_ss = tmp[0..1];
-    d.tmp_blur = tmp[0..1];
-
-    const sigma: f64 = 1.5;
-    rblur.gauss_init(sigma, &d);
-
     const data: *Ssimulacra2Data = allocator.create(Ssimulacra2Data) catch unreachable;
     data.* = d;
 
@@ -258,6 +230,6 @@ export fn ssimulacra2Create(in: ?*const vs.Map, out: ?*vs.Map, user_data: ?*anyo
 }
 
 export fn VapourSynthPluginInit2(plugin: *vs.Plugin, vspapi: *const vs.PLUGINAPI) void {
-    _ = vspapi.configPlugin.?("com.julek.ssimulacra2", "ssimulacra2", "VapourSynth SSIMULACRA2", vs.makeVersion(2, 0), vs.VAPOURSYNTH_API_VERSION, 0, plugin);
+    _ = vspapi.configPlugin.?("com.julek.ssimulacra2", "ssimulacra2", "VapourSynth SSIMULACRA2", vs.makeVersion(3, 0), vs.VAPOURSYNTH_API_VERSION, 0, plugin);
     _ = vspapi.registerFunction.?("SSIMULACRA2", "reference:vnode;distorted:vnode;", "clip:vnode;", ssimulacra2Create, null, plugin);
 }