diff --git a/src/blur.zig b/src/blur.zig new file mode 100644 index 0000000..796b7f1 --- /dev/null +++ b/src/blur.zig @@ -0,0 +1,111 @@ +const std = @import("std"); +const allocator = std.heap.c_allocator; + +inline fn blur_h(srcp: anytype, dstp: [*]f32, kernel: [9]f32, width: usize) void { + const ksize: usize = 9; + const radius: usize = ksize >> 1; + + var j: usize = 0; + while (j < @min(width, radius)) : (j += 1) { + const dist_from_right: usize = width - 1 - j; + var accum: f32 = 0.0; + var k: usize = 0; + while (k < radius) : (k += 1) { + const idx: usize = if (j < radius - k) (@min(radius - k - j, width - 1)) else (j - radius + k); + accum += kernel[k] * srcp[idx]; + } + + k = radius; + while (k < ksize) : (k += 1) { + const idx: usize = if (dist_from_right < k - radius) (j - @min(k - radius - dist_from_right, j)) else (j - radius + k); + accum += kernel[k] * srcp[idx]; + } + + dstp[j] = accum; + } + + j = radius; + while (j < width - @min(width, radius)) : (j += 1) { + var accum: f32 = 0.0; + var k: usize = 0; + while (k < ksize) : (k += 1) { + accum += kernel[k] * srcp[j - radius + k]; + } + + dstp[j] = accum; + } + + j = @max(radius, width - @min(width, radius)); + while (j < width) : (j += 1) { + const dist_from_right: usize = width - 1 - j; + var accum: f32 = 0.0; + var k: usize = 0; + while (k < radius) : (k += 1) { + const idx: usize = if (j < radius - k) (@min(radius - k - j, width - 1)) else (j - radius + k); + accum += kernel[k] * srcp[idx]; + } + + k = radius; + while (k < ksize) : (k += 1) { + const idx: usize = if (dist_from_right < k - radius) (j - @min(k - radius - dist_from_right, j)) else (j - radius + k); + accum += kernel[k] * srcp[idx]; + } + + dstp[j] = accum; + } +} + +inline fn blur_v(src: anytype, dstp: [*]f32, kernel: [9]f32, width: usize) void { + var j: usize = 0; + while (j < width) : (j += 1) { + var accum: f32 = 0.0; + var k: usize = 0; + while (k < 9) : (k += 1) { + accum += kernel[k] * src[k][j]; + } + + dstp[j] = accum; + } +} + +pub inline fn process(src: [*]const f32, dst: [*]f32, stride: usize, width: usize, height: usize) void { + const kernel = [9]f32{ + 0.0076144188642501831054687500, + 0.0360749699175357818603515625, + 0.1095860823988914489746093750, + 0.2134445458650588989257812500, + 0.2665599882602691650390625000, + 0.2134445458650588989257812500, + 0.1095860823988914489746093750, + 0.0360749699175357818603515625, + 0.0076144188642501831054687500, + }; + + const ksize: usize = 9; + const radius: usize = ksize >> 1; + var i: usize = 0; + while (i < height) : (i += 1) { + var srcp: [9][*]const f32 = undefined; + const dstp: [*]f32 = dst + i * stride; + const dist_from_bottom: usize = height - 1 - i; + + const tmp_arr = allocator.alignedAlloc(f32, 64, width) catch unreachable; + defer allocator.free(tmp_arr); + const tmp: [*]f32 = tmp_arr.ptr; + + var k: usize = 0; + while (k < radius) : (k += 1) { + const row: usize = if (i < radius - k) (@min(radius - k - i, height - 1)) else (i - radius + k); + srcp[k] = src + row * stride; + } + + k = radius; + while (k < ksize) : (k += 1) { + const row: usize = if (dist_from_bottom < k - radius) (i - @min(k - radius - dist_from_bottom, i)) else (i - radius + k); + srcp[k] = src + row * stride; + } + + blur_v(srcp, tmp, kernel, width); + blur_h(tmp, dstp, kernel, width); + } +} diff --git a/src/rblur.zig b/src/rblur.zig deleted file mode 100644 index a8fe0d7..0000000 --- a/src/rblur.zig +++ /dev/null @@ -1,231 +0,0 @@ -const std = @import("std"); -const ssimulacra2 = @import("ssimulacra2.zig"); -const allocator = std.heap.c_allocator; - -const vec_t = @Vector(16, f32); - -inline fn v_pass(src: [*]const f32, dst: [*]f32, stride: usize, d: *ssimulacra2.Ssimulacra2Data, width: usize, height: usize) void { - const big_n = d.radius; - const mul_in_1: vec_t = @splat(d.mul_in[0]); - const mul_in_3: vec_t = @splat(d.mul_in[4]); - const mul_in_5: vec_t = @splat(d.mul_in[8]); - const mul_prev_1: vec_t = @splat(d.mul_prev[0]); - const mul_prev_3: vec_t = @splat(d.mul_prev[4]); - const mul_prev_5: vec_t = @splat(d.mul_prev[8]); - const mul_prev2_1: vec_t = @splat(d.mul_prev2[0]); - const mul_prev2_3: vec_t = @splat(d.mul_prev2[4]); - const mul_prev2_5: vec_t = @splat(d.mul_prev2[8]); - const v00: vec_t = @splat(@as(f32, 0.0)); - - const iheight: i32 = @intCast(height); - - var x: usize = 0; - while (x < width) : (x += 16) { - const srcp = src + x; - var dstp = dst + x; - var prev_1: vec_t = v00; - var prev_3: vec_t = v00; - var prev_5: vec_t = v00; - var prev2_1: vec_t = v00; - var prev2_3: vec_t = v00; - var prev2_5: vec_t = v00; - - var n: i32 = -big_n + 1; - while (n < iheight) : (n += 1) { - const top: i32 = n - big_n - 1; - const bot: i32 = n + big_n - 1; - const top_val: vec_t = if (top >= 0) (srcp[(@as(usize, @intCast(top)) * stride)..][0..16].*) else v00; - const bot_val: vec_t = if (bot < iheight) (srcp[(@as(usize, @intCast(bot)) * stride)..][0..16].*) else v00; - const sum: vec_t = top_val + bot_val; - - var out_1: vec_t = sum * mul_in_1; - var out_3: vec_t = sum * mul_in_3; - var out_5: vec_t = sum * mul_in_5; - - out_1 = @mulAdd(vec_t, mul_prev2_1, prev2_1, out_1); - out_3 = @mulAdd(vec_t, mul_prev2_3, prev2_3, out_3); - out_5 = @mulAdd(vec_t, mul_prev2_5, prev2_5, out_5); - prev2_1 = prev_1; - prev2_3 = prev_3; - prev2_5 = prev_5; - - out_1 = @mulAdd(vec_t, mul_prev_1, prev_1, out_1); - out_3 = @mulAdd(vec_t, mul_prev_3, prev_3, out_3); - out_5 = @mulAdd(vec_t, mul_prev_5, prev_5, out_5); - prev_1 = out_1; - prev_3 = out_3; - prev_5 = out_5; - - if (n >= 0) { - dstp[(@as(usize, @intCast(n)) * stride)..][0..16].* = out_1 + out_3 + out_5; - } - } - } -} - -inline fn h_pass(src: [*]const f32, dst: [*]f32, stride: usize, d: *ssimulacra2.Ssimulacra2Data, width: usize, height: usize) void { - const big_n = d.radius; - const mul_in_1 = d.mul_in[0]; - const mul_in_3 = d.mul_in[4]; - const mul_in_5 = d.mul_in[8]; - const mul_prev_1 = d.mul_prev[0]; - const mul_prev_3 = d.mul_prev[4]; - const mul_prev_5 = d.mul_prev[8]; - const mul_prev2_1 = d.mul_prev2[0]; - const mul_prev2_3 = d.mul_prev2[4]; - const mul_prev2_5 = d.mul_prev2[8]; - - const iwidth: i32 = @intCast(width); - - var y: usize = 0; - while (y < height) : (y += 1) { - const srcp = src + y * stride; - var dstp = dst + y * stride; - var prev_1: f32 = 0.0; - var prev_3: f32 = 0.0; - var prev_5: f32 = 0.0; - var prev2_1: f32 = 0.0; - var prev2_3: f32 = 0.0; - var prev2_5: f32 = 0.0; - - var n: i32 = -big_n + 1; - while (n < iwidth) : (n += 1) { - const left: i32 = n - big_n - 1; - const right: i32 = n + big_n - 1; - const left_val: f32 = if (left >= 0) (srcp[@intCast(left)]) else 0.0; - const right_val: f32 = if (right < iwidth) (srcp[@intCast(right)]) else 0.0; - const sum: f32 = left_val + right_val; - - var out_1: f32 = sum * mul_in_1; - var out_3: f32 = sum * mul_in_3; - var out_5: f32 = sum * mul_in_5; - - out_1 = @mulAdd(f32, mul_prev2_1, prev2_1, out_1); - out_3 = @mulAdd(f32, mul_prev2_3, prev2_3, out_3); - out_5 = @mulAdd(f32, mul_prev2_5, prev2_5, out_5); - prev2_1 = prev_1; - prev2_3 = prev_3; - prev2_5 = prev_5; - - out_1 = @mulAdd(f32, mul_prev_1, prev_1, out_1); - out_3 = @mulAdd(f32, mul_prev_3, prev_3, out_3); - out_5 = @mulAdd(f32, mul_prev_5, prev_5, out_5); - prev_1 = out_1; - prev_3 = out_3; - prev_5 = out_5; - - if (n >= 0) { - dstp[@intCast(n)] = out_1 + out_3 + out_5; - } - } - } -} - -pub inline fn process(srcp: [*]const f32, dstp: [*]f32, stride: usize, width: usize, height: usize, d: *ssimulacra2.Ssimulacra2Data) void { - if (d.tmp_blur.len == 1) { - d.tmp_blur = allocator.alignedAlloc(f32, 64, stride * height) catch unreachable; - } - - const tmpp = d.tmp_blur.ptr; - h_pass(srcp, tmpp, stride, d, width, height); - v_pass(tmpp, dstp, stride, d, width, height); -} - -pub inline fn Inv3x3Matrix(matrix: [*]f64) void { - var temp: [9]f64 = undefined; - temp[0] = @mulAdd(f64, matrix[4], matrix[8], -(matrix[5] * matrix[7])); - temp[1] = @mulAdd(f64, matrix[2], matrix[7], -(matrix[1] * matrix[8])); - temp[2] = @mulAdd(f64, matrix[1], matrix[5], -(matrix[2] * matrix[4])); - temp[3] = @mulAdd(f64, matrix[5], matrix[6], -(matrix[3] * matrix[8])); - temp[4] = @mulAdd(f64, matrix[0], matrix[8], -(matrix[2] * matrix[6])); - temp[5] = @mulAdd(f64, matrix[2], matrix[3], -(matrix[0] * matrix[5])); - temp[6] = @mulAdd(f64, matrix[3], matrix[7], -(matrix[4] * matrix[6])); - temp[7] = @mulAdd(f64, matrix[1], matrix[6], -(matrix[0] * matrix[7])); - temp[8] = @mulAdd(f64, matrix[0], matrix[4], -(matrix[1] * matrix[3])); - const det: f64 = @mulAdd(f64, matrix[0], temp[0], @mulAdd(f64, matrix[1], temp[3], (matrix[2] * temp[6]))); - - const idet: f64 = 1.0 / det; - var i: usize = 0; - while (i < 9) : (i += 1) { - matrix[i] = temp[i] * idet; - } -} - -pub inline fn MatMul(a: [*]f64, b: [*]f64, ha: usize, wa: usize, wb: usize, d: [*]f64) void { - var temp: [wa]f64 = undefined; - var x: usize = 0; - while (x < wb) : (x += 1) { - var z: usize = 0; - while (z < wa) : (z += 1) { - temp[z] = b[z * wb + x]; - } - - var y: usize = 0; - while (y < ha) : (y += 1) { - var e: f64 = 0.0; - var j: usize = 0; - while (j < wa) : (j += 1) { - e += a[y * wa + j] * temp[j]; - } - - d[y * wb + x] = e; - } - } -} - -pub inline fn gauss_init(sigma: f64, d: *ssimulacra2.Ssimulacra2Data) void { - const kPi: f64 = 3.141592653589793238; - const radius: f64 = @round(3.2795 * sigma + 0.2546); - const pi_div_2r: f64 = kPi / (2.0 * radius); - const omega = [3]f64{ pi_div_2r, 3.0 * pi_div_2r, 5.0 * pi_div_2r }; - const p_1: f64 = 1.0 / @tan(0.5 * omega[0]); - const p_3: f64 = -1.0 / @tan(0.5 * omega[1]); - const p_5: f64 = 1.0 / @tan(0.5 * omega[2]); - const r_1: f64 = p_1 * p_1 / @sin(omega[0]); - const r_3: f64 = -p_3 * p_3 / @sin(omega[1]); - const r_5: f64 = p_5 * p_5 / @sin(omega[2]); - const neg_half_sigma2: f64 = -0.5 * sigma * sigma; - const recip_radius: f64 = 1.0 / radius; - var rho: [3]f64 = undefined; - - var i: usize = 0; - while (i < 3) : (i += 1) { - rho[i] = @exp(neg_half_sigma2 * omega[i] * omega[i]) * recip_radius; - } - - const D_13: f64 = p_1 * r_3 - r_1 * p_3; - const D_35: f64 = p_3 * r_5 - r_3 * p_5; - const D_51: f64 = p_5 * r_1 - r_5 * p_1; - const recip_d13: f64 = 1.0 / D_13; - const zeta_15: f64 = D_35 * recip_d13; - const zeta_35: f64 = D_51 * recip_d13; - var A = [9]f64{ p_1, p_3, p_5, r_1, r_3, r_5, zeta_15, zeta_35, 1.0 }; - Inv3x3Matrix(&A); - - var gamma = [3]f64{ 1.0, radius * radius - sigma * sigma, zeta_15 * rho[0] + zeta_35 * rho[1] + rho[2] }; - var beta: [3]f64 = undefined; - MatMul(&A, &gamma, 3, 3, 1, &beta); - d.radius = @intFromFloat(radius); - - var n2: [3]f64 = undefined; - var d1: [3]f64 = undefined; - i = 0; - while (i < 3) : (i += 1) { - n2[i] = -beta[i] * @cos(omega[i] * (radius + 1.0)); - d1[i] = -2.0 * @cos(omega[i]); - - const d_2: f64 = d1[i] * d1[i]; - d.mul_prev[4 * i + 0] = @floatCast(-d1[i]); - d.mul_prev[4 * i + 1] = @floatCast(d_2 - 1.0); - d.mul_prev[4 * i + 2] = @floatCast(-d_2 * d1[i] + 2.0 * d1[i]); - d.mul_prev[4 * i + 3] = @floatCast(d_2 * d_2 - 3.0 * d_2 + 1.0); - d.mul_prev2[4 * i + 0] = -1.0; - d.mul_prev2[4 * i + 1] = @floatCast(d1[i]); - d.mul_prev2[4 * i + 2] = @floatCast(-d_2 + 1.0); - d.mul_prev2[4 * i + 3] = @floatCast(d_2 * d1[i] - 2.0 * d1[i]); - d.mul_in[4 * i + 0] = @floatCast(n2[i]); - d.mul_in[4 * i + 1] = @floatCast(-d1[i] * n2[i]); - d.mul_in[4 * i + 2] = @floatCast(d_2 * n2[i] - n2[i]); - d.mul_in[4 * i + 3] = @floatCast(-d_2 * d1[i] * n2[i] + 2.0 * d1[i] * n2[i]); - } -} diff --git a/src/ssimulacra2.zig b/src/ssimulacra2.zig index ae07e45..1229340 100644 --- a/src/ssimulacra2.zig +++ b/src/ssimulacra2.zig @@ -2,7 +2,7 @@ const std = @import("std"); const vs = @import("vapoursynth").vapoursynth4; const vsh = @import("vapoursynth").vshelper; -const rblur = @import("rblur.zig"); +const blur = @import("blur.zig"); const downscale = @import("downscale.zig"); const multiply = @import("multiply.zig"); const score = @import("score.zig"); @@ -22,14 +22,6 @@ const allocator = std.heap.c_allocator; pub const Ssimulacra2Data = struct { node1: ?*vs.Node, node2: ?*vs.Node, - - tmp_ss: []f32, - tmp_blur: []f32, - - radius: i32, - mul_in: [12]f32, - mul_prev: [12]f32, - mul_prev2: [12]f32, }; inline fn copy_data(dst: [3][*]f32, src: [3][*]const f32, stride: usize, width: usize, height: usize) void { @@ -46,7 +38,7 @@ inline fn copy_data(dst: [3][*]f32, src: [3][*]const f32, stride: usize, width: } } -inline fn process(src8a: [3][*]const u8, src8b: [3][*]const u8, stride8: usize, width: usize, height: usize, d: *Ssimulacra2Data) f64 { +inline fn process(src8a: [3][*]const u8, src8b: [3][*]const u8, stride8: usize, width: usize, height: usize) f64 { const stride: usize = stride8 >> (@sizeOf(f32) >> 1); const srcp1 = [3][*]const f32{ @@ -62,12 +54,9 @@ inline fn process(src8a: [3][*]const u8, src8b: [3][*]const u8, stride8: usize, }; const wh: usize = stride * height; - - if (d.tmp_ss.len == 1) { - d.tmp_ss = allocator.alignedAlloc(f32, 64, wh * 18) catch unreachable; - } - - const tempp = d.tmp_ss.ptr; + const tmp_arr = allocator.alignedAlloc(f32, 32, width * height * 18) catch unreachable; + defer allocator.free(tmp_arr); + const tempp = tmp_arr.ptr; const srcp1b = [3][*]f32{ tempp, tempp + wh, tempp + (wh * 2) }; const srcp2b = [3][*]f32{ tempp + (wh * 3), tempp + (wh * 4), tempp + (wh * 5) }; const tmpp1 = [3][*]f32{ tempp + (wh * 6), tempp + (wh * 7), tempp + (wh * 8) }; @@ -105,16 +94,16 @@ inline fn process(src8a: [3][*]const u8, src8b: [3][*]const u8, stride8: usize, var plane: usize = 0; while (plane < 3) : (plane += 1) { multiply.process(tmpp1[plane], tmpp1[plane], tmpp3, stride2, width2, height2); - rblur.process(tmpp3, tmpps11, stride2, width2, height2, d); + blur.process(tmpp3, tmpps11, stride2, width2, height2); multiply.process(tmpp2[plane], tmpp2[plane], tmpp3, stride2, width2, height2); - rblur.process(tmpp3, tmpps22, stride2, width2, height2, d); + blur.process(tmpp3, tmpps22, stride2, width2, height2); multiply.process(tmpp1[plane], tmpp2[plane], tmpp3, stride2, width2, height2); - rblur.process(tmpp3, tmpps12, stride2, width2, height2, d); + blur.process(tmpp3, tmpps12, stride2, width2, height2); - rblur.process(tmpp1[plane], tmppmu1, stride2, width2, height2, d); - rblur.process(tmpp2[plane], tmpp3, stride2, width2, height2, d); + blur.process(tmpp1[plane], tmppmu1, stride2, width2, height2); + blur.process(tmpp2[plane], tmpp3, stride2, width2, height2); score.ssim_map( tmpps11, @@ -184,7 +173,6 @@ export fn ssimulacra2GetFrame(n: c_int, activation_reason: ar, instance_data: ?* stride, width, height, - d, ); _ = vsapi.?.mapSetFloat.?(vsapi.?.getFramePropertiesRW.?(dst), "_SSIMULACRA2", val, ma.Replace); @@ -198,15 +186,6 @@ export fn ssimulacra2Free(instance_data: ?*anyopaque, core: ?*vs.Core, vsapi: ?* const d: *Ssimulacra2Data = @ptrCast(@alignCast(instance_data)); vsapi.?.freeNode.?(d.node1); vsapi.?.freeNode.?(d.node2); - - if (d.tmp_ss.len != 1) { - allocator.free(d.tmp_ss); - } - - if (d.tmp_blur.len != 1) { - allocator.free(d.tmp_blur); - } - allocator.destroy(d); } @@ -233,13 +212,6 @@ export fn ssimulacra2Create(in: ?*const vs.Map, out: ?*vs.Map, user_data: ?*anyo return; } - var tmp = [_]f32{0.0}; - d.tmp_ss = tmp[0..1]; - d.tmp_blur = tmp[0..1]; - - const sigma: f64 = 1.5; - rblur.gauss_init(sigma, &d); - const data: *Ssimulacra2Data = allocator.create(Ssimulacra2Data) catch unreachable; data.* = d; @@ -258,6 +230,6 @@ export fn ssimulacra2Create(in: ?*const vs.Map, out: ?*vs.Map, user_data: ?*anyo } export fn VapourSynthPluginInit2(plugin: *vs.Plugin, vspapi: *const vs.PLUGINAPI) void { - _ = vspapi.configPlugin.?("com.julek.ssimulacra2", "ssimulacra2", "VapourSynth SSIMULACRA2", vs.makeVersion(2, 0), vs.VAPOURSYNTH_API_VERSION, 0, plugin); + _ = vspapi.configPlugin.?("com.julek.ssimulacra2", "ssimulacra2", "VapourSynth SSIMULACRA2", vs.makeVersion(3, 0), vs.VAPOURSYNTH_API_VERSION, 0, plugin); _ = vspapi.registerFunction.?("SSIMULACRA2", "reference:vnode;distorted:vnode;", "clip:vnode;", ssimulacra2Create, null, plugin); }