From 34d3a8649a05bfde0f9fa9abe8d5ec457f05c26f Mon Sep 17 00:00:00 2001 From: Beinsezii <beinsezii@gmail.com> Date: Sun, 2 Jun 2024 23:31:28 -0700 Subject: [PATCH] Convert lrgb_to_xyz to DType MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Should be a best case scenario. Literally just element-wise FMA. Almost +30%: 107µs to 77µs on arch=native It's *cool* yes but the code quality degrades so much I wonder if its even worth it. Then when you factor in the complex 3-dimension deinterleave that'll be needed to use it properly... I still have to test it of course, but I just feel it'll eat what little perf I get. I have AVX512 as well, so AVX≤2 will probably end up hurting even more. --- benches/conversions.rs | 5 +++++ src/lib.rs | 20 ++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/benches/conversions.rs b/benches/conversions.rs index 032e96f..5fe1b89 100644 --- a/benches/conversions.rs +++ b/benches/conversions.rs @@ -1,6 +1,7 @@ #![feature(portable_simd)] use criterion::{black_box, criterion_group, criterion_main, Criterion}; use colcon::{Space, convert_space}; +//use std::simd::prelude::*; fn pixels() -> Box<[f32]> { let size = 512; @@ -25,6 +26,10 @@ pub fn conversions(c: &mut Criterion) { black_box(pixels.clone().chunks_exact_mut(3).for_each(|pixel| colcon::lrgb_to_xyz(pixel.try_into().unwrap()))); } )); + c.bench_function("lrgb_to_xyz_simd", |b| b.iter(|| { + black_box(pixels.clone().as_simd_mut::<32>().1.chunks_exact_mut(3).for_each(|pixel| colcon::lrgb_to_xyz(pixel.try_into().unwrap()))); + } )); + c.bench_function("xyz_to_cielab", |b| b.iter(|| { black_box(pixels.clone().chunks_exact_mut(3).for_each(|pixel| colcon::xyz_to_cielab(pixel.try_into().unwrap()))); } )); diff --git a/src/lib.rs b/src/lib.rs index 748bc07..b01171e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -288,11 +288,11 @@ fn matmul3t(pixel: [f32; 3], matrix: [[f32; 3]; 3]) -> [f32; 3] { } /// Transposed 3 * 3x3 matrix multiply, ie matrix @ pixel -fn matmul3(matrix: [[f32; 3]; 3], pixel: [f32; 3]) -> [f32; 3] { +fn matmul3<T: DType>(matrix: [[f32; 3]; 3], pixel: [T; 3]) -> [T; 3] { [ - pixel[0] * matrix[0][0] + pixel[1] * matrix[0][1] + pixel[2] * matrix[0][2], - pixel[0] * matrix[1][0] + pixel[1] * matrix[1][1] + pixel[2] * matrix[1][2], - pixel[0] * matrix[2][0] + pixel[1] * matrix[2][1] + pixel[2] * matrix[2][2], + pixel[0].fma(DType::f32(matrix[0][0]), pixel[1].fma(DType::f32(matrix[0][1]), pixel[2] * DType::f32(matrix[0][2]))), + pixel[0].fma(DType::f32(matrix[1][0]), pixel[1].fma(DType::f32(matrix[1][1]), pixel[2] * DType::f32(matrix[1][2]))), + pixel[0].fma(DType::f32(matrix[2][0]), pixel[1].fma(DType::f32(matrix[2][1]), pixel[2] * DType::f32(matrix[2][2]))), ] } // ### MATRICES ### }}} @@ -1048,11 +1048,15 @@ pub extern "C" fn srgb_to_lrgb(pixel: &mut [f32; 3]) { /// Convert from Linear Light RGB to CIE XYZ, D65 standard illuminant /// /// <https://en.wikipedia.org/wiki/SRGB#From_sRGB_to_CIE_XYZ> -#[no_mangle] -pub extern "C" fn lrgb_to_xyz(pixel: &mut [f32; 3]) { +pub fn lrgb_to_xyz<T: DType>(pixel: &mut [T; 3]) { *pixel = matmul3(XYZ65_MAT, *pixel) } +#[no_mangle] +extern "C" fn lrgb_to_xyz_f32(pixel: &mut [f32; 3]) { + lrgb_to_xyz(pixel) +} + /// Convert from CIE XYZ to CIE LAB. /// /// <https://en.wikipedia.org/wiki/CIELAB_color_space#From_CIEXYZ_to_CIELAB> @@ -1601,7 +1605,7 @@ mod tests { #[test] fn xyz_forwards() { - func_cmp(LRGB, XYZ, lrgb_to_xyz) + func_cmp(LRGB, XYZ, lrgb_to_xyz_f32) } #[test] fn xyz_backwards() { @@ -1752,7 +1756,7 @@ mod tests { ("hsv_backwards", hsv_to_srgb), ("lrgb_forwards", srgb_to_lrgb), ("lrgb_backwards", lrgb_to_srgb), - ("xyz_forwards", lrgb_to_xyz), + ("xyz_forwards", lrgb_to_xyz_f32), ("xyz_backwards", xyz_to_lrgb), ("lab_forwards", xyz_to_cielab), ("lab_backwards", cielab_to_xyz),