Add #[target_feature(enable = "simd128")]

Cykooz · Jan 18, 2023 · 1a5c1a2 · 1a5c1a2
1 parent e09fa1e
commit 1a5c1a2
Show file tree

Hide file tree

Showing 11 changed files with 36 additions and 8 deletions.
diff --git a/src/alpha/u8x2/wasm32.rs b/src/alpha/u8x2/wasm32.rs
@@ -201,14 +201,14 @@ unsafe fn divide_alpha_8_pixels(pixels: v128) -> v128 {
         9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1,
     );
     let alpha_scale = f32x4_splat(255.0 * 256.0);
-    // sse4 _mm_cvtps_ep32 converts inf to i32::MIN or 2147483648f32 u32.
-    // wasm32 u32x4_trunc_sat_f32x4 converts inf to u32::MAX.
+    // sse4 _mm_cvtps_epi32 converts inf to i32::MIN or 2147483648f32 u32.
+    // wasm32 u32x4_trunc_sat_f32x4 on AVX systems converts inf to u32::MAX.
     // Tests pass without capping inf from dividing by zero, but scaled values will not match sse4,
     // and other potential test cases will (probably?) break.
     let alpha_scale_max = f32x4_splat(2147483648f32);
 
     let alpha_lo_f32 = f32x4_convert_u32x4(i8x16_swizzle(pixels, ALPHA32_SH_LO));
-    // trunc_sat will always round down. Adding f32x4_nearest would match _mm_cvtps_ep32 exactly,
+    // trunc_sat will always round down. Adding f32x4_nearest would match _mm_cvtps_epi32 exactly,
     // but would add extra instructions.
     let scaled_alpha_lo_u32 = u32x4_trunc_sat_f32x4(f32x4_pmin(
         f32x4_div(alpha_scale, alpha_lo_f32),

diff --git a/src/convolution/u16x1/wasm32.rs b/src/convolution/u16x1/wasm32.rs
@@ -43,6 +43,7 @@ pub(crate) fn horiz_convolution(
 /// - length of all rows in dst_rows must be equal
 /// - coefficients_chunks.len() == dst_rows.0.len()
 /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_four_rows(
     src_rows: [&[U16]; 4],
     dst_rows: [&mut &mut [U16]; 4],
@@ -172,6 +173,7 @@ unsafe fn horiz_convolution_four_rows(
 /// - bounds.len() == dst_row.len()
 /// - coefficients_chunks.len() == dst_row.len()
 /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.len()
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_one_row(
     src_row: &[U16],
     dst_row: &mut [U16],

diff --git a/src/convolution/u16x2/wasm32.rs b/src/convolution/u16x2/wasm32.rs
@@ -44,6 +44,7 @@ pub(crate) fn horiz_convolution(
 /// - coefficients_chunks.len() == dst_rows.0.len()
 /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
 /// - precision <= MAX_COEFS_PRECISION
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_four_rows(
     src_rows: [&[U16x2]; 4],
     dst_rows: [&mut &mut [U16x2]; 4],
@@ -159,6 +160,7 @@ unsafe fn horiz_convolution_four_rows(
 /// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
 /// - precision <= MAX_COEFS_PRECISION
 #[inline]
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_one_row(
     src_row: &[U16x2],
     dst_row: &mut [U16x2],

diff --git a/src/convolution/u16x3/wasm32.rs b/src/convolution/u16x3/wasm32.rs
@@ -45,6 +45,7 @@ pub(crate) fn horiz_convolution(
 /// - coefficients_chunks.len() == dst_rows.0.len()
 /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
 /// - precision <= MAX_COEFS_PRECISION
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_8u4x(
     src_rows: [&[U16x3]; 4],
     dst_rows: [&mut &mut [U16x3]; 4],
@@ -147,6 +148,7 @@ unsafe fn horiz_convolution_8u4x(
 /// - coefficients_chunks.len() == dst_row.len()
 /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.len()
 /// - precision <= MAX_COEFS_PRECISION
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_8u(
     src_row: &[U16x3],
     dst_row: &mut [U16x3],

diff --git a/src/convolution/u16x4/wasm32.rs b/src/convolution/u16x4/wasm32.rs
@@ -44,6 +44,7 @@ pub(crate) fn horiz_convolution(
 /// - coefficients_chunks.len() == dst_rows.0.len()
 /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
 /// - precision <= MAX_COEFS_PRECISION
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_four_rows(
     src_rows: [&[U16x4]; 4],
     dst_rows: [&mut &mut [U16x4]; 4],
@@ -117,9 +118,15 @@ unsafe fn horiz_convolution_four_rows(
             for i in 0..4 {
                 let source = wasm32_utils::loadl_i64(src_rows[i], x);
                 let rg_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE);
-                rg_sum[i] = i64x2_add(rg_sum[i], wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff0_i64x2));
+                rg_sum[i] = i64x2_add(
+                    rg_sum[i],
+                    wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff0_i64x2),
+                );
                 let ba_i64x2 = i8x16_swizzle(source, BA0_SHUFFLE);
-                ba_sum[i] = i64x2_add(ba_sum[i], wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff0_i64x2));
+                ba_sum[i] = i64x2_add(
+                    ba_sum[i],
+                    wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff0_i64x2),
+                );
             }
         }
 
@@ -143,6 +150,7 @@ unsafe fn horiz_convolution_four_rows(
 /// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
 /// - precision <= MAX_COEFS_PRECISION
 #[inline]
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_one_row(
     src_row: &[U16x4],
     dst_row: &mut [U16x4],

diff --git a/src/convolution/u8x1/wasm32.rs b/src/convolution/u8x1/wasm32.rs
@@ -45,6 +45,7 @@ pub(crate) fn horiz_convolution(
 /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
 /// - precision <= MAX_COEFS_PRECISION
 #[inline]
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_four_rows(
     src_rows: [&[U8]; 4],
     dst_rows: [&mut &mut [U8]; 4],
@@ -112,6 +113,7 @@ unsafe fn horiz_convolution_four_rows(
 /// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
 /// - precision <= MAX_COEFS_PRECISION
 #[inline]
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_row(
     src_row: &[U8],
     dst_row: &mut [U8],

diff --git a/src/convolution/u8x2/wasm32.rs b/src/convolution/u8x2/wasm32.rs
@@ -45,6 +45,7 @@ pub(crate) fn horiz_convolution(
 /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
 /// - precision <= MAX_COEFS_PRECISION
 #[inline]
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_four_rows(
     src_rows: [&[U8x2]; 4],
     dst_rows: [&mut &mut [U8x2]; 4],
@@ -144,6 +145,7 @@ unsafe fn horiz_convolution_four_rows(
 }
 
 #[inline]
+#[target_feature(enable = "simd128")]
 unsafe fn set_dst_pixel(
     raw: v128,
     d_row: &mut &mut [U8x2],
@@ -165,6 +167,7 @@ unsafe fn set_dst_pixel(
 /// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
 /// - precision <= MAX_COEFS_PRECISION
 #[inline]
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_one_row(
     src_row: &[U8x2],
     dst_row: &mut [U8x2],

diff --git a/src/convolution/u8x3/wasm32.rs b/src/convolution/u8x3/wasm32.rs
@@ -47,6 +47,7 @@ pub(crate) fn horiz_convolution(
 /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
 /// - precision <= MAX_COEFS_PRECISION
 #[inline]
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_8u4x(
     src_rows: [&[U8x3]; 4],
     dst_rows: [&mut &mut [U8x3]; 4],
@@ -177,6 +178,7 @@ unsafe fn horiz_convolution_8u4x(
 /// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
 /// - precision <= MAX_COEFS_PRECISION
 #[inline]
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_8u(
     src_row: &[U8x3],
     dst_row: &mut [U8x3],

diff --git a/src/convolution/u8x4/wasm32.rs b/src/convolution/u8x4/wasm32.rs
@@ -49,6 +49,7 @@ pub(crate) fn horiz_convolution(
 /// - coefficients_chunks.len() == dst_rows.0.len()
 /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
 /// - precision <= MAX_COEFS_PRECISION
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_8u4x(
     src_rows: [&[U8x4]; 4],
     dst_rows: [&mut &mut [U8x4]; 4],
@@ -180,6 +181,7 @@ unsafe fn horiz_convolution_8u4x(
 /// - coefficients_chunks.len() == dst_row.len()
 /// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.len()
 /// - precision <= MAX_COEFS_PRECISION
+#[target_feature(enable = "simd128")]
 unsafe fn horiz_convolution_8u(
     src_row: &[U8x4],
     dst_row: &mut [U8x4],

diff --git a/src/convolution/vertical_u16/wasm32.rs b/src/convolution/vertical_u16/wasm32.rs
@@ -25,6 +25,7 @@ pub(crate) fn vert_convolution<T: PixelExt<Component = u16>>(
     }
 }
 
+#[target_feature(enable = "simd128")]
 unsafe fn vert_convolution_into_one_row_u16<T: PixelExt<Component = u16>>(
     src_img: &ImageView<T>,
     dst_row: &mut [T],
@@ -85,7 +86,8 @@ unsafe fn vert_convolution_into_one_row_u16<T: PixelExt<Component = u16>>(
                     let source = wasm32_utils::load_v128(src_rows[r], src_x + x * 8);
                     for i in 0..4 {
                         let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]);
-                        sums[i][x] = i64x2_add(sums[i][x], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
+                        sums[i][x] =
+                            i64x2_add(sums[i][x], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
                     }
                 }
             }
@@ -101,7 +103,8 @@ unsafe fn vert_convolution_into_one_row_u16<T: PixelExt<Component = u16>>(
                 let source = wasm32_utils::load_v128(components, src_x + x * 8);
                 for i in 0..4 {
                     let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]);
-                    sums[i][x] = i64x2_add(sums[i][x], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
+                    sums[i][x] =
+                        i64x2_add(sums[i][x], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
                 }
             }
         }
@@ -140,7 +143,8 @@ unsafe fn vert_convolution_into_one_row_u16<T: PixelExt<Component = u16>>(
                 let source = wasm32_utils::load_v128(src_rows[r], src_x);
                 for i in 0..4 {
                     let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]);
-                    sums[i] = i64x2_add(sums[i], wasm32_utils::i64x2_mul_lo(c_i64x2, coeffs_i64[r]));
+                    sums[i] =
+                        i64x2_add(sums[i], wasm32_utils::i64x2_mul_lo(c_i64x2, coeffs_i64[r]));
                 }
             }
             y += 2;

diff --git a/src/convolution/vertical_u8/wasm32.rs b/src/convolution/vertical_u8/wasm32.rs
@@ -25,6 +25,7 @@ pub(crate) fn vert_convolution<T: PixelExt<Component = u8>>(
     }
 }
 
+#[target_feature(enable = "simd128")]
 pub(crate) unsafe fn vert_convolution_into_one_row_u8<T: PixelExt<Component = u8>>(
     src_img: &ImageView<T>,
     dst_row: &mut [T],