Skip to content

Commit

Permalink
Add #[target_feature(enable = "simd128")]
Browse files Browse the repository at this point in the history
  • Loading branch information
cdmurph32 committed Jan 18, 2023
1 parent e09fa1e commit 1a5c1a2
Show file tree
Hide file tree
Showing 11 changed files with 36 additions and 8 deletions.
6 changes: 3 additions & 3 deletions src/alpha/u8x2/wasm32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,14 +201,14 @@ unsafe fn divide_alpha_8_pixels(pixels: v128) -> v128 {
9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1,
);
let alpha_scale = f32x4_splat(255.0 * 256.0);
// sse4 _mm_cvtps_ep32 converts inf to i32::MIN or 2147483648f32 u32.
// wasm32 u32x4_trunc_sat_f32x4 converts inf to u32::MAX.
// sse4 _mm_cvtps_epi32 converts inf to i32::MIN or 2147483648f32 u32.
// wasm32 u32x4_trunc_sat_f32x4 on AVX systems converts inf to u32::MAX.
// Tests pass without capping inf from dividing by zero, but scaled values will not match sse4,
// and other potential test cases will (probably?) break.
let alpha_scale_max = f32x4_splat(2147483648f32);

let alpha_lo_f32 = f32x4_convert_u32x4(i8x16_swizzle(pixels, ALPHA32_SH_LO));
// trunc_sat will always round down. Adding f32x4_nearest would match _mm_cvtps_ep32 exactly,
// trunc_sat will always round down. Adding f32x4_nearest would match _mm_cvtps_epi32 exactly,
// but would add extra instructions.
let scaled_alpha_lo_u32 = u32x4_trunc_sat_f32x4(f32x4_pmin(
f32x4_div(alpha_scale, alpha_lo_f32),
Expand Down
2 changes: 2 additions & 0 deletions src/convolution/u16x1/wasm32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ pub(crate) fn horiz_convolution(
/// - length of all rows in dst_rows must be equal
/// - coefficients_chunks.len() == dst_rows.0.len()
/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_four_rows(
src_rows: [&[U16]; 4],
dst_rows: [&mut &mut [U16]; 4],
Expand Down Expand Up @@ -172,6 +173,7 @@ unsafe fn horiz_convolution_four_rows(
/// - bounds.len() == dst_row.len()
/// - coefficients_chunks.len() == dst_row.len()
/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.len()
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_one_row(
src_row: &[U16],
dst_row: &mut [U16],
Expand Down
2 changes: 2 additions & 0 deletions src/convolution/u16x2/wasm32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ pub(crate) fn horiz_convolution(
/// - coefficients_chunks.len() == dst_rows.0.len()
/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
/// - precision <= MAX_COEFS_PRECISION
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_four_rows(
src_rows: [&[U16x2]; 4],
dst_rows: [&mut &mut [U16x2]; 4],
Expand Down Expand Up @@ -159,6 +160,7 @@ unsafe fn horiz_convolution_four_rows(
/// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
/// - precision <= MAX_COEFS_PRECISION
#[inline]
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_one_row(
src_row: &[U16x2],
dst_row: &mut [U16x2],
Expand Down
2 changes: 2 additions & 0 deletions src/convolution/u16x3/wasm32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ pub(crate) fn horiz_convolution(
/// - coefficients_chunks.len() == dst_rows.0.len()
/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
/// - precision <= MAX_COEFS_PRECISION
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_8u4x(
src_rows: [&[U16x3]; 4],
dst_rows: [&mut &mut [U16x3]; 4],
Expand Down Expand Up @@ -147,6 +148,7 @@ unsafe fn horiz_convolution_8u4x(
/// - coefficients_chunks.len() == dst_row.len()
/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.len()
/// - precision <= MAX_COEFS_PRECISION
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_8u(
src_row: &[U16x3],
dst_row: &mut [U16x3],
Expand Down
12 changes: 10 additions & 2 deletions src/convolution/u16x4/wasm32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ pub(crate) fn horiz_convolution(
/// - coefficients_chunks.len() == dst_rows.0.len()
/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
/// - precision <= MAX_COEFS_PRECISION
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_four_rows(
src_rows: [&[U16x4]; 4],
dst_rows: [&mut &mut [U16x4]; 4],
Expand Down Expand Up @@ -117,9 +118,15 @@ unsafe fn horiz_convolution_four_rows(
for i in 0..4 {
let source = wasm32_utils::loadl_i64(src_rows[i], x);
let rg_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE);
rg_sum[i] = i64x2_add(rg_sum[i], wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff0_i64x2));
rg_sum[i] = i64x2_add(
rg_sum[i],
wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff0_i64x2),
);
let ba_i64x2 = i8x16_swizzle(source, BA0_SHUFFLE);
ba_sum[i] = i64x2_add(ba_sum[i], wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff0_i64x2));
ba_sum[i] = i64x2_add(
ba_sum[i],
wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff0_i64x2),
);
}
}

Expand All @@ -143,6 +150,7 @@ unsafe fn horiz_convolution_four_rows(
/// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
/// - precision <= MAX_COEFS_PRECISION
#[inline]
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_one_row(
src_row: &[U16x4],
dst_row: &mut [U16x4],
Expand Down
2 changes: 2 additions & 0 deletions src/convolution/u8x1/wasm32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ pub(crate) fn horiz_convolution(
/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
/// - precision <= MAX_COEFS_PRECISION
#[inline]
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_four_rows(
src_rows: [&[U8]; 4],
dst_rows: [&mut &mut [U8]; 4],
Expand Down Expand Up @@ -112,6 +113,7 @@ unsafe fn horiz_convolution_four_rows(
/// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
/// - precision <= MAX_COEFS_PRECISION
#[inline]
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_row(
src_row: &[U8],
dst_row: &mut [U8],
Expand Down
3 changes: 3 additions & 0 deletions src/convolution/u8x2/wasm32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ pub(crate) fn horiz_convolution(
/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
/// - precision <= MAX_COEFS_PRECISION
#[inline]
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_four_rows(
src_rows: [&[U8x2]; 4],
dst_rows: [&mut &mut [U8x2]; 4],
Expand Down Expand Up @@ -144,6 +145,7 @@ unsafe fn horiz_convolution_four_rows(
}

#[inline]
#[target_feature(enable = "simd128")]
unsafe fn set_dst_pixel(
raw: v128,
d_row: &mut &mut [U8x2],
Expand All @@ -165,6 +167,7 @@ unsafe fn set_dst_pixel(
/// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
/// - precision <= MAX_COEFS_PRECISION
#[inline]
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_one_row(
src_row: &[U8x2],
dst_row: &mut [U8x2],
Expand Down
2 changes: 2 additions & 0 deletions src/convolution/u8x3/wasm32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ pub(crate) fn horiz_convolution(
/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
/// - precision <= MAX_COEFS_PRECISION
#[inline]
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_8u4x(
src_rows: [&[U8x3]; 4],
dst_rows: [&mut &mut [U8x3]; 4],
Expand Down Expand Up @@ -177,6 +178,7 @@ unsafe fn horiz_convolution_8u4x(
/// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
/// - precision <= MAX_COEFS_PRECISION
#[inline]
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_8u(
src_row: &[U8x3],
dst_row: &mut [U8x3],
Expand Down
2 changes: 2 additions & 0 deletions src/convolution/u8x4/wasm32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ pub(crate) fn horiz_convolution(
/// - coefficients_chunks.len() == dst_rows.0.len()
/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
/// - precision <= MAX_COEFS_PRECISION
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_8u4x(
src_rows: [&[U8x4]; 4],
dst_rows: [&mut &mut [U8x4]; 4],
Expand Down Expand Up @@ -180,6 +181,7 @@ unsafe fn horiz_convolution_8u4x(
/// - coefficients_chunks.len() == dst_row.len()
/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.len()
/// - precision <= MAX_COEFS_PRECISION
#[target_feature(enable = "simd128")]
unsafe fn horiz_convolution_8u(
src_row: &[U8x4],
dst_row: &mut [U8x4],
Expand Down
10 changes: 7 additions & 3 deletions src/convolution/vertical_u16/wasm32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pub(crate) fn vert_convolution<T: PixelExt<Component = u16>>(
}
}

#[target_feature(enable = "simd128")]
unsafe fn vert_convolution_into_one_row_u16<T: PixelExt<Component = u16>>(
src_img: &ImageView<T>,
dst_row: &mut [T],
Expand Down Expand Up @@ -85,7 +86,8 @@ unsafe fn vert_convolution_into_one_row_u16<T: PixelExt<Component = u16>>(
let source = wasm32_utils::load_v128(src_rows[r], src_x + x * 8);
for i in 0..4 {
let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]);
sums[i][x] = i64x2_add(sums[i][x], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
sums[i][x] =
i64x2_add(sums[i][x], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
}
}
}
Expand All @@ -101,7 +103,8 @@ unsafe fn vert_convolution_into_one_row_u16<T: PixelExt<Component = u16>>(
let source = wasm32_utils::load_v128(components, src_x + x * 8);
for i in 0..4 {
let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]);
sums[i][x] = i64x2_add(sums[i][x], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
sums[i][x] =
i64x2_add(sums[i][x], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
}
}
}
Expand Down Expand Up @@ -140,7 +143,8 @@ unsafe fn vert_convolution_into_one_row_u16<T: PixelExt<Component = u16>>(
let source = wasm32_utils::load_v128(src_rows[r], src_x);
for i in 0..4 {
let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]);
sums[i] = i64x2_add(sums[i], wasm32_utils::i64x2_mul_lo(c_i64x2, coeffs_i64[r]));
sums[i] =
i64x2_add(sums[i], wasm32_utils::i64x2_mul_lo(c_i64x2, coeffs_i64[r]));
}
}
y += 2;
Expand Down
1 change: 1 addition & 0 deletions src/convolution/vertical_u8/wasm32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pub(crate) fn vert_convolution<T: PixelExt<Component = u8>>(
}
}

#[target_feature(enable = "simd128")]
pub(crate) unsafe fn vert_convolution_into_one_row_u8<T: PixelExt<Component = u8>>(
src_img: &ImageView<T>,
dst_row: &mut [T],
Expand Down

0 comments on commit 1a5c1a2

Please sign in to comment.