Skip to content

Commit

Permalink
Improved NEON Vec3A::cross implementation.
Browse files Browse the repository at this point in the history
  • Loading branch information
bitshifter committed Apr 1, 2024
1 parent 25ddaa0 commit e4e7b80
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 18 deletions.
29 changes: 20 additions & 9 deletions codegen/templates/vec.rs.tera
Original file line number Diff line number Diff line change
Expand Up @@ -629,15 +629,26 @@ impl {{ self_t }} {
Self(simd_swizzle!(sub, [2, 0, 1, 1]))
{% elif is_neon %}
unsafe {
let lhs = self.into();
let rhs = rhs.into();
let lhszxy = vextq_f32(vuzp1q_f32(lhs, lhs), lhs, 3);
let rhszxy = vextq_f32(vuzp1q_f32(rhs, rhs), rhs, 3);
let lhszxy_rhs = vmulq_f32(lhszxy, rhs);
let rhszxy_lhs = vmulq_f32(lhs, rhszxy);
let sub = vsubq_f32(lhszxy_rhs, rhszxy_lhs);
let result = vzip1q_f32(sub, sub);
let result = vsetq_lane_f32(vgetq_lane_f32(sub, 2), result, 0);
// Implementation taken from Realtime Math
let lhs = self.0;
let rhs = rhs.0;
// cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx)
let lhs_yzwx = vextq_f32(lhs, lhs, 1);
let rhs_wxyz = vextq_f32(rhs, rhs, 3);

let lhs_yzx = vsetq_lane_f32(vgetq_lane_f32(lhs, 0), lhs_yzwx, 2);
let rhs_zxy = vsetq_lane_f32(vgetq_lane_f32(rhs, 2), rhs_wxyz, 0);

// part_a = (a.yzx * b.zxy)
let part_a = vmulq_f32(lhs_yzx, rhs_zxy);

let lhs_wxyz = vextq_f32(lhs, lhs, 3);
let rhs_yzwx = vextq_f32(rhs, rhs, 1);
let lhs_zxy = vsetq_lane_f32(vgetq_lane_f32(lhs, 2), lhs_wxyz, 0);
let rhs_yzx = vsetq_lane_f32(vgetq_lane_f32(rhs, 0), rhs_yzwx, 2);

// result = part_a - (a.zxy * b.yzx)
let result = vmlsq_f32(part_a, lhs_zxy, rhs_yzx);
Self(result)
}
{% endif %}
Expand Down
29 changes: 20 additions & 9 deletions src/f32/neon/vec3a.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,15 +211,26 @@ impl Vec3A {
#[must_use]
pub fn cross(self, rhs: Self) -> Self {
unsafe {
let lhs = self.into();
let rhs = rhs.into();
let lhszxy = vextq_f32(vuzp1q_f32(lhs, lhs), lhs, 3);
let rhszxy = vextq_f32(vuzp1q_f32(rhs, rhs), rhs, 3);
let lhszxy_rhs = vmulq_f32(lhszxy, rhs);
let rhszxy_lhs = vmulq_f32(lhs, rhszxy);
let sub = vsubq_f32(lhszxy_rhs, rhszxy_lhs);
let result = vzip1q_f32(sub, sub);
let result = vsetq_lane_f32(vgetq_lane_f32(sub, 2), result, 0);
// Implementation taken from Realtime Math
let lhs = self.0;
let rhs = rhs.0;
// cross(a, b) = (a.yzx * b.zxy) - (a.zxy * b.yzx)
let lhs_yzwx = vextq_f32(lhs, lhs, 1);
let rhs_wxyz = vextq_f32(rhs, rhs, 3);

let lhs_yzx = vsetq_lane_f32(vgetq_lane_f32(lhs, 0), lhs_yzwx, 2);
let rhs_zxy = vsetq_lane_f32(vgetq_lane_f32(rhs, 2), rhs_wxyz, 0);

// part_a = (a.yzx * b.zxy)
let part_a = vmulq_f32(lhs_yzx, rhs_zxy);

let lhs_wxyz = vextq_f32(lhs, lhs, 3);
let rhs_yzwx = vextq_f32(rhs, rhs, 1);
let lhs_zxy = vsetq_lane_f32(vgetq_lane_f32(lhs, 2), lhs_wxyz, 0);
let rhs_yzx = vsetq_lane_f32(vgetq_lane_f32(rhs, 0), rhs_yzwx, 2);

// result = part_a - (a.zxy * b.yzx)
let result = vmlsq_f32(part_a, lhs_zxy, rhs_yzx);
Self(result)
}
}
Expand Down

0 comments on commit e4e7b80

Please sign in to comment.