From d265a7a381f2a015645ec74999693ce12d97ed11 Mon Sep 17 00:00:00 2001 From: Cameron Hart Date: Mon, 15 Apr 2024 23:32:12 +1200 Subject: [PATCH] Use scalar impl of mat4 det instead of neon. --- codegen/templates/mat.rs.tera | 135 +++++++++++++++++----------------- src/f32/neon/mat4.rs | 81 ++++---------------- 2 files changed, 85 insertions(+), 131 deletions(-) diff --git a/codegen/templates/mat.rs.tera b/codegen/templates/mat.rs.tera index 3b8a8e79..c4dfbd11 100644 --- a/codegen/templates/mat.rs.tera +++ b/codegen/templates/mat.rs.tera @@ -1310,72 +1310,75 @@ impl {{ self_t }} { let detcof = addres * f32x4::from_array([1.0, -1.0, 1.0, -1.0]); dot4(self.x_axis.0, detcof) - {% elif self_t == "Mat4" and is_neon %} - unsafe { - let swizz2110 = |x| { - let x = vuzp1q_f32(x, vdupq_laneq_f32(x, 1)); - vextq_f32(x, x, 1) - }; - let swizz3323 = |x| { - let xy = vgetq_lane_f32(x, 3); - vsetq_lane_f32(xy, vsetq_lane_f32(xy, x, 0), 1) - }; - let swizz2100 = |x| { - let y = vuzp1q_f32(x, x); - vuzp1q_f32(vextq_f32(x, y, 3), y) - }; - let swizz0021 = |x| vtrn1q_f32(x, vzip1q_f32(x, x)); - // let swizz6723 = |x, y| { - // vsetq_lane_f64(vgetq_lane_f64(y, 1), 0) - // }; - let swizz2323 = |x| vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(x), 1)); - let swizz0012 = |x| vzip1q_f32(x, vuzp1q_f32(x, x)); - let swizz1000 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 1), vdupq_laneq_f32(x, 0), 0); - let swizz1344 = |x, y| vuzp2q_f32(x, vdupq_laneq_f32(y, 0)); - let swizz0113 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 1), x, 2); - let swizz2211 = |x| { - let x = vsetq_lane_f32(vgetq_lane_f32(x, 1), x, 3); - vzip2q_f32(x, x) - }; - let swizz2245 = |x, y| vextq_f32(vtrn1q_f32(x, x), y, 1); - let swizz0233 = |x| vuzp1q_f32(x, vdupq_laneq_f32(x, 3)); - let swizz3332 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 2), vdupq_laneq_f32(x, 3), 3); - - // Based on https://github.com/g-truc/glm `glm_mat4_determinant` - let swp2a = swizz2110(self.z_axis.0); - let swp3a = swizz3323(self.w_axis.0); - let swp2b = swizz3323(self.z_axis.0); - let swp3b = swizz2110(self.w_axis.0); - let swp2c = swizz2100(self.z_axis.0); - let swp3c = swizz0021(self.w_axis.0); - - let mula = vmulq_f32(swp2a, swp3a); - let mulb = vmulq_f32(swp2b, swp3b); - let mulc = vmulq_f32(swp2c, swp3c); - let sube = vsubq_f32(mula, mulb); - let subf = vsubq_f32(swizz2323(mulc), mulc); - - let subfaca = swizz0012(sube); - let swpfaca = swizz1000(self.y_axis.0); - let mulfaca = vmulq_f32(swpfaca, subfaca); - - let subtmpb = swizz1344(sube, subf); - let subfacb = swizz0113(subtmpb); - let swpfacb = swizz2211(self.y_axis.0); - let mulfacb = vmulq_f32(swpfacb, subfacb); - - let subres = vsubq_f32(mulfaca, mulfacb); - let subtmpc = swizz2245(sube, subf); - let subfacc = swizz0233(subtmpc); - let swpfacc = swizz3332(self.y_axis.0); - let mulfacc = vmulq_f32(swpfacc, subfacc); - - let addres = vaddq_f32(subres, mulfacc); - const COF: float32x4_t = Vec4::new(1.0, -1.0, 1.0, -1.0).0; - let detcof = vmulq_f32(addres, COF); - - dot4(self.x_axis.0, detcof) - } + {# + // neon implementation is slower than scalar + // {% elif self_t == "Mat4" and is_neon %} + // unsafe { + // let swizz2110 = |x| { + // let x = vuzp1q_f32(x, vdupq_laneq_f32(x, 1)); + // vextq_f32(x, x, 1) + // }; + // let swizz3323 = |x| { + // let xy = vgetq_lane_f32(x, 3); + // vsetq_lane_f32(xy, vsetq_lane_f32(xy, x, 0), 1) + // }; + // let swizz2100 = |x| { + // let y = vuzp1q_f32(x, x); + // vuzp1q_f32(vextq_f32(x, y, 3), y) + // }; + // let swizz0021 = |x| vtrn1q_f32(x, vzip1q_f32(x, x)); + // // let swizz6723 = |x, y| { + // // vsetq_lane_f64(vgetq_lane_f64(y, 1), 0) + // // }; + // let swizz2323 = |x| vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(x), 1)); + // let swizz0012 = |x| vzip1q_f32(x, vuzp1q_f32(x, x)); + // let swizz1000 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 1), vdupq_laneq_f32(x, 0), 0); + // let swizz1344 = |x, y| vuzp2q_f32(x, vdupq_laneq_f32(y, 0)); + // let swizz0113 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 1), x, 2); + // let swizz2211 = |x| { + // let x = vsetq_lane_f32(vgetq_lane_f32(x, 1), x, 3); + // vzip2q_f32(x, x) + // }; + // let swizz2245 = |x, y| vextq_f32(vtrn1q_f32(x, x), y, 1); + // let swizz0233 = |x| vuzp1q_f32(x, vdupq_laneq_f32(x, 3)); + // let swizz3332 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 2), vdupq_laneq_f32(x, 3), 3); + + // // Based on https://github.com/g-truc/glm `glm_mat4_determinant` + // let swp2a = swizz2110(self.z_axis.0); + // let swp3a = swizz3323(self.w_axis.0); + // let swp2b = swizz3323(self.z_axis.0); + // let swp3b = swizz2110(self.w_axis.0); + // let swp2c = swizz2100(self.z_axis.0); + // let swp3c = swizz0021(self.w_axis.0); + + // let mula = vmulq_f32(swp2a, swp3a); + // let mulb = vmulq_f32(swp2b, swp3b); + // let mulc = vmulq_f32(swp2c, swp3c); + // let sube = vsubq_f32(mula, mulb); + // let subf = vsubq_f32(swizz2323(mulc), mulc); + + // let subfaca = swizz0012(sube); + // let swpfaca = swizz1000(self.y_axis.0); + // let mulfaca = vmulq_f32(swpfaca, subfaca); + + // let subtmpb = swizz1344(sube, subf); + // let subfacb = swizz0113(subtmpb); + // let swpfacb = swizz2211(self.y_axis.0); + // let mulfacb = vmulq_f32(swpfacb, subfacb); + + // let subres = vsubq_f32(mulfaca, mulfacb); + // let subtmpc = swizz2245(sube, subf); + // let subfacc = swizz0233(subtmpc); + // let swpfacc = swizz3332(self.y_axis.0); + // let mulfacc = vmulq_f32(swpfacc, subfacc); + + // let addres = vaddq_f32(subres, mulfacc); + // const COF: float32x4_t = Vec4::new(1.0, -1.0, 1.0, -1.0).0; + // let detcof = vmulq_f32(addres, COF); + + // dot4(self.x_axis.0, detcof) + // } + #} {% elif dim == 2 %} self.x_axis.x * self.y_axis.y - self.x_axis.y * self.y_axis.x {% elif dim == 3 %} diff --git a/src/f32/neon/mat4.rs b/src/f32/neon/mat4.rs index e21337e0..9828f852 100644 --- a/src/f32/neon/mat4.rs +++ b/src/f32/neon/mat4.rs @@ -581,71 +581,22 @@ impl Mat4 { /// Returns the determinant of `self`. #[must_use] pub fn determinant(&self) -> f32 { - unsafe { - let swizz2110 = |x| { - let x = vuzp1q_f32(x, vdupq_laneq_f32(x, 1)); - vextq_f32(x, x, 1) - }; - let swizz3323 = |x| { - let xy = vgetq_lane_f32(x, 3); - vsetq_lane_f32(xy, vsetq_lane_f32(xy, x, 0), 1) - }; - let swizz2100 = |x| { - let y = vuzp1q_f32(x, x); - vuzp1q_f32(vextq_f32(x, y, 3), y) - }; - let swizz0021 = |x| vtrn1q_f32(x, vzip1q_f32(x, x)); - // let swizz6723 = |x, y| { - // vsetq_lane_f64(vgetq_lane_f64(y, 1), 0) - // }; - let swizz2323 = |x| vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(x), 1)); - let swizz0012 = |x| vzip1q_f32(x, vuzp1q_f32(x, x)); - let swizz1000 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 1), vdupq_laneq_f32(x, 0), 0); - let swizz1344 = |x, y| vuzp2q_f32(x, vdupq_laneq_f32(y, 0)); - let swizz0113 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 1), x, 2); - let swizz2211 = |x| { - let x = vsetq_lane_f32(vgetq_lane_f32(x, 1), x, 3); - vzip2q_f32(x, x) - }; - let swizz2245 = |x, y| vextq_f32(vtrn1q_f32(x, x), y, 1); - let swizz0233 = |x| vuzp1q_f32(x, vdupq_laneq_f32(x, 3)); - let swizz3332 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 2), vdupq_laneq_f32(x, 3), 3); - - // Based on https://github.com/g-truc/glm `glm_mat4_determinant` - let swp2a = swizz2110(self.z_axis.0); - let swp3a = swizz3323(self.w_axis.0); - let swp2b = swizz3323(self.z_axis.0); - let swp3b = swizz2110(self.w_axis.0); - let swp2c = swizz2100(self.z_axis.0); - let swp3c = swizz0021(self.w_axis.0); - - let mula = vmulq_f32(swp2a, swp3a); - let mulb = vmulq_f32(swp2b, swp3b); - let mulc = vmulq_f32(swp2c, swp3c); - let sube = vsubq_f32(mula, mulb); - let subf = vsubq_f32(swizz2323(mulc), mulc); - - let subfaca = swizz0012(sube); - let swpfaca = swizz1000(self.y_axis.0); - let mulfaca = vmulq_f32(swpfaca, subfaca); - - let subtmpb = swizz1344(sube, subf); - let subfacb = swizz0113(subtmpb); - let swpfacb = swizz2211(self.y_axis.0); - let mulfacb = vmulq_f32(swpfacb, subfacb); - - let subres = vsubq_f32(mulfaca, mulfacb); - let subtmpc = swizz2245(sube, subf); - let subfacc = swizz0233(subtmpc); - let swpfacc = swizz3332(self.y_axis.0); - let mulfacc = vmulq_f32(swpfacc, subfacc); - - let addres = vaddq_f32(subres, mulfacc); - const COF: float32x4_t = Vec4::new(1.0, -1.0, 1.0, -1.0).0; - let detcof = vmulq_f32(addres, COF); - - dot4(self.x_axis.0, detcof) - } + let (m00, m01, m02, m03) = self.x_axis.into(); + let (m10, m11, m12, m13) = self.y_axis.into(); + let (m20, m21, m22, m23) = self.z_axis.into(); + let (m30, m31, m32, m33) = self.w_axis.into(); + + let a2323 = m22 * m33 - m23 * m32; + let a1323 = m21 * m33 - m23 * m31; + let a1223 = m21 * m32 - m22 * m31; + let a0323 = m20 * m33 - m23 * m30; + let a0223 = m20 * m32 - m22 * m30; + let a0123 = m20 * m31 - m21 * m30; + + m00 * (m11 * a2323 - m12 * a1323 + m13 * a1223) + - m01 * (m10 * a2323 - m12 * a0323 + m13 * a0223) + + m02 * (m10 * a1323 - m11 * a0323 + m13 * a0123) + - m03 * (m10 * a1223 - m11 * a0223 + m12 * a0123) } /// Returns the inverse of `self`.