Skip to content

Commit

Permalink
Use scalar impl of mat4 det instead of neon.
Browse files Browse the repository at this point in the history
  • Loading branch information
bitshifter committed Apr 15, 2024
1 parent 9bc020a commit d265a7a
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 131 deletions.
135 changes: 69 additions & 66 deletions codegen/templates/mat.rs.tera
Original file line number Diff line number Diff line change
Expand Up @@ -1310,72 +1310,75 @@ impl {{ self_t }} {
let detcof = addres * f32x4::from_array([1.0, -1.0, 1.0, -1.0]);

dot4(self.x_axis.0, detcof)
{% elif self_t == "Mat4" and is_neon %}
unsafe {
let swizz2110 = |x| {
let x = vuzp1q_f32(x, vdupq_laneq_f32(x, 1));
vextq_f32(x, x, 1)
};
let swizz3323 = |x| {
let xy = vgetq_lane_f32(x, 3);
vsetq_lane_f32(xy, vsetq_lane_f32(xy, x, 0), 1)
};
let swizz2100 = |x| {
let y = vuzp1q_f32(x, x);
vuzp1q_f32(vextq_f32(x, y, 3), y)
};
let swizz0021 = |x| vtrn1q_f32(x, vzip1q_f32(x, x));
// let swizz6723 = |x, y| {
// vsetq_lane_f64(vgetq_lane_f64(y, 1), 0)
// };
let swizz2323 = |x| vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(x), 1));
let swizz0012 = |x| vzip1q_f32(x, vuzp1q_f32(x, x));
let swizz1000 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 1), vdupq_laneq_f32(x, 0), 0);
let swizz1344 = |x, y| vuzp2q_f32(x, vdupq_laneq_f32(y, 0));
let swizz0113 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 1), x, 2);
let swizz2211 = |x| {
let x = vsetq_lane_f32(vgetq_lane_f32(x, 1), x, 3);
vzip2q_f32(x, x)
};
let swizz2245 = |x, y| vextq_f32(vtrn1q_f32(x, x), y, 1);
let swizz0233 = |x| vuzp1q_f32(x, vdupq_laneq_f32(x, 3));
let swizz3332 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 2), vdupq_laneq_f32(x, 3), 3);

// Based on https://github.com/g-truc/glm `glm_mat4_determinant`
let swp2a = swizz2110(self.z_axis.0);
let swp3a = swizz3323(self.w_axis.0);
let swp2b = swizz3323(self.z_axis.0);
let swp3b = swizz2110(self.w_axis.0);
let swp2c = swizz2100(self.z_axis.0);
let swp3c = swizz0021(self.w_axis.0);

let mula = vmulq_f32(swp2a, swp3a);
let mulb = vmulq_f32(swp2b, swp3b);
let mulc = vmulq_f32(swp2c, swp3c);
let sube = vsubq_f32(mula, mulb);
let subf = vsubq_f32(swizz2323(mulc), mulc);

let subfaca = swizz0012(sube);
let swpfaca = swizz1000(self.y_axis.0);
let mulfaca = vmulq_f32(swpfaca, subfaca);

let subtmpb = swizz1344(sube, subf);
let subfacb = swizz0113(subtmpb);
let swpfacb = swizz2211(self.y_axis.0);
let mulfacb = vmulq_f32(swpfacb, subfacb);

let subres = vsubq_f32(mulfaca, mulfacb);
let subtmpc = swizz2245(sube, subf);
let subfacc = swizz0233(subtmpc);
let swpfacc = swizz3332(self.y_axis.0);
let mulfacc = vmulq_f32(swpfacc, subfacc);

let addres = vaddq_f32(subres, mulfacc);
const COF: float32x4_t = Vec4::new(1.0, -1.0, 1.0, -1.0).0;
let detcof = vmulq_f32(addres, COF);

dot4(self.x_axis.0, detcof)
}
{#
// neon implementation is slower than scalar
// {% elif self_t == "Mat4" and is_neon %}
// unsafe {
// let swizz2110 = |x| {
// let x = vuzp1q_f32(x, vdupq_laneq_f32(x, 1));
// vextq_f32(x, x, 1)
// };
// let swizz3323 = |x| {
// let xy = vgetq_lane_f32(x, 3);
// vsetq_lane_f32(xy, vsetq_lane_f32(xy, x, 0), 1)
// };
// let swizz2100 = |x| {
// let y = vuzp1q_f32(x, x);
// vuzp1q_f32(vextq_f32(x, y, 3), y)
// };
// let swizz0021 = |x| vtrn1q_f32(x, vzip1q_f32(x, x));
// // let swizz6723 = |x, y| {
// // vsetq_lane_f64(vgetq_lane_f64(y, 1), 0)
// // };
// let swizz2323 = |x| vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(x), 1));
// let swizz0012 = |x| vzip1q_f32(x, vuzp1q_f32(x, x));
// let swizz1000 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 1), vdupq_laneq_f32(x, 0), 0);
// let swizz1344 = |x, y| vuzp2q_f32(x, vdupq_laneq_f32(y, 0));
// let swizz0113 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 1), x, 2);
// let swizz2211 = |x| {
// let x = vsetq_lane_f32(vgetq_lane_f32(x, 1), x, 3);
// vzip2q_f32(x, x)
// };
// let swizz2245 = |x, y| vextq_f32(vtrn1q_f32(x, x), y, 1);
// let swizz0233 = |x| vuzp1q_f32(x, vdupq_laneq_f32(x, 3));
// let swizz3332 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 2), vdupq_laneq_f32(x, 3), 3);

// // Based on https://github.com/g-truc/glm `glm_mat4_determinant`
// let swp2a = swizz2110(self.z_axis.0);
// let swp3a = swizz3323(self.w_axis.0);
// let swp2b = swizz3323(self.z_axis.0);
// let swp3b = swizz2110(self.w_axis.0);
// let swp2c = swizz2100(self.z_axis.0);
// let swp3c = swizz0021(self.w_axis.0);

// let mula = vmulq_f32(swp2a, swp3a);
// let mulb = vmulq_f32(swp2b, swp3b);
// let mulc = vmulq_f32(swp2c, swp3c);
// let sube = vsubq_f32(mula, mulb);
// let subf = vsubq_f32(swizz2323(mulc), mulc);

// let subfaca = swizz0012(sube);
// let swpfaca = swizz1000(self.y_axis.0);
// let mulfaca = vmulq_f32(swpfaca, subfaca);

// let subtmpb = swizz1344(sube, subf);
// let subfacb = swizz0113(subtmpb);
// let swpfacb = swizz2211(self.y_axis.0);
// let mulfacb = vmulq_f32(swpfacb, subfacb);

// let subres = vsubq_f32(mulfaca, mulfacb);
// let subtmpc = swizz2245(sube, subf);
// let subfacc = swizz0233(subtmpc);
// let swpfacc = swizz3332(self.y_axis.0);
// let mulfacc = vmulq_f32(swpfacc, subfacc);

// let addres = vaddq_f32(subres, mulfacc);
// const COF: float32x4_t = Vec4::new(1.0, -1.0, 1.0, -1.0).0;
// let detcof = vmulq_f32(addres, COF);

// dot4(self.x_axis.0, detcof)
// }
#}
{% elif dim == 2 %}
self.x_axis.x * self.y_axis.y - self.x_axis.y * self.y_axis.x
{% elif dim == 3 %}
Expand Down
81 changes: 16 additions & 65 deletions src/f32/neon/mat4.rs
Original file line number Diff line number Diff line change
Expand Up @@ -581,71 +581,22 @@ impl Mat4 {
/// Returns the determinant of `self`.
#[must_use]
pub fn determinant(&self) -> f32 {
unsafe {
let swizz2110 = |x| {
let x = vuzp1q_f32(x, vdupq_laneq_f32(x, 1));
vextq_f32(x, x, 1)
};
let swizz3323 = |x| {
let xy = vgetq_lane_f32(x, 3);
vsetq_lane_f32(xy, vsetq_lane_f32(xy, x, 0), 1)
};
let swizz2100 = |x| {
let y = vuzp1q_f32(x, x);
vuzp1q_f32(vextq_f32(x, y, 3), y)
};
let swizz0021 = |x| vtrn1q_f32(x, vzip1q_f32(x, x));
// let swizz6723 = |x, y| {
// vsetq_lane_f64(vgetq_lane_f64(y, 1), 0)
// };
let swizz2323 = |x| vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(x), 1));
let swizz0012 = |x| vzip1q_f32(x, vuzp1q_f32(x, x));
let swizz1000 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 1), vdupq_laneq_f32(x, 0), 0);
let swizz1344 = |x, y| vuzp2q_f32(x, vdupq_laneq_f32(y, 0));
let swizz0113 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 1), x, 2);
let swizz2211 = |x| {
let x = vsetq_lane_f32(vgetq_lane_f32(x, 1), x, 3);
vzip2q_f32(x, x)
};
let swizz2245 = |x, y| vextq_f32(vtrn1q_f32(x, x), y, 1);
let swizz0233 = |x| vuzp1q_f32(x, vdupq_laneq_f32(x, 3));
let swizz3332 = |x| vsetq_lane_f32(vgetq_lane_f32(x, 2), vdupq_laneq_f32(x, 3), 3);

// Based on https://github.com/g-truc/glm `glm_mat4_determinant`
let swp2a = swizz2110(self.z_axis.0);
let swp3a = swizz3323(self.w_axis.0);
let swp2b = swizz3323(self.z_axis.0);
let swp3b = swizz2110(self.w_axis.0);
let swp2c = swizz2100(self.z_axis.0);
let swp3c = swizz0021(self.w_axis.0);

let mula = vmulq_f32(swp2a, swp3a);
let mulb = vmulq_f32(swp2b, swp3b);
let mulc = vmulq_f32(swp2c, swp3c);
let sube = vsubq_f32(mula, mulb);
let subf = vsubq_f32(swizz2323(mulc), mulc);

let subfaca = swizz0012(sube);
let swpfaca = swizz1000(self.y_axis.0);
let mulfaca = vmulq_f32(swpfaca, subfaca);

let subtmpb = swizz1344(sube, subf);
let subfacb = swizz0113(subtmpb);
let swpfacb = swizz2211(self.y_axis.0);
let mulfacb = vmulq_f32(swpfacb, subfacb);

let subres = vsubq_f32(mulfaca, mulfacb);
let subtmpc = swizz2245(sube, subf);
let subfacc = swizz0233(subtmpc);
let swpfacc = swizz3332(self.y_axis.0);
let mulfacc = vmulq_f32(swpfacc, subfacc);

let addres = vaddq_f32(subres, mulfacc);
const COF: float32x4_t = Vec4::new(1.0, -1.0, 1.0, -1.0).0;
let detcof = vmulq_f32(addres, COF);

dot4(self.x_axis.0, detcof)
}
let (m00, m01, m02, m03) = self.x_axis.into();
let (m10, m11, m12, m13) = self.y_axis.into();
let (m20, m21, m22, m23) = self.z_axis.into();
let (m30, m31, m32, m33) = self.w_axis.into();

let a2323 = m22 * m33 - m23 * m32;
let a1323 = m21 * m33 - m23 * m31;
let a1223 = m21 * m32 - m22 * m31;
let a0323 = m20 * m33 - m23 * m30;
let a0223 = m20 * m32 - m22 * m30;
let a0123 = m20 * m31 - m21 * m30;

m00 * (m11 * a2323 - m12 * a1323 + m13 * a1223)
- m01 * (m10 * a2323 - m12 * a0323 + m13 * a0223)
+ m02 * (m10 * a1323 - m11 * a0323 + m13 * a0123)
- m03 * (m10 * a1223 - m11 * a0223 + m12 * a0123)
}

/// Returns the inverse of `self`.
Expand Down

0 comments on commit d265a7a

Please sign in to comment.