Reduce dot threshold at which lerp is used instead of slerp.

Add lerp_impl which is reused to slerp and lerp but doesn't calc dot again.
bitshifter · Aug 18, 2024 · 2c7ae1d · 2c7ae1d
1 parent 913395d
commit 2c7ae1d
Show file tree

Hide file tree

Showing 8 changed files with 160 additions and 112 deletions.
diff --git a/codegen/templates/quat.rs.tera b/codegen/templates/quat.rs.tera
@@ -750,6 +750,51 @@ impl {{ self_t }} {
         {{ vec4_t }}::from(self).abs_diff_eq({{ vec4_t }}::from(rhs), max_abs_diff)
     }
 
+    #[inline(always)]
+    #[must_use]
+    fn lerp_impl(self, end: Self, s: {{ scalar_t }}) -> Self {
+        {% if is_scalar %}
+            let interpolated = self + ((end - self) * s);
+            interpolated.normalize()
+        {% elif is_sse2 %}
+            let start = self.0;
+            let end = end.0;
+            unsafe {
+                let interpolated = _mm_add_ps(
+                    _mm_mul_ps(_mm_sub_ps(end, start), _mm_set_ps1(s)),
+                    start,
+                );
+                {{ self_t }}(interpolated).normalize()
+            }
+        {% elif is_wasm32 %}
+            let start = self.0;
+            let end = end.0;
+            let interpolated = f32x4_add(
+                f32x4_mul(f32x4_sub(end, start), f32x4_splat(s)),
+                start,
+            );
+            {{ self_t }}(interpolated).normalize()
+        {% elif is_coresimd %}
+            let start = self.0;
+            let end = end.0;
+            let interpolated = start + ((end - start) * f32x4::splat(s));
+            {{ self_t }}(interpolated).normalize()
+        {% elif is_neon %}
+            const NEG_ZERO: float32x4_t = f32x4_from_array([-0.0; 4]);
+            let start = self.0;
+            let end = end.0;
+            unsafe {
+                let interpolated = vaddq_f32(
+                    vmulq_f32(vsubq_f32(end, start), vld1q_dup_f32(&s)),
+                    start,
+                );
+                {{ self_t }}(interpolated).normalize()
+            }
+        {% else %}
+            unimplemented!()
+        {% endif %}
+    }
+
     /// Performs a linear interpolation between `self` and `rhs` based on
     /// the value `s`.
     ///
@@ -767,69 +812,41 @@ impl {{ self_t }} {
         glam_assert!(end.is_normalized());
 
         {% if is_scalar %}
-            let start = self;
-            let dot = start.dot(end);
+            let dot = self.dot(end);
             let bias = if dot >= 0.0 { 1.0 } else { -1.0 };
-            let interpolated = start.add(end.mul(bias).sub(start).mul(s));
-            interpolated.normalize()
+            self.lerp_impl(end * bias, s) 
         {% elif is_sse2 %}
             const NEG_ZERO: __m128 = m128_from_f32x4([-0.0; 4]);
-            let start = self.0;
-            let end = end.0;
             unsafe {
-                let dot = dot4_into_m128(start, end);
+                let dot = dot4_into_m128(self.0, end.0);
                 // Calculate the bias, if the dot product is positive or zero, there is no bias
                 // but if it is negative, we want to flip the 'end' rotation XYZW components
                 let bias = _mm_and_ps(dot, NEG_ZERO);
-                let interpolated = _mm_add_ps(
-                    _mm_mul_ps(_mm_sub_ps(_mm_xor_ps(end, bias), start), _mm_set_ps1(s)),
-                    start,
-                );
-                {{ self_t }}(interpolated).normalize()
+                self.lerp_impl(Self(_mm_xor_ps(end.0, bias)), s)
             }
         {% elif is_wasm32 %}
             const NEG_ZERO: v128 = v128_from_f32x4([-0.0; 4]);
-            let start = self.0;
-            let end = end.0;
-            let dot = dot4_into_v128(start, end);
+            let dot = dot4_into_v128(self.0, end.0);
             // Calculate the bias, if the dot product is positive or zero, there is no bias
             // but if it is negative, we want to flip the 'end' rotation XYZW components
             let bias = v128_and(dot, NEG_ZERO);
-            let interpolated = f32x4_add(
-                f32x4_mul(f32x4_sub(v128_xor(end, bias), start), f32x4_splat(s)),
-                start,
-            );
-            {{ self_t }}(interpolated).normalize()
+            self.lerp_impl(Self(v128_xor(end.0, bias)), s)
         {% elif is_coresimd %}
             const NEG_ZERO: f32x4 = f32x4::from_array([-0.0; 4]);
-            let start = self.0;
-            let end = end.0;
-            let dot = dot4_into_f32x4(start, end);
+            let dot = dot4_into_f32x4(self.0, end.0);
             // Calculate the bias, if the dot product is positive or zero, there is no bias
             // but if it is negative, we want to flip the 'end' rotation XYZW components
             let bias = f32x4_bitand(dot, NEG_ZERO);
-            let interpolated = start + ((f32x4_bitxor(end, bias) - start) * f32x4::splat(s));
-            {{ self_t }}(interpolated).normalize()
+            self.lerp_impl(Self(f32x4_bitxor(end.0, bias)), s)
         {% elif is_neon %}
             const NEG_ZERO: float32x4_t = f32x4_from_array([-0.0; 4]);
-            let start = self.0;
-            let end = end.0;
             unsafe {
-                let dot = dot4_into_f32x4(start, end);
+                let dot = dot4_into_f32x4(self.0, end.0);
                 // Calculate the bias, if the dot product is positive or zero, there is no bias
                 // but if it is negative, we want to flip the 'end' rotation XYZW components
                 let bias = vandq_u32(vreinterpretq_u32_f32(dot), vreinterpretq_u32_f32(NEG_ZERO));
-                let interpolated = vaddq_f32(
-                    vmulq_f32(
-                        vsubq_f32(
-                            vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(end), bias)),
-                            start,
-                        ),
-                        vld1q_dup_f32(&s),
-                    ),
-                    start,
-                );
-                {{ self_t }}(interpolated).normalize()
+                self.lerp_impl(
+                    Self(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(end), bias))), s)
             }
         {% else %}
             unimplemented!()
@@ -852,8 +869,6 @@ impl {{ self_t }} {
         glam_assert!(self.is_normalized());
         glam_assert!(end.is_normalized());
 
-        const DOT_THRESHOLD: {{ scalar_t }} = 0.9995;
-
         // Note that a rotation can be represented by two quaternions: `q` and
         // `-q`. The slerp path between `q` and `end` will be different from the
         // path between `-q` and `end`. One path will take the long way around and
@@ -866,9 +881,10 @@ impl {{ self_t }} {
             dot = -dot;
         }
 
+        const DOT_THRESHOLD: {{ scalar_t }} = 1.0 - {{ scalar_t }}::EPSILON;
         if dot > DOT_THRESHOLD {
-            // assumes lerp returns a normalized quaternion
-            self.lerp(end, s)
+            // if above threshold perform linear interpolation to avoid divide by zero
+            self.lerp_impl(end, s)
         } else {
             let theta = math::acos_approx(dot);
             {% if is_scalar %}

diff --git a/src/f32/coresimd/quat.rs b/src/f32/coresimd/quat.rs
@@ -606,6 +606,15 @@ impl Quat {
         Vec4::from(self).abs_diff_eq(Vec4::from(rhs), max_abs_diff)
     }
 
+    #[inline(always)]
+    #[must_use]
+    fn lerp_impl(self, end: Self, s: f32) -> Self {
+        let start = self.0;
+        let end = end.0;
+        let interpolated = start + ((end - start) * f32x4::splat(s));
+        Quat(interpolated).normalize()
+    }
+
     /// Performs a linear interpolation between `self` and `rhs` based on
     /// the value `s`.
     ///
@@ -623,14 +632,11 @@ impl Quat {
         glam_assert!(end.is_normalized());
 
         const NEG_ZERO: f32x4 = f32x4::from_array([-0.0; 4]);
-        let start = self.0;
-        let end = end.0;
-        let dot = dot4_into_f32x4(start, end);
+        let dot = dot4_into_f32x4(self.0, end.0);
         // Calculate the bias, if the dot product is positive or zero, there is no bias
         // but if it is negative, we want to flip the 'end' rotation XYZW components
         let bias = f32x4_bitand(dot, NEG_ZERO);
-        let interpolated = start + ((f32x4_bitxor(end, bias) - start) * f32x4::splat(s));
-        Quat(interpolated).normalize()
+        self.lerp_impl(Self(f32x4_bitxor(end.0, bias)), s)
     }
 
     /// Performs a spherical linear interpolation between `self` and `end`
@@ -649,8 +655,6 @@ impl Quat {
         glam_assert!(self.is_normalized());
         glam_assert!(end.is_normalized());
 
-        const DOT_THRESHOLD: f32 = 0.9995;
-
         // Note that a rotation can be represented by two quaternions: `q` and
         // `-q`. The slerp path between `q` and `end` will be different from the
         // path between `-q` and `end`. One path will take the long way around and
@@ -663,9 +667,10 @@ impl Quat {
             dot = -dot;
         }
 
+        const DOT_THRESHOLD: f32 = 1.0 - f32::EPSILON;
         if dot > DOT_THRESHOLD {
-            // assumes lerp returns a normalized quaternion
-            self.lerp(end, s)
+            // if above threshold perform linear interpolation to avoid divide by zero
+            self.lerp_impl(end, s)
         } else {
             let theta = math::acos_approx(dot);
 

diff --git a/src/f32/neon/quat.rs b/src/f32/neon/quat.rs
@@ -611,6 +611,19 @@ impl Quat {
         Vec4::from(self).abs_diff_eq(Vec4::from(rhs), max_abs_diff)
     }
 
+    #[inline(always)]
+    #[must_use]
+    fn lerp_impl(self, end: Self, s: f32) -> Self {
+        const NEG_ZERO: float32x4_t = f32x4_from_array([-0.0; 4]);
+        let start = self.0;
+        let end = end.0;
+        unsafe {
+            let interpolated =
+                vaddq_f32(vmulq_f32(vsubq_f32(end, start), vld1q_dup_f32(&s)), start);
+            Quat(interpolated).normalize()
+        }
+    }
+
     /// Performs a linear interpolation between `self` and `rhs` based on
     /// the value `s`.
     ///
@@ -628,24 +641,18 @@ impl Quat {
         glam_assert!(end.is_normalized());
 
         const NEG_ZERO: float32x4_t = f32x4_from_array([-0.0; 4]);
-        let start = self.0;
-        let end = end.0;
         unsafe {
-            let dot = dot4_into_f32x4(start, end);
+            let dot = dot4_into_f32x4(self.0, end.0);
             // Calculate the bias, if the dot product is positive or zero, there is no bias
             // but if it is negative, we want to flip the 'end' rotation XYZW components
             let bias = vandq_u32(vreinterpretq_u32_f32(dot), vreinterpretq_u32_f32(NEG_ZERO));
-            let interpolated = vaddq_f32(
-                vmulq_f32(
-                    vsubq_f32(
-                        vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(end), bias)),
-                        start,
-                    ),
-                    vld1q_dup_f32(&s),
-                ),
-                start,
-            );
-            Quat(interpolated).normalize()
+            self.lerp_impl(
+                Self(vreinterpretq_f32_u32(veorq_u32(
+                    vreinterpretq_u32_f32(end),
+                    bias,
+                ))),
+                s,
+            )
         }
     }
 
@@ -665,8 +672,6 @@ impl Quat {
         glam_assert!(self.is_normalized());
         glam_assert!(end.is_normalized());
 
-        const DOT_THRESHOLD: f32 = 0.9995;
-
         // Note that a rotation can be represented by two quaternions: `q` and
         // `-q`. The slerp path between `q` and `end` will be different from the
         // path between `-q` and `end`. One path will take the long way around and
@@ -679,9 +684,10 @@ impl Quat {
             dot = -dot;
         }
 
+        const DOT_THRESHOLD: f32 = 1.0 - f32::EPSILON;
         if dot > DOT_THRESHOLD {
-            // assumes lerp returns a normalized quaternion
-            self.lerp(end, s)
+            // if above threshold perform linear interpolation to avoid divide by zero
+            self.lerp_impl(end, s)
         } else {
             let theta = math::acos_approx(dot);
 

diff --git a/src/f32/scalar/quat.rs b/src/f32/scalar/quat.rs
@@ -618,6 +618,13 @@ impl Quat {
         Vec4::from(self).abs_diff_eq(Vec4::from(rhs), max_abs_diff)
     }
 
+    #[inline(always)]
+    #[must_use]
+    fn lerp_impl(self, end: Self, s: f32) -> Self {
+        let interpolated = self + ((end - self) * s);
+        interpolated.normalize()
+    }
+
     /// Performs a linear interpolation between `self` and `rhs` based on
     /// the value `s`.
     ///
@@ -634,11 +641,9 @@ impl Quat {
         glam_assert!(self.is_normalized());
         glam_assert!(end.is_normalized());
 
-        let start = self;
-        let dot = start.dot(end);
+        let dot = self.dot(end);
         let bias = if dot >= 0.0 { 1.0 } else { -1.0 };
-        let interpolated = start.add(end.mul(bias).sub(start).mul(s));
-        interpolated.normalize()
+        self.lerp_impl(end * bias, s)
     }
 
     /// Performs a spherical linear interpolation between `self` and `end`
@@ -657,8 +662,6 @@ impl Quat {
         glam_assert!(self.is_normalized());
         glam_assert!(end.is_normalized());
 
-        const DOT_THRESHOLD: f32 = 0.9995;
-
         // Note that a rotation can be represented by two quaternions: `q` and
         // `-q`. The slerp path between `q` and `end` will be different from the
         // path between `-q` and `end`. One path will take the long way around and
@@ -671,9 +674,10 @@ impl Quat {
             dot = -dot;
         }
 
+        const DOT_THRESHOLD: f32 = 1.0 - f32::EPSILON;
         if dot > DOT_THRESHOLD {
-            // assumes lerp returns a normalized quaternion
-            self.lerp(end, s)
+            // if above threshold perform linear interpolation to avoid divide by zero
+            self.lerp_impl(end, s)
         } else {
             let theta = math::acos_approx(dot);
 

diff --git a/src/f32/sse2/quat.rs b/src/f32/sse2/quat.rs
@@ -614,6 +614,18 @@ impl Quat {
         Vec4::from(self).abs_diff_eq(Vec4::from(rhs), max_abs_diff)
     }
 
+    #[inline(always)]
+    #[must_use]
+    fn lerp_impl(self, end: Self, s: f32) -> Self {
+        let start = self.0;
+        let end = end.0;
+        unsafe {
+            let interpolated =
+                _mm_add_ps(_mm_mul_ps(_mm_sub_ps(end, start), _mm_set_ps1(s)), start);
+            Quat(interpolated).normalize()
+        }
+    }
+
     /// Performs a linear interpolation between `self` and `rhs` based on
     /// the value `s`.
     ///
@@ -631,18 +643,12 @@ impl Quat {
         glam_assert!(end.is_normalized());
 
         const NEG_ZERO: __m128 = m128_from_f32x4([-0.0; 4]);
-        let start = self.0;
-        let end = end.0;
         unsafe {
-            let dot = dot4_into_m128(start, end);
+            let dot = dot4_into_m128(self.0, end.0);
             // Calculate the bias, if the dot product is positive or zero, there is no bias
             // but if it is negative, we want to flip the 'end' rotation XYZW components
             let bias = _mm_and_ps(dot, NEG_ZERO);
-            let interpolated = _mm_add_ps(
-                _mm_mul_ps(_mm_sub_ps(_mm_xor_ps(end, bias), start), _mm_set_ps1(s)),
-                start,
-            );
-            Quat(interpolated).normalize()
+            self.lerp_impl(Self(_mm_xor_ps(end.0, bias)), s)
         }
     }
 
@@ -662,8 +668,6 @@ impl Quat {
         glam_assert!(self.is_normalized());
         glam_assert!(end.is_normalized());
 
-        const DOT_THRESHOLD: f32 = 0.9995;
-
         // Note that a rotation can be represented by two quaternions: `q` and
         // `-q`. The slerp path between `q` and `end` will be different from the
         // path between `-q` and `end`. One path will take the long way around and
@@ -676,9 +680,10 @@ impl Quat {
             dot = -dot;
         }
 
+        const DOT_THRESHOLD: f32 = 1.0 - f32::EPSILON;
         if dot > DOT_THRESHOLD {
-            // assumes lerp returns a normalized quaternion
-            self.lerp(end, s)
+            // if above threshold perform linear interpolation to avoid divide by zero
+            self.lerp_impl(end, s)
         } else {
             let theta = math::acos_approx(dot);