From 071425a0ae8acf0e8555133666d963e5ff2e6eed Mon Sep 17 00:00:00 2001
From: maihd <maihd.dev@gmail.com>
Date: Mon, 1 Apr 2024 17:03:35 +0700
Subject: [PATCH] group common functions in one file

---
 include/vectormath.h                          |   4 +-
 include/vectormath/scalarmath.h               |   2 +-
 include/vectormath/vectormath_clang.h         | 640 +----------------
 include/vectormath/vectormath_common.h        | 489 +++++++++++++
 include/vectormath/vectormath_scalar.h        | 391 +----------
 include/vectormath/vectormath_simd.h          | 647 +-----------------
 include/vectormath/vectormath_simd_utils.h    | 286 ++++++++
 include/vectormath/vectormath_types.h         |   2 +-
 premake5.lua                                  |   4 +-
 scripts/vectormath.premake5.lua               |   4 +
 .../app/build.gradle                          |   2 +-
 11 files changed, 790 insertions(+), 1681 deletions(-)
 create mode 100644 include/vectormath/vectormath_common.h
 create mode 100644 include/vectormath/vectormath_simd_utils.h

diff --git a/include/vectormath.h b/include/vectormath.h
index e80dc9d..29f03a3 100644
--- a/include/vectormath.h
+++ b/include/vectormath.h
@@ -5,9 +5,7 @@
 #if VECTORMATH_ENABLE_CLANG_EXT
 #   include "vectormath/vectormath_clang.h"
 #elif VECTORMATH_SIMD_ENABLE
-#   if VECTORMATH_NEON_SUPPORT
-#       include "vectormath/sse_to_neon.h"
-#   endif
+// #if VECTORMATH_SIMD_ENABLE
 #   include "vectormath/vectormath_simd.h"
 #else
 #   include "vectormath/vectormath_scalar.h"
diff --git a/include/vectormath/scalarmath.h b/include/vectormath/scalarmath.h
index 539fbb3..8a45de5 100644
--- a/include/vectormath/scalarmath.h
+++ b/include/vectormath/scalarmath.h
@@ -418,7 +418,7 @@ __forceinline float float_fast_rsqrt(float x)
     cvt.f = x;
     cvt.i = 0x5f3759df - (cvt.i >> 1); 
     cvt.f = cvt.f * (1.5f - xhalf * cvt.f * cvt.f); // first approximation
-    // cvt.f = cvt.f * (1.5f - xhalf * cvt.f * cvt.f); // second approximation
+    cvt.f = cvt.f * (1.5f - xhalf * cvt.f * cvt.f); // second approximation
     return cvt.f;
 }
 
diff --git a/include/vectormath/vectormath_clang.h b/include/vectormath/vectormath_clang.h
index 910288c..5221a4d 100644
--- a/include/vectormath/vectormath_clang.h
+++ b/include/vectormath/vectormath_clang.h
@@ -2,6 +2,8 @@
 
 #include "scalarmath.h"
 #include "vectormath_types.h"
+#include "vectormath_common.h"
+#include "vectormath_simd_utils.h"
 
 // Make sure clang extensions is enable
 #if !VECTORMATH_ENABLE_CLANG_EXT
@@ -16,264 +18,6 @@
 // Helper for extensions
 #define VECTORMATH_FUNCTIONS_DEFINED
 
-// -------------------------------------------------------------
-// Helper constants
-// -------------------------------------------------------------
-
-// Small epsilon value
-constexpr float SIMD_SLERP_TOL  = 0.999f;
-
-// Common constants used to evaluate simdSinf/cosf4/tanf4
-constexpr float SIMD_SINCOS_CC0 = -0.0013602249f;
-constexpr float SIMD_SINCOS_CC1 =  0.0416566950f;
-constexpr float SIMD_SINCOS_CC2 = -0.4999990225f;
-constexpr float SIMD_SINCOS_SC0 = -0.0001950727f;
-constexpr float SIMD_SINCOS_SC1 =  0.0083320758f;
-constexpr float SIMD_SINCOS_SC2 = -0.1666665247f;
-constexpr float SIMD_SINCOS_KC1 =  1.57079625129f;
-constexpr float SIMD_SINCOS_KC2 =  7.54978995489e-8f;
-
-// Shorthand functions to get the unit vectors as __m128
-__forceinline __m128 m128_unit_1000() { return _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f); }
-__forceinline __m128 m128_unit_0100() { return _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f); }
-__forceinline __m128 m128_unit_0010() { return _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f); }
-__forceinline __m128 m128_unit_0001() { return _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f); }
-
-// ========================================================
-// Internal helper types and functions
-// ========================================================
-
-// These have to be macros because _MM_SHUFFLE() requires compile-time constants.
-#define m128_ror(vec, i)       (((i) % 4) ? (_mm_shuffle_ps(vec, vec, _MM_SHUFFLE((uint32_t)(i + 3) % 4, (uint32_t)(i + 2) % 4, (uint32_t)(i + 1) % 4, (uint32_t)(i + 0) % 4))) : (vec))
-#define m128_splat(x, e)       _mm_shuffle_ps(x, x, _MM_SHUFFLE(e, e, e, e))
-#define m128_sld(vec, vec2, x) m128_ror(vec, ((x) / 4))
-
-__forceinline __m128 m128_mul_add(__m128 a, __m128 b, __m128 c)
-{
-    return _mm_add_ps(c, _mm_mul_ps(a, b));
-}
-
-__forceinline __m128 m128_mul_sub(__m128 a, __m128 b, __m128 c)
-{
-    return _mm_sub_ps(c, _mm_mul_ps(a, b));
-}
-
-__forceinline __m128 m128_merge_hi(__m128 a, __m128 b)
-{
-    return _mm_unpacklo_ps(a, b);
-}
-
-__forceinline __m128 m128_merge_lo(__m128 a, __m128 b)
-{
-    return _mm_unpackhi_ps(a, b);
-}
-
-__forceinline __m128 m128_select(__m128 a, __m128 b, __m128 mask)
-{
-    return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
-}
-
-__forceinline __m128 m128_negatef(__m128 x)
-{
-    return _mm_sub_ps(_mm_setzero_ps(), x);
-}
-
-__forceinline __m128 m128_fabsf(__m128 x)
-{
-    return _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
-    //return _mm_andnot_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
-}
-
-__forceinline __m128 m128_acosf(__m128 x)
-{
-    const __m128 xabs   = m128_fabsf(x);
-    const __m128 select = _mm_cmplt_ps(x, _mm_setzero_ps());
-    const __m128 t1     = _mm_sqrt_ps(_mm_sub_ps(_mm_set1_ps(1.0f), xabs));
-
-    /* Instruction counts can be reduced if the polynomial was
-     * computed entirely from nested (dependent) fma's. However,
-     * to reduce the number of pipeline stalls, the polygon is evaluated
-     * in two halves (hi and lo).
-     */
-    const __m128 xabs2 = _mm_mul_ps(xabs, xabs);
-    const __m128 xabs4 = _mm_mul_ps(xabs2, xabs2);
-
-    const __m128 hi = m128_mul_add(m128_mul_add(m128_mul_add(_mm_set1_ps(-0.0012624911f),
-        xabs, _mm_set1_ps( 0.0066700901f)),
-        xabs, _mm_set1_ps(-0.0170881256f)),
-        xabs, _mm_set1_ps( 0.0308918810f));
-
-    const __m128 lo = m128_mul_add(m128_mul_add(m128_mul_add(_mm_set1_ps(-0.0501743046f),
-        xabs, _mm_set1_ps( 0.0889789874f)),
-        xabs, _mm_set1_ps(-0.2145988016f)),
-        xabs, _mm_set1_ps( 1.5707963050f));
-
-    const __m128 result = m128_mul_add(hi, xabs4, lo);
-
-    // Adjust the result if x is negative.
-    return m128_select(
-        _mm_mul_ps(t1, result),                                     // Positive
-        m128_mul_sub(t1, result, _mm_set1_ps(3.1415926535898f)),    // Negative
-        select
-    );
-}
-
-__forceinline __m128 m128_cosf(__m128 x)
-{
-    // Range reduction
-    // Find the quadrant the angle falls in
-    // Algo:
-    //      xl = angle * TwoOverPi; 
-    //      q = (int32_t)(ceil(abs(xl)) * sign(xl))
-    const __m128i q = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(0.63661977236f)));
-
-    // Compute the offset based on the quadrant that the angle falls in.
-    // Add 1 to the offset for the cosine.
-    const __m128i offset_sin = _mm_and_si128(q, _mm_set1_epi32(0x3));
-    const __m128i offset_cos = _mm_add_epi32(_mm_set1_epi32(1), offset_sin);
-
-    // Remainder in range [-pi/4..pi/4]
-    const __m128 qf = _mm_cvtepi32_ps(q);
-    const __m128 xl = m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC2), m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC1), x));
-
-    // Compute x^2 and x^3
-    const __m128 xl2 = _mm_mul_ps(xl, xl);
-    const __m128 xl3 = _mm_mul_ps(xl2, xl);
-
-    // Compute both the sin and cos of the angles
-    // using a polynomial expression:
-    //      cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2)
-    //      sx =   xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
-    const __m128 cx =
-        m128_mul_add(
-            m128_mul_add(
-                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_CC0), xl2, _mm_set1_ps(SIMD_SINCOS_CC1)), xl2, _mm_set1_ps(SIMD_SINCOS_CC2)),
-            xl2, _mm_set1_ps(1.0f));
-
-    const __m128 sx =
-        m128_mul_add(
-            m128_mul_add(
-                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_SC0), xl2, _mm_set1_ps(SIMD_SINCOS_SC1)), xl2, _mm_set1_ps(SIMD_SINCOS_SC2)),
-            xl3, xl);
-
-    // Use the cosine when the offset is odd and the sin
-    // when the offset is even
-    __m128 cos_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_cos, _mm_set1_epi32(0x1)), _mm_setzero_si128()));
-
-    __m128 cos = m128_select(cx, sx, cos_mask);
-
-    // Flip the sign of the result when (offset mod 4) = 1 or 2
-    cos_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_cos, _mm_set1_epi32(0x2)), _mm_setzero_si128()));
-
-    // Write the result
-    return m128_select(_mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), cos), cos, cos_mask);    
-}
-
-__forceinline __m128 m128_sinf(__m128 x)
-{
-    // Range reduction
-    // Find the quadrant the angle falls in
-    // Algo:
-    //      xl = angle * TwoOverPi; 
-    //      q = (int32_t)(ceil(abs(xl)) * sign(xl))
-    const __m128i q = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(0.63661977236f)));
-
-    // Compute the offset based on the quadrant that the angle falls in.
-    // Add 1 to the offset for the cosine.
-    const __m128i offset_sin = _mm_and_si128(q, _mm_set1_epi32(0x3));
-
-    // Remainder in range [-pi/4..pi/4]
-    const __m128 qf = _mm_cvtepi32_ps(q);
-    const __m128 xl = m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC2), m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC1), x));
-
-    // Compute x^2 and x^3
-    const __m128 xl2 = _mm_mul_ps(xl, xl);
-    const __m128 xl3 = _mm_mul_ps(xl2, xl);
-
-    // Compute both the sin and cos of the angles
-    // using a polynomial expression:
-    //      cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2)
-    //      sx =   xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
-    const __m128 cx =
-        m128_mul_add(
-            m128_mul_add(
-                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_CC0), xl2, _mm_set1_ps(SIMD_SINCOS_CC1)), xl2, _mm_set1_ps(SIMD_SINCOS_CC2)),
-            xl2, _mm_set1_ps(1.0f));
-
-    const __m128 sx =
-        m128_mul_add(
-            m128_mul_add(
-                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_SC0), xl2, _mm_set1_ps(SIMD_SINCOS_SC1)), xl2, _mm_set1_ps(SIMD_SINCOS_SC2)),
-            xl3, xl);
-
-    // Use the cosine when the offset is odd and the sin
-    // when the offset is even
-    __m128 sin_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_sin, _mm_set1_epi32(0x1)), _mm_setzero_si128()));
-
-    __m128 sin = m128_select(cx, sx, sin_mask);
-
-    // Flip the sign of the result when (offset mod 4) = 1 or 2
-    sin_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_sin, _mm_set1_epi32(0x2)), _mm_setzero_si128()));
-
-    // Write the result
-    return m128_select(_mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), sin), sin, sin_mask);
-}
-
-__forceinline void m128_sinf_cosf(__m128 x, __m128* out_sin, __m128* out_cos)
-{
-    // Range reduction
-    // Find the quadrant the angle falls in
-    // Algo:
-    //      xl = angle * TwoOverPi; 
-    //      q = (int32_t)(ceil(abs(xl)) * sign(xl))
-    const __m128i q = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(0.63661977236f)));
-
-    // Compute the offset based on the quadrant that the angle falls in.
-    // Add 1 to the offset for the cosine.
-    const __m128i offset_sin = _mm_and_si128(q, _mm_set1_epi32(0x3));
-    const __m128i offset_cos = _mm_add_epi32(_mm_set1_epi32(1), offset_sin);
-
-    // Remainder in range [-pi/4..pi/4]
-    const __m128 qf = _mm_cvtepi32_ps(q);
-    const __m128 xl = m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC2), m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC1), x));
-
-    // Compute x^2 and x^3
-    const __m128 xl2 = _mm_mul_ps(xl, xl);
-    const __m128 xl3 = _mm_mul_ps(xl2, xl);
-
-    // Compute both the sin and cos of the angles
-    // using a polynomial expression:
-    //      cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2)
-    //      sx =   xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
-    const __m128 cx =
-        m128_mul_add(
-            m128_mul_add(
-                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_CC0), xl2, _mm_set1_ps(SIMD_SINCOS_CC1)), xl2, _mm_set1_ps(SIMD_SINCOS_CC2)),
-            xl2, _mm_set1_ps(1.0f));
-
-    const __m128 sx =
-        m128_mul_add(
-            m128_mul_add(
-                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_SC0), xl2, _mm_set1_ps(SIMD_SINCOS_SC1)), xl2, _mm_set1_ps(SIMD_SINCOS_SC2)),
-            xl3, xl);
-
-    // Use the cosine when the offset is odd and the sin
-    // when the offset is even
-    __m128 sin_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_sin, _mm_set1_epi32(0x1)), _mm_setzero_si128()));
-    __m128 cos_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_cos, _mm_set1_epi32(0x1)), _mm_setzero_si128()));
-
-    __m128 sin = m128_select(cx, sx, sin_mask);
-    __m128 cos = m128_select(cx, sx, cos_mask);
-
-    // Flip the sign of the result when (offset mod 4) = 1 or 2
-    sin_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_sin, _mm_set1_epi32(0x2)), _mm_setzero_si128()));
-    cos_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_cos, _mm_set1_epi32(0x2)), _mm_setzero_si128()));
-
-    // Write the result
-    *out_sin = m128_select(_mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), sin), sin, sin_mask);
-    *out_cos = m128_select(_mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), cos), cos, cos_mask);
-}
-
 // -------------------------------------------------------------
 // Constructors
 // -------------------------------------------------------------
@@ -284,26 +28,6 @@ __forceinline __m128 m128_from_vec3(vec3 v)
     return _mm_set_ps(0.0f, v.z, v.y, v.x);
 }
 
-/// Create a new vector 2D
-__forceinline vec2 vec2_new(float x, float y)
-{
-    vec2 result = { x, y };
-    return result;
-}
-
-/// Create a new vector 2D, with components have same value
-__forceinline vec2 vec2_new1(float s)
-{
-    vec2 result = { s, s };
-    return result;
-}
-
-/// Create a new vector 2D from a vector 3D
-__forceinline vec2 vec2_from_vec3(vec3 v)
-{
-    return v.xy;
-}
-
 /// Create a new vector 3D
 __forceinline vec3 vec3_new(float x, float y, float z)
 {
@@ -432,71 +156,6 @@ __forceinline mat4 mat4_load(const float ptr[])
 // Operators-like functions
 // -------------------------------------------------------------
 
-__forceinline vec2 vec2_neg(vec2 v)
-{
-    return vec2_new(-v.x, -v.y);
-}
-
-__forceinline vec2 vec2_add(vec2 a, vec2 b)
-{
-    return vec2_new(a.x + b.x, a.y + b.y);
-}
-
-__forceinline vec2 vec2_sub(vec2 a, vec2 b)
-{
-    return vec2_new(a.x - b.x, a.y - b.y);
-}
-
-__forceinline vec2 vec2_mul(vec2 a, vec2 b)
-{
-    return vec2_new(a.x * b.x, a.y * b.y);
-}
-
-__forceinline vec2 vec2_div(vec2 a, vec2 b)
-{
-    return vec2_new(a.x / b.x, a.y / b.y);
-}
-
-__forceinline vec2 vec2_add1(vec2 a, float b)
-{
-    return vec2_add(a, vec2_new1(b));
-}
-
-__forceinline vec2 vec2_sub1(vec2 a, float b)
-{
-    return vec2_sub(a, vec2_new1(b));
-}
-
-__forceinline vec2 vec2_mul1(vec2 a, float b)
-{
-    return vec2_mul(a, vec2_new1(b));
-}
-
-__forceinline vec2 vec2_div1(vec2 a, float b)
-{
-    return vec2_div(a, vec2_new1(b));
-}
-
-__forceinline vec2 vec2_mul_add(vec2 a, vec2 b, vec2 c)
-{
-    return vec2_add(c, vec2_mul(a, b));
-}
-
-__forceinline vec2 vec2_mul_sub(vec2 a, vec2 b, vec2 c)
-{
-    return vec2_sub(c, vec2_mul(a, b));
-}
-
-__forceinline bool vec2_equal(vec2 a, vec2 b)
-{
-    return a.x == b.x && a.y == b.y;
-}
-
-__forceinline bool vec2_not_equal(vec2 a, vec2 b)
-{
-    return a.x != b.x || a.y != b.y;
-}
-
 __forceinline vec3 vec3_neg(vec3 v)
 {
     return -v;
@@ -704,301 +363,6 @@ __forceinline bool mat4_not_equal(mat4 a, mat4 b)
 // Functions
 // -------------------------------------------------------------
 
-/// Computes sign of 'x'
-__forceinline ivec2 vec2_sign(vec2 v)
-{
-    ivec2 result;
-    result.x = float_sign(v.x);
-    result.y = float_sign(v.y);
-    return result;
-}
-
-/// Computes absolute value
-__forceinline vec2 vec2_abs(vec2 v)
-{
-    return vec2_new(fabsf(v.x), fabsf(v.y));
-}
-
-/// Computes cosine
-__forceinline vec2 vec2_cos(vec2 v)
-{
-    return vec2_new(cosf(v.x), cosf(v.y));
-}
-
-/// Computes sine
-__forceinline vec2 vec2_sin(vec2 v)
-{
-    return vec2_new(sinf(v.x), sinf(v.y));
-}
-
-/// Computes tangent
-__forceinline vec2 vec2_tan(vec2 v)
-{
-    return vec2_new(tanf(v.x), tanf(v.y));
-}
-
-/// Computes hyperbolic cosine
-__forceinline vec2 vec2_cosh(vec2 v)
-{
-    return vec2_new(coshf(v.x), coshf(v.y));
-}
-
-/// Computes hyperbolic sine
-__forceinline vec2 vec2_sinh(vec2 v)
-{
-    return vec2_new(sinhf(v.x), sinhf(v.y));
-}
-
-/// Computes hyperbolic tangent
-__forceinline vec2 vec2_tanh(vec2 v)
-{
-    return vec2_new(tanhf(v.x), tanhf(v.y));
-}
-
-/// Computes inverse cosine
-__forceinline vec2 vec2_acos(vec2 v)
-{
-    return vec2_new(acosf(v.x), acosf(v.y));
-}
-
-/// Computes inverse sine
-__forceinline vec2 vec2_asin(vec2 v)
-{
-    return vec2_new(asinf(v.x), asinf(v.y));
-}
-
-/// Computes inverse tangent
-__forceinline vec2 vec2_atan(vec2 v)
-{
-    return vec2_new(atanf(v.x), atanf(v.y));
-}
-
-/// Computes inverse tangent with 2 args
-__forceinline vec2 vec2_atan2(vec2 a, vec2 b)
-{
-    return vec2_new(atan2f(a.x, b.x), atan2f(a.y, b.y));
-}
-
-/// Computes Euler number raised to the power 'x'
-__forceinline vec2 vec2_exp(vec2 v)
-{
-    return vec2_new(expf(v.x), expf(v.y));
-}
-
-/// Computes 2 raised to the power 'x'
-__forceinline vec2 vec2_exp2(vec2 v)
-{
-    return vec2_new(exp2f(v.x), exp2f(v.y));
-}
-
-/// Computes the base Euler number logarithm
-__forceinline vec2 vec2_log(vec2 v)
-{
-    return vec2_new(logf(v.x), logf(v.y));
-}
-
-/// Computes the base 2 logarithm
-__forceinline vec2 vec2_log2(vec2 v)
-{
-    return vec2_new(log2f(v.x), log2f(v.y));
-}
-
-/// Computes the base 10 logarithm
-__forceinline vec2 vec2_log10(vec2 v)
-{
-    return vec2_new(log10f(v.x), log10f(v.y));
-}
-
-/// Computes the value of base raised to the power exponent
-__forceinline vec2 vec2_pow(vec2 a, vec2 b)
-{
-    return vec2_new(powf(a.x, b.x), powf(a.y, b.y));
-}
-
-/// Get the fractal part of floating point
-__forceinline vec2 vec2_frac(vec2 v)
-{
-    return vec2_new(float_frac(v.x), float_frac(v.y));
-}
-
-/// Computes the floating-point remainder of the division operation x/y
-__forceinline vec2 vec2_fmod(vec2 a, vec2 b)
-{
-    return vec2_new(fmodf(a.x, b.x), fmodf(a.y, b.y));
-}
-
-/// Computes the smallest integer value not less than 'x'
-__forceinline vec2 vec2_ceil(vec2 v)
-{
-    return vec2_new(ceilf(v.x), ceilf(v.y));
-}
-
-/// Computes the largest integer value not greater than 'x'
-__forceinline vec2 vec2_floor(vec2 v)
-{
-    return vec2_new(floorf(v.x), floorf(v.y));
-}
-
-/// Computes the nearest integer value
-__forceinline vec2 vec2_round(vec2 v)
-{
-    return vec2_new(roundf(v.x), roundf(v.y));
-}
-
-/// Computes the nearest integer not greater in magnitude than 'x'
-__forceinline vec2 vec2_trunc(vec2 v)
-{
-    return vec2_new(truncf(v.x), truncf(v.y));
-}
-
-/// Get the smaller value
-__forceinline vec2 vec2_min(vec2 a, vec2 b)
-{
-    return vec2_new(float_min(a.x, b.x), float_min(a.y, b.y));
-}
-
-/// Get the larger value
-__forceinline vec2 vec2_max(vec2 a, vec2 b)
-{
-    return vec2_new(float_max(a.x, b.x), float_max(a.y, b.y));
-}
-
-/// Clamps the 'x' value to the [min, max].
-__forceinline vec2 vec2_clamp(vec2 v, vec2 min, vec2 max)
-{
-    return vec2_new(float_clamp(v.x, min.x, max.x), float_clamp(v.y, min.y, max.y));
-}
-
-/// Clamps the specified value within the range of 0 to 1
-__forceinline vec2 vec2_saturate(vec2 v)
-{
-    return vec2_new(float_saturate(v.x), float_saturate(v.y));
-}
-
-/// Compares two values, returning 0 or 1 based on which value is greater.
-__forceinline vec2 vec2_step(vec2 a, vec2 b)
-{
-    return vec2_new(float_step(a.x, b.x), float_step(a.y, b.y));
-}
-
-/// Performs a linear interpolation.
-__forceinline vec2 vec2_lerp(vec2 a, vec2 b, vec2 t)
-{
-    return vec2_new(float_lerp(a.x, b.x, t.x), float_lerp(a.y, b.y, t.y));
-}
-
-/// Performs a linear interpolation.
-__forceinline vec2 vec2_lerp1(vec2 a, vec2 b, float t)
-{
-    return vec2_new(float_lerp(a.x, b.x, t), float_lerp(a.y, b.y, t));
-}
-
-/// Compute a smooth Hermite interpolation
-__forceinline vec2 vec2_smoothstep(vec2 a, vec2 b, vec2 t)
-{
-    return vec2_new(float_smoothstep(a.x, b.x, t.x), float_smoothstep(a.y, b.y, t.y));
-}
-
-/// Computes square root of 'x'.
-__forceinline vec2 vec2_sqrt(vec2 v)
-{
-    return vec2_new(sqrtf(v.x), sqrtf(v.y));
-}
-
-/// Computes inverse square root of 'x'.
-__forceinline vec2 vec2_rsqrt(vec2 v)
-{
-    return vec2_new(float_rsqrt(v.x), float_rsqrt(v.y));
-}
-
-/// Compute dot product of two vectors
-__forceinline float vec2_dot(vec2 a, vec2 b)
-{
-    return a.x * b.x + a.y * b.y;
-}
-
-/// Compute squared length of vector
-__forceinline float vec2_lensqr(vec2 v)
-{
-    return vec2_dot(v, v);
-}
-
-/// Compute length of vector
-__forceinline float vec2_length(vec2 v)
-{
-    return sqrtf(vec2_lensqr(v));
-}
-
-/// Compute distance from 'a' to b
-__forceinline float vec2_distance(vec2 a, vec2 b)
-{
-    return vec2_length(vec2_sub(a, b));
-}
-
-/// Compute squared distance from 'a' to b
-__forceinline float vec2_distsqr(vec2 a, vec2 b)
-{
-    return vec2_lensqr(vec2_sub(a, b));
-}
-
-/// Compute normalized vector
-__forceinline vec2 vec2_normalize(vec2 v)
-{
-    const float lsqr = vec2_lensqr(v);
-    if (lsqr > 0.0f)
-    {
-        const float f = 1.0f / sqrtf(lsqr);
-        return vec2_new(v.x * f, v.y * f);
-    }
-    else
-    {
-        return v;
-    }
-}
-
-/// Compute reflection vector
-__forceinline vec2 vec2_reflect(vec2 v, vec2 n)
-{
-    return vec2_sub(v, vec2_mul1(n, 2.0f * vec2_dot(v, n)));
-}
-
-/// Compute refraction vector
-__forceinline vec2 vec2_refract(vec2 v, vec2 n, float eta)
-{
-    const float k = 1.0f - eta * eta * (1.0f - vec2_dot(v, n) * vec2_dot(v, n));
-    return k < 0.0f
-        ? vec2_new1(0.0f)
-        : vec2_sub(vec2_mul1(v, eta), vec2_mul1(v, (eta * vec2_dot(v, n) + sqrtf(k))));
-}
-
-/// Compute faceforward vector
-__forceinline vec2 vec2_faceforward(vec2 n, vec2 i, vec2 nref)
-{
-    return vec2_dot(i, nref) < 0.0f ? n : vec2_neg(n);
-}
-
-/// Compute angle vector
-__forceinline float vec2_angle(vec2 v)
-{
-    return atan2f(v.y, v.x);
-}
-
-/// Compute angle vector in degrees
-__forceinline float vec2_angle_deg(vec2 v)
-{
-    return float_rad2deg(vec2_angle(v));
-}
-
-/// Create new vector with angle and length
-__forceinline vec2 vec2_from_angle(float angle, float length) {
-    return vec2_new(cosf(angle) * length, sinf(angle) * length);
-}
-
-/// Create new vector with angle in degrees, and length
-__forceinline vec2 vec2_from_angle_deg(float angle, float length) {
-    return vec2_from_angle(float_rad2deg(angle), length);
-}
-
 /// Computes sign of 'x'
 __forceinline ivec3 vec3_sign(vec3 v)
 {
diff --git a/include/vectormath/vectormath_common.h b/include/vectormath/vectormath_common.h
new file mode 100644
index 0000000..0b6fd3f
--- /dev/null
+++ b/include/vectormath/vectormath_common.h
@@ -0,0 +1,489 @@
+#pragma once
+
+#include "scalarmath.h"
+#include "vectormath_types.h"
+
+// -------------------------------------------------------------
+// Constructors
+// -------------------------------------------------------------
+
+
+/// Create a new vector 2D
+__forceinline vec2 vec2_new(float x, float y)
+{
+    vec2 result;
+    result.x = x;
+    result.y = y;
+    return result;
+}
+
+
+/// Create a new vector 2D with 2 components have same value
+__forceinline vec2 vec2_new1(float s)
+{
+    vec2 result;
+    result.x = s;
+    result.y = s;
+    return result;
+}
+
+
+/// Create a new vector 2D from a vector 3D
+__forceinline vec2 vec2_from_vec3(vec3 v)
+{
+    return v.xy;
+}
+
+// -------------------------------------------------------------
+// Operators-like functions
+// -------------------------------------------------------------
+
+
+/// Compute negate value of v
+__forceinline vec2 vec2_neg(vec2 v)
+{
+    return vec2_new(-v.x, -v.y);
+}
+
+
+/// Add two vectors 
+__forceinline vec2 vec2_add(vec2 a, vec2 b)
+{
+    return vec2_new(a.x + b.x, a.y + b.y);
+}
+
+/// Subtract two vectors
+__forceinline vec2 vec2_sub(vec2 a, vec2 b)
+{
+    return vec2_new(a.x - b.x, a.y - b.y);
+}
+
+
+/// Multiply two vectors
+__forceinline vec2 vec2_mul(vec2 a, vec2 b)
+{
+    return vec2_new(a.x * b.x, a.y * b.y);
+}
+
+
+/// Divide two vectors
+__forceinline vec2 vec2_div(vec2 a, vec2 b)
+{
+    return vec2_new(a.x / b.x, a.y / b.y);
+}
+
+
+/// Add vector with scalar
+__forceinline vec2 vec2_add1(vec2 a, float b)
+{
+    return vec2_add(a, vec2_new1(b));
+}
+
+
+/// Subtract vector with scalar
+__forceinline vec2 vec2_sub1(vec2 a, float b)
+{
+    return vec2_sub(a, vec2_new1(b));
+}
+
+
+/// Multiply vector with scalar
+__forceinline vec2 vec2_mul1(vec2 a, float b)
+{
+    return vec2_mul(a, vec2_new1(b));
+}
+
+
+/// Divide vector with scalar
+__forceinline vec2 vec2_div1(vec2 a, float b)
+{
+    return vec2_div(a, vec2_new1(b));
+}
+
+
+/// Compute Mul-Add algorithm: 
+///     mul_add(a, b, c) = c - a * b 
+__forceinline vec2 vec2_mul_add(vec2 a, vec2 b, vec2 c)
+{
+    return vec2_add(c, vec2_mul(a, b));
+}
+
+
+/// Compute Mul-Sub algorithm:
+///     mul_sub(a, b, c) = c - a * b
+__forceinline vec2 vec2_mul_sub(vec2 a, vec2 b, vec2 c)
+{
+    return vec2_sub(c, vec2_mul(a, b));
+}
+
+
+/// Check if two vectors are equal
+__forceinline bool vec2_equal(vec2 a, vec2 b)
+{
+    return a.x == b.x && a.y == b.y;
+}
+
+
+/// Check if two vectors are not equal (faster than !vec2_equal)
+__forceinline bool vec2_not_equal(vec2 a, vec2 b)
+{
+    return a.x != b.x || a.y != b.y;
+}
+
+
+/// Check if two vectors are closely equal
+__forceinline bool vec2_isclose(vec2 a, vec2 b)
+{
+    return float_isclose(a.x, b.x)
+        && float_isclose(a.y, b.y);
+}
+
+
+// -------------------------------------------------------------
+// Functions
+// -------------------------------------------------------------
+
+
+/// Computes sign of 'x'
+__forceinline ivec2 vec2_sign(vec2 v)
+{
+    ivec2 result = { float_sign(v.x), float_sign(v.y) };
+    return result;
+}
+
+
+/// Computes absolute value
+__forceinline vec2 vec2_abs(vec2 v)
+{
+    return vec2_new(fabsf(v.x), fabsf(v.y));
+}
+
+
+/// Computes cosine
+__forceinline vec2 vec2_cos(vec2 v)
+{
+    return vec2_new(cosf(v.x), cosf(v.y));
+}
+
+
+/// Computes sine
+__forceinline vec2 vec2_sin(vec2 v)
+{
+    return vec2_new(sinf(v.x), sinf(v.y));
+}
+
+
+/// Computes tangent
+__forceinline vec2 vec2_tan(vec2 v)
+{
+    return vec2_new(tanf(v.x), tanf(v.y));
+}
+
+
+/// Computes hyperbolic cosine
+__forceinline vec2 vec2_cosh(vec2 v)
+{
+    return vec2_new(coshf(v.x), coshf(v.y));
+}
+
+
+/// Computes hyperbolic sine
+__forceinline vec2 vec2_sinh(vec2 v)
+{
+    return vec2_new(sinhf(v.x), sinhf(v.y));
+}
+
+
+/// Computes hyperbolic tangent
+__forceinline vec2 vec2_tanh(vec2 v)
+{
+    return vec2_new(tanhf(v.x), tanhf(v.y));
+}
+
+
+/// Computes inverse cosine
+__forceinline vec2 vec2_acos(vec2 v)
+{
+    return vec2_new(acosf(v.x), acosf(v.y));
+}
+
+
+/// Computes inverse sine
+__forceinline vec2 vec2_asin(vec2 v)
+{
+    return vec2_new(asinf(v.x), asinf(v.y));
+}
+
+
+/// Computes inverse tangent
+__forceinline vec2 vec2_atan(vec2 v)
+{
+    return vec2_new(atanf(v.x), atanf(v.y));
+}
+
+
+/// Computes inverse tangent with 2 args
+__forceinline vec2 vec2_atan2(vec2 a, vec2 b)
+{
+    return vec2_new(atan2f(a.x, b.x), atan2f(a.y, b.y));
+}
+
+
+/// Computes Euler number raised to the power 'x'
+__forceinline vec2 vec2_exp(vec2 v)
+{
+    return vec2_new(expf(v.x), expf(v.y));
+}
+
+
+/// Computes 2 raised to the power 'x'
+__forceinline vec2 vec2_exp2(vec2 v)
+{
+    return vec2_new(exp2f(v.x), exp2f(v.y));
+}
+
+
+/// Computes the base Euler number logarithm
+__forceinline vec2 vec2_log(vec2 v)
+{
+    return vec2_new(logf(v.x), logf(v.y));
+}
+
+
+/// Computes the base 2 logarithm
+__forceinline vec2 vec2_log2(vec2 v)
+{
+    return vec2_new(log2f(v.x), log2f(v.y));
+}
+
+
+/// Computes the base 10 logarithm
+__forceinline vec2 vec2_log10(vec2 v)
+{
+    return vec2_new(log10f(v.x), log10f(v.y));
+}
+
+
+/// Computes the value of base raised to the power exponent
+__forceinline vec2 vec2_pow(vec2 a, vec2 b)
+{
+    return vec2_new(powf(a.x, b.x), powf(a.y, b.y));
+}
+
+
+/// Get the fractal part of floating point
+__forceinline vec2 vec2_frac(vec2 v)
+{
+    return vec2_new(float_frac(v.x), float_frac(v.y));
+}
+
+
+/// Computes the floating-point remainder of the division operation x/y
+__forceinline vec2 vec2_fmod(vec2 a, vec2 b)
+{
+    return vec2_new(fmodf(a.x, b.x), fmodf(a.y, b.y));
+}
+
+
+/// Computes the smallest integer value not less than 'x'
+__forceinline vec2 vec2_ceil(vec2 v)
+{
+    return vec2_new(ceilf(v.x), ceilf(v.y));
+}
+
+
+/// Computes the largest integer value not greater than 'x'
+__forceinline vec2 vec2_floor(vec2 v)
+{
+    return vec2_new(floorf(v.x), floorf(v.y));
+}
+
+
+/// Computes the nearest integer value
+__forceinline vec2 vec2_round(vec2 v)
+{
+    return vec2_new(roundf(v.x), roundf(v.y));
+}
+
+
+/// Computes the nearest integer not greater in magnitude than 'x'
+__forceinline vec2 vec2_trunc(vec2 v)
+{
+    return vec2_new(truncf(v.x), truncf(v.y));
+}
+
+
+/// Get the smaller value
+__forceinline vec2 vec2_min(vec2 a, vec2 b)
+{
+    return vec2_new(float_min(a.x, b.x), float_min(a.y, b.y));
+}
+
+
+/// Get the larger value
+__forceinline vec2 vec2_max(vec2 a, vec2 b)
+{
+    return vec2_new(float_max(a.x, b.x), float_max(a.y, b.y));
+}
+
+
+/// Clamps the 'x' value to the [min, max].
+__forceinline vec2 vec2_clamp(vec2 v, vec2 min, vec2 max)
+{
+    return vec2_new(float_clamp(v.x, min.x, max.x), float_clamp(v.y, min.y, max.y));
+}
+
+
+/// Clamps the specified value within the range of 0 to 1
+__forceinline vec2 vec2_saturate(vec2 v)
+{
+    return vec2_new(float_saturate(v.x), float_saturate(v.y));
+}
+
+
+/// Compares two values, returning 0 or 1 based on which value is greater.
+__forceinline vec2 vec2_step(vec2 a, vec2 b)
+{
+    return vec2_new(float_step(a.x, b.x), float_step(a.y, b.y));
+}
+
+
+/// Performs a linear interpolation.
+__forceinline vec2 vec2_lerp(vec2 a, vec2 b, vec2 t)
+{
+    return vec2_new(float_lerp(a.x, b.x, t.x), float_lerp(a.y, b.y, t.y));
+}
+
+
+/// Performs a linear interpolation.
+__forceinline vec2 vec2_lerp1(vec2 a, vec2 b, float t)
+{
+    return vec2_new(float_lerp(a.x, b.x, t), float_lerp(a.y, b.y, t));
+}
+
+
+/// Compute a smooth Hermite interpolation
+__forceinline vec2 vec2_smoothstep(vec2 a, vec2 b, vec2 t)
+{
+    return vec2_new(float_smoothstep(a.x, b.x, t.x), float_smoothstep(a.y, b.y, t.y));
+}
+
+
+/// Computes square root of 'x'.
+__forceinline vec2 vec2_sqrt(vec2 v)
+{
+    return vec2_new(sqrtf(v.x), sqrtf(v.y));
+}
+
+
+/// Computes inverse square root of 'x'.
+__forceinline vec2 vec2_rsqrt(vec2 v)
+{
+    return vec2_new(float_rsqrt(v.x), float_rsqrt(v.y));
+}
+
+
+/// Compute dot product of two vectors
+__forceinline float vec2_dot(vec2 a, vec2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+
+
+/// Compute squared length of vector
+__forceinline float vec2_lensqr(vec2 v)
+{
+    return vec2_dot(v, v);
+}
+
+
+/// Compute length of vector
+__forceinline float vec2_length(vec2 v)
+{
+    return sqrtf(vec2_lensqr(v));
+}
+
+
+/// Compute distance from 'a' to b
+__forceinline float vec2_distance(vec2 a, vec2 b)
+{
+    return vec2_length(vec2_sub(a, b));
+}
+
+
+/// Compute squared distance from 'a' to b
+__forceinline float vec2_distsqr(vec2 a, vec2 b)
+{
+    return vec2_lensqr(vec2_sub(a, b));
+}
+
+
+/// Compute normalized vector
+__forceinline vec2 vec2_normalize(vec2 v)
+{
+    const float lsqr = vec2_lensqr(v);
+    if (lsqr > 0.0f)
+    {
+        const float f = 1.0f / sqrtf(lsqr);
+        return vec2_new(v.x * f, v.y * f);
+    }
+    else
+    {
+        return v;
+    }
+}
+
+
+/// Compute reflection vector
+__forceinline vec2 vec2_reflect(vec2 v, vec2 n)
+{
+    return vec2_sub(v, vec2_mul1(n, 2.0f * vec2_dot(v, n)));
+}
+
+
+/// Compute refraction vector
+__forceinline vec2 vec2_refract(vec2 v, vec2 n, float eta)
+{
+    const float k = 1.0f - eta * eta * (1.0f - vec2_dot(v, n) * vec2_dot(v, n));
+    return k < 0.0f
+        ? vec2_new1(0.0f)
+        : vec2_sub(vec2_mul1(v, eta), vec2_mul1(v, (eta * vec2_dot(v, n) + sqrtf(k))));
+}
+
+
+/// Compute faceforward vector
+__forceinline vec2 vec2_faceforward(vec2 n, vec2 i, vec2 nref)
+{
+    return vec2_dot(i, nref) < 0.0f ? n : vec2_neg(n);
+}
+
+
+/// Compute angle vector
+__forceinline float vec2_angle(vec2 v)
+{
+    return atan2f(v.y, v.x);
+}
+
+
+/// Compute angle vector in degrees
+__forceinline float vec2_angle_deg(vec2 v)
+{
+    return float_rad2deg(vec2_angle(v));
+}
+
+
+/// Create new vector with angle and length
+__forceinline vec2 vec2_from_angle(float angle, float length)
+{
+    return vec2_new(cosf(angle) * length, sinf(angle) * length);
+}
+
+
+/// Create new vector with angle in degrees, and length
+__forceinline vec2 vec2_from_angle_deg(float angle, float length)
+{
+    return vec2_from_angle(float_rad2deg(angle), length);
+}
+
+//! LEAVE AN EMPTY LINE HERE, REQUIRE BY GCC/G++
+
diff --git a/include/vectormath/vectormath_scalar.h b/include/vectormath/vectormath_scalar.h
index e6c804a..d45b1d8 100644
--- a/include/vectormath/vectormath_scalar.h
+++ b/include/vectormath/vectormath_scalar.h
@@ -2,6 +2,7 @@
 
 #include "scalarmath.h"
 #include "vectormath_types.h"
+#include "vectormath_common.h"
 
 // Should not use both scalar & m128 versions of vector_math together
 #if defined(VECTORMATH_FUNCTIONS_DEFINED)
@@ -15,30 +16,6 @@
 // Constructors
 // -------------------------------------------------------------
 
-/// Create a new vector 2D
-__forceinline vec2 vec2_new(float x, float y)
-{
-    vec2 result;
-    result.x = x;
-    result.y = y;
-    return result;
-}
-
-/// Create a new vector 2D with 2 components have same value
-__forceinline vec2 vec2_new1(float s)
-{
-    vec2 result;
-    result.x = s;
-    result.y = s;
-    return result;
-}
-
-/// Create a new vector 2D from a vector 3D
-__forceinline vec2 vec2_from_vec3(vec3 v)
-{
-    return v.xy;
-}
-
 /// Create a new vector 3D
 __forceinline vec3 vec3_new(float x, float y, float z)
 {
@@ -182,77 +159,6 @@ __forceinline mat4 mat4_load(const float ptr[])
 // Operators-like functions
 // -------------------------------------------------------------
 
-__forceinline vec2 vec2_neg(vec2 v)
-{
-    return vec2_new(-v.x, -v.y);
-}
-
-__forceinline vec2 vec2_add(vec2 a, vec2 b)
-{
-    return vec2_new(a.x + b.x, a.y + b.y);
-}
-
-__forceinline vec2 vec2_sub(vec2 a, vec2 b)
-{
-    return vec2_new(a.x - b.x, a.y - b.y);
-}
-
-__forceinline vec2 vec2_mul(vec2 a, vec2 b)
-{
-    return vec2_new(a.x * b.x, a.y * b.y);
-}
-
-__forceinline vec2 vec2_div(vec2 a, vec2 b)
-{
-    return vec2_new(a.x / b.x, a.y / b.y);
-}
-
-__forceinline vec2 vec2_add1(vec2 a, float b)
-{
-    return vec2_add(a, vec2_new1(b));
-}
-
-__forceinline vec2 vec2_sub1(vec2 a, float b)
-{
-    return vec2_sub(a, vec2_new1(b));
-}
-
-__forceinline vec2 vec2_mul1(vec2 a, float b)
-{
-    return vec2_mul(a, vec2_new1(b));
-}
-
-__forceinline vec2 vec2_div1(vec2 a, float b)
-{
-    return vec2_div(a, vec2_new1(b));
-}
-
-__forceinline vec2 vec2_mul_add(vec2 a, vec2 b, vec2 c)
-{
-    return vec2_add(c, vec2_mul(a, b));
-}
-
-__forceinline vec2 vec2_mul_sub(vec2 a, vec2 b, vec2 c)
-{
-    return vec2_sub(c, vec2_mul(a, b));
-}
-
-__forceinline bool vec2_equal(vec2 a, vec2 b)
-{
-    return a.x == b.x && a.y == b.y;
-}
-
-__forceinline bool vec2_not_equal(vec2 a, vec2 b)
-{
-    return a.x != b.x || a.y != b.y;
-}
-
-__forceinline bool vec2_isclose(vec2 a, vec2 b)
-{
-    return float_isclose(a.x, b.x)
-        && float_isclose(a.y, b.y);
-}
-
 __forceinline vec3 vec3_neg(vec3 v)
 {
     return vec3_new(-v.x, -v.y, -v.z);
@@ -442,301 +348,6 @@ __forceinline bool mat4_not_equal(mat4 a, mat4 b)
 // Functions
 // -------------------------------------------------------------
 
-/// Computes sign of 'x'
-__forceinline ivec2 vec2_sign(vec2 v)
-{
-    ivec2 result = { float_sign(v.x), float_sign(v.y) };
-    return result;
-}
-
-/// Computes absolute value
-__forceinline vec2 vec2_abs(vec2 v)
-{
-    return vec2_new(fabsf(v.x), fabsf(v.y));
-}
-
-/// Computes cosine
-__forceinline vec2 vec2_cos(vec2 v)
-{
-    return vec2_new(cosf(v.x), cosf(v.y));
-}
-
-/// Computes sine
-__forceinline vec2 vec2_sin(vec2 v)
-{
-    return vec2_new(sinf(v.x), sinf(v.y));
-}
-
-/// Computes tangent
-__forceinline vec2 vec2_tan(vec2 v)
-{
-    return vec2_new(tanf(v.x), tanf(v.y));
-}
-
-/// Computes hyperbolic cosine
-__forceinline vec2 vec2_cosh(vec2 v)
-{
-    return vec2_new(coshf(v.x), coshf(v.y));
-}
-
-/// Computes hyperbolic sine
-__forceinline vec2 vec2_sinh(vec2 v)
-{
-    return vec2_new(sinhf(v.x), sinhf(v.y));
-}
-
-/// Computes hyperbolic tangent
-__forceinline vec2 vec2_tanh(vec2 v)
-{
-    return vec2_new(tanhf(v.x), tanhf(v.y));
-}
-
-/// Computes inverse cosine
-__forceinline vec2 vec2_acos(vec2 v)
-{
-    return vec2_new(acosf(v.x), acosf(v.y));
-}
-
-/// Computes inverse sine
-__forceinline vec2 vec2_asin(vec2 v)
-{
-    return vec2_new(asinf(v.x), asinf(v.y));
-}
-
-/// Computes inverse tangent
-__forceinline vec2 vec2_atan(vec2 v)
-{
-    return vec2_new(atanf(v.x), atanf(v.y));
-}
-
-/// Computes inverse tangent with 2 args
-__forceinline vec2 vec2_atan2(vec2 a, vec2 b)
-{
-    return vec2_new(atan2f(a.x, b.x), atan2f(a.y, b.y));
-}
-
-/// Computes Euler number raised to the power 'x'
-__forceinline vec2 vec2_exp(vec2 v)
-{
-    return vec2_new(expf(v.x), expf(v.y));
-}
-
-/// Computes 2 raised to the power 'x'
-__forceinline vec2 vec2_exp2(vec2 v)
-{
-    return vec2_new(exp2f(v.x), exp2f(v.y));
-}
-
-/// Computes the base Euler number logarithm
-__forceinline vec2 vec2_log(vec2 v)
-{
-    return vec2_new(logf(v.x), logf(v.y));
-}
-
-/// Computes the base 2 logarithm
-__forceinline vec2 vec2_log2(vec2 v)
-{
-    return vec2_new(log2f(v.x), log2f(v.y));
-}
-
-/// Computes the base 10 logarithm
-__forceinline vec2 vec2_log10(vec2 v)
-{
-    return vec2_new(log10f(v.x), log10f(v.y));
-}
-
-/// Computes the value of base raised to the power exponent
-__forceinline vec2 vec2_pow(vec2 a, vec2 b)
-{
-    return vec2_new(powf(a.x, b.x), powf(a.y, b.y));
-}
-
-/// Get the fractal part of floating point
-__forceinline vec2 vec2_frac(vec2 v)
-{
-    return vec2_new(float_frac(v.x), float_frac(v.y));
-}
-
-/// Computes the floating-point remainder of the division operation x/y
-__forceinline vec2 vec2_fmod(vec2 a, vec2 b)
-{
-    return vec2_new(fmodf(a.x, b.x), fmodf(a.y, b.y));
-}
-
-/// Computes the smallest integer value not less than 'x'
-__forceinline vec2 vec2_ceil(vec2 v)
-{
-    return vec2_new(ceilf(v.x), ceilf(v.y));
-}
-
-/// Computes the largest integer value not greater than 'x'
-__forceinline vec2 vec2_floor(vec2 v)
-{
-    return vec2_new(floorf(v.x), floorf(v.y));
-}
-
-/// Computes the nearest integer value
-__forceinline vec2 vec2_round(vec2 v)
-{
-    return vec2_new(roundf(v.x), roundf(v.y));
-}
-
-/// Computes the nearest integer not greater in magnitude than 'x'
-__forceinline vec2 vec2_trunc(vec2 v)
-{
-    return vec2_new(truncf(v.x), truncf(v.y));
-}
-
-/// Get the smaller value
-__forceinline vec2 vec2_min(vec2 a, vec2 b)
-{
-    return vec2_new(float_min(a.x, b.x), float_min(a.y, b.y));
-}
-
-/// Get the larger value
-__forceinline vec2 vec2_max(vec2 a, vec2 b)
-{
-    return vec2_new(float_max(a.x, b.x), float_max(a.y, b.y));
-}
-
-/// Clamps the 'x' value to the [min, max].
-__forceinline vec2 vec2_clamp(vec2 v, vec2 min, vec2 max)
-{
-    return vec2_new(float_clamp(v.x, min.x, max.x), float_clamp(v.y, min.y, max.y));
-}
-
-/// Clamps the specified value within the range of 0 to 1
-__forceinline vec2 vec2_saturate(vec2 v)
-{
-    return vec2_new(float_saturate(v.x), float_saturate(v.y));
-}
-
-/// Compares two values, returning 0 or 1 based on which value is greater.
-__forceinline vec2 vec2_step(vec2 a, vec2 b)
-{
-    return vec2_new(float_step(a.x, b.x), float_step(a.y, b.y));
-}
-
-/// Performs a linear interpolation.
-__forceinline vec2 vec2_lerp(vec2 a, vec2 b, vec2 t)
-{
-    return vec2_new(float_lerp(a.x, b.x, t.x), float_lerp(a.y, b.y, t.y));
-}
-
-/// Performs a linear interpolation.
-__forceinline vec2 vec2_lerp1(vec2 a, vec2 b, float t)
-{
-    return vec2_new(float_lerp(a.x, b.x, t), float_lerp(a.y, b.y, t));
-}
-
-/// Compute a smooth Hermite interpolation
-__forceinline vec2 vec2_smoothstep(vec2 a, vec2 b, vec2 t)
-{
-    return vec2_new(float_smoothstep(a.x, b.x, t.x), float_smoothstep(a.y, b.y, t.y));
-}
-
-/// Computes square root of 'x'.
-__forceinline vec2 vec2_sqrt(vec2 v)
-{
-    return vec2_new(sqrtf(v.x), sqrtf(v.y));
-}
-
-/// Computes inverse square root of 'x'.
-__forceinline vec2 vec2_rsqrt(vec2 v)
-{
-    return vec2_new(float_rsqrt(v.x), float_rsqrt(v.y));
-}
-
-/// Compute dot product of two vectors
-__forceinline float vec2_dot(vec2 a, vec2 b)
-{
-    return a.x * b.x + a.y * b.y;
-}
-
-/// Compute squared length of vector
-__forceinline float vec2_lensqr(vec2 v)
-{
-    return vec2_dot(v, v);
-}
-
-/// Compute length of vector
-__forceinline float vec2_length(vec2 v)
-{
-    return sqrtf(vec2_lensqr(v));
-}
-
-/// Compute distance from 'a' to b
-__forceinline float vec2_distance(vec2 a, vec2 b)
-{
-    return vec2_length(vec2_sub(a, b));
-}
-
-/// Compute squared distance from 'a' to b
-__forceinline float vec2_distsqr(vec2 a, vec2 b)
-{
-    return vec2_lensqr(vec2_sub(a, b));
-}
-
-/// Compute normalized vector
-__forceinline vec2 vec2_normalize(vec2 v)
-{
-    const float lsqr = vec2_lensqr(v);
-    if (lsqr > 0.0f)
-    {
-        const float f = 1.0f / sqrtf(lsqr);
-        return vec2_new(v.x * f, v.y * f);
-    }
-    else
-    {
-        return v;
-    }
-}
-
-/// Compute reflection vector
-__forceinline vec2 vec2_reflect(vec2 v, vec2 n)
-{
-    return vec2_sub(v, vec2_mul1(n, 2.0f * vec2_dot(v, n)));
-}
-
-/// Compute refraction vector
-__forceinline vec2 vec2_refract(vec2 v, vec2 n, float eta)
-{
-    const float k = 1.0f - eta * eta * (1.0f - vec2_dot(v, n) * vec2_dot(v, n));
-    return k < 0.0f
-        ? vec2_new1(0.0f)
-        : vec2_sub(vec2_mul1(v, eta), vec2_mul1(v, (eta * vec2_dot(v, n) + sqrtf(k))));
-}
-
-/// Compute faceforward vector
-__forceinline vec2 vec2_faceforward(vec2 n, vec2 i, vec2 nref)
-{
-    return vec2_dot(i, nref) < 0.0f ? n : vec2_neg(n);
-}
-
-/// Compute angle vector
-__forceinline float vec2_angle(vec2 v)
-{
-    return atan2f(v.y, v.x);
-}
-
-/// Compute angle vector in degrees
-__forceinline float vec2_angle_deg(vec2 v)
-{
-    return float_rad2deg(vec2_angle(v));
-}
-
-/// Create new vector with angle and length
-__forceinline vec2 vec2_from_angle(float angle, float length)
-{
-    return vec2_new(cosf(angle) * length, sinf(angle) * length);
-}
-
-/// Create new vector with angle in degrees, and length
-__forceinline vec2 vec2_from_angle_deg(float angle, float length)
-{
-    return vec2_from_angle(float_rad2deg(angle), length);
-}
-
 /// Computes sign of 'x'
 __forceinline ivec3 vec3_sign(vec3 v)
 {
diff --git a/include/vectormath/vectormath_simd.h b/include/vectormath/vectormath_simd.h
index 8ceaa13..fed9d54 100644
--- a/include/vectormath/vectormath_simd.h
+++ b/include/vectormath/vectormath_simd.h
@@ -2,6 +2,8 @@
 
 #include "scalarmath.h"
 #include "vectormath_types.h"
+#include "vectormath_common.h"
+#include "vectormath_simd_utils.h"
 
 // Make sure m128 is enable
 #if !VECTORMATH_SIMD_ENABLE
@@ -16,288 +18,10 @@
 // Helper for extensions
 #define VECTORMATH_FUNCTIONS_DEFINED
 
-// -------------------------------------------------------------
-// Helper constants
-// -------------------------------------------------------------
-
-// Small epsilon value
-constexpr float SIMD_SLERP_TOL  = 0.999f;
-
-// Common constants used to evaluate simdSinf/cosf4/tanf4
-constexpr float SIMD_SINCOS_CC0 = -0.0013602249f;
-constexpr float SIMD_SINCOS_CC1 =  0.0416566950f;
-constexpr float SIMD_SINCOS_CC2 = -0.4999990225f;
-constexpr float SIMD_SINCOS_SC0 = -0.0001950727f;
-constexpr float SIMD_SINCOS_SC1 =  0.0083320758f;
-constexpr float SIMD_SINCOS_SC2 = -0.1666665247f;
-constexpr float SIMD_SINCOS_KC1 =  1.57079625129f;
-constexpr float SIMD_SINCOS_KC2 =  7.54978995489e-8f;
-
-// Shorthand functions to get the unit vectors as __m128
-__forceinline __m128 m128_unit_1000() { return _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f); }
-__forceinline __m128 m128_unit_0100() { return _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f); }
-__forceinline __m128 m128_unit_0010() { return _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f); }
-__forceinline __m128 m128_unit_0001() { return _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f); }
-
-// ========================================================
-// Internal helper types and functions
-// ========================================================
-
-// These have to be macros because _MM_SHUFFLE() requires compile-time constants.
-#define m128_ror(vec, i)       (((i) % 4) ? (_mm_shuffle_ps(vec, vec, _MM_SHUFFLE((uint32_t)(i + 3) % 4, (uint32_t)(i + 2) % 4, (uint32_t)(i + 1) % 4, (uint32_t)(i + 0) % 4))) : (vec))
-#define m128_splat(x, e)       _mm_shuffle_ps(x, x, _MM_SHUFFLE(e, e, e, e))
-#define m128_sld(vec, vec2, x) m128_ror(vec, ((x) / 4))
-
-__forceinline __m128 m128_mul_add(__m128 a, __m128 b, __m128 c)
-{
-    return _mm_add_ps(c, _mm_mul_ps(a, b));
-}
-
-__forceinline __m128 m128_mul_sub(__m128 a, __m128 b, __m128 c)
-{
-    return _mm_sub_ps(c, _mm_mul_ps(a, b));
-}
-
-__forceinline __m128 m128_merge_hi(__m128 a, __m128 b)
-{
-    return _mm_unpacklo_ps(a, b);
-}
-
-__forceinline __m128 m128_merge_lo(__m128 a, __m128 b)
-{
-    return _mm_unpackhi_ps(a, b);
-}
-
-__forceinline __m128 m128_select(__m128 a, __m128 b, __m128 mask)
-{
-    return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
-}
-
-__forceinline __m128 m128_negatef(__m128 x)
-{
-    return _mm_sub_ps(_mm_setzero_ps(), x);
-}
-
-__forceinline __m128 m128_fabsf(__m128 x)
-{
-    return _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
-    //return _mm_andnot_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
-}
-
-__forceinline __m128 m128_acosf(__m128 x)
-{
-    const __m128 xabs   = m128_fabsf(x);
-    const __m128 select = _mm_cmplt_ps(x, _mm_setzero_ps());
-    const __m128 t1     = _mm_sqrt_ps(_mm_sub_ps(_mm_set1_ps(1.0f), xabs));
-
-    /* Instruction counts can be reduced if the polynomial was
-     * computed entirely from nested (dependent) fma's. However,
-     * to reduce the number of pipeline stalls, the polygon is evaluated
-     * in two halves (hi and lo).
-     */
-    const __m128 xabs2 = _mm_mul_ps(xabs, xabs);
-    const __m128 xabs4 = _mm_mul_ps(xabs2, xabs2);
-
-    const __m128 hi = m128_mul_add(m128_mul_add(m128_mul_add(_mm_set1_ps(-0.0012624911f),
-        xabs, _mm_set1_ps( 0.0066700901f)),
-        xabs, _mm_set1_ps(-0.0170881256f)),
-        xabs, _mm_set1_ps( 0.0308918810f));
-
-    const __m128 lo = m128_mul_add(m128_mul_add(m128_mul_add(_mm_set1_ps(-0.0501743046f),
-        xabs, _mm_set1_ps( 0.0889789874f)),
-        xabs, _mm_set1_ps(-0.2145988016f)),
-        xabs, _mm_set1_ps( 1.5707963050f));
-
-    const __m128 result = m128_mul_add(hi, xabs4, lo);
-
-    // Adjust the result if x is negative.
-    return m128_select(
-        _mm_mul_ps(t1, result),                                     // Positive
-        m128_mul_sub(t1, result, _mm_set1_ps(3.1415926535898f)),    // Negative
-        select
-    );
-}
-
-__forceinline __m128 m128_cosf(__m128 x)
-{
-    // Range reduction
-    // Find the quadrant the angle falls in
-    // Algo:
-    //      xl = angle * TwoOverPi; 
-    //      q = (int32_t)(ceil(abs(xl)) * sign(xl))
-    const __m128i q = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(0.63661977236f)));
-
-    // Compute the offset based on the quadrant that the angle falls in.
-    // Add 1 to the offset for the cosine.
-    const __m128i offset_sin = _mm_and_si128(q, _mm_set1_epi32(0x3));
-    const __m128i offset_cos = _mm_add_epi32(_mm_set1_epi32(1), offset_sin);
-
-    // Remainder in range [-pi/4..pi/4]
-    const __m128 qf = _mm_cvtepi32_ps(q);
-    const __m128 xl = m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC2), m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC1), x));
-
-    // Compute x^2 and x^3
-    const __m128 xl2 = _mm_mul_ps(xl, xl);
-    const __m128 xl3 = _mm_mul_ps(xl2, xl);
-
-    // Compute both the sin and cos of the angles
-    // using a polynomial expression:
-    //      cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2)
-    //      sx =   xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
-    const __m128 cx =
-        m128_mul_add(
-            m128_mul_add(
-                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_CC0), xl2, _mm_set1_ps(SIMD_SINCOS_CC1)), xl2, _mm_set1_ps(SIMD_SINCOS_CC2)),
-            xl2, _mm_set1_ps(1.0f));
-
-    const __m128 sx =
-        m128_mul_add(
-            m128_mul_add(
-                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_SC0), xl2, _mm_set1_ps(SIMD_SINCOS_SC1)), xl2, _mm_set1_ps(SIMD_SINCOS_SC2)),
-            xl3, xl);
-
-    // Use the cosine when the offset is odd and the sin
-    // when the offset is even
-    __m128 cos_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_cos, _mm_set1_epi32(0x1)), _mm_setzero_si128()));
-
-    __m128 cos = m128_select(cx, sx, cos_mask);
-
-    // Flip the sign of the result when (offset mod 4) = 1 or 2
-    cos_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_cos, _mm_set1_epi32(0x2)), _mm_setzero_si128()));
-
-    // Write the result
-    return m128_select(_mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), cos), cos, cos_mask);    
-}
-
-__forceinline __m128 m128_sinf(__m128 x)
-{
-    // Range reduction
-    // Find the quadrant the angle falls in
-    // Algo:
-    //      xl = angle * TwoOverPi; 
-    //      q = (int32_t)(ceil(abs(xl)) * sign(xl))
-    const __m128i q = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(0.63661977236f)));
-
-    // Compute the offset based on the quadrant that the angle falls in.
-    // Add 1 to the offset for the cosine.
-    const __m128i offset_sin = _mm_and_si128(q, _mm_set1_epi32(0x3));
-
-    // Remainder in range [-pi/4..pi/4]
-    const __m128 qf = _mm_cvtepi32_ps(q);
-    const __m128 xl = m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC2), m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC1), x));
-
-    // Compute x^2 and x^3
-    const __m128 xl2 = _mm_mul_ps(xl, xl);
-    const __m128 xl3 = _mm_mul_ps(xl2, xl);
-
-    // Compute both the sin and cos of the angles
-    // using a polynomial expression:
-    //      cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2)
-    //      sx =   xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
-    const __m128 cx =
-        m128_mul_add(
-            m128_mul_add(
-                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_CC0), xl2, _mm_set1_ps(SIMD_SINCOS_CC1)), xl2, _mm_set1_ps(SIMD_SINCOS_CC2)),
-            xl2, _mm_set1_ps(1.0f));
-
-    const __m128 sx =
-        m128_mul_add(
-            m128_mul_add(
-                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_SC0), xl2, _mm_set1_ps(SIMD_SINCOS_SC1)), xl2, _mm_set1_ps(SIMD_SINCOS_SC2)),
-            xl3, xl);
-
-    // Use the cosine when the offset is odd and the sin
-    // when the offset is even
-    __m128 sin_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_sin, _mm_set1_epi32(0x1)), _mm_setzero_si128()));
-
-    __m128 sin = m128_select(cx, sx, sin_mask);
-
-    // Flip the sign of the result when (offset mod 4) = 1 or 2
-    sin_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_sin, _mm_set1_epi32(0x2)), _mm_setzero_si128()));
-
-    // Write the result
-    return m128_select(_mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), sin), sin, sin_mask);
-}
-
-__forceinline void m128_sinf_cosf(__m128 x, __m128* out_sin, __m128* out_cos)
-{
-    // Range reduction
-    // Find the quadrant the angle falls in
-    // Algo:
-    //      xl = angle * TwoOverPi; 
-    //      q = (int32_t)(ceil(abs(xl)) * sign(xl))
-    const __m128i q = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(0.63661977236f)));
-
-    // Compute the offset based on the quadrant that the angle falls in.
-    // Add 1 to the offset for the cosine.
-    const __m128i offset_sin = _mm_and_si128(q, _mm_set1_epi32(0x3));
-    const __m128i offset_cos = _mm_add_epi32(_mm_set1_epi32(1), offset_sin);
-
-    // Remainder in range [-pi/4..pi/4]
-    const __m128 qf = _mm_cvtepi32_ps(q);
-    const __m128 xl = m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC2), m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC1), x));
-
-    // Compute x^2 and x^3
-    const __m128 xl2 = _mm_mul_ps(xl, xl);
-    const __m128 xl3 = _mm_mul_ps(xl2, xl);
-
-    // Compute both the sin and cos of the angles
-    // using a polynomial expression:
-    //      cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2)
-    //      sx =   xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
-    const __m128 cx =
-        m128_mul_add(
-            m128_mul_add(
-                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_CC0), xl2, _mm_set1_ps(SIMD_SINCOS_CC1)), xl2, _mm_set1_ps(SIMD_SINCOS_CC2)),
-            xl2, _mm_set1_ps(1.0f));
-
-    const __m128 sx =
-        m128_mul_add(
-            m128_mul_add(
-                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_SC0), xl2, _mm_set1_ps(SIMD_SINCOS_SC1)), xl2, _mm_set1_ps(SIMD_SINCOS_SC2)),
-            xl3, xl);
-
-    // Use the cosine when the offset is odd and the sin
-    // when the offset is even
-    __m128 sin_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_sin, _mm_set1_epi32(0x1)), _mm_setzero_si128()));
-    __m128 cos_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_cos, _mm_set1_epi32(0x1)), _mm_setzero_si128()));
-
-    __m128 sin = m128_select(cx, sx, sin_mask);
-    __m128 cos = m128_select(cx, sx, cos_mask);
-
-    // Flip the sign of the result when (offset mod 4) = 1 or 2
-    sin_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_sin, _mm_set1_epi32(0x2)), _mm_setzero_si128()));
-    cos_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_cos, _mm_set1_epi32(0x2)), _mm_setzero_si128()));
-
-    // Write the result
-    *out_sin = m128_select(_mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), sin), sin, sin_mask);
-    *out_cos = m128_select(_mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), cos), cos, cos_mask);
-}
-
 // -------------------------------------------------------------
 // Constructors
 // -------------------------------------------------------------
 
-/// Create a new vector 2D
-__forceinline vec2 vec2_new(float x, float y)
-{
-    vec2 result = { x, y };
-    return result;
-}
-
-/// Create a new vector 2D, with components have same value
-__forceinline vec2 vec2_new1(float s)
-{
-    vec2 result = { s, s };
-    return result;
-}
-
-/// Create a new vector 2D from a vector 3D
-__forceinline vec2 vec2_from_vec3(vec3 v)
-{
-    return v.xy;
-}
-
 /// Create a new vector 3D
 __forceinline vec3 vec3_new(float x, float y, float z)
 {
@@ -440,76 +164,6 @@ __forceinline mat4 mat4_load(const float ptr[])
 // Operators-like functions
 // -------------------------------------------------------------
 
-__forceinline vec2 vec2_neg(vec2 v)
-{
-    return vec2_new(-v.x, -v.y);
-}
-
-__forceinline vec2 vec2_add(vec2 a, vec2 b)
-{
-    return vec2_new(a.x + b.x, a.y + b.y);
-}
-
-__forceinline vec2 vec2_sub(vec2 a, vec2 b)
-{
-    return vec2_new(a.x - b.x, a.y - b.y);
-}
-
-__forceinline vec2 vec2_mul(vec2 a, vec2 b)
-{
-    return vec2_new(a.x * b.x, a.y * b.y);
-}
-
-__forceinline vec2 vec2_div(vec2 a, vec2 b)
-{
-    return vec2_new(a.x / b.x, a.y / b.y);
-}
-
-__forceinline vec2 vec2_add1(vec2 a, float b)
-{
-    return vec2_add(a, vec2_new1(b));
-}
-
-__forceinline vec2 vec2_sub1(vec2 a, float b)
-{
-    return vec2_sub(a, vec2_new1(b));
-}
-
-__forceinline vec2 vec2_mul1(vec2 a, float b)
-{
-    return vec2_mul(a, vec2_new1(b));
-}
-
-__forceinline vec2 vec2_div1(vec2 a, float b)
-{
-    return vec2_div(a, vec2_new1(b));
-}
-
-__forceinline vec2 vec2_mul_add(vec2 a, vec2 b, vec2 c)
-{
-    return vec2_add(c, vec2_mul(a, b));
-}
-
-__forceinline vec2 vec2_mul_sub(vec2 a, vec2 b, vec2 c)
-{
-    return vec2_sub(c, vec2_mul(a, b));
-}
-
-__forceinline bool vec2_equal(vec2 a, vec2 b)
-{
-    return a.x == b.x && a.y == b.y;
-}
-
-__forceinline bool vec2_not_equal(vec2 a, vec2 b)
-{
-    return a.x != b.x || a.y != b.y;
-}
-
-__forceinline bool vec2_isclose(vec2 a, vec2 b)
-{
-    return float_isclose(a.x, b.x) && float_isclose(a.y, b.y);
-}
-
 __forceinline vec3 vec3_neg(vec3 v)
 {
     return vec3_from_m128(_mm_sub_ps(_mm_setzero_ps(), v.m128));
@@ -696,303 +350,6 @@ __forceinline bool mat4_not_equal(mat4 a, mat4 b)
 // Functions
 // -------------------------------------------------------------
 
-/// Computes sign of 'x'
-__forceinline ivec2 vec2_sign(vec2 v)
-{
-    ivec2 result = { float_sign(v.x), float_sign(v.y) };
-    return result;
-}
-
-/// Computes absolute value
-__forceinline vec2 vec2_abs(vec2 v)
-{
-    return vec2_new(fabsf(v.x), fabsf(v.y));
-}
-
-/// Computes cosine
-__forceinline vec2 vec2_cos(vec2 v)
-{
-    return vec2_new(cosf(v.x), cosf(v.y));
-}
-
-/// Computes sine
-__forceinline vec2 vec2_sin(vec2 v)
-{
-    return vec2_new(sinf(v.x), sinf(v.y));
-}
-
-/// Computes tangent
-__forceinline vec2 vec2_tan(vec2 v)
-{
-    return vec2_new(tanf(v.x), tanf(v.y));
-}
-
-/// Computes hyperbolic cosine
-__forceinline vec2 vec2_cosh(vec2 v)
-{
-    return vec2_new(coshf(v.x), coshf(v.y));
-}
-
-/// Computes hyperbolic sine
-__forceinline vec2 vec2_sinh(vec2 v)
-{
-    return vec2_new(sinhf(v.x), sinhf(v.y));
-}
-
-/// Computes hyperbolic tangent
-__forceinline vec2 vec2_tanh(vec2 v)
-{
-    return vec2_new(tanhf(v.x), tanhf(v.y));
-}
-
-/// Computes inverse cosine
-__forceinline vec2 vec2_acos(vec2 v)
-{
-    return vec2_new(acosf(v.x), acosf(v.y));
-}
-
-/// Computes inverse sine
-__forceinline vec2 vec2_asin(vec2 v)
-{
-    return vec2_new(asinf(v.x), asinf(v.y));
-}
-
-/// Computes inverse tangent
-__forceinline vec2 vec2_atan(vec2 v)
-{
-    return vec2_new(atanf(v.x), atanf(v.y));
-}
-
-/// Computes inverse tangent with 2 args
-__forceinline vec2 vec2_atan2(vec2 a, vec2 b)
-{
-    return vec2_new(atan2f(a.x, b.x), atan2f(a.y, b.y));
-}
-
-/// Computes Euler number raised to the power 'x'
-__forceinline vec2 vec2_exp(vec2 v)
-{
-    return vec2_new(expf(v.x), expf(v.y));
-}
-
-/// Computes 2 raised to the power 'x'
-__forceinline vec2 vec2_exp2(vec2 v)
-{
-    return vec2_new(exp2f(v.x), exp2f(v.y));
-}
-
-/// Computes the base Euler number logarithm
-__forceinline vec2 vec2_log(vec2 v)
-{
-    return vec2_new(logf(v.x), logf(v.y));
-}
-
-/// Computes the base 2 logarithm
-__forceinline vec2 vec2_log2(vec2 v)
-{
-    return vec2_new(log2f(v.x), log2f(v.y));
-}
-
-/// Computes the base 10 logarithm
-__forceinline vec2 vec2_log10(vec2 v)
-{
-    return vec2_new(log10f(v.x), log10f(v.y));
-}
-
-/// Computes the value of base raised to the power exponent
-__forceinline vec2 vec2_pow(vec2 a, vec2 b)
-{
-    return vec2_new(powf(a.x, b.x), powf(a.y, b.y));
-}
-
-/// Get the fractal part of floating point
-__forceinline vec2 vec2_frac(vec2 v)
-{
-    return vec2_new(float_frac(v.x), float_frac(v.y));
-}
-
-/// Computes the floating-point remainder of the division operation x/y
-__forceinline vec2 vec2_fmod(vec2 a, vec2 b)
-{
-    return vec2_new(fmodf(a.x, b.x), fmodf(a.y, b.y));
-}
-
-/// Computes the smallest integer value not less than 'x'
-__forceinline vec2 vec2_ceil(vec2 v)
-{
-    return vec2_new(ceilf(v.x), ceilf(v.y));
-}
-
-/// Computes the largest integer value not greater than 'x'
-__forceinline vec2 vec2_floor(vec2 v)
-{
-    return vec2_new(floorf(v.x), floorf(v.y));
-}
-
-/// Computes the nearest integer value
-__forceinline vec2 vec2_round(vec2 v)
-{
-    return vec2_new(roundf(v.x), roundf(v.y));
-}
-
-/// Computes the nearest integer not greater in magnitude than 'x'
-__forceinline vec2 vec2_trunc(vec2 v)
-{
-    return vec2_new(truncf(v.x), truncf(v.y));
-}
-
-/// Get the smaller value
-__forceinline vec2 vec2_min(vec2 a, vec2 b)
-{
-    return vec2_new(float_min(a.x, b.x), float_min(a.y, b.y));
-}
-
-/// Get the larger value
-__forceinline vec2 vec2_max(vec2 a, vec2 b)
-{
-    return vec2_new(float_max(a.x, b.x), float_max(a.y, b.y));
-}
-
-/// Clamps the 'x' value to the [min, max].
-__forceinline vec2 vec2_clamp(vec2 v, vec2 min, vec2 max)
-{
-    return vec2_new(float_clamp(v.x, min.x, max.x), float_clamp(v.y, min.y, max.y));
-}
-
-/// Clamps the specified value within the range of 0 to 1
-__forceinline vec2 vec2_saturate(vec2 v)
-{
-    return vec2_new(float_saturate(v.x), float_saturate(v.y));
-}
-
-/// Compares two values, returning 0 or 1 based on which value is greater.
-__forceinline vec2 vec2_step(vec2 a, vec2 b)
-{
-    return vec2_new(float_step(a.x, b.x), float_step(a.y, b.y));
-}
-
-/// Performs a linear interpolation.
-__forceinline vec2 vec2_lerp(vec2 a, vec2 b, vec2 t)
-{
-    return vec2_new(float_lerp(a.x, b.x, t.x), float_lerp(a.y, b.y, t.y));
-}
-
-/// Performs a linear interpolation.
-__forceinline vec2 vec2_lerp1(vec2 a, vec2 b, float t)
-{
-    return vec2_new(float_lerp(a.x, b.x, t), float_lerp(a.y, b.y, t));
-}
-
-/// Compute a smooth Hermite interpolation
-__forceinline vec2 vec2_smoothstep(vec2 a, vec2 b, vec2 t)
-{
-    return vec2_new(float_smoothstep(a.x, b.x, t.x), float_smoothstep(a.y, b.y, t.y));
-}
-
-/// Computes square root of 'x'.
-__forceinline vec2 vec2_sqrt(vec2 v)
-{
-    return vec2_new(sqrtf(v.x), sqrtf(v.y));
-}
-
-/// Computes inverse square root of 'x'.
-__forceinline vec2 vec2_rsqrt(vec2 v)
-{
-    return vec2_new(float_rsqrt(v.x), float_rsqrt(v.y));
-}
-
-/// Compute dot product of two vectors
-__forceinline float vec2_dot(vec2 a, vec2 b)
-{
-    return a.x * b.x + a.y * b.y;
-}
-
-/// Compute squared length of vector
-__forceinline float vec2_lensqr(vec2 v)
-{
-    return vec2_dot(v, v);
-}
-
-/// Compute length of vector
-__forceinline float vec2_length(vec2 v)
-{  
-    return float32_sqrt(vec2_lensqr(v));
-}
-
-/// Compute distance from 'a' to b
-__forceinline float vec2_distance(vec2 a, vec2 b)
-{
-    return vec2_length(vec2_sub(a, b));
-}
-
-/// Compute squared distance from 'a' to b
-__forceinline float vec2_distsqr(vec2 a, vec2 b)
-{
-    return vec2_lensqr(vec2_sub(a, b));
-}
-
-/// Compute normalized vector
-__forceinline vec2 vec2_normalize(vec2 v)
-{
-#ifndef VECTORMATH_USE_EXACT_PRECISION
-    return vec2_mul1(v, float32_fast_rsqrt(vec2_lensqr(v)));
-#else
-    const float lsqr = vec2_lensqr(v);
-    if (lsqr > 0.0f)
-    {
-        const float f = 1.0f / sqrtf(lsqr);
-        return vec2_new(v.x * f, v.y * f);
-    }
-    else
-    {
-        return v;
-    }
-#endif
-}
-
-/// Compute reflection vector
-__forceinline vec2 vec2_reflect(vec2 v, vec2 n)
-{
-    return vec2_sub(v, vec2_mul1(n, 2.0f * vec2_dot(v, n)));
-}
-
-/// Compute refraction vector
-__forceinline vec2 vec2_refract(vec2 v, vec2 n, float eta)
-{
-    const float k = 1.0f - eta * eta * (1.0f - vec2_dot(v, n) * vec2_dot(v, n));
-    return k < 0.0f
-        ? vec2_new1(0.0f)
-        : vec2_sub(vec2_mul1(v, eta), vec2_mul1(v, (eta * vec2_dot(v, n) + sqrtf(k))));
-}
-
-/// Compute faceforward vector
-__forceinline vec2 vec2_faceforward(vec2 n, vec2 i, vec2 nref)
-{
-    return vec2_dot(i, nref) < 0.0f ? n : vec2_neg(n);
-}
-
-/// Compute angle vector
-__forceinline float vec2_angle(vec2 v)
-{
-    return atan2f(v.y, v.x);
-}
-
-/// Compute angle vector in degrees
-__forceinline float vec2_angle_deg(vec2 v)
-{
-    return float_rad2deg(vec2_angle(v));
-}
-
-/// Create new vector with angle and length
-__forceinline vec2 vec2_from_angle(float angle, float length) {
-    return vec2_new(cosf(angle) * length, sinf(angle) * length);
-}
-
-/// Create new vector with angle in degrees, and length
-__forceinline vec2 vec2_from_angle_deg(float angle, float length) {
-    return vec2_from_angle(float_rad2deg(angle), length);
-}
-
 /// Computes sign of 'x'
 __forceinline ivec3 vec3_sign(vec3 v)
 {
diff --git a/include/vectormath/vectormath_simd_utils.h b/include/vectormath/vectormath_simd_utils.h
new file mode 100644
index 0000000..2cc85c9
--- /dev/null
+++ b/include/vectormath/vectormath_simd_utils.h
@@ -0,0 +1,286 @@
+#pragma once
+
+#include "vectormath_types.h"
+
+// Make sure m128 is enable
+#if !VECTORMATH_SIMD_ENABLE && !VECTORMATH_USE_CLANG_EXT
+#error The vectormath support for simd is not enable, please use vector_math_scalar.h instead
+#endif
+
+// -------------------------------------------------------------
+// Helper constants
+// -------------------------------------------------------------
+
+// Small epsilon value
+constexpr float SIMD_SLERP_TOL  = 0.999f;
+
+// Common constants used to evaluate simdSinf/cosf4/tanf4
+constexpr float SIMD_SINCOS_CC0 = -0.0013602249f;
+constexpr float SIMD_SINCOS_CC1 =  0.0416566950f;
+constexpr float SIMD_SINCOS_CC2 = -0.4999990225f;
+constexpr float SIMD_SINCOS_SC0 = -0.0001950727f;
+constexpr float SIMD_SINCOS_SC1 =  0.0083320758f;
+constexpr float SIMD_SINCOS_SC2 = -0.1666665247f;
+constexpr float SIMD_SINCOS_KC1 =  1.57079625129f;
+constexpr float SIMD_SINCOS_KC2 =  7.54978995489e-8f;
+
+
+// Shorthand functions to get the unit vectors as __m128
+__forceinline __m128 m128_unit_1000(void) { return _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f); }
+__forceinline __m128 m128_unit_0100(void) { return _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f); }
+__forceinline __m128 m128_unit_0010(void) { return _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f); }
+__forceinline __m128 m128_unit_0001(void) { return _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f); }
+
+
+// ========================================================
+// Internal helper types and functions
+// ========================================================
+
+
+// These have to be macros because _MM_SHUFFLE() requires compile-time constants.
+#define m128_ror(vec, i)       (((i) % 4) ? (_mm_shuffle_ps(vec, vec, _MM_SHUFFLE((uint32_t)(i + 3) % 4, (uint32_t)(i + 2) % 4, (uint32_t)(i + 1) % 4, (uint32_t)(i + 0) % 4))) : (vec))
+#define m128_splat(x, e)       _mm_shuffle_ps(x, x, _MM_SHUFFLE(e, e, e, e))
+#define m128_sld(vec, vec2, x) m128_ror(vec, ((x) / 4))
+
+
+__forceinline __m128 m128_mul_add(__m128 a, __m128 b, __m128 c)
+{
+    return _mm_add_ps(c, _mm_mul_ps(a, b));
+}
+
+
+__forceinline __m128 m128_mul_sub(__m128 a, __m128 b, __m128 c)
+{
+    return _mm_sub_ps(c, _mm_mul_ps(a, b));
+}
+
+
+__forceinline __m128 m128_merge_hi(__m128 a, __m128 b)
+{
+    return _mm_unpacklo_ps(a, b);
+}
+
+
+__forceinline __m128 m128_merge_lo(__m128 a, __m128 b)
+{
+    return _mm_unpackhi_ps(a, b);
+}
+
+
+__forceinline __m128 m128_select(__m128 a, __m128 b, __m128 mask)
+{
+    return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
+}
+
+
+__forceinline __m128 m128_negatef(__m128 x)
+{
+    return _mm_sub_ps(_mm_setzero_ps(), x);
+}
+
+
+__forceinline __m128 m128_fabsf(__m128 x)
+{
+    return _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
+    //return _mm_andnot_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
+}
+
+
+__forceinline __m128 m128_acosf(__m128 x)
+{
+    const __m128 xabs   = m128_fabsf(x);
+    const __m128 select = _mm_cmplt_ps(x, _mm_setzero_ps());
+    const __m128 t1     = _mm_sqrt_ps(_mm_sub_ps(_mm_set1_ps(1.0f), xabs));
+
+    /* Instruction counts can be reduced if the polynomial was
+     * computed entirely from nested (dependent) fma's. However,
+     * to reduce the number of pipeline stalls, the polygon is evaluated
+     * in two halves (hi and lo).
+     */
+    const __m128 xabs2 = _mm_mul_ps(xabs, xabs);
+    const __m128 xabs4 = _mm_mul_ps(xabs2, xabs2);
+
+    const __m128 hi = m128_mul_add(m128_mul_add(m128_mul_add(_mm_set1_ps(-0.0012624911f),
+        xabs, _mm_set1_ps( 0.0066700901f)),
+        xabs, _mm_set1_ps(-0.0170881256f)),
+        xabs, _mm_set1_ps( 0.0308918810f));
+
+    const __m128 lo = m128_mul_add(m128_mul_add(m128_mul_add(_mm_set1_ps(-0.0501743046f),
+        xabs, _mm_set1_ps( 0.0889789874f)),
+        xabs, _mm_set1_ps(-0.2145988016f)),
+        xabs, _mm_set1_ps( 1.5707963050f));
+
+    const __m128 result = m128_mul_add(hi, xabs4, lo);
+
+    // Adjust the result if x is negative.
+    return m128_select(
+        _mm_mul_ps(t1, result),                                     // Positive
+        m128_mul_sub(t1, result, _mm_set1_ps(3.1415926535898f)),    // Negative
+        select
+    );
+}
+
+
+/// Compute cosf for SIMD 128
+__forceinline __m128 m128_cosf(__m128 x)
+{
+    // Range reduction
+    // Find the quadrant the angle falls in
+    // Algo:
+    //      xl = angle * TwoOverPi; 
+    //      q = (int32_t)(ceil(abs(xl)) * sign(xl))
+    const __m128i q = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(0.63661977236f)));
+
+    // Compute the offset based on the quadrant that the angle falls in.
+    // Add 1 to the offset for the cosine.
+    const __m128i offset_sin = _mm_and_si128(q, _mm_set1_epi32(0x3));
+    const __m128i offset_cos = _mm_add_epi32(_mm_set1_epi32(1), offset_sin);
+
+    // Remainder in range [-pi/4..pi/4]
+    const __m128 qf = _mm_cvtepi32_ps(q);
+    const __m128 xl = m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC2), m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC1), x));
+
+    // Compute x^2 and x^3
+    const __m128 xl2 = _mm_mul_ps(xl, xl);
+    const __m128 xl3 = _mm_mul_ps(xl2, xl);
+
+    // Compute both the sin and cos of the angles
+    // using a polynomial expression:
+    //      cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2)
+    //      sx =   xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
+    const __m128 cx =
+        m128_mul_add(
+            m128_mul_add(
+                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_CC0), xl2, _mm_set1_ps(SIMD_SINCOS_CC1)), xl2, _mm_set1_ps(SIMD_SINCOS_CC2)),
+            xl2, _mm_set1_ps(1.0f));
+
+    const __m128 sx =
+        m128_mul_add(
+            m128_mul_add(
+                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_SC0), xl2, _mm_set1_ps(SIMD_SINCOS_SC1)), xl2, _mm_set1_ps(SIMD_SINCOS_SC2)),
+            xl3, xl);
+
+    // Use the cosine when the offset is odd and the sin
+    // when the offset is even
+    __m128 cos_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_cos, _mm_set1_epi32(0x1)), _mm_setzero_si128()));
+
+    __m128 cos = m128_select(cx, sx, cos_mask);
+
+    // Flip the sign of the result when (offset mod 4) = 1 or 2
+    cos_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_cos, _mm_set1_epi32(0x2)), _mm_setzero_si128()));
+
+    // Write the result
+    return m128_select(_mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), cos), cos, cos_mask);    
+}
+
+
+/// Compute sinf for SIMD 128
+__forceinline __m128 m128_sinf(__m128 x)
+{
+    // Range reduction
+    // Find the quadrant the angle falls in
+    // Algo:
+    //      xl = angle * TwoOverPi; 
+    //      q = (int32_t)(ceil(abs(xl)) * sign(xl))
+    const __m128i q = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(0.63661977236f)));
+
+    // Compute the offset based on the quadrant that the angle falls in.
+    // Add 1 to the offset for the cosine.
+    const __m128i offset_sin = _mm_and_si128(q, _mm_set1_epi32(0x3));
+
+    // Remainder in range [-pi/4..pi/4]
+    const __m128 qf = _mm_cvtepi32_ps(q);
+    const __m128 xl = m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC2), m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC1), x));
+
+    // Compute x^2 and x^3
+    const __m128 xl2 = _mm_mul_ps(xl, xl);
+    const __m128 xl3 = _mm_mul_ps(xl2, xl);
+
+    // Compute both the sin and cos of the angles
+    // using a polynomial expression:
+    //      cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2)
+    //      sx =   xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
+    const __m128 cx =
+        m128_mul_add(
+            m128_mul_add(
+                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_CC0), xl2, _mm_set1_ps(SIMD_SINCOS_CC1)), xl2, _mm_set1_ps(SIMD_SINCOS_CC2)),
+            xl2, _mm_set1_ps(1.0f));
+
+    const __m128 sx =
+        m128_mul_add(
+            m128_mul_add(
+                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_SC0), xl2, _mm_set1_ps(SIMD_SINCOS_SC1)), xl2, _mm_set1_ps(SIMD_SINCOS_SC2)),
+            xl3, xl);
+
+    // Use the cosine when the offset is odd and the sin
+    // when the offset is even
+    __m128 sin_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_sin, _mm_set1_epi32(0x1)), _mm_setzero_si128()));
+
+    __m128 sin = m128_select(cx, sx, sin_mask);
+
+    // Flip the sign of the result when (offset mod 4) = 1 or 2
+    sin_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_sin, _mm_set1_epi32(0x2)), _mm_setzero_si128()));
+
+    // Write the result
+    return m128_select(_mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), sin), sin, sin_mask);
+}
+
+
+/// Compute cosf and sinf for SIMD 128 in one function call
+__forceinline void m128_sinf_cosf(__m128 x, __m128* out_sin, __m128* out_cos)
+{
+    // Range reduction
+    // Find the quadrant the angle falls in
+    // Algo:
+    //      xl = angle * TwoOverPi; 
+    //      q = (int32_t)(ceil(abs(xl)) * sign(xl))
+    const __m128i q = _mm_cvtps_epi32(_mm_mul_ps(x, _mm_set1_ps(0.63661977236f)));
+
+    // Compute the offset based on the quadrant that the angle falls in.
+    // Add 1 to the offset for the cosine.
+    const __m128i offset_sin = _mm_and_si128(q, _mm_set1_epi32(0x3));
+    const __m128i offset_cos = _mm_add_epi32(_mm_set1_epi32(1), offset_sin);
+
+    // Remainder in range [-pi/4..pi/4]
+    const __m128 qf = _mm_cvtepi32_ps(q);
+    const __m128 xl = m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC2), m128_mul_sub(qf, _mm_set1_ps(SIMD_SINCOS_KC1), x));
+
+    // Compute x^2 and x^3
+    const __m128 xl2 = _mm_mul_ps(xl, xl);
+    const __m128 xl3 = _mm_mul_ps(xl2, xl);
+
+    // Compute both the sin and cos of the angles
+    // using a polynomial expression:
+    //      cx = 1.0f + xl2 * ((C0 * xl2 + C1) * xl2 + C2)
+    //      sx =   xl + xl3 * ((S0 * xl2 + S1) * xl2 + S2)
+    const __m128 cx =
+        m128_mul_add(
+            m128_mul_add(
+                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_CC0), xl2, _mm_set1_ps(SIMD_SINCOS_CC1)), xl2, _mm_set1_ps(SIMD_SINCOS_CC2)),
+            xl2, _mm_set1_ps(1.0f));
+
+    const __m128 sx =
+        m128_mul_add(
+            m128_mul_add(
+                m128_mul_add(_mm_set1_ps(SIMD_SINCOS_SC0), xl2, _mm_set1_ps(SIMD_SINCOS_SC1)), xl2, _mm_set1_ps(SIMD_SINCOS_SC2)),
+            xl3, xl);
+
+    // Use the cosine when the offset is odd and the sin
+    // when the offset is even
+    __m128 sin_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_sin, _mm_set1_epi32(0x1)), _mm_setzero_si128()));
+    __m128 cos_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_cos, _mm_set1_epi32(0x1)), _mm_setzero_si128()));
+
+    __m128 sin = m128_select(cx, sx, sin_mask);
+    __m128 cos = m128_select(cx, sx, cos_mask);
+
+    // Flip the sign of the result when (offset mod 4) = 1 or 2
+    sin_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_sin, _mm_set1_epi32(0x2)), _mm_setzero_si128()));
+    cos_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_and_si128(offset_cos, _mm_set1_epi32(0x2)), _mm_setzero_si128()));
+
+    // Write the result
+    *out_sin = m128_select(_mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), sin), sin, sin_mask);
+    *out_cos = m128_select(_mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000)), cos), cos, cos_mask);
+}
+
+//! EOF
+
diff --git a/include/vectormath/vectormath_types.h b/include/vectormath/vectormath_types.h
index 7998482..5563a5b 100644
--- a/include/vectormath/vectormath_types.h
+++ b/include/vectormath/vectormath_types.h
@@ -399,7 +399,7 @@ typedef union VECTORMATH_VECTORTYPE(mat4, 16)
 #endif
 
 static_assert(sizeof(vec2)  ==  8, "sizeof(vec2) must be 8 bytes");
-static_assert(sizeof(vec3)  == 16, "sizeof(vec3) must be 12 bytes");
+static_assert(sizeof(vec3)  == 16, "sizeof(vec3) must be 16 bytes");
 static_assert(sizeof(vec4)  == 16, "sizeof(vec4) must be 16 bytes");
 
 static_assert(sizeof(ivec2) ==  8, "sizeof(ivec2) must be 8 bytes");
diff --git a/premake5.lua b/premake5.lua
index a089c90..7fa5c1b 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -101,11 +101,11 @@ end
 
 -- examples
 newoption {
-    trigger = "examples",
+    trigger = "with-examples",
     description = "Generate examples projects"
 }
 
-if _OPTIONS["examples"] then
+if _OPTIONS["with-examples"] then
     local function vectormathexample(name)
         vectormathproject(name)
         do
diff --git a/scripts/vectormath.premake5.lua b/scripts/vectormath.premake5.lua
index 520a613..c8803da 100644
--- a/scripts/vectormath.premake5.lua
+++ b/scripts/vectormath.premake5.lua
@@ -44,12 +44,16 @@ return {
             path.join(subincludedir, "stdmath_extensions.h"),
     
             path.join(subincludedir, "vectormath_types.h"),
+            path.join(subincludedir, "vectormath_common.h"),
     
             path.join(subincludedir, "vectormath_scalar.h"),
+            path.join(subincludedir, "vectormath_clang.h"),
             path.join(subincludedir, "vectormath_simd.h"),
+            path.join(subincludedir, "vectormath_simd_utils.h"),
             
             path.join(subincludedir, "vectormath_operators.h"),
             path.join(subincludedir, "vectormath_swizzles.h"),
+            path.join(subincludedir, "vectormath_generics.h"),
     
             path.join(includedir, "vectormath.h"),
         }
diff --git a/unit_tests/projects/vectormath_unit_tests_android/app/build.gradle b/unit_tests/projects/vectormath_unit_tests_android/app/build.gradle
index 545bcb3..ce8065c 100644
--- a/unit_tests/projects/vectormath_unit_tests_android/app/build.gradle
+++ b/unit_tests/projects/vectormath_unit_tests_android/app/build.gradle
@@ -22,7 +22,7 @@ android {
             }
 
             ndk {
-                abiFilters 'armeabi-v7a', 'arm64-v8a'
+                abiFilters 'armeabi-v7a', 'arm64-v8a', 'x86', 'x86_64'
             }
         }
     }