From 4a4a129e6138b9fe056b9b2b65fe05acc00efc63 Mon Sep 17 00:00:00 2001 From: "maihd.dev" Date: Wed, 27 Mar 2024 23:41:38 +0700 Subject: [PATCH] android: support neon intrinsics --- include/vectormath/sse_to_neon.h | 171 +++++++++++++++++- include/vectormath/vectormath_types.h | 27 ++- .../app/build.gradle | 5 + .../app/src/main/cpp/CMakeLists.txt | 1 + 4 files changed, 181 insertions(+), 23 deletions(-) diff --git a/include/vectormath/sse_to_neon.h b/include/vectormath/sse_to_neon.h index b35732a..c58c41c 100644 --- a/include/vectormath/sse_to_neon.h +++ b/include/vectormath/sse_to_neon.h @@ -1,11 +1,12 @@ #pragma once // Require CPU support neon -#if !defined(VECTORMATH_SUPPORT_NEON) || !VECTORMATH_SUPPORT_NEON +#if !defined(VECTORMATH_NEON_SUPPORT) || !VECTORMATH_NEON_SUPPORT #error Your platform is not supporting Neon instructions set #endif #include +#include // ------------------------------------------------------------- // SSE2NEON @@ -35,6 +36,148 @@ #define __constrange(a,b) \ const +// Use __forceinline for shorthand functions +#if !defined(_MSC_VER) && !defined(__forceinline) +# if defined(__GNUC__) +# define __forceinline static __attribute__((always_inline)) +# elif defined(__cplusplus) +# define __forceinline static inline +# else +# define __forceinline inline +# endif +#endif + +typedef float32x2_t __m64; +typedef float32x4_t __m128; +typedef int32x4_t __m128i; +typedef uint32x4_t __m128u; + +// ****************************************** +// type-safe casting between types +// ****************************************** + +#define vreinterpretq_m128_f16(x) \ + vreinterpretq_f32_f16(x) + +#define vreinterpretq_m128_f32(x) \ + (x) + +#define vreinterpretq_m128_f64(x) \ + vreinterpretq_f32_f64(x) + + +#define vreinterpretq_m128_u8(x) \ + vreinterpretq_f32_u8(x) + +#define vreinterpretq_m128_u16(x) \ + vreinterpretq_f32_u16(x) + +#define vreinterpretq_m128_u32(x) \ + vreinterpretq_f32_u32(x) + +#define vreinterpretq_m128_u64(x) \ + vreinterpretq_f32_u64(x) + + +#define vreinterpretq_m128_s8(x) \ + vreinterpretq_f32_s8(x) + +#define vreinterpretq_m128_s16(x) \ + vreinterpretq_f32_s16(x) + +#define vreinterpretq_m128_s32(x) \ + vreinterpretq_f32_s32(x) + +#define vreinterpretq_m128_s64(x) \ + vreinterpretq_f32_s64(x) + + +#define vreinterpretq_f16_m128(x) \ + vreinterpretq_f16_f32(x) + +#define vreinterpretq_f32_m128(x) \ + (x) + +#define vreinterpretq_f64_m128(x) \ + vreinterpretq_f64_f32(x) + + +#define vreinterpretq_u8_m128(x) \ + vreinterpretq_u8_f32(x) + +#define vreinterpretq_u16_m128(x) \ + vreinterpretq_u16_f32(x) + +#define vreinterpretq_u32_m128(x) \ + vreinterpretq_u32_f32(x) + +#define vreinterpretq_u64_m128(x) \ + vreinterpretq_u64_f32(x) + + +#define vreinterpretq_s8_m128(x) \ + vreinterpretq_s8_f32(x) + +#define vreinterpretq_s16_m128(x) \ + vreinterpretq_s16_f32(x) + +#define vreinterpretq_s32_m128(x) \ + vreinterpretq_s32_f32(x) + +#define vreinterpretq_s64_m128(x) \ + vreinterpretq_s64_f32(x) + + +#define vreinterpretq_m128i_s8(x) \ + vreinterpretq_s32_s8(x) + +#define vreinterpretq_m128i_s16(x) \ + vreinterpretq_s32_s16(x) + +#define vreinterpretq_m128i_s32(x) \ + (x) + +#define vreinterpretq_m128i_s64(x) \ + vreinterpretq_s32_s64(x) + + +#define vreinterpretq_m128i_u8(x) \ + vreinterpretq_s32_u8(x) + +#define vreinterpretq_m128i_u16(x) \ + vreinterpretq_s32_u16(x) + +#define vreinterpretq_m128i_u32(x) \ + vreinterpretq_s32_u32(x) + +#define vreinterpretq_m128i_u64(x) \ + vreinterpretq_s32_u64(x) + + +#define vreinterpretq_s8_m128i(x) \ + vreinterpretq_s8_s32(x) + +#define vreinterpretq_s16_m128i(x) \ + vreinterpretq_s16_s32(x) + +#define vreinterpretq_s32_m128i(x) \ + (x) + +#define vreinterpretq_s64_m128i(x) \ + vreinterpretq_s64_s32(x) + + +#define vreinterpretq_u8_m128i(x) \ + vreinterpretq_u8_s32(x) + +#define vreinterpretq_u16_m128i(x) \ + vreinterpretq_u16_s32(x) + +#define vreinterpretq_u32_m128i(x) \ + vreinterpretq_u32_s32(x) + +#define vreinterpretq_u64_m128i(x) \ + vreinterpretq_u64_s32(x) /// union intended to allow direct access to an __m128 variable using the names that the MSVC /// compiler provides. This union should really only be used when trying to access the members @@ -85,13 +228,6 @@ __forceinline __m128 _mm_setzero_ps(void) return vreinterpretq_m128_f32(vdupq_n_f32(0)); } -/// Sets the four single-precision, floating-point values to w. -/// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx -__forceinline __m128 _mm_set1_ps(float _w) -{ - return vreinterpretq_m128_f32(vdupq_n_f32(_w)); -} - /// Copy single-precision (32-bit) floating-point element a to the lower element of dst, and zero the upper 3 elements. /// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss&expand=4901,4895,4901 __forceinline __m128 _mm_set_ss(float a) @@ -100,6 +236,18 @@ __forceinline __m128 _mm_set_ss(float a) return vreinterpretq_m128_f32(vld1q_f32(data)); } +// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +__forceinline __m128 _mm_set1_ps(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +__forceinline __m128 _mm_set_ps1(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + /// Sets the four single-precision, floating-point values to the four inputs. /// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx __forceinline __m128 _mm_set_ps(float w, float z, float y, float x) @@ -108,6 +256,13 @@ __forceinline __m128 _mm_set_ps(float w, float z, float y, float x) return vreinterpretq_m128_f32(vld1q_f32(data)); } +// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx +__forceinline __m128 _mm_setr_ps(float w, float z , float y , float x ) +{ + float __attribute__ ((aligned (16))) data[4] = { w, z, y, x }; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + /// Sets the 4 signed 32-bit integer values in reverse order /// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx __forceinline __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) diff --git a/include/vectormath/vectormath_types.h b/include/vectormath/vectormath_types.h index 9852adc..7998482 100644 --- a/include/vectormath/vectormath_types.h +++ b/include/vectormath/vectormath_types.h @@ -63,21 +63,22 @@ #endif // Detect neon support & enable -#define VECTORMATH_NEON_SUPPORT 0 // No neon support now -//#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && (defined(VECTORMATH_SIMD_ENABLE) && VECTORMATH_SIMD_ENABLE) -//# if defined(__ARM_ARCH_7A__) && defined(__ANDROID__) -//# define VECTORMATH_NEON_SUPPORT 0 // NO SUPPORT for Android 32bit -//# else -//# define VECTORMATH_NEON_SUPPORT 1 -//# endif -//#else -//# define VECTORMATH_NEON_SUPPORT 0 -//#endif +//#define VECTORMATH_NEON_SUPPORT 0 // No neon support now +#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) +# if defined(__ARM_ARCH_7A__) && defined(__ANDROID__) +# define VECTORMATH_NEON_SUPPORT 0 // NO SUPPORT for Android 32bit +# else +# define VECTORMATH_NEON_SUPPORT 1 +# endif +#else +# define VECTORMATH_NEON_SUPPORT 0 +#endif // Detect SSE support & enable #define VECTORMATH_SSE_SUPPORT 0 #if defined(__SSSE3__) +# undef VECTORMATH_SSE_SUPPORT # define VECTORMATH_SSE_SUPPORT 1 #endif @@ -117,11 +118,7 @@ # include typedef __m128i __m128u; #elif VECTORMATH_NEON_SUPPORT -# include - typedef float32x2_t __m64; - typedef float32x4_t __m128; - typedef int32x4_t __m128i; - typedef uint32x4_t __m128u; +# include "sse_to_neon.h" #elif !VECTORMATH_ENABLE_CLANG_EXT typedef struct __m64 { float data[2]; } __m64; typedef struct __m128 { float data[4]; } __m128; diff --git a/unit_tests/projects/vectormath_unit_tests_android/app/build.gradle b/unit_tests/projects/vectormath_unit_tests_android/app/build.gradle index 5712b0f..545bcb3 100644 --- a/unit_tests/projects/vectormath_unit_tests_android/app/build.gradle +++ b/unit_tests/projects/vectormath_unit_tests_android/app/build.gradle @@ -18,6 +18,11 @@ android { cmake { cppFlags '-std=c++17' arguments "-DANDROID_TOOLCHAIN=clang" + arguments "-DANDROID_ARM_NEON=ON" + } + + ndk { + abiFilters 'armeabi-v7a', 'arm64-v8a' } } } diff --git a/unit_tests/projects/vectormath_unit_tests_android/app/src/main/cpp/CMakeLists.txt b/unit_tests/projects/vectormath_unit_tests_android/app/src/main/cpp/CMakeLists.txt index 9b5bc18..a8d099c 100644 --- a/unit_tests/projects/vectormath_unit_tests_android/app/src/main/cpp/CMakeLists.txt +++ b/unit_tests/projects/vectormath_unit_tests_android/app/src/main/cpp/CMakeLists.txt @@ -78,4 +78,5 @@ target_compile_options( -DVECTORMATH_USE_EXACT_PRECISION -DVECTORMATH_USE_CLANG_EXT + -DVECTORMATH_SIMD_ENABLE=1 ) \ No newline at end of file