diff --git a/include/Algo-Direct-Common.h b/include/Algo-Direct-Common.h
index c97084904..7b40edea9 100644
--- a/include/Algo-Direct-Common.h
+++ b/include/Algo-Direct-Common.h
@@ -190,7 +190,7 @@ struct DirectInfo
                 xi = xws;
             }
             else {
-                myassert(Gap==1, "if Gap>1 then X workspace must be provided");
+                myassert((Gap==1), "if Gap>1 then X workspace must be provided");
                 xi = x;
             }
 
diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h
index d5fa58d12..7f52fce14 100644
--- a/include/Algo-Direct2.h
+++ b/include/Algo-Direct2.h
@@ -52,6 +52,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
 private:
     typedef AlgoScalarBase<T, A> base_t;
 
+#ifdef USE_SSE2
     FORCE_INLINE
         //NO_INLINE
         void resolve(const FVec<SSE, float>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const
@@ -135,6 +136,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
         pr[0] = u.ui32[0];
         pr[1] = u.ui32[2];
     }
+#endif // USE_SSE2
 
 #ifdef USE_AVX
 
diff --git a/include/Portable.h b/include/Portable.h
index 1710b0502..178f8baa9 100644
--- a/include/Portable.h
+++ b/include/Portable.h
@@ -4,10 +4,40 @@
 #include <stdexcept>
 #include <sstream>
 
+#if defined(__aarch64__)
+#ifdef __CUDACC__
+#undef USE_NEON // Doesn't work with nvcc, undefined symbols
+#else
+#include <arm_neon.h>
+#undef USE_NEON // Not yet implemented
+#endif
+#undef USE_AVX // x86_64 only
+#undef USE_AVX2 // x86_64 only
+#undef USE_SSE2 // x86_64 only
+#undef USE_SSE41 // x86_64 only
+#undef USE_SSE42 // x86_64 only
+#undef USE_FMA // x86_64 only
+#ifdef USE_NEON
+typedef float32x4_t __m128;
+typedef int32x4_t __m128i;
+typedef float64x2_t __m128d;
+#else
+typedef struct {float a; float b; float c; float d;} __m128;
+typedef struct {int a; int b; int c; int d;} __m128i;
+typedef struct {double a; double b;} __m128d;
+#endif
+#else
+#undef USE_NEON // ARM64 only
 #ifdef __FMA__
 #define USE_FMA
 #endif
+#if !defined(__SSE2__) && !defined(_MSC_VER)
+#error Compiler must support SSE2
+#endif
+#define USE_SSE2
 
+#if defined(__aarch64__)
+#else
 #ifdef __AVX2__
 #define USE_AVX2
 #endif
@@ -24,7 +54,8 @@
 #ifdef __SSE4_2__
 #define USE_SSE42
 #endif
-
+#endif
+#endif
 
 #ifndef _MSC_VER
 #include <stdint.h>
diff --git a/include/SIMD.h b/include/SIMD.h
index d559e9f55..19e8bffb3 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -2,6 +2,46 @@
 
 #include "Portable.h"
 
+#ifdef USE_SSE2
+#include <emmintrin.h>
+#if defined(USE_AVX) || defined(USE_AVX2)
+#include <immintrin.h>
+#else
+#ifdef USE_SSE41
+#include <smmintrin.h>
+#endif
+#endif
+#endif
+
+namespace BinSearch {
+namespace Details {
+
+template <InstrSet I, typename T>
+struct FTOITraits{};
+
+template <InstrSet I, class T>
+struct FVec;
+
+template <InstrSet I, class T>
+struct IVec;
+
+template <InstrSet I, class T>
+struct FVec1;
+
+template <> struct InstrFloatTraits<Scalar, float>
+{
+    typedef __m128  vec_t;
+};
+
+template <> struct InstrFloatTraits<Scalar, double>
+{
+    typedef __m128d vec_t;
+};
+
+}
+}
+
+#if !defined(__aarch64__)
 #ifdef USE_SSE42
 #ifndef _MSC_VER
 #include <popcntintrin.h>
@@ -26,29 +66,11 @@ FORCE_INLINE int popcnt32(int x32)
 } // namespace
 #endif
 
-#if defined(USE_AVX) || defined(USE_AVX2)
-#include <immintrin.h>
-#else
-#include <emmintrin.h>
-#ifdef USE_SSE41
-#include <smmintrin.h>
-#endif
-#endif
-
 #include "Type.h"
 
 namespace BinSearch {
 namespace Details {
 
-template <InstrSet I, class T>
-struct FVec;
-
-template <InstrSet I, class T>
-struct IVec;
-
-template <InstrSet I, class T>
-struct FVec1;
-
 template <> struct InstrIntTraits<SSE>
 {
     typedef __m128i vec_t;
@@ -64,18 +86,8 @@ template <> struct InstrFloatTraits<SSE, double>
     typedef __m128d vec_t;
 };
 
-template <> struct InstrFloatTraits<Scalar, float>
-{
-    typedef float  vec_t;
-};
-
-template <> struct InstrFloatTraits<Scalar, double>
-{
-    typedef double vec_t;
-};
-
-template <InstrSet I, typename T>
-struct FTOITraits
+template <>
+struct FTOITraits<SSE, float>
 {
     typedef IVec<SSE, float> vec_t;
 };
@@ -295,9 +307,11 @@ FORCE_INLINE FVec<SSE,float> operator-   (const FVec<SSE,float>& a,  const FVec<
 FORCE_INLINE FVec<SSE,float> operator*   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_mul_ps( a, b ); }
 FORCE_INLINE FVec<SSE,float> operator/   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_div_ps( a, b ); }
 FORCE_INLINE IVec<SSE,float> ftoi        (const FVec<SSE,float>& a)                             { return _mm_cvttps_epi32(a); }
+#ifndef __clang__ // Conflicts with builtin operator
 FORCE_INLINE IVec<SSE,float> operator<=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); }
 FORCE_INLINE IVec<SSE,float> operator>=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); }
 FORCE_INLINE IVec<SSE,float> operator<   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128(_mm_cmplt_ps(a, b)); }
+#endif
 #ifdef USE_FMA
 FORCE_INLINE FVec<SSE, float> mulSub(const FVec<SSE, float>& a, const FVec<SSE, float>& b, const FVec<SSE, float>& c) { return _mm_fmsub_ps(a, b, c); }
 #endif
@@ -349,9 +363,11 @@ FORCE_INLINE FVec<SSE,double> operator-   (const FVec<SSE,double>& a, const FVec
 FORCE_INLINE FVec<SSE,double> operator*   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_mul_pd( a, b ); }
 FORCE_INLINE FVec<SSE,double> operator/   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_div_pd( a, b ); }
 FORCE_INLINE IVec<SSE,float>  ftoi        (const FVec<SSE,double>& a)                               { return _mm_cvttpd_epi32(a); }
+#ifndef __clang__ // Conflicts with builtin operator
 FORCE_INLINE IVec<SSE,double> operator<=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); }
 FORCE_INLINE IVec<SSE,double> operator<   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); }
 FORCE_INLINE IVec<SSE,double> operator>=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); }
+#endif
 #ifdef USE_FMA
 FORCE_INLINE FVec<SSE, double> mulSub(const FVec<SSE, double>& a, const FVec<SSE, double>& b, const FVec<SSE, double>& c ) { return _mm_fmsub_pd(a, b, c); }
 #endif
@@ -570,3 +586,4 @@ FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX
 
 } // namepsace Details
 } // namespace BinSearch
+#endif // !defined(__aarch64__)
diff --git a/include/Type.h b/include/Type.h
index 720bfb86f..16bf3e3ae 100644
--- a/include/Type.h
+++ b/include/Type.h
@@ -10,7 +10,7 @@ using std::size_t;
 
 namespace BinSearch {
 
-enum InstrSet { Scalar, SSE, AVX };
+enum InstrSet { Scalar, SSE, AVX, Neon };
 
 #define ALGOENUM(x, b) x,
 enum Algos