From 8407059c370ac2dc2e3af8079b29de8e8609c46e Mon Sep 17 00:00:00 2001 From: Christoper Taylor Date: Wed, 30 Oct 2024 22:45:01 -0400 Subject: [PATCH 1/2] initial import --- src/external/marray/include/vector.hpp | 4 + .../marray/include/vector_generic.hpp | 2157 +++++++++++++++++ src/external/tci/tci/task_set.c | 4 + 3 files changed, 2165 insertions(+) create mode 100644 src/external/marray/include/vector_generic.hpp diff --git a/src/external/marray/include/vector.hpp b/src/external/marray/include/vector.hpp index 1aad93b06..e5b93a655 100644 --- a/src/external/marray/include/vector.hpp +++ b/src/external/marray/include/vector.hpp @@ -34,6 +34,10 @@ struct vector_traits #include "vector_sse41.hpp" +#else + +#include "vector_generic.hpp" + #endif #if __GNUC__ >= 6 diff --git a/src/external/marray/include/vector_generic.hpp b/src/external/marray/include/vector_generic.hpp new file mode 100644 index 000000000..c89b23bfa --- /dev/null +++ b/src/external/marray/include/vector_generic.hpp @@ -0,0 +1,2157 @@ +#ifndef _MARRAY_VECTOR_GENERIC_HPP_ +#define _MARRAY_VECTOR_GENERIC_HPP_ + +#include "vector.hpp" + +#include +#include + +namespace MArray +{ + +template +struct vector_base { +}; + +template<> +struct vector_base { + inline static float result_value[4]; +}; + +template<> +struct vector_base { + inline static double result_value[2]; +}; + +template<> +struct vector_base> { + inline static float result_value[4]; +}; + +template<> +struct vector_base { + using value_type = uint8_t; + inline static value_type result_value[16]; +}; + +template<> +struct vector_base { + using value_type = uint16_t; + inline static value_type result_value[8]; +}; + +template<> +struct vector_base { + using value_type = uint32_t; + inline static value_type result_value[4]; +}; + +template<> +struct vector_base { + using value_type = uint64_t; + inline static value_type result_value[2]; +}; + +template <> +struct vector_traits +{ + constexpr static unsigned vector_width = 4; + constexpr static size_t alignment = 64; + + using vector_type = float*; + using value_type = float; + + template static + detail::enable_if_t::value, float*> + convert(float* v) + { + return v; + } + + template static + detail::enable_if_t::value, double*> + convert(float* v) + { + return (double*)v; + } + + template static + detail::enable_if_t>::value, float*> + convert(float* v) + { + return v; + } + + template static + detail::enable_if_t::value, int8_t*> + convert(float* v) + { + //ivector_type i32 = _mm_cvtps_epi32(v); + //ivector_type i16 = _mm_packs_epi32(i32, i32); + //return _mm_packs_epi16(i16, i16); + return (int8_t*)v; + } + + template static + detail::enable_if_t::value, uint8_t*> + convert(float* v) + { + return (uint8_t*)v; + } + + template static + detail::enable_if_t::value, int16_t*> + convert(float* v) + { + return (int16_t*)v; + } + + template static + detail::enable_if_t::value, uint16_t*> + convert(float* v) + { + return (uint16_t*)v; + } + + template static + detail::enable_if_t::value, int32_t*> + convert(float* v) + { + return (int32_t*)v; + } + + template static + detail::enable_if_t::value, uint32_t*> + convert(float* v) + { + return (uint32_t*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T*> + convert(float* v) + { + return (T*)v; + } + + template static + detail::enable_if_t + load(const float* ptr) + { + vector_base::result_value[0] = ptr[0]; + vector_base::result_value[1] = ptr[1]; + vector_base::result_value[2] = ptr[2]; + vector_base::result_value[3] = ptr[3]; + return vector_base::result_value; + } + + template static + detail::enable_if_t + load(const float* ptr) + { + vector_base::result_value[0] = ptr[0]; + vector_base::result_value[1] = ptr[1]; + vector_base::result_value[2] = ptr[2]; + vector_base::result_value[3] = ptr[3]; + return vector_base::result_value; + } + + template static + detail::enable_if_t + load(const float* ptr) + { + vector_base::result_value[0] = ptr[0]; + vector_base::result_value[1] = ptr[1]; + vector_base::result_value[2] = ptr[2]; + vector_base::result_value[3] = ptr[3]; + return vector_base::result_value; + } + + static float* load1(const float* ptr) + { + vector_base::result_value[0] = ptr[0]; + vector_base::result_value[1] = ptr[1]; + vector_base::result_value[2] = ptr[2]; + vector_base::result_value[3] = ptr[3]; + return vector_base::result_value; + } + + static float* set1(float val) + { + vector_base::result_value[0] = val; + vector_base::result_value[1] = val; + vector_base::result_value[2] = val; + vector_base::result_value[3] = val; + return vector_base::result_value; + } + + template static + detail::enable_if_t + store(float* v, float* ptr) + { + //_mm_storeu_ps(ptr, v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + + template static + detail::enable_if_t + store(float * v, float* ptr) + { + //_mm_store_ps(ptr, v); + std::copy(v, v+4, ptr); + } + + template static + detail::enable_if_t + store(double * v, float* ptr) + { + //_mm_store_sd((double*)ptr, _mm_castps_pd(v)); + std::copy(v, v+1, (double*)ptr); + } + + static float* add(float* a, float* b) + { + //return _mm_add_ps(a, b); + vector_base::result_value[0] = a[0] + b[0]; + vector_base::result_value[1] = a[1] + b[1]; + vector_base::result_value[2] = a[2] + b[2]; + vector_base::result_value[3] = a[3] + b[3]; + return vector_base::result_value; + } + + static float* sub(float* a, float* b) + { + //return _mm_sub_ps(a, b); + vector_base::result_value[0] = a[0] - b[0]; + vector_base::result_value[1] = a[1] - b[1]; + vector_base::result_value[2] = a[2] - b[2]; + vector_base::result_value[3] = a[3] - b[3]; + return vector_base::result_value; + } + + static float* mul(float* a, float* b) + { + //return _mm_mul_ps(a, b); + vector_base::result_value[0] = a[0] * b[0]; + vector_base::result_value[1] = a[1] * b[1]; + vector_base::result_value[2] = a[2] * b[2]; + vector_base::result_value[3] = a[3] * b[3]; + return vector_base::result_value; + } + + static float* div(float* a, float* b) + { + //return _mm_div_ps(a, b); + vector_base::result_value[0] = a[0] / b[0]; + vector_base::result_value[1] = a[1] / b[1]; + vector_base::result_value[2] = a[2] / b[2]; + vector_base::result_value[3] = a[3] / b[3]; + return vector_base::result_value; + } + + static float* pow(float* a, float* b) + { + //return _mm_setr_ps(std::pow((float)a[0], (float)b[0]), + // std::pow((float)a[1], (float)b[1]), + // std::pow((float)a[2], (float)b[2]), + // std::pow((float)a[3], (float)b[3])); + vector_base::result_value[0] = std::pow(a[0] , b[0]); + vector_base::result_value[1] = std::pow(a[1] , b[1]); + vector_base::result_value[2] = std::pow(a[2] , b[2]); + vector_base::result_value[3] = std::pow(a[3] , b[3]); + return vector_base::result_value; + } + + static float* negate(float* a) + { + //return _mm_xor_ps(a, _mm_set1_ps(-0.0f)); + vector_base::result_value[0] = -a[0]; + vector_base::result_value[1] = -a[1]; + vector_base::result_value[2] = -a[2]; + vector_base::result_value[3] = -a[3]; + return vector_base::result_value; + } + + static float* exp(float* a) + { + //return _mm_setr_ps(std::exp((float)a[0]), + // std::exp((float)a[1]), + // std::exp((float)a[2]), + // std::exp((float)a[3])); + vector_base::result_value[0] = std::exp(a[0]); + vector_base::result_value[1] = std::exp(a[1]); + vector_base::result_value[2] = std::exp(a[2]); + vector_base::result_value[3] = std::exp(a[3]); + return vector_base::result_value; + } + + static float* sqrt(float* a) + { + //return _mm_sqrt_ps(a); + vector_base::result_value[0] = std::sqrt(a[0]); + vector_base::result_value[1] = std::sqrt(a[1]); + vector_base::result_value[2] = std::sqrt(a[2]); + vector_base::result_value[3] = std::sqrt(a[3]); + return vector_base::result_value; + } +}; + +template <> +struct vector_traits +{ + constexpr static unsigned vector_width = 2; + constexpr static size_t alignment = 64; + using vector_type = double*; + using value_type = double; + + template static + detail::enable_if_t::value, float*> + convert(double* v) + { + //return _mm_cvtpd_ps(v); + return (float*)v; + } + + template static + detail::enable_if_t::value, double*> + convert(double* v) + { + return v; + } + + template static + detail::enable_if_t>::value, float*> + convert(double* v) + { + //return _mm_unpacklo_ps(_mm_cvtpd_ps(v), _mm_setzero_ps()); + return (float*)v; + } + + template static + detail::enable_if_t::value, int8_t*> + convert(double* v) + { + //ivector_type i32 = _mm_cvtpd_epi32(v); + //ivector_type i16 = _mm_packs_epi32(i32, i32); + //return _mm_packs_epi16(i16, i16); + return (int8_t*)v; + } + + template static + detail::enable_if_t::value, uint8_t*> + convert(double* v) + { + //ivector_type i32 = _mm_cvtpd_epi32(v); + //ivector_type i16 = _mm_packus_epi32(i32, i32); + //return _mm_packus_epi16(i16, i16); + return (uint8_t*)v; + } + + template static + detail::enable_if_t::value, int16_t*> + convert(double* v) + { + //ivector_type i32 = _mm_cvtpd_epi32(v); + //return _mm_packs_epi32(i32, i32); + return (int16_t*)v; + } + + template static + detail::enable_if_t::value, uint16_t*> + convert(double* v) + { + //ivector_type i32 = _mm_cvtpd_epi32(v); + //return _mm_packus_epi32(i32, i32); + return (uint16_t*)v; + } + + template static + detail::enable_if_t::value, int32_t*> + convert(double* v) + { + //return _mm_cvtpd_epi32(v); + return (int32_t*)v; + } + + template static + detail::enable_if_t::value, uint32_t*> + convert(double* v) + { + //return _mm_setr_epi32((uint32_t)v[0], (uint32_t)v[1], + // (uint32_t)v[0], (uint32_t)v[1]); + return (uint32_t*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T*> + convert(double* v) + { + //return _mm_set_epi64x((T)v[1], (T)v[0]); + return (T*)v; + } + + template static + detail::enable_if_t + load(const double* ptr) + { + //return _mm_loadu_pd(ptr); + return const_cast(ptr); + } + + template static + detail::enable_if_t + load(const double* ptr) + { + //return _mm_load_pd(ptr); + return const_cast(ptr); + } + + static double * load1(const double* ptr) + { + //return _mm_load1_pd(ptr); + return const_cast(ptr); + } + + static double* set1(double val) + { + //return _mm_set1_pd(val); + vector_base::result_value[0] = val; + vector_base::result_value[1] = val; + return vector_base::result_value; + } + + template static + detail::enable_if_t + store(double* v, double* ptr) + { + //_mm_storeu_pd(ptr, v); + std::copy(v, v+2, ptr); + } + + template static + detail::enable_if_t + store(double* v, double* ptr) + { + //_mm_store_pd(ptr, v); + std::copy(v, v+2, ptr); + } + + static double* add(double* a, double* b) + { + //return _mm_add_pd(a, b); + vector_base::result_value[0] = a[0] + b[0]; + vector_base::result_value[1] = a[1] + b[1]; + return vector_base::result_value; + } + + static double* sub(double* a, double* b) + { + //return _mm_sub_pd(a, b); + vector_base::result_value[0] = a[0] - b[0]; + vector_base::result_value[1] = a[1] - b[1]; + return vector_base::result_value; + } + + static double* mul(double* a, double* b) + { + //return _mm_mul_pd(a, b); + vector_base::result_value[0] = a[0] * b[0]; + vector_base::result_value[1] = a[1] * b[1]; + return vector_base::result_value; + } + + static double* div(double* a, double* b) + { + //return _mm_div_pd(a, b); + vector_base::result_value[0] = a[0] / b[0]; + vector_base::result_value[1] = a[1] / b[1]; + return vector_base::result_value; + } + + static double* pow(double* a, double* b) + { + //return _mm_setr_pd(std::pow((double)a[0], (double)b[0]), + // std::pow((double)a[1], (double)b[1])); + vector_base::result_value[0] = std::pow(a[0] , b[0]); + vector_base::result_value[1] = std::pow(a[1] , b[1]); + return vector_base::result_value; + } + + static double* negate(double* a) + { + //return _mm_xor_pd(a, _mm_set1_pd(-0.0)); + vector_base::result_value[0] = -a[0]; + vector_base::result_value[1] = -a[1]; + return vector_base::result_value; + } + + static double* exp(double* a) + { + //return _mm_setr_pd(std::exp((double)a[0]), + // std::exp((double)a[1])); + vector_base::result_value[0] = std::exp(a[0]); + vector_base::result_value[1] = std::exp(a[1]); + return vector_base::result_value; + } + + static double* sqrt(double* a) + { + //return _mm_sqrt_pd(a); + vector_base::result_value[0] = std::sqrt(a[0]); + vector_base::result_value[1] = std::sqrt(a[1]); + return vector_base::result_value; + } +}; + +template <> +struct vector_traits> +{ + constexpr static unsigned vector_width = 2; + constexpr static size_t alignment = 64; + using vector_type = float*; + using value_type = float; + + template static + detail::enable_if_t::value, float* > + convert(float* v) + { + //return _mm_shuffle_ps(v, v, _MM_SHUFFLE(2,0,2,0)); + return v; + } + + template static + detail::enable_if_t::value, double*> + convert(float* v) + { + //return _mm_cvtps_pd(convert(v)); + return (double*)v; + } + + template static + detail::enable_if_t>::value, float* > + convert(float* v) + { + return v; + } + + template static + detail::enable_if_t::value, int8_t* > + convert(float* v) + { + //ivector_type i32 = _mm_cvtps_epi32(convert(v)); + //ivector_type i16 = _mm_packs_epi32(i32, i32); + //return _mm_packs_epi16(i16, i16); + return (int8_t*)v; + } + + template static + detail::enable_if_t::value, uint8_t* > + convert(float* v) + { + //ivector_type i32 = _mm_cvtps_epi32(convert(v)); + //ivector_type i16 = _mm_packus_epi32(i32, i32); + //return _mm_packus_epi16(i16, i16); + return (uint8_t*)v; + } + + template static + detail::enable_if_t::value, int16_t* > + convert(float* v) + { + //ivector_type i32 = _mm_cvtps_epi32(convert(v)); + //return _mm_packs_epi32(i32, i32); + return (int16_t*)v; + } + + template static + detail::enable_if_t::value, uint16_t* > + convert(float* v) + { + //ivector_type i32 = _mm_cvtps_epi32(convert(v)); + //return _mm_packus_epi32(i32, i32); + return (uint16_t*)v; + } + + template static + detail::enable_if_t::value, int32_t* > + convert(float* v) + { + //return _mm_cvtps_epi32(convert(v)); + return (uint32_t*)v; + } + + template static + detail::enable_if_t::value, uint32_t* > + convert(float* v) + { + //return _mm_setr_epi32((uint32_t)v[0], (uint32_t)v[2], + // (uint32_t)v[0], (uint32_t)v[2]); + return (uint32_t*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, int64_t* > + convert(float* v) + { + //return _mm_set_epi64x((T)v[2], (T)v[0]); + return (int64_t*)v; + } + + template static + detail::enable_if_t + load(const std::complex* ptr) + { + //return _mm_loadu_ps((float*)ptr); + vector_base::result_value[0] = ptr->real(); + vector_base::result_value[1] = ptr->imag(); + return vector_base::result_value; + } + + template static + detail::enable_if_t + load(const std::complex* ptr) + { + //return _mm_load_ps((float*)ptr); + vector_base::result_value[0] = ptr->real(); + vector_base::result_value[1] = ptr->imag(); + return vector_base::result_value; + } + + static float* load1(const std::complex* ptr) + { + //return _mm_castpd_ps(_mm_load1_pd((double*)ptr)); + vector_base::result_value[0] = ptr->real(); + vector_base::result_value[1] = ptr->imag(); + return vector_base::result_value; + } + + static float* set1(std::complex val) + { + //return _mm_castpd_ps(_mm_set1_pd(*(double*)&val)); + vector_base::result_value[0] = val.real(); + vector_base::result_value[1] = val.imag(); + return vector_base::result_value; + } + + template static + detail::enable_if_t + store(float* v, std::complex* ptr) + { + //_mm_storeu_ps((float*)ptr, v); + (*ptr) = {v[0], v[1]}; + } + + template static + detail::enable_if_t + store(float* v, std::complex* ptr) + { + //_mm_store_ps((float*)ptr, v); + (*ptr) = {v[0], v[1]}; + } + + static float* add(float* a, float* b) + { + //return _mm_add_ps(a, b); + vector_base::result_value[0] = a[0] + b[0]; + vector_base::result_value[1] = a[1] + b[1]; + vector_base::result_value[2] = a[2] + b[2]; + vector_base::result_value[3] = a[3] + b[3]; + return vector_base::result_value; + } + + static float* sub(float* a, float* b) + { + //return _mm_sub_ps(a, b); + vector_base::result_value[0] = a[0] - b[0]; + vector_base::result_value[1] = a[1] - b[1]; + vector_base::result_value[2] = a[2] - b[2]; + vector_base::result_value[3] = a[3] - b[3]; + return vector_base::result_value; + } + + static float* mul(float* a, float* b) + { +/* + float* ashuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,3,0,1)); + float* breal = _mm_moveldup_ps(b); + float* bimag = _mm_movehdup_ps(b); + float* tmp1 = _mm_mul_ps( a, breal); // tmp1 = (ar0*br0, ai0*br0, ar1*br1, ai1*br1) + float* tmp2 = _mm_mul_ps(ashuf, bimag); // tmp2 = (ai0*bi0, ar0*bi0, ai1*bi1, ar1*bi1) + return _mm_addsub_ps(tmp1, tmp2); +*/ + //(a + ib) (c + id) = (ac - bd) + i(ad + bc). + + vector_base::result_value[0] = a[0]*b[0] + a[1]*b[1]; + vector_base::result_value[1] = a[0]*b[1] + a[1]*b[0]; + + vector_base::result_value[2] = a[2]*b[2] + a[3]*b[3]; + vector_base::result_value[3] = a[2]*b[3] + a[3]*b[2]; + + return vector_base::result_value; + } + + static float* div(float* a, float* b) + { +/* + float* bsqr = _mm_mul_ps(b, b); + bsqr = _mm_hadd_ps(bsqr, bsqr); + bsqr = _mm_shuffle_ps(bsqr, bsqr, _MM_SHUFFLE(3,1,2,0)); // bsqr = (|b0|^2, |b0|^2, |b1|^2, |b1|^2) + + float* ashuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,3,0,1)); + float* breal = _mm_moveldup_ps(b); + float* bimag = _mm_movehdup_ps(b); + float* tmp1 = _mm_mul_ps( a, breal); // tmp1 = ( ar0*br0, ai0*br0, ar1*br1, ai1*br1) + float* tmp2 = _mm_mul_ps(ashuf, bimag); + tmp2 = _mm_xor_ps(tmp2, _mm_set1_ps(-0.0f)); // tmp2 = (-ai0*bi0, -ar0*bi0, -ai1*bi1, -ar1*bi1) + float* abconj = _mm_addsub_ps(tmp1, tmp2); + + return _mm_div_ps(abconj, bsqr); +*/ + const float denom_a = b[0] * b[0] + a[1] * a[1]; + vector_base::result_value[0] = (a[0] + b[0]) + (a[1] * b[1]) / denom_a; + vector_base::result_value[1] = (a[1] * b[0]) - (a[0] * b[1]) / denom_a; + + const float denom_b = b[2] * b[2] + a[3] * a[3]; + vector_base::result_value[2] = (a[2] + b[2]) + (a[3] * b[3]) / denom_b; + vector_base::result_value[3] = (a[3] * b[2]) - (a[2] * b[3]) / denom_b; + + return vector_base::result_value; + } + + static float* pow(float* a, float* b) + { + std::complex a0((float)a[0], (float)a[1]); + std::complex a1((float)a[2], (float)a[3]); + std::complex b0((float)b[0], (float)b[1]); + std::complex b1((float)b[2], (float)b[3]); + std::complex c0 = std::pow(a0, b0); + std::complex c1 = std::pow(a1, b1); + vector_base::result_value[0] = c0.real(); + vector_base::result_value[1] = c0.imag(); + vector_base::result_value[2] = c1.real(); + vector_base::result_value[3] = c1.imag(); + return vector_base::result_value; + } + + static float* negate(float* a) + { + vector_base::result_value[0] = 0.-a[0]; + vector_base::result_value[1] = 0.-a[1]; + vector_base::result_value[2] = 0.-a[2]; + vector_base::result_value[3] = 0.-a[3]; + return vector_base::result_value; + } + + static float* exp(float* a) + { + std::complex a0((float)a[0], (float)a[1]); + std::complex a1((float)a[2], (float)a[3]); + std::complex b0 = std::exp(a0); + std::complex b1 = std::exp(a1); + vector_base::result_value[0] = b0.real(); + vector_base::result_value[1] = b0.imag(); + vector_base::result_value[2] = b1.real(); + vector_base::result_value[3] = b1.imag(); + return vector_base::result_value; + } + + static float* sqrt(float* a) + { + std::complex a0((float)a[0], (float)a[1]); + std::complex a1((float)a[2], (float)a[3]); + std::complex b0 = std::sqrt(a0); + std::complex b1 = std::sqrt(a1); + vector_base::result_value[0] = b0.real(); + vector_base::result_value[1] = b0.imag(); + vector_base::result_value[2] = b1.real(); + vector_base::result_value[3] = b1.imag(); + return vector_base::result_value; + } +}; + +template +struct vector_traits::value || + std::is_same::value>> +{ + constexpr static unsigned vector_width = 32; + constexpr static size_t alignment = 64; + + using vector_type = + std::conditional::value, + int8_t*, + uint8_t* + >; + + using value_type = typename vector_base::value_type; + + template static + detail::enable_if_t::value, float* > + convert(vector_type v) + { + //return _mm_cvtepi32_ps(convert(v)); + return (float*)v; + } + + template static + detail::enable_if_t::value, double* > + convert(vector_type v) + { + //return _mm_cvtepi32_pd(convert(v)); + return (double*)v; + } + + template static + detail::enable_if_t>::value, float* > + convert(vector_type v) + { + //return _mm_unpacklo_ps(convert(v), _mm_setzero_ps()); + return (float*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T* > + convert(vector_type v) + { + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T* > + convert(vector_type v) + { + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T * > + convert(vector_type v) + { +/* + return std::is_signed::value ? (int32_t*)v + : (uint32_t*)v; +*/ + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T * > + convert(vector_type v) + { + return (T*)v; + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { + return const_cast(ptr); + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { + return const_cast(ptr); + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { + return const_cast(ptr); + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { + return const_cast(ptr); + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { + return const_cast(ptr); + } + + static vector_type load1(const vector_type ptr) + { + return const_cast(ptr); + } + + static vector_type set1(value_type val) + { + vector_base::result_value[0] = val; + vector_base::result_value[1] = val; + vector_base::result_value[2] = val; + vector_base::result_value[3] = val; + vector_base::result_value[4] = val; + vector_base::result_value[5] = val; + vector_base::result_value[6] = val; + vector_base::result_value[7] = val; + vector_base::result_value[8] = val; + vector_base::result_value[9] = val; + vector_base::result_value[10] = val; + vector_base::result_value[11] = val; + vector_base::result_value[12] = val; + vector_base::result_value[13] = val; + vector_base::result_value[14] = val; + vector_base::result_value[15] = val; + return vector_base::result_value; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + ptr[4] = v[4]; + ptr[5] = v[5]; + ptr[6] = v[6]; + ptr[7] = v[7]; + ptr[8] = v[8]; + ptr[9] = v[9]; + ptr[10] = v[10]; + ptr[11] = v[11]; + ptr[12] = v[12]; + ptr[13] = v[13]; + ptr[14] = v[14]; + ptr[15] = v[15]; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { + ((int64_t*)ptr)[0] = v[7] << 56 | v[6] << 48 | v[5] << 40 | v[4] << 32 | v[3] << 24 | v[2] << 16 | v[1] << 8 | v[0]; + ((int64_t*)ptr)[1] = v[15] << 56 | v[14] << 48 | v[13] << 40 | v[12] << 32 | v[11] << 24 | v[10] << 16 | v[9] << 8 | v[8]; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { + (*((value_type*)ptr)) = v[7] << 56 | v[6] << 48 | v[5] << 40 | v[4] << 32 | v[3] << 24 | v[2] << 16 | v[1] << 8 | v[0]; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { + //*(int32_t*)ptr = _mm_extract_epi32(v, 0); + // dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0] + (*((value_type*)ptr)) = v[3] << 24 | v[2] << 16 | v[1] << 8 | v[0]; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { + //*(int16_t*)ptr = _mm_extract_epi16(v, 0); + // (a[127:0] >> (imm8[2:0] * 16))[15:0] + // + (*((value_type*)ptr)) = (v[1] << 8) | v[0]; + } + + static vector_type add(vector_type a, vector_type b) + { + //return _mm_add_epi8(a, b); + vector_base::result_value[0] = a[0] + b[0]; + vector_base::result_value[1] = a[1] + b[1]; + vector_base::result_value[2] = a[2] + b[2]; + vector_base::result_value[3] = a[3] + b[3]; + vector_base::result_value[4] = a[4] + b[4]; + vector_base::result_value[5] = a[5] + b[5]; + vector_base::result_value[6] = a[6] + b[6]; + vector_base::result_value[7] = a[7] + b[7]; + vector_base::result_value[8] = a[8] + b[8]; + vector_base::result_value[9] = a[9] + b[9]; + vector_base::result_value[10] = a[10] + b[10]; + vector_base::result_value[11] = a[11] + b[11]; + vector_base::result_value[12] = a[12] + b[12]; + vector_base::result_value[13] = a[13] + b[13]; + vector_base::result_value[14] = a[14] + b[14]; + vector_base::result_value[15] = a[15] + b[15]; + return vector_base::result_value; + } + + static vector_type sub(vector_type a, vector_type b) + { + //return _mm_sub_epi8(a, b); + vector_base::result_value[0] = a[0] - b[0]; + vector_base::result_value[1] = a[1] - b[1]; + vector_base::result_value[2] = a[2] - b[2]; + vector_base::result_value[3] = a[3] - b[3]; + vector_base::result_value[4] = a[4] - b[4]; + vector_base::result_value[5] = a[5] - b[5]; + vector_base::result_value[6] = a[6] - b[6]; + vector_base::result_value[7] = a[7] - b[7]; + vector_base::result_value[8] = a[8] - b[8]; + vector_base::result_value[9] = a[9] - b[9]; + vector_base::result_value[10] = a[10] - b[10]; + vector_base::result_value[11] = a[11] - b[11]; + vector_base::result_value[12] = a[12] - b[12]; + vector_base::result_value[13] = a[13] - b[13]; + vector_base::result_value[14] = a[14] - b[14]; + vector_base::result_value[15] = a[15] - b[15]; + return vector_base::result_value; + } + + static vector_type mul(vector_type a, vector_type b) + { + //ivector_type lo = _mm_and_si128(_mm_mullo_epi16(a, b), _mm_set1_epi16(0xff)); + //ivector_type hi = _mm_mullo_epi16(_mm_srli_epi16(a, 8),_mm_srli_epi16(b, 8)); + //return _mm_or_si128(_mm_slli_epi16(hi, 8), lo); + vector_base::result_value[0] = a[0] * b[0]; + vector_base::result_value[1] = a[1] * b[1]; + vector_base::result_value[2] = a[2] * b[2]; + vector_base::result_value[3] = a[3] * b[3]; + vector_base::result_value[4] = a[4] * b[4]; + vector_base::result_value[5] = a[5] * b[5]; + vector_base::result_value[6] = a[6] * b[6]; + vector_base::result_value[7] = a[7] * b[7]; + vector_base::result_value[8] = a[8] * b[8]; + vector_base::result_value[9] = a[9] * b[9]; + vector_base::result_value[10] = a[10] * b[10]; + vector_base::result_value[11] = a[11] * b[11]; + vector_base::result_value[12] = a[12] * b[12]; + vector_base::result_value[13] = a[13] * b[13]; + vector_base::result_value[14] = a[14] * b[14]; + vector_base::result_value[15] = a[15] * b[15]; + return vector_base::result_value; + } + + static vector_type div(vector_type a, vector_type b) + { +/* + return _mm_setr_epi8((U)_mm_extract_epi8(a, 0) / + (U)_mm_extract_epi8(b, 0), + (U)_mm_extract_epi8(a, 1) / + (U)_mm_extract_epi8(b, 1), + (U)_mm_extract_epi8(a, 2) / + (U)_mm_extract_epi8(b, 2), + (U)_mm_extract_epi8(a, 3) / + (U)_mm_extract_epi8(b, 3), + (U)_mm_extract_epi8(a, 4) / + (U)_mm_extract_epi8(b, 4), + (U)_mm_extract_epi8(a, 5) / + (U)_mm_extract_epi8(b, 5), + (U)_mm_extract_epi8(a, 6) / + (U)_mm_extract_epi8(b, 6), + (U)_mm_extract_epi8(a, 7) / + (U)_mm_extract_epi8(b, 7), + (U)_mm_extract_epi8(a, 8) / + (U)_mm_extract_epi8(b, 8), + (U)_mm_extract_epi8(a, 9) / + (U)_mm_extract_epi8(b, 9), + (U)_mm_extract_epi8(a,10) / + (U)_mm_extract_epi8(b,10), + (U)_mm_extract_epi8(a,11) / + (U)_mm_extract_epi8(b,11), + (U)_mm_extract_epi8(a,12) / + (U)_mm_extract_epi8(b,12), + (U)_mm_extract_epi8(a,13) / + (U)_mm_extract_epi8(b,13), + (U)_mm_extract_epi8(a,14) / + (U)_mm_extract_epi8(b,14), + (U)_mm_extract_epi8(a,15) / + (U)_mm_extract_epi8(b,15)); +*/ + vector_base::result_value[0] = a[0] / b[0]; + vector_base::result_value[1] = a[1] / b[1]; + vector_base::result_value[2] = a[2] / b[2]; + vector_base::result_value[3] = a[3] / b[3]; + vector_base::result_value[4] = a[4] / b[4]; + vector_base::result_value[5] = a[5] / b[5]; + vector_base::result_value[6] = a[6] / b[6]; + vector_base::result_value[7] = a[7] / b[7]; + vector_base::result_value[8] = a[8] / b[8]; + vector_base::result_value[9] = a[9] / b[9]; + vector_base::result_value[10] = a[10] / b[10]; + vector_base::result_value[11] = a[11] / b[11]; + vector_base::result_value[12] = a[12] / b[12]; + vector_base::result_value[13] = a[13] / b[13]; + vector_base::result_value[14] = a[14] / b[14]; + vector_base::result_value[15] = a[15] / b[15]; + return vector_base::result_value; + } + + static vector_type pow(vector_type a, vector_type b) + { +/* + return _mm_setr_epi8((U)std::pow((U)_mm_extract_epi8(a, 0), + (U)_mm_extract_epi8(b, 0)), + (U)std::pow((U)_mm_extract_epi8(a, 1), + (U)_mm_extract_epi8(b, 1)), + (U)std::pow((U)_mm_extract_epi8(a, 2), + (U)_mm_extract_epi8(b, 2)), + (U)std::pow((U)_mm_extract_epi8(a, 3), + (U)_mm_extract_epi8(b, 3)), + (U)std::pow((U)_mm_extract_epi8(a, 4), + (U)_mm_extract_epi8(b, 4)), + (U)std::pow((U)_mm_extract_epi8(a, 5), + (U)_mm_extract_epi8(b, 5)), + (U)std::pow((U)_mm_extract_epi8(a, 6), + (U)_mm_extract_epi8(b, 6)), + (U)std::pow((U)_mm_extract_epi8(a, 7), + (U)_mm_extract_epi8(b, 7)), + (U)std::pow((U)_mm_extract_epi8(a, 8), + (U)_mm_extract_epi8(b, 8)), + (U)std::pow((U)_mm_extract_epi8(a, 9), + (U)_mm_extract_epi8(b, 9)), + (U)std::pow((U)_mm_extract_epi8(a,10), + (U)_mm_extract_epi8(b,10)), + (U)std::pow((U)_mm_extract_epi8(a,11), + (U)_mm_extract_epi8(b,11)), + (U)std::pow((U)_mm_extract_epi8(a,12), + (U)_mm_extract_epi8(b,12)), + (U)std::pow((U)_mm_extract_epi8(a,13), + (U)_mm_extract_epi8(b,13)), + (U)std::pow((U)_mm_extract_epi8(a,14), + (U)_mm_extract_epi8(b,14)), + (U)std::pow((U)_mm_extract_epi8(a,15), + (U)_mm_extract_epi8(b,15))); +*/ + vector_base::result_value[0] = (U)std::pow(a[0], b[0]); + vector_base::result_value[1] = (U)std::pow(a[1], b[1]); + vector_base::result_value[2] = (U)std::pow(a[2], b[2]); + vector_base::result_value[3] = (U)std::pow(a[3], b[3]); + vector_base::result_value[4] = (U)std::pow(a[4], b[4]); + vector_base::result_value[5] = (U)std::pow(a[5], b[5]); + vector_base::result_value[6] = (U)std::pow(a[6], b[6]); + vector_base::result_value[7] = (U)std::pow(a[7], b[7]); + vector_base::result_value[8] = (U)std::pow(a[8], b[8]); + vector_base::result_value[9] = (U)std::pow(a[9], b[9]); + vector_base::result_value[10] = (U)std::pow(a[10], b[10]); + vector_base::result_value[11] = (U)std::pow(a[11], b[11]); + vector_base::result_value[12] = (U)std::pow(a[12], b[12]); + vector_base::result_value[13] = (U)std::pow(a[13], b[13]); + vector_base::result_value[14] = (U)std::pow(a[14], b[14]); + vector_base::result_value[15] = (U)std::pow(a[15], b[15]); + + return vector_base::result_value; + } + + static vector_type negate(vector_type a) + { + //return _mm_sub_epi8(_mm_setzero_si128(), a); + vector_base::result_value[0] = -(a[0]); + vector_base::result_value[1] = -(a[1]); + vector_base::result_value[2] = -(a[2]); + vector_base::result_value[3] = -(a[3]); + vector_base::result_value[4] = -(a[4]); + vector_base::result_value[5] = -(a[5]); + vector_base::result_value[6] = -(a[6]); + vector_base::result_value[7] = -(a[7]); + vector_base::result_value[8] = -(a[8]); + vector_base::result_value[9] = -(a[9]); + vector_base::result_value[10] = -(a[10]); + vector_base::result_value[11] = -(a[11]); + vector_base::result_value[12] = -(a[12]); + vector_base::result_value[13] = -(a[13]); + vector_base::result_value[14] = -(a[14]); + vector_base::result_value[15] = -(a[15]); + + return vector_base::result_value; + } + + static vector_type exp(vector_type a) + { +/* + return _mm_setr_epi8((U)std::exp((U)_mm_extract_epi8(a, 0)), + (U)std::exp((U)_mm_extract_epi8(a, 1)), + (U)std::exp((U)_mm_extract_epi8(a, 2)), + (U)std::exp((U)_mm_extract_epi8(a, 3)), + (U)std::exp((U)_mm_extract_epi8(a, 4)), + (U)std::exp((U)_mm_extract_epi8(a, 5)), + (U)std::exp((U)_mm_extract_epi8(a, 6)), + (U)std::exp((U)_mm_extract_epi8(a, 7)), + (U)std::exp((U)_mm_extract_epi8(a, 8)), + (U)std::exp((U)_mm_extract_epi8(a, 9)), + (U)std::exp((U)_mm_extract_epi8(a,10)), + (U)std::exp((U)_mm_extract_epi8(a,11)), + (U)std::exp((U)_mm_extract_epi8(a,12)), + (U)std::exp((U)_mm_extract_epi8(a,13)), + (U)std::exp((U)_mm_extract_epi8(a,14)), + (U)std::exp((U)_mm_extract_epi8(a,15))); +*/ + vector_base::result_value[0] = (U)std::exp(a[0]); + vector_base::result_value[1] = (U)std::exp(a[1]); + vector_base::result_value[2] = (U)std::exp(a[2]); + vector_base::result_value[3] = (U)std::exp(a[3]); + vector_base::result_value[4] = (U)std::exp(a[4]); + vector_base::result_value[5] = (U)std::exp(a[5]); + vector_base::result_value[6] = (U)std::exp(a[6]); + vector_base::result_value[7] = (U)std::exp(a[7]); + vector_base::result_value[8] = (U)std::exp(a[8]); + vector_base::result_value[9] = (U)std::exp(a[9]); + vector_base::result_value[10] = (U)std::exp(a[10]); + vector_base::result_value[11] = (U)std::exp(a[11]); + vector_base::result_value[12] = (U)std::exp(a[12]); + vector_base::result_value[13] = (U)std::exp(a[13]); + vector_base::result_value[14] = (U)std::exp(a[14]); + vector_base::result_value[15] = (U)std::exp(a[15]); + + return vector_base::result_value; + } + + static U* sqrt(U* a) + { +/* + return _mm_setr_epi8((U)std::sqrt((U)_mm_extract_epi8(a, 0)), + (U)std::sqrt((U)_mm_extract_epi8(a, 1)), + (U)std::sqrt((U)_mm_extract_epi8(a, 2)), + (U)std::sqrt((U)_mm_extract_epi8(a, 3)), + (U)std::sqrt((U)_mm_extract_epi8(a, 4)), + (U)std::sqrt((U)_mm_extract_epi8(a, 5)), + (U)std::sqrt((U)_mm_extract_epi8(a, 6)), + (U)std::sqrt((U)_mm_extract_epi8(a, 7)), + (U)std::sqrt((U)_mm_extract_epi8(a, 8)), + (U)std::sqrt((U)_mm_extract_epi8(a, 9)), + (U)std::sqrt((U)_mm_extract_epi8(a,10)), + (U)std::sqrt((U)_mm_extract_epi8(a,11)), + (U)std::sqrt((U)_mm_extract_epi8(a,12)), + (U)std::sqrt((U)_mm_extract_epi8(a,13)), + (U)std::sqrt((U)_mm_extract_epi8(a,14)), + (U)std::sqrt((U)_mm_extract_epi8(a,15))); +*/ + vector_base::result_value[0] = (U)std::sqrt(a[0]); + vector_base::result_value[1] = (U)std::sqrt(a[1]); + vector_base::result_value[2] = (U)std::sqrt(a[2]); + vector_base::result_value[3] = (U)std::sqrt(a[3]); + vector_base::result_value[4] = (U)std::sqrt(a[4]); + vector_base::result_value[5] = (U)std::sqrt(a[5]); + vector_base::result_value[6] = (U)std::sqrt(a[6]); + vector_base::result_value[7] = (U)std::sqrt(a[7]); + vector_base::result_value[8] = (U)std::sqrt(a[8]); + vector_base::result_value[9] = (U)std::sqrt(a[9]); + vector_base::result_value[10] = (U)std::sqrt(a[10]); + vector_base::result_value[11] = (U)std::sqrt(a[11]); + vector_base::result_value[12] = (U)std::sqrt(a[12]); + vector_base::result_value[13] = (U)std::sqrt(a[13]); + vector_base::result_value[14] = (U)std::sqrt(a[14]); + vector_base::result_value[15] = (U)std::sqrt(a[15]); + + return vector_base::result_value; + } +}; + + +template +struct vector_traits::value || + std::is_same::value>> +{ + constexpr static unsigned vector_width = 16; + constexpr static size_t alignment = 64; + + using vector_type = + std::conditional::value, + int16_t*, + uint16_t* + >; + + using value_type = typename vector_base::value_type; + + template static + detail::enable_if_t::value, float*> + convert(vector_type v) + { +// return _mm_cvtepi32_ps(convert(v)); + return (T*)v; + } + + template static + detail::enable_if_t::value, double*> + convert(vector_type v) + { +// return _mm_cvtepi32_pd(convert(v)); + return (T*)v; + } + + template static + detail::enable_if_t>::value, float*> + convert(vector_type v) + { +// return _mm_unpacklo_ps(convert(v), _mm_setzero_ps()); + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T* > + convert(vector_type v) + { +/* + return std::is_signed::value ? _mm_packs_epi16(v, v) + : _mm_packus_epi16(v, v); +*/ + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T* > + convert(vector_type v) + { + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T* > + convert(vector_type v) + { +/* + return std::is_signed::value ? _mm_cvtepi16_epi32(v) + : _mm_cvtepu16_epi32(v); +*/ + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T* > + convert(vector_type v) + { +/* + return std::is_signed::value ? _mm_cvtepi16_epi64(v) + : _mm_cvtepu16_epi64(v); +*/ + return (T*)v; + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { +// return _mm_loadu_si128((ivector_type*)ptr); + vector_base::result_value[0] = ptr[0]; + vector_base::result_value[1] = ptr[1]; + vector_base::result_value[2] = ptr[2]; + vector_base::result_value[3] = ptr[3]; + vector_base::result_value[4] = ptr[4]; + vector_base::result_value[5] = ptr[5]; + vector_base::result_value[6] = ptr[6]; + vector_base::result_value[7] = ptr[7]; + return vector_base::result_value; + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { +// return _mm_load_si128((ivector_type*)ptr); + vector_base::result_value[0] = ptr[0]; + vector_base::result_value[1] = ptr[1]; + vector_base::result_value[2] = ptr[2]; + vector_base::result_value[3] = ptr[3]; + vector_base::result_value[4] = ptr[4]; + vector_base::result_value[5] = ptr[5]; + vector_base::result_value[6] = ptr[6]; + vector_base::result_value[7] = ptr[7]; + return vector_base::result_value; + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { +// return _mm_set1_epi64x(*(int64_t*)ptr); + vector_base::result_value[0] = ptr[0]; + vector_base::result_value[1] = ptr[1]; + vector_base::result_value[2] = ptr[2]; + vector_base::result_value[3] = ptr[3]; + vector_base::result_value[4] = ptr[4]; + vector_base::result_value[5] = ptr[5]; + vector_base::result_value[6] = ptr[6]; + vector_base::result_value[7] = ptr[7]; + return vector_base::result_value; + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { +// return _mm_set1_epi32(*(int32_t*)ptr); + vector_base::result_value[0] = ptr[0]; + vector_base::result_value[1] = ptr[1]; + vector_base::result_value[2] = ptr[2]; + vector_base::result_value[3] = ptr[3]; + vector_base::result_value[4] = ptr[4]; + vector_base::result_value[5] = ptr[5]; + vector_base::result_value[6] = ptr[6]; + vector_base::result_value[7] = ptr[7]; + return vector_base::result_value; + } + + static vector_type load1(const vector_type ptr) + { +// return _mm_set1_epi16(*ptr); + vector_base::result_value[0] = ptr[0]; + vector_base::result_value[1] = ptr[1]; + vector_base::result_value[2] = ptr[2]; + vector_base::result_value[3] = ptr[3]; + vector_base::result_value[4] = ptr[4]; + vector_base::result_value[5] = ptr[5]; + vector_base::result_value[6] = ptr[6]; + vector_base::result_value[7] = ptr[7]; + return vector_base::result_value; + } + + static vector_type set1(value_type val) + { +// return _mm_set1_epi16(val); + vector_base::result_value[0] = val; + vector_base::result_value[1] = val; + vector_base::result_value[2] = val; + vector_base::result_value[3] = val; + vector_base::result_value[4] = val; + vector_base::result_value[5] = val; + vector_base::result_value[6] = val; + vector_base::result_value[7] = val; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { +// _mm_storeu_si128((ivector_type*)ptr, v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + ptr[4] = v[4]; + ptr[5] = v[5]; + ptr[6] = v[6]; + ptr[7] = v[7]; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { +// _mm_store_si128((ivector_type*)ptr, v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + ptr[4] = v[4]; + ptr[5] = v[5]; + ptr[6] = v[6]; + ptr[7] = v[7]; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { +// _mm_storel_epi64((ivector_type*)ptr, v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + ptr[4] = v[4]; + ptr[5] = v[5]; + ptr[6] = v[6]; + ptr[7] = v[7]; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { +// *(int32_t*)ptr = _mm_extract_epi32(v, 0); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + ptr[4] = v[4]; + ptr[5] = v[5]; + ptr[6] = v[6]; + ptr[7] = v[7]; + } + + static vector_type add(vector_type a, vector_type b) + { +// return _mm_add_epi16(a, b); + vector_base::result_value[0] = (U)(a[0]) + (U)(b[0]); + vector_base::result_value[1] = (U)(a[1]) + (U)(b[1]); + vector_base::result_value[2] = (U)(a[2]) + (U)(b[2]); + vector_base::result_value[3] = (U)(a[3]) + (U)(b[3]); + vector_base::result_value[4] = (U)(a[4]) + (U)(b[4]); + vector_base::result_value[5] = (U)(a[5]) + (U)(b[5]); + vector_base::result_value[6] = (U)(a[6]) + (U)(b[6]); + vector_base::result_value[7] = (U)(a[7]) + (U)(b[7]); + return vector_base::result_value; + } + + static vector_type sub(vector_type a, vector_type b) + { +// return _mm_sub_epi16(a, b); + vector_base::result_value[0] = (U)(a[0]) - (U)(b[0]); + vector_base::result_value[1] = (U)(a[1]) - (U)(b[1]); + vector_base::result_value[2] = (U)(a[2]) - (U)(b[2]); + vector_base::result_value[3] = (U)(a[3]) - (U)(b[3]); + vector_base::result_value[4] = (U)(a[4]) - (U)(b[4]); + vector_base::result_value[5] = (U)(a[5]) - (U)(b[5]); + vector_base::result_value[6] = (U)(a[6]) - (U)(b[6]); + vector_base::result_value[7] = (U)(a[7]) - (U)(b[7]); + return vector_base::result_value; + } + + static vector_type mul(vector_type a, vector_type b) + { +// return _mm_mullo_epi16(a, b); + vector_base::result_value[0] = (U)(a[0]) * (U)(b[0]); + vector_base::result_value[1] = (U)(a[1]) * (U)(b[1]); + vector_base::result_value[2] = (U)(a[2]) * (U)(b[2]); + vector_base::result_value[3] = (U)(a[3]) * (U)(b[3]); + vector_base::result_value[4] = (U)(a[4]) * (U)(b[4]); + vector_base::result_value[5] = (U)(a[5]) * (U)(b[5]); + vector_base::result_value[6] = (U)(a[6]) * (U)(b[6]); + vector_base::result_value[7] = (U)(a[7]) * (U)(b[7]); + return vector_base::result_value; + } + + static vector_type div(vector_type a, vector_type b) + { +/* + return _mm_setr_epi16((U)_mm_extract_epi16(a, 0) / + (U)_mm_extract_epi16(b, 0), + (U)_mm_extract_epi16(a, 1) / + (U)_mm_extract_epi16(b, 1), + (U)_mm_extract_epi16(a, 2) / + (U)_mm_extract_epi16(b, 2), + (U)_mm_extract_epi16(a, 3) / + (U)_mm_extract_epi16(b, 3), + (U)_mm_extract_epi16(a, 4) / + (U)_mm_extract_epi16(b, 4), + (U)_mm_extract_epi16(a, 5) / + (U)_mm_extract_epi16(b, 5), + (U)_mm_extract_epi16(a, 6) / + (U)_mm_extract_epi16(b, 6), + (U)_mm_extract_epi16(a, 7) / + (U)_mm_extract_epi16(b, 7)); +*/ + vector_base::result_value[0] = (U)(a[0]) / (U)(b[0]); + vector_base::result_value[1] = (U)(a[1]) / (U)(b[1]); + vector_base::result_value[2] = (U)(a[2]) / (U)(b[2]); + vector_base::result_value[3] = (U)(a[3]) / (U)(b[3]); + vector_base::result_value[4] = (U)(a[4]) / (U)(b[4]); + vector_base::result_value[5] = (U)(a[5]) / (U)(b[5]); + vector_base::result_value[6] = (U)(a[6]) / (U)(b[6]); + vector_base::result_value[7] = (U)(a[7]) / (U)(b[7]); + return vector_base::result_value; + } + + static vector_type pow(vector_type a, vector_type b) + { +/* + return _mm_setr_epi16((U)std::pow((U)_mm_extract_epi16(a, 0), + (U)_mm_extract_epi16(b, 0)), + (U)std::pow((U)_mm_extract_epi16(a, 1), + (U)_mm_extract_epi16(b, 1)), + (U)std::pow((U)_mm_extract_epi16(a, 2), + (U)_mm_extract_epi16(b, 2)), + (U)std::pow((U)_mm_extract_epi16(a, 3), + (U)_mm_extract_epi16(b, 3)), + (U)std::pow((U)_mm_extract_epi16(a, 4), + (U)_mm_extract_epi16(b, 4)), + (U)std::pow((U)_mm_extract_epi16(a, 5), + (U)_mm_extract_epi16(b, 5)), + (U)std::pow((U)_mm_extract_epi16(a, 6), + (U)_mm_extract_epi16(b, 6)), + (U)std::pow((U)_mm_extract_epi16(a, 7), + (U)_mm_extract_epi16(b, 7))); + +*/ + vector_base::result_value[0] = (U)std::pow(a[0], b[0]); + vector_base::result_value[1] = (U)std::pow(a[1], b[1]); + vector_base::result_value[2] = (U)std::pow(a[2], b[2]); + vector_base::result_value[3] = (U)std::pow(a[3], b[3]); + vector_base::result_value[4] = (U)std::pow(a[4], b[4]); + vector_base::result_value[5] = (U)std::pow(a[5], b[5]); + vector_base::result_value[6] = (U)std::pow(a[6], b[6]); + vector_base::result_value[7] = (U)std::pow(a[7], b[7]); + return vector_base::result_value; + } + + static vector_type negate(vector_type a) + { +// return _mm_sub_epi16(_mm_setzero_si128(), a); + vector_base::result_value[0] = (U)(0) - (U)(a[0]); + vector_base::result_value[1] = (U)(0) - (U)(a[1]); + vector_base::result_value[2] = (U)(0) - (U)(a[2]); + vector_base::result_value[3] = (U)(0) - (U)(a[3]); + vector_base::result_value[4] = (U)(0) - (U)(a[4]); + vector_base::result_value[5] = (U)(0) - (U)(a[5]); + vector_base::result_value[6] = (U)(0) - (U)(a[6]); + vector_base::result_value[7] = (U)(0) - (U)(a[7]); + return vector_base::result_value; + } + + static vector_type exp(vector_type a) + { +/* + return _mm_setr_epi16((U)std::exp((U)_mm_extract_epi16(a, 0)), + (U)std::exp((U)_mm_extract_epi16(a, 1)), + (U)std::exp((U)_mm_extract_epi16(a, 2)), + (U)std::exp((U)_mm_extract_epi16(a, 3)), + (U)std::exp((U)_mm_extract_epi16(a, 4)), + (U)std::exp((U)_mm_extract_epi16(a, 5)), + (U)std::exp((U)_mm_extract_epi16(a, 6)), + (U)std::exp((U)_mm_extract_epi16(a, 7))); +*/ + vector_base::result_value[0] = (U)std::exp(a[0]); + vector_base::result_value[1] = (U)std::exp(a[1]); + vector_base::result_value[2] = (U)std::exp(a[2]); + vector_base::result_value[3] = (U)std::exp(a[3]); + vector_base::result_value[4] = (U)std::exp(a[4]); + vector_base::result_value[5] = (U)std::exp(a[5]); + vector_base::result_value[6] = (U)std::exp(a[6]); + vector_base::result_value[7] = (U)std::exp(a[7]); + return vector_base::result_value; + } + + static vector_type sqrt(vector_type a) + { +/* + return _mm_setr_epi16((U)std::sqrt((U)_mm_extract_epi16(a, 0)), + (U)std::sqrt((U)_mm_extract_epi16(a, 1)), + (U)std::sqrt((U)_mm_extract_epi16(a, 2)), + (U)std::sqrt((U)_mm_extract_epi16(a, 3)), + (U)std::sqrt((U)_mm_extract_epi16(a, 4)), + (U)std::sqrt((U)_mm_extract_epi16(a, 5)), + (U)std::sqrt((U)_mm_extract_epi16(a, 6)), + (U)std::sqrt((U)_mm_extract_epi16(a, 7))); +*/ + vector_base::result_value[0] = (U)std::sqrt(a[0]); + vector_base::result_value[1] = (U)std::sqrt(a[1]); + vector_base::result_value[2] = (U)std::sqrt(a[2]); + vector_base::result_value[3] = (U)std::sqrt(a[3]); + vector_base::result_value[4] = (U)std::sqrt(a[4]); + vector_base::result_value[5] = (U)std::sqrt(a[5]); + vector_base::result_value[6] = (U)std::sqrt(a[6]); + vector_base::result_value[7] = (U)std::sqrt(a[7]); + return vector_base::result_value; + } +}; + + +template +struct vector_traits::value || + std::is_same::value>> +{ + constexpr static unsigned vector_width = 8; + constexpr static size_t alignment = 64; + + using vector_type = + std::conditional::value, + int32_t*, + uint32_t* + >; + + using value_type = typename vector_base::value_type; + + template static + detail::enable_if_t::value, float*> + convert(vector_type v) + { +// return _mm_cvtepi32_ps(v); + return (T*)v; + } + + template static + detail::enable_if_t::value, double*> + convert(vector_type v) + { + //return _mm_cvtepi32_pd(v); + return (T*)v; + } + + template static + detail::enable_if_t>::value, float*> + convert(vector_type v) + { + //return _mm_unpacklo_ps(convert(v), _mm_setzero_ps()); + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, U* > + convert(vector_type v) + { +/* + U* i16 = std::is_signed::value ? _mm_packs_epi32(v, v) + : _mm_packus_epi32(v, v); + return std::is_signed::value ? _mm_packs_epi16(i16, i16) + : _mm_packus_epi16(i16, i16); +*/ + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T > + convert(vector_type v) + { +/* + return std::is_signed::value ? _mm_packs_epi32(v, v) + : _mm_packus_epi32(v, v); +*/ + return std::is_signed::value ? (T*)v + : (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T > + convert(vector_type v) + { + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T > + convert(vector_type v) + { +/* + return std::is_signed::value ? _mm_cvtepi32_epi64(v) + : _mm_cvtepu32_epi64(v); +*/ + return std::is_signed::value ? (T*)v + : (T*)v; + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { +// return _mm_loadu_si128((ivector_type*)ptr); + return ptr; + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { +// return _mm_load_si128((ivector_type*)ptr); + return ptr; + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { +// return _mm_set1_epi64x(*(int64_t*)ptr); + return ptr; + } + + static vector_type load1(const vector_type ptr) + { +// return _mm_set1_epi32(*ptr); + return ptr; + } + + static vector_type set1(value_type val) + { +// return _mm_set1_epi32(val); + vector_base::result_value[0] = val; + vector_base::result_value[1] = val; + vector_base::result_value[2] = val; + vector_base::result_value[3] = val; + return vector_base::result_value; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { +// _mm_storeu_si128((ivector_type*)ptr, v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { +// _mm_store_si128((ivector_type*)ptr, v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { +// _mm_storel_epi64((ivector_type*)ptr, v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + + static vector_type add(vector_type a, vector_type b) + { +// return _mm_add_epi32(a, b); + vector_base::result_value[0] = a[0] + b[0]; + vector_base::result_value[1] = a[1] + b[1]; + vector_base::result_value[2] = a[2] + b[2]; + vector_base::result_value[3] = a[3] + b[3]; + return vector_base::result_value; + } + + static vector_type sub(vector_type a, vector_type b) + { +// return _mm_sub_epi32(a, b); + vector_base::result_value[0] = a[0] - b[0]; + vector_base::result_value[1] = a[1] - b[1]; + vector_base::result_value[2] = a[2] - b[2]; + vector_base::result_value[3] = a[3] - b[3]; + return vector_base::result_value; + } + + static vector_type mul(vector_type a, vector_type b) + { +// return _mm_mullo_epi32(a, b); + vector_base::result_value[0] = a[0] * b[0]; + vector_base::result_value[1] = a[1] * b[1]; + vector_base::result_value[2] = a[2] * b[2]; + vector_base::result_value[3] = a[3] * b[3]; + return vector_base::result_value; + } + + static vector_type div(vector_type a, vector_type b) + { +/* + return _mm_setr_epi32((U)_mm_extract_epi32(a, 0) / + (U)_mm_extract_epi32(b, 0), + (U)_mm_extract_epi32(a, 1) / + (U)_mm_extract_epi32(b, 1), + (U)_mm_extract_epi32(a, 2) / + (U)_mm_extract_epi32(b, 2), + (U)_mm_extract_epi32(a, 3) / + (U)_mm_extract_epi32(b, 3)); +*/ + vector_base::result_value[0] = a[0] / b[0]; + vector_base::result_value[1] = a[1] / b[1]; + vector_base::result_value[2] = a[2] / b[2]; + vector_base::result_value[3] = a[3] / b[3]; + return vector_base::result_value; + } + + static vector_type pow(vector_type a, vector_type b) + { +/* + return _mm_setr_epi32((U)std::pow((U)_mm_extract_epi32(a, 0), + (U)_mm_extract_epi32(b, 0)), + (U)std::pow((U)_mm_extract_epi32(a, 1), + (U)_mm_extract_epi32(b, 1)), + (U)std::pow((U)_mm_extract_epi32(a, 2), + (U)_mm_extract_epi32(b, 2)), + (U)std::pow((U)_mm_extract_epi32(a, 3), + (U)_mm_extract_epi32(b, 3))); +*/ + vector_base::result_value[0] = std::pow(a[0], b[0]); + vector_base::result_value[1] = std::pow(a[1], b[1]); + vector_base::result_value[2] = std::pow(a[2], b[2]); + vector_base::result_value[3] = std::pow(a[3], b[3]); + return vector_base::result_value; + } + + static vector_type negate(vector_type a) + { +// return _mm_sub_epi32(_mm_setzero_si128(), a); + vector_base::result_value[0] = 0 - a[0]; + vector_base::result_value[1] = 0 - a[1]; + vector_base::result_value[2] = 0 - a[2]; + vector_base::result_value[3] = 0 - a[3]; + return vector_base::result_value; + } + + static vector_type exp(vector_type a) + { +/* + return _mm_setr_epi32((U)std::exp((U)_mm_extract_epi32(a, 0)), + (U)std::exp((U)_mm_extract_epi32(a, 1)), + (U)std::exp((U)_mm_extract_epi32(a, 2)), + (U)std::exp((U)_mm_extract_epi32(a, 3))); +*/ + vector_base::result_value[0] = std::exp(a[0]); + vector_base::result_value[1] = std::exp(a[1]); + vector_base::result_value[2] = std::exp(a[2]); + vector_base::result_value[3] = std::exp(a[3]); + return vector_base::result_value; + } + + static vector_type sqrt(vector_type a) + { +/* + return _mm_setr_epi32((U)std::sqrt((U)_mm_extract_epi32(a, 0)), + (U)std::sqrt((U)_mm_extract_epi32(a, 1)), + (U)std::sqrt((U)_mm_extract_epi32(a, 2)), + (U)std::sqrt((U)_mm_extract_epi32(a, 3))); +*/ + vector_base::result_value[0] = std::sqrt(a[0]); + vector_base::result_value[1] = std::sqrt(a[1]); + vector_base::result_value[2] = std::sqrt(a[2]); + vector_base::result_value[3] = std::sqrt(a[3]); + return vector_base::result_value; + } +}; + + +template +struct vector_traits::value || + std::is_same::value>> +{ + constexpr static unsigned vector_width = 4; + constexpr static size_t alignment = 64; + + using vector_type = + std::conditional::value, + int64_t*, + uint64_t* + >; + + using value_type = typename vector_base::value_type; + + template static + detail::enable_if_t::value, float* > + convert(vector_type v) + { +/* + float a = (U)_mm_extract_epi64(v, 0); + float b = (U)_mm_extract_epi64(v, 1); + return _mm_setr_ps(a, b, a, b); +*/ + return (float*)v; + } + + template static + detail::enable_if_t::value, double* > + convert(vector_type v) + { +/* + double a = (U)_mm_extract_epi64(v, 0); + double b = (U)_mm_extract_epi64(v, 1); + return _mm_setr_pd(a, b); +*/ + return (double*)v; + } + + template static + detail::enable_if_t>::value, float* > + convert(vector_type v) + { +// return _mm_unpacklo_ps(convert(v), _mm_setzero_ps()); + return (float*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T* > + convert(vector_type v) + { +/* + T a = (U)_mm_extract_epi64(v, 0); + T b = (U)_mm_extract_epi64(v, 1); + return _mm_setr_epi8(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b); +*/ + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T* > + convert(vector_type v) + { +/* + T a = (U)_mm_extract_epi64(v, 0); + T b = (U)_mm_extract_epi64(v, 1); + return _mm_setr_epi16(a, b, a, b, a, b, a, b); +*/ + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T* > + convert(vector_type v) + { +/* + T a = (U)_mm_extract_epi64(v, 0); + T b = (U)_mm_extract_epi64(v, 1); + return _mm_setr_epi32(a, b, a, b); +*/ + return (T*)v; + } + + template static + detail::enable_if_t::value || + std::is_same::value, T*> + convert(vector_type v) + { + return (T*)v; + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { +// return _mm_loadu_si128((ivector_type*)ptr); + vector_base::result_value[0] = ptr[0]; + vector_base::result_value[1] = ptr[1]; + return vector_base::result_value; + } + + template static + detail::enable_if_t + load(const vector_type ptr) + { +// return _mm_load_si128((ivector_type*)ptr); + vector_base::result_value[0] = ptr[0]; + vector_base::result_value[1] = ptr[1]; + return vector_base::result_value; + } + + static vector_type load1(const vector_type ptr) + { +// return _mm_set1_epi64x(*ptr); + vector_base::result_value[0] = ptr[0]; + vector_base::result_value[1] = ptr[1]; + return vector_base::result_value; + } + + static vector_type set1(value_type val) + { +// return _mm_set1_epi64x(val); + vector_base::result_value[0] = val; + vector_base::result_value[1] = val; + return vector_base::result_value; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { +// _mm_storeu_si128((ivector_type*)ptr, v); + ptr[0] = v[0]; + ptr[1] = v[1]; + } + + template static + detail::enable_if_t + store(vector_type v, vector_type ptr) + { +// _mm_store_si128((ivector_type*)ptr, v); + ptr[0] = v[0]; + ptr[1] = v[1]; + } + + static vector_type add( vector_type a, vector_type b ) + { +// return _mm_add_epi64(a, b); + vector_base::result_value[0] = ((U)a[0] + (U)b[0]); + vector_base::result_value[1] = ((U)a[1] + (U)b[1]); + return vector_base::result_value; + } + + static vector_type sub( vector_type a, vector_type b ) + { +// return _mm_sub_epi64(a, b); + + vector_base::result_value[0] = ((U)a[0] - (U)b[0]); + vector_base::result_value[1] = ((U)a[1] - (U)b[1]); + return vector_base::result_value; + } + + static vector_type mul( vector_type a, vector_type b ) + { +/* + return _mm_set_epi64x((U)_mm_extract_epi64(a, 1) * + (U)_mm_extract_epi64(b, 1), + (U)_mm_extract_epi64(a, 0) * + (U)_mm_extract_epi64(b, 0)); +*/ + vector_base::result_value[0] = ((U)a[0] * (U)b[0]); + vector_base::result_value[1] = ((U)a[1] * (U)b[1]); + return vector_base::result_value; + } + + static vector_type div( vector_type a, vector_type b ) + { +/* + return _mm_set_epi64x((U)_mm_extract_epi64(a, 1) / + (U)_mm_extract_epi64(b, 1), + (U)_mm_extract_epi64(a, 0) / + (U)_mm_extract_epi64(b, 0)); +*/ + vector_base::result_value[0] = ((U)a[0] / (U)b[0]); + vector_base::result_value[1] = ((U)a[1] / (U)b[1]); + return vector_base::result_value; + } + + static vector_type pow( vector_type a, vector_type b) + { +/* + return _mm_set_epi64x((U)std::pow((U)_mm_extract_epi64(a, 1), + (U)_mm_extract_epi64(b, 1)), + (U)std::pow((U)_mm_extract_epi64(a, 0), + (U)_mm_extract_epi64(b, 0))); +*/ + vector_base::result_value[0] = (U)std::pow((U)a[0], (U)b[0]); + vector_base::result_value[1] = (U)std::pow((U)a[1], (U)b[1]); + return vector_base::result_value; + } + + static vector_type negate( vector_type a ) + { + //return _mm_sub_epi64(_mm_setzero_si128(), a); + return ((U)0)-a; + } + + static vector_type exp(vector_type a) + { + //return _mm_set_epi64x((U)std::exp((U)_mm_extract_epi64(a, 1)), + // (U)std::exp((U)_mm_extract_epi64(a, 0))); + vector_base::result_value[0] = (U)std::exp(a[0]); + vector_base::result_value[1] = (U)std::exp(a[1]); + return vector_base::result_value; + } + + static vector_type sqrt(vector_type a) + { + //return _mm_set_epi64x((U)std::sqrt((U)_mm_extract_epi64(a, 1)), + // (U)std::sqrt((U)_mm_extract_epi64(a, 0))); + + vector_base::result_value[0] = (U)std::sqrt(a[0]); + vector_base::result_value[1] = (U)std::sqrt(a[1]); + return vector_base::result_value; + } +}; + +} + +#endif diff --git a/src/external/tci/tci/task_set.c b/src/external/tci/tci/task_set.c index 361ca8266..556f490fc 100644 --- a/src/external/tci/tci/task_set.c +++ b/src/external/tci/tci/task_set.c @@ -2,7 +2,11 @@ #include "task_set.h" #ifdef __cplusplus +#include + extern "C" { +#else +#include #endif #if TCI_USE_OPENMP_THREADS || TCI_USE_PTHREADS_THREADS || TCI_USE_WINDOWS_THREADS From 5dc2a4aa1ab17291e49e1be2e580ea56046449a8 Mon Sep 17 00:00:00 2001 From: "ct.clmsn" Date: Wed, 30 Oct 2024 23:00:37 -0400 Subject: [PATCH 2/2] rm'd comments from vector_generic; fixed several include and compiler issues revolving around simd compilation definitions --- src/external/marray/include/expression.hpp | 4 + .../marray/include/vector_generic.hpp | 424 ------------------ src/util/cpuid.cxx | 8 +- src/util/cpuid.hpp | 4 +- 4 files changed, 13 insertions(+), 427 deletions(-) diff --git a/src/external/marray/include/expression.hpp b/src/external/marray/include/expression.hpp index 2448c613e..170038ee7 100644 --- a/src/external/marray/include/expression.hpp +++ b/src/external/marray/include/expression.hpp @@ -1,8 +1,12 @@ #ifndef _MARRAY_EXPRESSION_HPP_ #define _MARRAY_EXPRESSION_HPP_ +#if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE4_1__) + #include +#endif + #include "utility.hpp" #include "vector.hpp" diff --git a/src/external/marray/include/vector_generic.hpp b/src/external/marray/include/vector_generic.hpp index c89b23bfa..2dcbfbb27 100644 --- a/src/external/marray/include/vector_generic.hpp +++ b/src/external/marray/include/vector_generic.hpp @@ -86,9 +86,6 @@ struct vector_traits detail::enable_if_t::value, int8_t*> convert(float* v) { - //ivector_type i32 = _mm_cvtps_epi32(v); - //ivector_type i16 = _mm_packs_epi32(i32, i32); - //return _mm_packs_epi16(i16, i16); return (int8_t*)v; } @@ -190,7 +187,6 @@ struct vector_traits detail::enable_if_t store(float* v, float* ptr) { - //_mm_storeu_ps(ptr, v); ptr[0] = v[0]; ptr[1] = v[1]; ptr[2] = v[2]; @@ -201,7 +197,6 @@ struct vector_traits detail::enable_if_t store(float * v, float* ptr) { - //_mm_store_ps(ptr, v); std::copy(v, v+4, ptr); } @@ -209,13 +204,11 @@ struct vector_traits detail::enable_if_t store(double * v, float* ptr) { - //_mm_store_sd((double*)ptr, _mm_castps_pd(v)); std::copy(v, v+1, (double*)ptr); } static float* add(float* a, float* b) { - //return _mm_add_ps(a, b); vector_base::result_value[0] = a[0] + b[0]; vector_base::result_value[1] = a[1] + b[1]; vector_base::result_value[2] = a[2] + b[2]; @@ -225,7 +218,6 @@ struct vector_traits static float* sub(float* a, float* b) { - //return _mm_sub_ps(a, b); vector_base::result_value[0] = a[0] - b[0]; vector_base::result_value[1] = a[1] - b[1]; vector_base::result_value[2] = a[2] - b[2]; @@ -235,7 +227,6 @@ struct vector_traits static float* mul(float* a, float* b) { - //return _mm_mul_ps(a, b); vector_base::result_value[0] = a[0] * b[0]; vector_base::result_value[1] = a[1] * b[1]; vector_base::result_value[2] = a[2] * b[2]; @@ -245,7 +236,6 @@ struct vector_traits static float* div(float* a, float* b) { - //return _mm_div_ps(a, b); vector_base::result_value[0] = a[0] / b[0]; vector_base::result_value[1] = a[1] / b[1]; vector_base::result_value[2] = a[2] / b[2]; @@ -255,10 +245,6 @@ struct vector_traits static float* pow(float* a, float* b) { - //return _mm_setr_ps(std::pow((float)a[0], (float)b[0]), - // std::pow((float)a[1], (float)b[1]), - // std::pow((float)a[2], (float)b[2]), - // std::pow((float)a[3], (float)b[3])); vector_base::result_value[0] = std::pow(a[0] , b[0]); vector_base::result_value[1] = std::pow(a[1] , b[1]); vector_base::result_value[2] = std::pow(a[2] , b[2]); @@ -268,7 +254,6 @@ struct vector_traits static float* negate(float* a) { - //return _mm_xor_ps(a, _mm_set1_ps(-0.0f)); vector_base::result_value[0] = -a[0]; vector_base::result_value[1] = -a[1]; vector_base::result_value[2] = -a[2]; @@ -278,10 +263,6 @@ struct vector_traits static float* exp(float* a) { - //return _mm_setr_ps(std::exp((float)a[0]), - // std::exp((float)a[1]), - // std::exp((float)a[2]), - // std::exp((float)a[3])); vector_base::result_value[0] = std::exp(a[0]); vector_base::result_value[1] = std::exp(a[1]); vector_base::result_value[2] = std::exp(a[2]); @@ -291,7 +272,6 @@ struct vector_traits static float* sqrt(float* a) { - //return _mm_sqrt_ps(a); vector_base::result_value[0] = std::sqrt(a[0]); vector_base::result_value[1] = std::sqrt(a[1]); vector_base::result_value[2] = std::sqrt(a[2]); @@ -312,7 +292,6 @@ struct vector_traits detail::enable_if_t::value, float*> convert(double* v) { - //return _mm_cvtpd_ps(v); return (float*)v; } @@ -327,7 +306,6 @@ struct vector_traits detail::enable_if_t>::value, float*> convert(double* v) { - //return _mm_unpacklo_ps(_mm_cvtpd_ps(v), _mm_setzero_ps()); return (float*)v; } @@ -335,9 +313,6 @@ struct vector_traits detail::enable_if_t::value, int8_t*> convert(double* v) { - //ivector_type i32 = _mm_cvtpd_epi32(v); - //ivector_type i16 = _mm_packs_epi32(i32, i32); - //return _mm_packs_epi16(i16, i16); return (int8_t*)v; } @@ -345,9 +320,6 @@ struct vector_traits detail::enable_if_t::value, uint8_t*> convert(double* v) { - //ivector_type i32 = _mm_cvtpd_epi32(v); - //ivector_type i16 = _mm_packus_epi32(i32, i32); - //return _mm_packus_epi16(i16, i16); return (uint8_t*)v; } @@ -355,8 +327,6 @@ struct vector_traits detail::enable_if_t::value, int16_t*> convert(double* v) { - //ivector_type i32 = _mm_cvtpd_epi32(v); - //return _mm_packs_epi32(i32, i32); return (int16_t*)v; } @@ -364,8 +334,6 @@ struct vector_traits detail::enable_if_t::value, uint16_t*> convert(double* v) { - //ivector_type i32 = _mm_cvtpd_epi32(v); - //return _mm_packus_epi32(i32, i32); return (uint16_t*)v; } @@ -373,7 +341,6 @@ struct vector_traits detail::enable_if_t::value, int32_t*> convert(double* v) { - //return _mm_cvtpd_epi32(v); return (int32_t*)v; } @@ -381,8 +348,6 @@ struct vector_traits detail::enable_if_t::value, uint32_t*> convert(double* v) { - //return _mm_setr_epi32((uint32_t)v[0], (uint32_t)v[1], - // (uint32_t)v[0], (uint32_t)v[1]); return (uint32_t*)v; } @@ -391,7 +356,6 @@ struct vector_traits std::is_same::value, T*> convert(double* v) { - //return _mm_set_epi64x((T)v[1], (T)v[0]); return (T*)v; } @@ -399,7 +363,6 @@ struct vector_traits detail::enable_if_t load(const double* ptr) { - //return _mm_loadu_pd(ptr); return const_cast(ptr); } @@ -407,19 +370,16 @@ struct vector_traits detail::enable_if_t load(const double* ptr) { - //return _mm_load_pd(ptr); return const_cast(ptr); } static double * load1(const double* ptr) { - //return _mm_load1_pd(ptr); return const_cast(ptr); } static double* set1(double val) { - //return _mm_set1_pd(val); vector_base::result_value[0] = val; vector_base::result_value[1] = val; return vector_base::result_value; @@ -429,7 +389,6 @@ struct vector_traits detail::enable_if_t store(double* v, double* ptr) { - //_mm_storeu_pd(ptr, v); std::copy(v, v+2, ptr); } @@ -437,13 +396,11 @@ struct vector_traits detail::enable_if_t store(double* v, double* ptr) { - //_mm_store_pd(ptr, v); std::copy(v, v+2, ptr); } static double* add(double* a, double* b) { - //return _mm_add_pd(a, b); vector_base::result_value[0] = a[0] + b[0]; vector_base::result_value[1] = a[1] + b[1]; return vector_base::result_value; @@ -451,7 +408,6 @@ struct vector_traits static double* sub(double* a, double* b) { - //return _mm_sub_pd(a, b); vector_base::result_value[0] = a[0] - b[0]; vector_base::result_value[1] = a[1] - b[1]; return vector_base::result_value; @@ -459,7 +415,6 @@ struct vector_traits static double* mul(double* a, double* b) { - //return _mm_mul_pd(a, b); vector_base::result_value[0] = a[0] * b[0]; vector_base::result_value[1] = a[1] * b[1]; return vector_base::result_value; @@ -467,7 +422,6 @@ struct vector_traits static double* div(double* a, double* b) { - //return _mm_div_pd(a, b); vector_base::result_value[0] = a[0] / b[0]; vector_base::result_value[1] = a[1] / b[1]; return vector_base::result_value; @@ -475,8 +429,6 @@ struct vector_traits static double* pow(double* a, double* b) { - //return _mm_setr_pd(std::pow((double)a[0], (double)b[0]), - // std::pow((double)a[1], (double)b[1])); vector_base::result_value[0] = std::pow(a[0] , b[0]); vector_base::result_value[1] = std::pow(a[1] , b[1]); return vector_base::result_value; @@ -484,7 +436,6 @@ struct vector_traits static double* negate(double* a) { - //return _mm_xor_pd(a, _mm_set1_pd(-0.0)); vector_base::result_value[0] = -a[0]; vector_base::result_value[1] = -a[1]; return vector_base::result_value; @@ -492,8 +443,6 @@ struct vector_traits static double* exp(double* a) { - //return _mm_setr_pd(std::exp((double)a[0]), - // std::exp((double)a[1])); vector_base::result_value[0] = std::exp(a[0]); vector_base::result_value[1] = std::exp(a[1]); return vector_base::result_value; @@ -501,7 +450,6 @@ struct vector_traits static double* sqrt(double* a) { - //return _mm_sqrt_pd(a); vector_base::result_value[0] = std::sqrt(a[0]); vector_base::result_value[1] = std::sqrt(a[1]); return vector_base::result_value; @@ -520,7 +468,6 @@ struct vector_traits> detail::enable_if_t::value, float* > convert(float* v) { - //return _mm_shuffle_ps(v, v, _MM_SHUFFLE(2,0,2,0)); return v; } @@ -528,7 +475,6 @@ struct vector_traits> detail::enable_if_t::value, double*> convert(float* v) { - //return _mm_cvtps_pd(convert(v)); return (double*)v; } @@ -543,9 +489,6 @@ struct vector_traits> detail::enable_if_t::value, int8_t* > convert(float* v) { - //ivector_type i32 = _mm_cvtps_epi32(convert(v)); - //ivector_type i16 = _mm_packs_epi32(i32, i32); - //return _mm_packs_epi16(i16, i16); return (int8_t*)v; } @@ -553,9 +496,6 @@ struct vector_traits> detail::enable_if_t::value, uint8_t* > convert(float* v) { - //ivector_type i32 = _mm_cvtps_epi32(convert(v)); - //ivector_type i16 = _mm_packus_epi32(i32, i32); - //return _mm_packus_epi16(i16, i16); return (uint8_t*)v; } @@ -563,8 +503,6 @@ struct vector_traits> detail::enable_if_t::value, int16_t* > convert(float* v) { - //ivector_type i32 = _mm_cvtps_epi32(convert(v)); - //return _mm_packs_epi32(i32, i32); return (int16_t*)v; } @@ -572,8 +510,6 @@ struct vector_traits> detail::enable_if_t::value, uint16_t* > convert(float* v) { - //ivector_type i32 = _mm_cvtps_epi32(convert(v)); - //return _mm_packus_epi32(i32, i32); return (uint16_t*)v; } @@ -581,7 +517,6 @@ struct vector_traits> detail::enable_if_t::value, int32_t* > convert(float* v) { - //return _mm_cvtps_epi32(convert(v)); return (uint32_t*)v; } @@ -589,8 +524,6 @@ struct vector_traits> detail::enable_if_t::value, uint32_t* > convert(float* v) { - //return _mm_setr_epi32((uint32_t)v[0], (uint32_t)v[2], - // (uint32_t)v[0], (uint32_t)v[2]); return (uint32_t*)v; } @@ -599,7 +532,6 @@ struct vector_traits> std::is_same::value, int64_t* > convert(float* v) { - //return _mm_set_epi64x((T)v[2], (T)v[0]); return (int64_t*)v; } @@ -607,7 +539,6 @@ struct vector_traits> detail::enable_if_t load(const std::complex* ptr) { - //return _mm_loadu_ps((float*)ptr); vector_base::result_value[0] = ptr->real(); vector_base::result_value[1] = ptr->imag(); return vector_base::result_value; @@ -617,7 +548,6 @@ struct vector_traits> detail::enable_if_t load(const std::complex* ptr) { - //return _mm_load_ps((float*)ptr); vector_base::result_value[0] = ptr->real(); vector_base::result_value[1] = ptr->imag(); return vector_base::result_value; @@ -625,7 +555,6 @@ struct vector_traits> static float* load1(const std::complex* ptr) { - //return _mm_castpd_ps(_mm_load1_pd((double*)ptr)); vector_base::result_value[0] = ptr->real(); vector_base::result_value[1] = ptr->imag(); return vector_base::result_value; @@ -633,7 +562,6 @@ struct vector_traits> static float* set1(std::complex val) { - //return _mm_castpd_ps(_mm_set1_pd(*(double*)&val)); vector_base::result_value[0] = val.real(); vector_base::result_value[1] = val.imag(); return vector_base::result_value; @@ -643,7 +571,6 @@ struct vector_traits> detail::enable_if_t store(float* v, std::complex* ptr) { - //_mm_storeu_ps((float*)ptr, v); (*ptr) = {v[0], v[1]}; } @@ -651,13 +578,11 @@ struct vector_traits> detail::enable_if_t store(float* v, std::complex* ptr) { - //_mm_store_ps((float*)ptr, v); (*ptr) = {v[0], v[1]}; } static float* add(float* a, float* b) { - //return _mm_add_ps(a, b); vector_base::result_value[0] = a[0] + b[0]; vector_base::result_value[1] = a[1] + b[1]; vector_base::result_value[2] = a[2] + b[2]; @@ -667,7 +592,6 @@ struct vector_traits> static float* sub(float* a, float* b) { - //return _mm_sub_ps(a, b); vector_base::result_value[0] = a[0] - b[0]; vector_base::result_value[1] = a[1] - b[1]; vector_base::result_value[2] = a[2] - b[2]; @@ -677,16 +601,6 @@ struct vector_traits> static float* mul(float* a, float* b) { -/* - float* ashuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,3,0,1)); - float* breal = _mm_moveldup_ps(b); - float* bimag = _mm_movehdup_ps(b); - float* tmp1 = _mm_mul_ps( a, breal); // tmp1 = (ar0*br0, ai0*br0, ar1*br1, ai1*br1) - float* tmp2 = _mm_mul_ps(ashuf, bimag); // tmp2 = (ai0*bi0, ar0*bi0, ai1*bi1, ar1*bi1) - return _mm_addsub_ps(tmp1, tmp2); -*/ - //(a + ib) (c + id) = (ac - bd) + i(ad + bc). - vector_base::result_value[0] = a[0]*b[0] + a[1]*b[1]; vector_base::result_value[1] = a[0]*b[1] + a[1]*b[0]; @@ -698,21 +612,6 @@ struct vector_traits> static float* div(float* a, float* b) { -/* - float* bsqr = _mm_mul_ps(b, b); - bsqr = _mm_hadd_ps(bsqr, bsqr); - bsqr = _mm_shuffle_ps(bsqr, bsqr, _MM_SHUFFLE(3,1,2,0)); // bsqr = (|b0|^2, |b0|^2, |b1|^2, |b1|^2) - - float* ashuf = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,3,0,1)); - float* breal = _mm_moveldup_ps(b); - float* bimag = _mm_movehdup_ps(b); - float* tmp1 = _mm_mul_ps( a, breal); // tmp1 = ( ar0*br0, ai0*br0, ar1*br1, ai1*br1) - float* tmp2 = _mm_mul_ps(ashuf, bimag); - tmp2 = _mm_xor_ps(tmp2, _mm_set1_ps(-0.0f)); // tmp2 = (-ai0*bi0, -ar0*bi0, -ai1*bi1, -ar1*bi1) - float* abconj = _mm_addsub_ps(tmp1, tmp2); - - return _mm_div_ps(abconj, bsqr); -*/ const float denom_a = b[0] * b[0] + a[1] * a[1]; vector_base::result_value[0] = (a[0] + b[0]) + (a[1] * b[1]) / denom_a; vector_base::result_value[1] = (a[1] * b[0]) - (a[0] * b[1]) / denom_a; @@ -794,7 +693,6 @@ struct vector_traits::value || detail::enable_if_t::value, float* > convert(vector_type v) { - //return _mm_cvtepi32_ps(convert(v)); return (float*)v; } @@ -802,7 +700,6 @@ struct vector_traits::value || detail::enable_if_t::value, double* > convert(vector_type v) { - //return _mm_cvtepi32_pd(convert(v)); return (double*)v; } @@ -810,7 +707,6 @@ struct vector_traits::value || detail::enable_if_t>::value, float* > convert(vector_type v) { - //return _mm_unpacklo_ps(convert(v), _mm_setzero_ps()); return (float*)v; } @@ -835,10 +731,6 @@ struct vector_traits::value || std::is_same::value, T * > convert(vector_type v) { -/* - return std::is_signed::value ? (int32_t*)v - : (uint32_t*)v; -*/ return (T*)v; } @@ -952,8 +844,6 @@ struct vector_traits::value || detail::enable_if_t store(vector_type v, vector_type ptr) { - //*(int32_t*)ptr = _mm_extract_epi32(v, 0); - // dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0] (*((value_type*)ptr)) = v[3] << 24 | v[2] << 16 | v[1] << 8 | v[0]; } @@ -961,15 +851,11 @@ struct vector_traits::value || detail::enable_if_t store(vector_type v, vector_type ptr) { - //*(int16_t*)ptr = _mm_extract_epi16(v, 0); - // (a[127:0] >> (imm8[2:0] * 16))[15:0] - // (*((value_type*)ptr)) = (v[1] << 8) | v[0]; } static vector_type add(vector_type a, vector_type b) { - //return _mm_add_epi8(a, b); vector_base::result_value[0] = a[0] + b[0]; vector_base::result_value[1] = a[1] + b[1]; vector_base::result_value[2] = a[2] + b[2]; @@ -991,7 +877,6 @@ struct vector_traits::value || static vector_type sub(vector_type a, vector_type b) { - //return _mm_sub_epi8(a, b); vector_base::result_value[0] = a[0] - b[0]; vector_base::result_value[1] = a[1] - b[1]; vector_base::result_value[2] = a[2] - b[2]; @@ -1013,9 +898,6 @@ struct vector_traits::value || static vector_type mul(vector_type a, vector_type b) { - //ivector_type lo = _mm_and_si128(_mm_mullo_epi16(a, b), _mm_set1_epi16(0xff)); - //ivector_type hi = _mm_mullo_epi16(_mm_srli_epi16(a, 8),_mm_srli_epi16(b, 8)); - //return _mm_or_si128(_mm_slli_epi16(hi, 8), lo); vector_base::result_value[0] = a[0] * b[0]; vector_base::result_value[1] = a[1] * b[1]; vector_base::result_value[2] = a[2] * b[2]; @@ -1037,40 +919,6 @@ struct vector_traits::value || static vector_type div(vector_type a, vector_type b) { -/* - return _mm_setr_epi8((U)_mm_extract_epi8(a, 0) / - (U)_mm_extract_epi8(b, 0), - (U)_mm_extract_epi8(a, 1) / - (U)_mm_extract_epi8(b, 1), - (U)_mm_extract_epi8(a, 2) / - (U)_mm_extract_epi8(b, 2), - (U)_mm_extract_epi8(a, 3) / - (U)_mm_extract_epi8(b, 3), - (U)_mm_extract_epi8(a, 4) / - (U)_mm_extract_epi8(b, 4), - (U)_mm_extract_epi8(a, 5) / - (U)_mm_extract_epi8(b, 5), - (U)_mm_extract_epi8(a, 6) / - (U)_mm_extract_epi8(b, 6), - (U)_mm_extract_epi8(a, 7) / - (U)_mm_extract_epi8(b, 7), - (U)_mm_extract_epi8(a, 8) / - (U)_mm_extract_epi8(b, 8), - (U)_mm_extract_epi8(a, 9) / - (U)_mm_extract_epi8(b, 9), - (U)_mm_extract_epi8(a,10) / - (U)_mm_extract_epi8(b,10), - (U)_mm_extract_epi8(a,11) / - (U)_mm_extract_epi8(b,11), - (U)_mm_extract_epi8(a,12) / - (U)_mm_extract_epi8(b,12), - (U)_mm_extract_epi8(a,13) / - (U)_mm_extract_epi8(b,13), - (U)_mm_extract_epi8(a,14) / - (U)_mm_extract_epi8(b,14), - (U)_mm_extract_epi8(a,15) / - (U)_mm_extract_epi8(b,15)); -*/ vector_base::result_value[0] = a[0] / b[0]; vector_base::result_value[1] = a[1] / b[1]; vector_base::result_value[2] = a[2] / b[2]; @@ -1092,40 +940,6 @@ struct vector_traits::value || static vector_type pow(vector_type a, vector_type b) { -/* - return _mm_setr_epi8((U)std::pow((U)_mm_extract_epi8(a, 0), - (U)_mm_extract_epi8(b, 0)), - (U)std::pow((U)_mm_extract_epi8(a, 1), - (U)_mm_extract_epi8(b, 1)), - (U)std::pow((U)_mm_extract_epi8(a, 2), - (U)_mm_extract_epi8(b, 2)), - (U)std::pow((U)_mm_extract_epi8(a, 3), - (U)_mm_extract_epi8(b, 3)), - (U)std::pow((U)_mm_extract_epi8(a, 4), - (U)_mm_extract_epi8(b, 4)), - (U)std::pow((U)_mm_extract_epi8(a, 5), - (U)_mm_extract_epi8(b, 5)), - (U)std::pow((U)_mm_extract_epi8(a, 6), - (U)_mm_extract_epi8(b, 6)), - (U)std::pow((U)_mm_extract_epi8(a, 7), - (U)_mm_extract_epi8(b, 7)), - (U)std::pow((U)_mm_extract_epi8(a, 8), - (U)_mm_extract_epi8(b, 8)), - (U)std::pow((U)_mm_extract_epi8(a, 9), - (U)_mm_extract_epi8(b, 9)), - (U)std::pow((U)_mm_extract_epi8(a,10), - (U)_mm_extract_epi8(b,10)), - (U)std::pow((U)_mm_extract_epi8(a,11), - (U)_mm_extract_epi8(b,11)), - (U)std::pow((U)_mm_extract_epi8(a,12), - (U)_mm_extract_epi8(b,12)), - (U)std::pow((U)_mm_extract_epi8(a,13), - (U)_mm_extract_epi8(b,13)), - (U)std::pow((U)_mm_extract_epi8(a,14), - (U)_mm_extract_epi8(b,14)), - (U)std::pow((U)_mm_extract_epi8(a,15), - (U)_mm_extract_epi8(b,15))); -*/ vector_base::result_value[0] = (U)std::pow(a[0], b[0]); vector_base::result_value[1] = (U)std::pow(a[1], b[1]); vector_base::result_value[2] = (U)std::pow(a[2], b[2]); @@ -1148,7 +962,6 @@ struct vector_traits::value || static vector_type negate(vector_type a) { - //return _mm_sub_epi8(_mm_setzero_si128(), a); vector_base::result_value[0] = -(a[0]); vector_base::result_value[1] = -(a[1]); vector_base::result_value[2] = -(a[2]); @@ -1171,24 +984,6 @@ struct vector_traits::value || static vector_type exp(vector_type a) { -/* - return _mm_setr_epi8((U)std::exp((U)_mm_extract_epi8(a, 0)), - (U)std::exp((U)_mm_extract_epi8(a, 1)), - (U)std::exp((U)_mm_extract_epi8(a, 2)), - (U)std::exp((U)_mm_extract_epi8(a, 3)), - (U)std::exp((U)_mm_extract_epi8(a, 4)), - (U)std::exp((U)_mm_extract_epi8(a, 5)), - (U)std::exp((U)_mm_extract_epi8(a, 6)), - (U)std::exp((U)_mm_extract_epi8(a, 7)), - (U)std::exp((U)_mm_extract_epi8(a, 8)), - (U)std::exp((U)_mm_extract_epi8(a, 9)), - (U)std::exp((U)_mm_extract_epi8(a,10)), - (U)std::exp((U)_mm_extract_epi8(a,11)), - (U)std::exp((U)_mm_extract_epi8(a,12)), - (U)std::exp((U)_mm_extract_epi8(a,13)), - (U)std::exp((U)_mm_extract_epi8(a,14)), - (U)std::exp((U)_mm_extract_epi8(a,15))); -*/ vector_base::result_value[0] = (U)std::exp(a[0]); vector_base::result_value[1] = (U)std::exp(a[1]); vector_base::result_value[2] = (U)std::exp(a[2]); @@ -1211,24 +1006,6 @@ struct vector_traits::value || static U* sqrt(U* a) { -/* - return _mm_setr_epi8((U)std::sqrt((U)_mm_extract_epi8(a, 0)), - (U)std::sqrt((U)_mm_extract_epi8(a, 1)), - (U)std::sqrt((U)_mm_extract_epi8(a, 2)), - (U)std::sqrt((U)_mm_extract_epi8(a, 3)), - (U)std::sqrt((U)_mm_extract_epi8(a, 4)), - (U)std::sqrt((U)_mm_extract_epi8(a, 5)), - (U)std::sqrt((U)_mm_extract_epi8(a, 6)), - (U)std::sqrt((U)_mm_extract_epi8(a, 7)), - (U)std::sqrt((U)_mm_extract_epi8(a, 8)), - (U)std::sqrt((U)_mm_extract_epi8(a, 9)), - (U)std::sqrt((U)_mm_extract_epi8(a,10)), - (U)std::sqrt((U)_mm_extract_epi8(a,11)), - (U)std::sqrt((U)_mm_extract_epi8(a,12)), - (U)std::sqrt((U)_mm_extract_epi8(a,13)), - (U)std::sqrt((U)_mm_extract_epi8(a,14)), - (U)std::sqrt((U)_mm_extract_epi8(a,15))); -*/ vector_base::result_value[0] = (U)std::sqrt(a[0]); vector_base::result_value[1] = (U)std::sqrt(a[1]); vector_base::result_value[2] = (U)std::sqrt(a[2]); @@ -1270,7 +1047,6 @@ struct vector_traits::value || detail::enable_if_t::value, float*> convert(vector_type v) { -// return _mm_cvtepi32_ps(convert(v)); return (T*)v; } @@ -1278,7 +1054,6 @@ struct vector_traits::value || detail::enable_if_t::value, double*> convert(vector_type v) { -// return _mm_cvtepi32_pd(convert(v)); return (T*)v; } @@ -1286,7 +1061,6 @@ struct vector_traits::value || detail::enable_if_t>::value, float*> convert(vector_type v) { -// return _mm_unpacklo_ps(convert(v), _mm_setzero_ps()); return (T*)v; } @@ -1295,10 +1069,6 @@ struct vector_traits::value || std::is_same::value, T* > convert(vector_type v) { -/* - return std::is_signed::value ? _mm_packs_epi16(v, v) - : _mm_packus_epi16(v, v); -*/ return (T*)v; } @@ -1315,10 +1085,6 @@ struct vector_traits::value || std::is_same::value, T* > convert(vector_type v) { -/* - return std::is_signed::value ? _mm_cvtepi16_epi32(v) - : _mm_cvtepu16_epi32(v); -*/ return (T*)v; } @@ -1327,10 +1093,6 @@ struct vector_traits::value || std::is_same::value, T* > convert(vector_type v) { -/* - return std::is_signed::value ? _mm_cvtepi16_epi64(v) - : _mm_cvtepu16_epi64(v); -*/ return (T*)v; } @@ -1338,7 +1100,6 @@ struct vector_traits::value || detail::enable_if_t load(const vector_type ptr) { -// return _mm_loadu_si128((ivector_type*)ptr); vector_base::result_value[0] = ptr[0]; vector_base::result_value[1] = ptr[1]; vector_base::result_value[2] = ptr[2]; @@ -1354,7 +1115,6 @@ struct vector_traits::value || detail::enable_if_t load(const vector_type ptr) { -// return _mm_load_si128((ivector_type*)ptr); vector_base::result_value[0] = ptr[0]; vector_base::result_value[1] = ptr[1]; vector_base::result_value[2] = ptr[2]; @@ -1370,7 +1130,6 @@ struct vector_traits::value || detail::enable_if_t load(const vector_type ptr) { -// return _mm_set1_epi64x(*(int64_t*)ptr); vector_base::result_value[0] = ptr[0]; vector_base::result_value[1] = ptr[1]; vector_base::result_value[2] = ptr[2]; @@ -1386,7 +1145,6 @@ struct vector_traits::value || detail::enable_if_t load(const vector_type ptr) { -// return _mm_set1_epi32(*(int32_t*)ptr); vector_base::result_value[0] = ptr[0]; vector_base::result_value[1] = ptr[1]; vector_base::result_value[2] = ptr[2]; @@ -1400,7 +1158,6 @@ struct vector_traits::value || static vector_type load1(const vector_type ptr) { -// return _mm_set1_epi16(*ptr); vector_base::result_value[0] = ptr[0]; vector_base::result_value[1] = ptr[1]; vector_base::result_value[2] = ptr[2]; @@ -1414,7 +1171,6 @@ struct vector_traits::value || static vector_type set1(value_type val) { -// return _mm_set1_epi16(val); vector_base::result_value[0] = val; vector_base::result_value[1] = val; vector_base::result_value[2] = val; @@ -1429,7 +1185,6 @@ struct vector_traits::value || detail::enable_if_t store(vector_type v, vector_type ptr) { -// _mm_storeu_si128((ivector_type*)ptr, v); ptr[0] = v[0]; ptr[1] = v[1]; ptr[2] = v[2]; @@ -1444,7 +1199,6 @@ struct vector_traits::value || detail::enable_if_t store(vector_type v, vector_type ptr) { -// _mm_store_si128((ivector_type*)ptr, v); ptr[0] = v[0]; ptr[1] = v[1]; ptr[2] = v[2]; @@ -1459,7 +1213,6 @@ struct vector_traits::value || detail::enable_if_t store(vector_type v, vector_type ptr) { -// _mm_storel_epi64((ivector_type*)ptr, v); ptr[0] = v[0]; ptr[1] = v[1]; ptr[2] = v[2]; @@ -1474,7 +1227,6 @@ struct vector_traits::value || detail::enable_if_t store(vector_type v, vector_type ptr) { -// *(int32_t*)ptr = _mm_extract_epi32(v, 0); ptr[0] = v[0]; ptr[1] = v[1]; ptr[2] = v[2]; @@ -1487,7 +1239,6 @@ struct vector_traits::value || static vector_type add(vector_type a, vector_type b) { -// return _mm_add_epi16(a, b); vector_base::result_value[0] = (U)(a[0]) + (U)(b[0]); vector_base::result_value[1] = (U)(a[1]) + (U)(b[1]); vector_base::result_value[2] = (U)(a[2]) + (U)(b[2]); @@ -1501,7 +1252,6 @@ struct vector_traits::value || static vector_type sub(vector_type a, vector_type b) { -// return _mm_sub_epi16(a, b); vector_base::result_value[0] = (U)(a[0]) - (U)(b[0]); vector_base::result_value[1] = (U)(a[1]) - (U)(b[1]); vector_base::result_value[2] = (U)(a[2]) - (U)(b[2]); @@ -1515,7 +1265,6 @@ struct vector_traits::value || static vector_type mul(vector_type a, vector_type b) { -// return _mm_mullo_epi16(a, b); vector_base::result_value[0] = (U)(a[0]) * (U)(b[0]); vector_base::result_value[1] = (U)(a[1]) * (U)(b[1]); vector_base::result_value[2] = (U)(a[2]) * (U)(b[2]); @@ -1529,24 +1278,6 @@ struct vector_traits::value || static vector_type div(vector_type a, vector_type b) { -/* - return _mm_setr_epi16((U)_mm_extract_epi16(a, 0) / - (U)_mm_extract_epi16(b, 0), - (U)_mm_extract_epi16(a, 1) / - (U)_mm_extract_epi16(b, 1), - (U)_mm_extract_epi16(a, 2) / - (U)_mm_extract_epi16(b, 2), - (U)_mm_extract_epi16(a, 3) / - (U)_mm_extract_epi16(b, 3), - (U)_mm_extract_epi16(a, 4) / - (U)_mm_extract_epi16(b, 4), - (U)_mm_extract_epi16(a, 5) / - (U)_mm_extract_epi16(b, 5), - (U)_mm_extract_epi16(a, 6) / - (U)_mm_extract_epi16(b, 6), - (U)_mm_extract_epi16(a, 7) / - (U)_mm_extract_epi16(b, 7)); -*/ vector_base::result_value[0] = (U)(a[0]) / (U)(b[0]); vector_base::result_value[1] = (U)(a[1]) / (U)(b[1]); vector_base::result_value[2] = (U)(a[2]) / (U)(b[2]); @@ -1560,25 +1291,6 @@ struct vector_traits::value || static vector_type pow(vector_type a, vector_type b) { -/* - return _mm_setr_epi16((U)std::pow((U)_mm_extract_epi16(a, 0), - (U)_mm_extract_epi16(b, 0)), - (U)std::pow((U)_mm_extract_epi16(a, 1), - (U)_mm_extract_epi16(b, 1)), - (U)std::pow((U)_mm_extract_epi16(a, 2), - (U)_mm_extract_epi16(b, 2)), - (U)std::pow((U)_mm_extract_epi16(a, 3), - (U)_mm_extract_epi16(b, 3)), - (U)std::pow((U)_mm_extract_epi16(a, 4), - (U)_mm_extract_epi16(b, 4)), - (U)std::pow((U)_mm_extract_epi16(a, 5), - (U)_mm_extract_epi16(b, 5)), - (U)std::pow((U)_mm_extract_epi16(a, 6), - (U)_mm_extract_epi16(b, 6)), - (U)std::pow((U)_mm_extract_epi16(a, 7), - (U)_mm_extract_epi16(b, 7))); - -*/ vector_base::result_value[0] = (U)std::pow(a[0], b[0]); vector_base::result_value[1] = (U)std::pow(a[1], b[1]); vector_base::result_value[2] = (U)std::pow(a[2], b[2]); @@ -1592,7 +1304,6 @@ struct vector_traits::value || static vector_type negate(vector_type a) { -// return _mm_sub_epi16(_mm_setzero_si128(), a); vector_base::result_value[0] = (U)(0) - (U)(a[0]); vector_base::result_value[1] = (U)(0) - (U)(a[1]); vector_base::result_value[2] = (U)(0) - (U)(a[2]); @@ -1606,16 +1317,6 @@ struct vector_traits::value || static vector_type exp(vector_type a) { -/* - return _mm_setr_epi16((U)std::exp((U)_mm_extract_epi16(a, 0)), - (U)std::exp((U)_mm_extract_epi16(a, 1)), - (U)std::exp((U)_mm_extract_epi16(a, 2)), - (U)std::exp((U)_mm_extract_epi16(a, 3)), - (U)std::exp((U)_mm_extract_epi16(a, 4)), - (U)std::exp((U)_mm_extract_epi16(a, 5)), - (U)std::exp((U)_mm_extract_epi16(a, 6)), - (U)std::exp((U)_mm_extract_epi16(a, 7))); -*/ vector_base::result_value[0] = (U)std::exp(a[0]); vector_base::result_value[1] = (U)std::exp(a[1]); vector_base::result_value[2] = (U)std::exp(a[2]); @@ -1629,16 +1330,6 @@ struct vector_traits::value || static vector_type sqrt(vector_type a) { -/* - return _mm_setr_epi16((U)std::sqrt((U)_mm_extract_epi16(a, 0)), - (U)std::sqrt((U)_mm_extract_epi16(a, 1)), - (U)std::sqrt((U)_mm_extract_epi16(a, 2)), - (U)std::sqrt((U)_mm_extract_epi16(a, 3)), - (U)std::sqrt((U)_mm_extract_epi16(a, 4)), - (U)std::sqrt((U)_mm_extract_epi16(a, 5)), - (U)std::sqrt((U)_mm_extract_epi16(a, 6)), - (U)std::sqrt((U)_mm_extract_epi16(a, 7))); -*/ vector_base::result_value[0] = (U)std::sqrt(a[0]); vector_base::result_value[1] = (U)std::sqrt(a[1]); vector_base::result_value[2] = (U)std::sqrt(a[2]); @@ -1671,7 +1362,6 @@ struct vector_traits::value || detail::enable_if_t::value, float*> convert(vector_type v) { -// return _mm_cvtepi32_ps(v); return (T*)v; } @@ -1679,7 +1369,6 @@ struct vector_traits::value || detail::enable_if_t::value, double*> convert(vector_type v) { - //return _mm_cvtepi32_pd(v); return (T*)v; } @@ -1687,7 +1376,6 @@ struct vector_traits::value || detail::enable_if_t>::value, float*> convert(vector_type v) { - //return _mm_unpacklo_ps(convert(v), _mm_setzero_ps()); return (T*)v; } @@ -1696,12 +1384,6 @@ struct vector_traits::value || std::is_same::value, U* > convert(vector_type v) { -/* - U* i16 = std::is_signed::value ? _mm_packs_epi32(v, v) - : _mm_packus_epi32(v, v); - return std::is_signed::value ? _mm_packs_epi16(i16, i16) - : _mm_packus_epi16(i16, i16); -*/ return (T*)v; } @@ -1710,10 +1392,6 @@ struct vector_traits::value || std::is_same::value, T > convert(vector_type v) { -/* - return std::is_signed::value ? _mm_packs_epi32(v, v) - : _mm_packus_epi32(v, v); -*/ return std::is_signed::value ? (T*)v : (T*)v; } @@ -1731,10 +1409,6 @@ struct vector_traits::value || std::is_same::value, T > convert(vector_type v) { -/* - return std::is_signed::value ? _mm_cvtepi32_epi64(v) - : _mm_cvtepu32_epi64(v); -*/ return std::is_signed::value ? (T*)v : (T*)v; } @@ -1743,7 +1417,6 @@ struct vector_traits::value || detail::enable_if_t load(const vector_type ptr) { -// return _mm_loadu_si128((ivector_type*)ptr); return ptr; } @@ -1751,7 +1424,6 @@ struct vector_traits::value || detail::enable_if_t load(const vector_type ptr) { -// return _mm_load_si128((ivector_type*)ptr); return ptr; } @@ -1759,19 +1431,16 @@ struct vector_traits::value || detail::enable_if_t load(const vector_type ptr) { -// return _mm_set1_epi64x(*(int64_t*)ptr); return ptr; } static vector_type load1(const vector_type ptr) { -// return _mm_set1_epi32(*ptr); return ptr; } static vector_type set1(value_type val) { -// return _mm_set1_epi32(val); vector_base::result_value[0] = val; vector_base::result_value[1] = val; vector_base::result_value[2] = val; @@ -1783,7 +1452,6 @@ struct vector_traits::value || detail::enable_if_t store(vector_type v, vector_type ptr) { -// _mm_storeu_si128((ivector_type*)ptr, v); ptr[0] = v[0]; ptr[1] = v[1]; ptr[2] = v[2]; @@ -1794,7 +1462,6 @@ struct vector_traits::value || detail::enable_if_t store(vector_type v, vector_type ptr) { -// _mm_store_si128((ivector_type*)ptr, v); ptr[0] = v[0]; ptr[1] = v[1]; ptr[2] = v[2]; @@ -1805,7 +1472,6 @@ struct vector_traits::value || detail::enable_if_t store(vector_type v, vector_type ptr) { -// _mm_storel_epi64((ivector_type*)ptr, v); ptr[0] = v[0]; ptr[1] = v[1]; ptr[2] = v[2]; @@ -1814,7 +1480,6 @@ struct vector_traits::value || static vector_type add(vector_type a, vector_type b) { -// return _mm_add_epi32(a, b); vector_base::result_value[0] = a[0] + b[0]; vector_base::result_value[1] = a[1] + b[1]; vector_base::result_value[2] = a[2] + b[2]; @@ -1824,7 +1489,6 @@ struct vector_traits::value || static vector_type sub(vector_type a, vector_type b) { -// return _mm_sub_epi32(a, b); vector_base::result_value[0] = a[0] - b[0]; vector_base::result_value[1] = a[1] - b[1]; vector_base::result_value[2] = a[2] - b[2]; @@ -1834,7 +1498,6 @@ struct vector_traits::value || static vector_type mul(vector_type a, vector_type b) { -// return _mm_mullo_epi32(a, b); vector_base::result_value[0] = a[0] * b[0]; vector_base::result_value[1] = a[1] * b[1]; vector_base::result_value[2] = a[2] * b[2]; @@ -1844,16 +1507,6 @@ struct vector_traits::value || static vector_type div(vector_type a, vector_type b) { -/* - return _mm_setr_epi32((U)_mm_extract_epi32(a, 0) / - (U)_mm_extract_epi32(b, 0), - (U)_mm_extract_epi32(a, 1) / - (U)_mm_extract_epi32(b, 1), - (U)_mm_extract_epi32(a, 2) / - (U)_mm_extract_epi32(b, 2), - (U)_mm_extract_epi32(a, 3) / - (U)_mm_extract_epi32(b, 3)); -*/ vector_base::result_value[0] = a[0] / b[0]; vector_base::result_value[1] = a[1] / b[1]; vector_base::result_value[2] = a[2] / b[2]; @@ -1863,16 +1516,6 @@ struct vector_traits::value || static vector_type pow(vector_type a, vector_type b) { -/* - return _mm_setr_epi32((U)std::pow((U)_mm_extract_epi32(a, 0), - (U)_mm_extract_epi32(b, 0)), - (U)std::pow((U)_mm_extract_epi32(a, 1), - (U)_mm_extract_epi32(b, 1)), - (U)std::pow((U)_mm_extract_epi32(a, 2), - (U)_mm_extract_epi32(b, 2)), - (U)std::pow((U)_mm_extract_epi32(a, 3), - (U)_mm_extract_epi32(b, 3))); -*/ vector_base::result_value[0] = std::pow(a[0], b[0]); vector_base::result_value[1] = std::pow(a[1], b[1]); vector_base::result_value[2] = std::pow(a[2], b[2]); @@ -1882,7 +1525,6 @@ struct vector_traits::value || static vector_type negate(vector_type a) { -// return _mm_sub_epi32(_mm_setzero_si128(), a); vector_base::result_value[0] = 0 - a[0]; vector_base::result_value[1] = 0 - a[1]; vector_base::result_value[2] = 0 - a[2]; @@ -1892,12 +1534,6 @@ struct vector_traits::value || static vector_type exp(vector_type a) { -/* - return _mm_setr_epi32((U)std::exp((U)_mm_extract_epi32(a, 0)), - (U)std::exp((U)_mm_extract_epi32(a, 1)), - (U)std::exp((U)_mm_extract_epi32(a, 2)), - (U)std::exp((U)_mm_extract_epi32(a, 3))); -*/ vector_base::result_value[0] = std::exp(a[0]); vector_base::result_value[1] = std::exp(a[1]); vector_base::result_value[2] = std::exp(a[2]); @@ -1907,12 +1543,6 @@ struct vector_traits::value || static vector_type sqrt(vector_type a) { -/* - return _mm_setr_epi32((U)std::sqrt((U)_mm_extract_epi32(a, 0)), - (U)std::sqrt((U)_mm_extract_epi32(a, 1)), - (U)std::sqrt((U)_mm_extract_epi32(a, 2)), - (U)std::sqrt((U)_mm_extract_epi32(a, 3))); -*/ vector_base::result_value[0] = std::sqrt(a[0]); vector_base::result_value[1] = std::sqrt(a[1]); vector_base::result_value[2] = std::sqrt(a[2]); @@ -1941,11 +1571,6 @@ struct vector_traits::value || detail::enable_if_t::value, float* > convert(vector_type v) { -/* - float a = (U)_mm_extract_epi64(v, 0); - float b = (U)_mm_extract_epi64(v, 1); - return _mm_setr_ps(a, b, a, b); -*/ return (float*)v; } @@ -1953,11 +1578,6 @@ struct vector_traits::value || detail::enable_if_t::value, double* > convert(vector_type v) { -/* - double a = (U)_mm_extract_epi64(v, 0); - double b = (U)_mm_extract_epi64(v, 1); - return _mm_setr_pd(a, b); -*/ return (double*)v; } @@ -1965,7 +1585,6 @@ struct vector_traits::value || detail::enable_if_t>::value, float* > convert(vector_type v) { -// return _mm_unpacklo_ps(convert(v), _mm_setzero_ps()); return (float*)v; } @@ -1974,11 +1593,6 @@ struct vector_traits::value || std::is_same::value, T* > convert(vector_type v) { -/* - T a = (U)_mm_extract_epi64(v, 0); - T b = (U)_mm_extract_epi64(v, 1); - return _mm_setr_epi8(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b); -*/ return (T*)v; } @@ -1987,11 +1601,6 @@ struct vector_traits::value || std::is_same::value, T* > convert(vector_type v) { -/* - T a = (U)_mm_extract_epi64(v, 0); - T b = (U)_mm_extract_epi64(v, 1); - return _mm_setr_epi16(a, b, a, b, a, b, a, b); -*/ return (T*)v; } @@ -2000,11 +1609,6 @@ struct vector_traits::value || std::is_same::value, T* > convert(vector_type v) { -/* - T a = (U)_mm_extract_epi64(v, 0); - T b = (U)_mm_extract_epi64(v, 1); - return _mm_setr_epi32(a, b, a, b); -*/ return (T*)v; } @@ -2020,7 +1624,6 @@ struct vector_traits::value || detail::enable_if_t load(const vector_type ptr) { -// return _mm_loadu_si128((ivector_type*)ptr); vector_base::result_value[0] = ptr[0]; vector_base::result_value[1] = ptr[1]; return vector_base::result_value; @@ -2030,7 +1633,6 @@ struct vector_traits::value || detail::enable_if_t load(const vector_type ptr) { -// return _mm_load_si128((ivector_type*)ptr); vector_base::result_value[0] = ptr[0]; vector_base::result_value[1] = ptr[1]; return vector_base::result_value; @@ -2038,7 +1640,6 @@ struct vector_traits::value || static vector_type load1(const vector_type ptr) { -// return _mm_set1_epi64x(*ptr); vector_base::result_value[0] = ptr[0]; vector_base::result_value[1] = ptr[1]; return vector_base::result_value; @@ -2046,7 +1647,6 @@ struct vector_traits::value || static vector_type set1(value_type val) { -// return _mm_set1_epi64x(val); vector_base::result_value[0] = val; vector_base::result_value[1] = val; return vector_base::result_value; @@ -2056,7 +1656,6 @@ struct vector_traits::value || detail::enable_if_t store(vector_type v, vector_type ptr) { -// _mm_storeu_si128((ivector_type*)ptr, v); ptr[0] = v[0]; ptr[1] = v[1]; } @@ -2065,14 +1664,12 @@ struct vector_traits::value || detail::enable_if_t store(vector_type v, vector_type ptr) { -// _mm_store_si128((ivector_type*)ptr, v); ptr[0] = v[0]; ptr[1] = v[1]; } static vector_type add( vector_type a, vector_type b ) { -// return _mm_add_epi64(a, b); vector_base::result_value[0] = ((U)a[0] + (U)b[0]); vector_base::result_value[1] = ((U)a[1] + (U)b[1]); return vector_base::result_value; @@ -2080,8 +1677,6 @@ struct vector_traits::value || static vector_type sub( vector_type a, vector_type b ) { -// return _mm_sub_epi64(a, b); - vector_base::result_value[0] = ((U)a[0] - (U)b[0]); vector_base::result_value[1] = ((U)a[1] - (U)b[1]); return vector_base::result_value; @@ -2089,12 +1684,6 @@ struct vector_traits::value || static vector_type mul( vector_type a, vector_type b ) { -/* - return _mm_set_epi64x((U)_mm_extract_epi64(a, 1) * - (U)_mm_extract_epi64(b, 1), - (U)_mm_extract_epi64(a, 0) * - (U)_mm_extract_epi64(b, 0)); -*/ vector_base::result_value[0] = ((U)a[0] * (U)b[0]); vector_base::result_value[1] = ((U)a[1] * (U)b[1]); return vector_base::result_value; @@ -2102,12 +1691,6 @@ struct vector_traits::value || static vector_type div( vector_type a, vector_type b ) { -/* - return _mm_set_epi64x((U)_mm_extract_epi64(a, 1) / - (U)_mm_extract_epi64(b, 1), - (U)_mm_extract_epi64(a, 0) / - (U)_mm_extract_epi64(b, 0)); -*/ vector_base::result_value[0] = ((U)a[0] / (U)b[0]); vector_base::result_value[1] = ((U)a[1] / (U)b[1]); return vector_base::result_value; @@ -2115,12 +1698,6 @@ struct vector_traits::value || static vector_type pow( vector_type a, vector_type b) { -/* - return _mm_set_epi64x((U)std::pow((U)_mm_extract_epi64(a, 1), - (U)_mm_extract_epi64(b, 1)), - (U)std::pow((U)_mm_extract_epi64(a, 0), - (U)_mm_extract_epi64(b, 0))); -*/ vector_base::result_value[0] = (U)std::pow((U)a[0], (U)b[0]); vector_base::result_value[1] = (U)std::pow((U)a[1], (U)b[1]); return vector_base::result_value; @@ -2128,7 +1705,6 @@ struct vector_traits::value || static vector_type negate( vector_type a ) { - //return _mm_sub_epi64(_mm_setzero_si128(), a); return ((U)0)-a; } diff --git a/src/util/cpuid.cxx b/src/util/cpuid.cxx index 6cb19b66b..b926c02cb 100644 --- a/src/util/cpuid.cxx +++ b/src/util/cpuid.cxx @@ -1,5 +1,8 @@ #include "cpuid.hpp" +#include +#include + #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) namespace tblis @@ -246,7 +249,10 @@ int get_cpu_type(int& model, int& part, int& features) auto pos = ptno.find("0x"); TBLIS_ASSERT(pos != std::string::npos); - part = strtoi(ptno, pos, 16); + + // part = strtoi(ptno, pos, 16); + auto ptno_substr = ptno.substr(pos, 16); + part = std::stoi(ptno, &pos, 16); return VENDOR_ARM; } diff --git a/src/util/cpuid.hpp b/src/util/cpuid.hpp index 320803d9d..f6be8bcc8 100644 --- a/src/util/cpuid.hpp +++ b/src/util/cpuid.hpp @@ -42,8 +42,8 @@ int get_cpu_type(int& family, int& model, int& features); namespace tblis { -enum {VENDOR_ARM, VENDOR_UNKNOWN} -enum {MODEL_ARMV7, MODEL_ARMV8, MODEL_UNKNOWN} +enum {VENDOR_ARM, VENDOR_UNKNOWN}; +enum {MODEL_ARMV7, MODEL_ARMV8, MODEL_UNKNOWN}; enum {FEATURE_NEON = 0x1}; int get_cpu_type(int& model, int& part, int& features);