diff --git a/.gitignore b/.gitignore index 0e6a6834..167de90c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .idea/ .vscode/ +.cache/ .ccls *.bin diff --git a/platforms/CLI-Emulator/main.cpp b/platforms/CLI-Emulator/main.cpp index 24261d85..7a041a49 100644 --- a/platforms/CLI-Emulator/main.cpp +++ b/platforms/CLI-Emulator/main.cpp @@ -406,6 +406,12 @@ int main(int argc, const char *argv[]) { } else { wac->run_module(m); } + + if(m->exception) { + fprintf(stderr, "wdcli: exception: %s\n", m->exception); + return 1; + } + wac->unload_module(m); wac->debugger->stop(); diff --git a/src/Debug/debugger.cpp b/src/Debug/debugger.cpp index 8828f2fe..feceeadc 100644 --- a/src/Debug/debugger.cpp +++ b/src/Debug/debugger.cpp @@ -366,6 +366,12 @@ void Debugger::printValue(const StackValue *v, const uint32_t idx, snprintf(buff, 255, R"("type":"F64","value":")" FMT(PRIx64) "\"", v->value.uint64); break; + case V128: + // we'll just use hex-strings + // 64-bit = 8 bytes = 16 nibbles = 16 hex-characters + snprintf(buff, 255, R"("type":"V128","value":"%016lx%016lx")", + v->value.simd.i64x2[0], v->value.simd.i64x2[1]); + break; default: snprintf(buff, 255, R"("type":"%02x","value":")" FMT(PRIx64) "\"", v->value_type, v->value.uint64); @@ -574,6 +580,9 @@ void Debugger::dumpLocals(const Module *m) const { snprintf(_value_str, 255, R"("type":"F64","value":%.7f)", v->value.f64); break; + case V128: + snprintf(_value_str, 255, R"("type":"V128","value":"%016lx%016lx")", + v->value.simd.i64x2[0], v->value.simd.i64x2[1]); default: snprintf(_value_str, 255, R"("type":"%02x","value":")" FMT(PRIx64) "\"", diff --git a/src/Interpreter/instructions.cpp b/src/Interpreter/instructions.cpp index 9046c0d0..5a49be1c 100644 --- a/src/Interpreter/instructions.cpp +++ b/src/Interpreter/instructions.cpp @@ -1267,3 +1267,737 @@ bool i_instr_callback([[maybe_unused]] Module *m, // TODO return true; } + +/** + * [0xfd] 0x0f...0x14 SIMD splat operations + */ +bool i_instr_simd_splat(Module* m, uint8_t opcode) { + auto &raw_top = m->stack[m->sp]; + auto &top = raw_top.value; + + switch(opcode) { + case 0x0f: { // i8x16.splat + const int8_t val = top.int32; + for(auto &i : top.simd.i8x16) i = val; + break; + } + + case 0x10: { // i16x8.splat + const int16_t val = top.int32; + for(auto &i : top.simd.i16x8) i = val; + break; + } + + case 0x11: { // i32x4.splat + const int32_t val = top.int32; + for(auto &i : top.simd.i32x4) i = val; + break; + } + + case 0x12: { // i64x2.splat + const int64_t val = top.int64; + for(auto &i : top.simd.i64x2) i = val; + break; + } + + case 0x13: { // f32x4.splat + const float val = top.f32; + for(auto &i : top.simd.f32x4) i = val; + break; + } + + case 0x14: { // f64x2.splat + const double val = top.f64; + for(auto &i : top.simd.f64x2) i = val; + break; + } + } + + raw_top.value_type = V128; + return true; +} + +bool i_instr_simd_extract(Module* m, uint8_t opcode){ + // inner helper to check if lane is within bounds, then execute the operation + auto &raw_top = m->stack[m->sp]; + const uint8_t lane = *m->pc_ptr; + m->pc_ptr++; + + // expect the compiler to inline the lambda - see + // https://stackoverflow.com/questions/13722426/why-can-lambdas-be-better-optimized-by-the-compiler-than-plain-functions/13722515#13722515 + auto lane_handler = [&raw_top, lane](const int max, I *in, O &out, const uint8_t type) { + if(lane > max) { + sprintf(exception, "lane index out of bounds (%d > %d)", lane, max); + return false; + } + + out = in[lane]; + raw_top.value_type = type; + + return true; + }; + + auto lane_handler_unsigned = [&raw_top, lane](const int max, I *in, O &out, const uint8_t type) { + if(lane > max) { + sprintf(exception, "lane index out of bounds (%d > %d)", lane, max); + return false; + } + + // type switcher between uint8 and uint16 based on input type + using temp_t = std::conditional_t, uint8_t, uint16_t>; + temp_t x; + memcpy(&x, &in[lane], sizeof(temp_t)); + out = x; + raw_top.value_type = type; + + return true; + }; + + switch(opcode) { + case 0x15: // i8x16.extract_lane_s + return lane_handler(15, raw_top.value.simd.i8x16, raw_top.value.int32, I32); + case 0x16: // i8x16.extract_lane_u + return lane_handler_unsigned(15, raw_top.value.simd.i8x16, raw_top.value.uint32, I32); + + case 0x18: // i16x8.extract_lane_s + return lane_handler(7, raw_top.value.simd.i16x8, raw_top.value.int32, I32); + case 0x19: // i16x8.extract_lane_u + return lane_handler_unsigned(7, raw_top.value.simd.i16x8, raw_top.value.uint32, I32); + + case 0x1b: // i32x4.extract_lane + return lane_handler(3, raw_top.value.simd.i32x4, raw_top.value.uint32, I32); + + case 0x1d: // i64x2.extract_lane + return lane_handler(1, raw_top.value.simd.i64x2, raw_top.value.uint64, I64); + + case 0x1f: // f32x4.extract_lane + return lane_handler(3, raw_top.value.simd.f32x4, raw_top.value.f32, F32); + + case 0x21: // f64x2.extract_lane + return lane_handler(1, raw_top.value.simd.f64x2, raw_top.value.f64, F64); + } + + return false; +} + +bool i_instr_simd_replace(Module *m, uint8_t opcode) { + auto &v128 = m->stack[m->sp - 1]; + auto &update = m->stack[m->sp].value; + const uint8_t lane = *m->pc_ptr; + m->pc_ptr++; + m->sp -= 1; + + auto lane_handler = [&v128, lane](const int max, O *out, O replace) { + if(lane > max) { + sprintf(exception, "lane index out of bounds (%d > %d)", lane, max); + return false; + } + + out[lane] = replace; + return true; + }; + + switch(opcode) { + case 0x17: // i8x16.replace_lane + return lane_handler(15, v128.value.simd.i8x16, static_cast(update.uint32)); + case 0x1a: // i16x8.replace_lane + return lane_handler(8, v128.value.simd.i16x8, static_cast(update.uint32)); + case 0x1c: // i32x4.replace_lane + return lane_handler(4, v128.value.simd.i32x4, static_cast(update.uint32)); + case 0x1e: // i64x2.replace_lane + return lane_handler(2, v128.value.simd.i64x2, static_cast(update.uint64)); + case 0x20: // f32x4.replace_lane + return lane_handler(4, v128.value.simd.f32x4, update.f32); + case 0x22: // f64x2.replace_lane + return lane_handler(2, v128.value.simd.f64x2, update.f64); + } + + return false; +} + +inline bool verify_endian() { + static bool _ = [] { + int n = 1; + if(*reinterpret_cast(&n) == 1) { + // little endian + return true; + } + + FATAL("v128.const only supported on little-endian systems"); + return false; + }(); + return _; +} + +bool i_instr_simd_const(Module* m){ + verify_endian(); + + const uint8_t *data = m->pc_ptr; + m->pc_ptr += 16; // skip immediate 16-byte data + + m->sp++; + auto &v = m->stack[m->sp].value.simd; + std::memcpy(&v, data, 16); + m->stack[m->sp].value_type = V128; + return true; +} + +bool i_instr_simd_store(Module* m){ + auto &sv = m->stack[m->sp--]; + StackValue sv2; + sv2.value_type = I64; + sv2.value.uint64 = sv.value.simd.i64x2[1]; + + const uint32_t flags = read_LEB_32(&m->pc_ptr); + const uint32_t offset = read_LEB_32(&m->pc_ptr); + + uint32_t ptr = m->stack[m->sp--].value.uint32; + + if(flags != 2 && TRACE) { + dbg_info( + " - unaligned store - flags: 0x%x, offset: 0x%x, addr: 0x%x, val: %s\n", + flags, offset, ptr, value_repr(&sv) + ); + } + + if(offset + ptr < ptr && !m->options.disable_memory_bounds) { + Interpreter::report_overflow(m, m->memory.bytes + offset + ptr); + } + + ptr += offset; + + // store in 2 consecutive locations + return m->warduino->interpreter->store(m, I64, ptr, sv) && + m->warduino->interpreter->store(m, I64, ptr + 8, sv2); +} + +bool i_instr_simd_load(Module* m) { + const uint32_t flags = read_LEB_32(&m->pc_ptr); + const uint32_t offset = read_LEB_32(&m->pc_ptr); + uint32_t ptr = m->stack[m->sp--].value.uint32; + if(flags != 2 && TRACE) { + dbg_info( + " - unaligned load - flags: 0x%x, offset: 0x%x, addr: 0x%x, val: %s\n", + flags, offset, ptr + ); + } + + if(offset + ptr < ptr && !m->options.disable_memory_bounds) { + Interpreter::report_overflow(m, m->memory.bytes + offset + ptr); + } + + ptr += offset; + + // load from 2 consecutive locations + bool success = m->warduino->interpreter->load(m, I64, ptr, offset); + const auto i64_0 = m->stack[m->sp].value.int64; + m->sp--; // make sure we overwrite the previous load + success &= m->warduino->interpreter->load(m, I64, ptr + 8, offset); + const auto i64_1 = m->stack[m->sp].value.int64; + + // reconstruct v128 + auto & [value_type, value] = m->stack[m->sp]; + value_type = V128; + value.simd.i64x2[0] = i64_0; + value.simd.i64x2[1] = i64_1; + + return success; +} + +bool i_instr_simd_bin_bit_op(Module* m, uint8_t opcode){ + auto v1 = m->stack[m->sp - 1].value.simd; + auto v2 = m->stack[m->sp].value.simd; + m->sp--; + + const auto apply = [&v1, &v2, m](F &&op) { + m->stack[m->sp].value.simd.i64x2[0] = op(v1.i64x2[0], v2.i64x2[0]); + m->stack[m->sp].value.simd.i64x2[1] = op(v1.i64x2[1], v2.i64x2[1]); + return true; + }; + + switch(opcode) { + case 0x4e: // bit-and + return apply([](const uint64_t &a, const uint64_t &b) { return a & b; }); + case 0x4f: // bit-and-not + return apply([](const uint64_t &a, const uint64_t &b) { return a & ~b; }); + case 0x50: // bit-or + return apply([](const uint64_t &a, const uint64_t &b) { return a | b; }); + case 0x51: // xor + return apply([](const uint64_t &a, const uint64_t &b) { return a ^ b; }); + } + + return false; +} + +bool i_instr_simd_v128_not(Module* m){ + m->stack[m->sp].value.simd.i64x2[0] = ~m->stack[m->sp].value.simd.i64x2[0]; + m->stack[m->sp].value.simd.i64x2[1] = ~m->stack[m->sp].value.simd.i64x2[1]; + return true; +} + + +// to implement the whole swathe of SIMD instructions, let's use some templates +// so the compiler can generate the code... +namespace { +// anonymous namespace to hide implementation details/allow code folding +// the SIMD vector type (for ease of coding) +using simd_t = decltype(StackValue::value.simd); + +// use type-traits for all compile-time meta-data +template struct v128_traits; +template <> struct v128_traits { + // we need to know the number of elements + constexpr static uint N = 16; + // we need to know the amount of bits + constexpr static uint bits = 8; + // the type of elements in the SIMD vector + using type = int8_t; + // the type to use when an operation returns a boolean + using bool_alt = int8_t; + // we need to know the signed version of this type + using signed_t = int8_t; + // we also need to know the unsigned version of this type + using unsigned_t = uint8_t; + // for the ext_add_pairwise, we need the extended type + using extended_t = int16_t; + // pointer-to-member to access the SIMD vector + constexpr static auto member = &simd_t::i8x16; +}; +template <> struct v128_traits { + constexpr static uint N = 16; + constexpr static uint bits = 8; + using type = uint8_t; + using bool_alt = int8_t; + using signed_t = int8_t; + using unsigned_t = uint8_t; + using extended_t = uint16_t; + constexpr static auto member = &simd_t::i8x16; +}; +template <> struct v128_traits { + constexpr static uint N = 8; + constexpr static uint bits = 16; + using type = int16_t; + using bool_alt = int16_t; + using signed_t = int16_t; + using unsigned_t = uint16_t; + using extended_t = int32_t; + constexpr static auto member = &simd_t::i16x8; +}; +template <> struct v128_traits { + constexpr static uint N = 8; + constexpr static uint bits = 16; + using type = uint16_t; + using bool_alt = int16_t; + using signed_t = int16_t; + using unsigned_t = uint16_t; + using extended_t = uint32_t; + constexpr static auto member = &simd_t::i16x8; +}; +template <> struct v128_traits { + constexpr static uint N = 4; + constexpr static uint bits = 32; + using type = int32_t; + using bool_alt = int32_t; + using signed_t = int32_t; + using unsigned_t = uint32_t; + constexpr static auto member = &simd_t::i32x4; +}; +template <> struct v128_traits { + constexpr static uint N = 4; + constexpr static uint bits = 32; + using type = uint32_t; + using bool_alt = int32_t; + using signed_t = int32_t; + using unsigned_t = uint32_t; + constexpr static auto member = &simd_t::i32x4; +}; +template <> struct v128_traits { + constexpr static uint N = 2; + constexpr static uint bits = 64; + using type = int64_t; + using bool_alt = int64_t; + using signed_t = int64_t; + using unsigned_t = uint64_t; + constexpr static auto member = &simd_t::i64x2; +}; +template <> struct v128_traits { + constexpr static uint N = 2; + constexpr static uint bits = 64; + using type = uint64_t; + using bool_alt = int64_t; + using signed_t = int64_t; + using unsigned_t = uint64_t; + constexpr static auto member = &simd_t::i64x2; +}; +template <> struct v128_traits { + constexpr static uint N = 4; + using type = float; + using bool_alt = int32_t; + using signed_t = float; + constexpr static auto member = &simd_t::f32x4; +}; +template <> struct v128_traits { + constexpr static uint N = 2; + using type = double; + using bool_alt = int64_t; + using signed_t = double; + constexpr static auto member = &simd_t::f64x2; +}; + +// get a reference to a lane in a SIMD vector +template +constexpr typename v128_traits::signed_t &get_lane_ref(StackValue &sv, uint8_t lane) { + // v128_traits::member is pointer-to-member + // (struct).*(ptr-to-member) gets struct.member + // so we get (simd-union).(correct type field)[lane] + return (sv.value.simd.*v128_traits::member)[lane]; +} +// get a const reference to a lane in a SIMD vector +template +constexpr const typename v128_traits::signed_t &get_lane_cref(const StackValue &sv, uint8_t lane) { + return (sv.value.simd.*v128_traits::member)[lane]; +} +// get a copy of a lane in a SIMD vector +template +constexpr typename v128_traits::signed_t get_lane_copy(StackValue &sv, uint8_t lane) { + return (sv.value.simd.*v128_traits::member)[lane]; +} + +// apply binary operator lane-wise +template +constexpr bool simd_bin_op(Module *m, const StackValue &sv1, const StackValue &sv2, F &&f) { + static_assert(std::is_invocable_v, "Operation is not invocable on type T"); + static_assert(sizeof(T) == sizeof(ExplicitCast), "ExplicitCast must have same size as T"); + + using result_t = std::conditional_t< + std::is_same_v, bool>, // if the result is a boolean + typename v128_traits::bool_alt, // then use the bool_alt type (to ensure lane count for floats) + typename v128_traits::type // otherwise, use the normal type + >; + using result_traits = v128_traits; // v128 traits for the result type + + static_assert(v128_traits::N == result_traits::N, "Result type must have same lane count as input type"); + + m->stack[m->sp].value_type = V128; + for(uint i = 0; i < v128_traits::N; i++) { + ExplicitCast lane1, lane2; + if constexpr(std::is_same_v) { + lane1 = get_lane_cref(sv1, i); + lane2 = get_lane_cref(sv2, i); + } + else { + memcpy(&lane1, &get_lane_cref(sv1, i), sizeof(T)); + memcpy(&lane2, &get_lane_cref(sv2, i), sizeof(T)); + } + get_lane_ref(m->stack[m->sp], i) = f(lane1, lane2); + } + return true; +} + +template typename FPre, typename ExplicitCast = T, typename F = FPre> +constexpr bool simd_bin_op(Module *m, const StackValue &sv1, const StackValue &sv2) { + return simd_bin_op(m, sv1, sv2, F{}); +} + +template typename FPre, typename ExplicitCast = T, typename F = FPre> +constexpr bool simd_shift(Module *m, const StackValue &sv1, const StackValue &sv2) { + static_assert(std::is_invocable_v, "Operation is not invocable on type T"); + static_assert(sizeof(T) == sizeof(ExplicitCast), "ExplicitCast must have same size as T"); + + using result_t = std::conditional_t< + std::is_same_v, bool>, // if the result is a boolean + typename v128_traits::bool_alt, // then use the bool_alt type (to ensure lane count for floats) + typename v128_traits::type // otherwise, use the normal type + >; + using result_traits = v128_traits; // v128 traits for the result type + + static_assert(v128_traits::N == result_traits::N, "Result type must have same lane count as input type"); + + F f{}; + + m->stack[m->sp].value_type = V128; + for(uint i = 0; i < v128_traits::N; i++) { + ExplicitCast lane1; + int32_t shift = sv2.value.int32; + if constexpr(std::is_same_v) { + lane1 = get_lane_cref(sv1, i); + } + else { + memcpy(&lane1, &get_lane_cref(sv1, i), sizeof(T)); + } + get_lane_ref(m->stack[m->sp], i) = f(lane1, shift); + } + return true; +} + +template +constexpr bool simd_ext_add(Module *m) { + using T_traits = v128_traits; + using extend_t = typename T_traits::extended_t; + using extend_traits = v128_traits; + + StackValue copy = m->stack[m->sp]; + m->stack[m->sp].value_type = V128; + StackValue &res = m->stack[m->sp]; + + for(uint i = 0; i < extend_traits::N; i++) { + auto x1 = static_cast(get_lane_cref(copy, 2 * i)); + auto x2 = static_cast(get_lane_cref(copy, 2 * i + 1)); + get_lane_ref(res, i) = x1 + x2; + } + + return true; +} +} + +// helper code for saturating arithmetic +namespace { +template +struct sat_add { + constexpr T operator()(const T t1, const T t2) const { + // from https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/include/bits/sat_arith.h#L49 + T t3{}; + if(!__builtin_add_overflow(t1, t2, &t3)) return t3; + if constexpr(std::is_unsigned_v) return std::numeric_limits::max(); + else if(t1 < 0) return std::numeric_limits::min(); + else return std::numeric_limits::max(); + } +}; + +template +struct sat_sub { + constexpr T operator()(const T t1, const T t2) const { + // from https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/include/bits/sat_arith.h#L65 + T t3{}; + if(!__builtin_sub_overflow(t1, t2, &t3)) return t3; + if constexpr(std::is_unsigned_v) return 0; + else if(t1 < 0) return std::numeric_limits::min(); + else return std::numeric_limits::max(); + } +}; + +template struct min_struct { constexpr T operator()(const T t1, const T t2) const { return std::min(t1, t2); } }; +template struct max_struct { constexpr T operator()(const T t1, const T t2) const { return std::max(t1, t2); } }; +// pseudo-min/max as defined in https://github.com/WebAssembly/simd/blob/main/proposals/simd/SIMD.md#pseudo-minimum +template struct p_min_struct { constexpr T operator()(const T t1, const T t2) const { return t2 < t1 ? t2 : t1; } }; +template struct p_max_struct { constexpr T operator()(const T t1, const T t2) const { return t1 < t2 ? t2 : t1; } }; + +template struct shift_left { + constexpr T operator()(const T t1, const int32_t t2) const { return static_cast(t1 << (t2 % v128_traits::bits)); } +}; +template struct arith_shift_right { + // sign-preserving right shift ~> works only "correctly" on signed types + constexpr T operator()(const T t1, const int32_t t2) const { + using type = typename v128_traits::signed_t; + type st1; + + if constexpr(std::is_same_v) st1 = t1; + else memcpy(&st1, &t1, sizeof(T)); + + st1 >>= (t2 % v128_traits::bits); + + T result; + if constexpr(std::is_same_v) result = st1; + else memcpy(&result, &st1, sizeof(T)); + + return result; + } +}; +template struct logic_shift_right { + // zero fill right shift ~> works only "correctly" on unsigned types + constexpr T operator()(const T t1, const int32_t t2) const { + using type = typename v128_traits::unsigned_t; + type ut1; + + if constexpr(std::is_same_v) ut1 = t1; + else memcpy(&ut1, &t1, sizeof(T)); + + ut1 >>= (t2 % v128_traits::bits); + + T result; + if constexpr(std::is_same_v) result = ut1; + else memcpy(&result, &ut1, sizeof(T)); + + return result; + } +}; +} + +bool i_instr_simd_bin_v128_v128_op(Module* m, uint8_t opcode){ + // need to keep copies! + const auto v1 = m->stack[m->sp - 1]; + const auto v2 = m->stack[m->sp]; + m->sp--; + + switch(opcode) { + // i8x16 -> ==, !=, <, , >u, <=, <=u, >=, >=u + case 0x23: return simd_bin_op(m, v1, v2); + case 0x24: return simd_bin_op(m, v1, v2); + case 0x25: return simd_bin_op(m, v1, v2); + case 0x26: return simd_bin_op(m, v1, v2); + case 0x27: return simd_bin_op(m, v1, v2); + case 0x28: return simd_bin_op(m, v1, v2); + case 0x29: return simd_bin_op(m, v1, v2); + case 0x2a: return simd_bin_op(m, v1, v2); + case 0x2b: return simd_bin_op(m, v1, v2); + case 0x2c: return simd_bin_op(m, v1, v2); + // i16x8 -> ==, !=, <, , >u, <=, <=u, >=, >=u + case 0x2d: return simd_bin_op(m, v1, v2); + case 0x2e: return simd_bin_op(m, v1, v2); + case 0x2f: return simd_bin_op(m, v1, v2); + case 0x30: return simd_bin_op(m, v1, v2); + case 0x31: return simd_bin_op(m, v1, v2); + case 0x32: return simd_bin_op(m, v1, v2); + case 0x33: return simd_bin_op(m, v1, v2); + case 0x34: return simd_bin_op(m, v1, v2); + case 0x35: return simd_bin_op(m, v1, v2); + case 0x36: return simd_bin_op(m, v1, v2); + // i32x4 -> ==, !=, <, , >u, <=, <=u, >=, >=u + case 0x37: return simd_bin_op(m, v1, v2); + case 0x38: return simd_bin_op(m, v1, v2); + case 0x39: return simd_bin_op(m, v1, v2); + case 0x3a: return simd_bin_op(m, v1, v2); + case 0x3b: return simd_bin_op(m, v1, v2); + case 0x3c: return simd_bin_op(m, v1, v2); + case 0x3d: return simd_bin_op(m, v1, v2); + case 0x3e: return simd_bin_op(m, v1, v2); + case 0x3f: return simd_bin_op(m, v1, v2); + case 0x40: return simd_bin_op(m, v1, v2); + // f32x4 -> ==, !=, <, >, <=, >= + case 0x41: return simd_bin_op(m, v1, v2); + case 0x42: return simd_bin_op(m, v1, v2); + case 0x43: return simd_bin_op(m, v1, v2); + case 0x44: return simd_bin_op(m, v1, v2); + case 0x45: return simd_bin_op(m, v1, v2); + case 0x46: return simd_bin_op(m, v1, v2); + // f64x2 -> ==, !=, <, >, <=, >= + case 0x47: return simd_bin_op(m, v1, v2); + case 0x48: return simd_bin_op(m, v1, v2); + case 0x49: return simd_bin_op(m, v1, v2); + case 0x4a: return simd_bin_op(m, v1, v2); + case 0x4b: return simd_bin_op(m, v1, v2); + case 0x4c: return simd_bin_op(m, v1, v2); + // i8x16 -> +, + sat s, + sat u, -, - sat s, - sat u + case 0x6e: return simd_bin_op(m, v1, v2); + case 0x6f: return simd_bin_op(m, v1, v2); + case 0x70: return simd_bin_op(m, v1, v2); + case 0x71: return simd_bin_op(m, v1, v2); + case 0x72: return simd_bin_op(m, v1, v2); + case 0x73: return simd_bin_op(m, v1, v2); + // i8x16 min s, min u, max s, max u (can't pass std::min/std::max as template argument, they are function-overload-sets) + case 0x76: return simd_bin_op(m, v1, v2); + case 0x77: return simd_bin_op(m, v1, v2); + case 0x78: return simd_bin_op(m, v1, v2); + case 0x79: return simd_bin_op(m, v1, v2); + // i16x8 -> +, + sat s, + sat u, -, - sat s, - sat u, * + case 0x8e: return simd_bin_op(m, v1, v2); + case 0x8f: return simd_bin_op(m, v1, v2); + case 0x90: return simd_bin_op(m, v1, v2); + case 0x91: return simd_bin_op(m, v1, v2); + case 0x92: return simd_bin_op(m, v1, v2); + case 0x93: return simd_bin_op(m, v1, v2); + case 0x95: return simd_bin_op(m, v1, v2); + // i16x8 min s, min u, max s, max u + case 0x96: return simd_bin_op(m, v1, v2); + case 0x97: return simd_bin_op(m, v1, v2); + case 0x98: return simd_bin_op(m, v1, v2); + case 0x99: return simd_bin_op(m, v1, v2); + // i32x4 -> +, -, * + case 0xae: return simd_bin_op(m, v1, v2); + case 0xb1: return simd_bin_op(m, v1, v2); + case 0xb5: return simd_bin_op(m, v1, v2); + // i32x4 min s, min u, max s, max u + case 0xb6: return simd_bin_op(m, v1, v2); + case 0xb7: return simd_bin_op(m, v1, v2); + case 0xb8: return simd_bin_op(m, v1, v2); + case 0xb9: return simd_bin_op(m, v1, v2); + // i64x2 -> +, -, * + case 0xce: return simd_bin_op(m, v1, v2); + case 0xd1: return simd_bin_op(m, v1, v2); + case 0xd5: return simd_bin_op(m, v1, v2); + // i64x2 -> ==, !=, <, >, <=, >= + case 0xd6: return simd_bin_op(m, v1, v2); + case 0xd7: return simd_bin_op(m, v1, v2); + case 0xd8: return simd_bin_op(m, v1, v2); + case 0xd9: return simd_bin_op(m, v1, v2); + case 0xda: return simd_bin_op(m, v1, v2); + case 0xdb: return simd_bin_op(m, v1, v2); + // f32x4 -> +, -, *, /, min, max, pmin, pmax + case 0xe4: return simd_bin_op(m, v1, v2); + case 0xe5: return simd_bin_op(m, v1, v2); + case 0xe6: return simd_bin_op(m, v1, v2); + case 0xe7: return simd_bin_op(m, v1, v2); + case 0xe8: return simd_bin_op(m, v1, v2); + case 0xe9: return simd_bin_op(m, v1, v2); + case 0xea: return simd_bin_op(m, v1, v2); + case 0xeb: return simd_bin_op(m, v1, v2); + // f64x2 -> +, -, *, /, min, max + case 0xf0: return simd_bin_op(m, v1, v2); + case 0xf1: return simd_bin_op(m, v1, v2); + case 0xf2: return simd_bin_op(m, v1, v2); + case 0xf3: return simd_bin_op(m, v1, v2); + case 0xf4: return simd_bin_op(m, v1, v2); + case 0xf5: return simd_bin_op(m, v1, v2); + case 0xf6: return simd_bin_op(m, v1, v2); + case 0xf7: return simd_bin_op(m, v1, v2); + + default: + return false; + } +} + +bool i_instr_simd_shift(Module* m, uint8_t opcode){ + // need to keep copies! + const auto v1 = m->stack[m->sp - 1]; + const auto v2 = m->stack[m->sp]; + m->sp--; + + switch(opcode) { + // i8x16 <<, >>(s), >>(u) + case 0x6b: return simd_shift(m, v1, v2); + case 0x6c: return simd_shift(m, v1, v2); + case 0x6d: return simd_shift(m, v1, v2); + // i16x8 <<, >>(s), >>(u) + case 0x8b: return simd_shift(m, v1, v2); + case 0x8c: return simd_shift(m, v1, v2); + case 0x8d: return simd_shift(m, v1, v2); + // i32x4 <<, >>(s), >>(u) + case 0xab: return simd_shift(m, v1, v2); + case 0xac: return simd_shift(m, v1, v2); + case 0xad: return simd_shift(m, v1, v2); + // i64x2 <<, >>(s), >>(u) + case 0xcb: return simd_shift(m, v1, v2); + case 0xcc: return simd_shift(m, v1, v2); + case 0xcd: return simd_shift(m, v1, v2); + + default: + return false; + } +} + +bool i_instr_simd_ext_add_pairwise(Module* m, uint8_t opcode){ + switch(opcode) { + case 0x7c: return simd_ext_add(m); + case 0x7d: return simd_ext_add(m); + case 0x7e: return simd_ext_add(m); + case 0x7f: return simd_ext_add(m); + default: + return false; + } +} + +bool i_instr_simd_swizzle(Module* m){ + const int8_t *current = m->stack[m->sp - 1].value.simd.i8x16; + uint8_t swizzle[16]; + memcpy(swizzle, m->stack[m->sp].value.simd.i8x16, 16 * sizeof(int8_t)); + int8_t lanes[16]; + + for(int i = 0; i < 16; i++) { + lanes[i] = swizzle[i] < 16 ? current[swizzle[i]] : static_cast(0); + } + + m->sp--; + m->stack[m->sp].value_type = V128; + memcpy(m->stack[m->sp].value.simd.i8x16, lanes, 16 * sizeof(int8_t)); + return true; +} diff --git a/src/Interpreter/instructions.h b/src/Interpreter/instructions.h index 54a80f1b..69b0810e 100644 --- a/src/Interpreter/instructions.h +++ b/src/Interpreter/instructions.h @@ -73,3 +73,27 @@ bool i_instr_binary_f64(Module *m, uint8_t opcode); bool i_instr_conversion(Module *m, uint8_t opcode); bool i_instr_callback(Module *m, uint8_t opcode); + +bool i_instr_simd_splat(Module *m, uint8_t opcode); + +bool i_instr_simd_extract(Module *m, uint8_t opcode); + +bool i_instr_simd_replace(Module *m, uint8_t opcode); + +bool i_instr_simd_const(Module *m); + +bool i_instr_simd_store(Module *m); + +bool i_instr_simd_load(Module *m); + +bool i_instr_simd_bin_bit_op(Module *m, uint8_t opcode); + +bool i_instr_simd_v128_not(Module *m); + +bool i_instr_simd_bin_v128_v128_op(Module *m, uint8_t opcode); + +bool i_instr_simd_shift(Module *m, uint8_t opcode); + +bool i_instr_simd_ext_add_pairwise(Module *m, uint8_t opcode); + +bool i_instr_simd_swizzle(Module *m); \ No newline at end of file diff --git a/src/Interpreter/interpreter.cpp b/src/Interpreter/interpreter.cpp index b5f3f4fe..cab9dad5 100644 --- a/src/Interpreter/interpreter.cpp +++ b/src/Interpreter/interpreter.cpp @@ -427,6 +427,11 @@ bool Interpreter::interpret(Module *m, bool waiting) { case 0xe0 ... 0xe3: success &= i_instr_callback(m, opcode); continue; + + case 0xfd: + success &= interpret_simd(m); + continue; + default: sprintf(exception, "unrecognized opcode 0x%x", opcode); if (m->options.return_exception) { @@ -455,12 +460,283 @@ bool Interpreter::interpret(Module *m, bool waiting) { if (!success && m->options.return_exception) { m->exception = strdup(exception); } else if (!success) { - FATAL("%s\n", exception); + FATAL("%s (0x%x)\n", exception, opcode); } return success; } +bool Interpreter::interpret_simd(Module *m) { + // this function should only be called from Interpreter::interpret + // and should only be called when the opcode is 0xfd + // -> at this point, the PC is pointing to the (first byte of) the actual + // opcode + + // TODO: technically the 2nd byte (and onwards) of the multi-byte SIMD + // opcodes are LEB-encoded u32's and could have multiple + // representations. However, we only support the shortest possible + // encodings; see https://webassembly.github.io/spec/core/appendix/index-instructions.html + + constexpr static auto next_pc_0x01 = [](Module *m, const uint8_t opcode) -> bool { + const auto next = *m->pc_ptr; + m->pc_ptr++; + if(next != 0x01) { + sprintf(exception, "SIMD opcode 0x%02x%02x should be followed by 0x%02x, but got 0x%02x", + 0x7d, opcode, 0x01, next); + return false; + } + return true; + }; + + // TODO: should be removed one day... + constexpr static auto not_implemented = [](const uint8_t opcode) -> bool { + sprintf(exception, "SIMD opcode 0x%02x%02x is not implemented (yet)", + 0x7d, opcode); + return false; + }; + + const auto opcode = *m->pc_ptr; + m->pc_ptr++; + + switch(opcode) { + case 0x00: return i_instr_simd_load(m); + + case 0x0b: return i_instr_simd_store(m); + case 0x0c: return i_instr_simd_const(m); + + case 0x0d: return not_implemented(opcode); // TODO: i8x16.shuffle + case 0x0e: return i_instr_simd_swizzle(m); + + case 0x0f ... 0x14: return i_instr_simd_splat(m, opcode); + + case 0x15: // interspersed with (dim).extract_lane + case 0x16: + case 0x18: + case 0x19: + case 0x1b: + case 0x1d: + case 0x1f: + case 0x21: + return i_instr_simd_extract(m, opcode); + + case 0x17: + case 0x1a: + case 0x1c: + case 0x1e: + case 0x20: + case 0x22: + return i_instr_simd_replace(m, opcode); + + case 0x23 ... 0x2c: // i8x16 relational operators + case 0x2d ... 0x36: // i16x8 relational operators + case 0x37 ... 0x40: // i32x4 relational operators + case 0x41 ... 0x46: // f32x4 relational operators + case 0x47 ... 0x4c: // f64x2 relational operators + return i_instr_simd_bin_v128_v128_op(m, opcode); + + case 0x4d: // v128.not + return i_instr_simd_v128_not(m); + + case 0x4e ... 0x51: // v128 bit-wise operators + return i_instr_simd_bin_bit_op(m, opcode); + + case 0x52: // TODO -> v128.bitselect + return not_implemented(opcode); + + case 0x53: // TODO -> v128.any_true + return not_implemented(opcode); + + case 0x54 ... 0x5d: // TODO -> v128.loadX_lane, v128.storeX_lane, v128.loadX_zero + return not_implemented(opcode); + + case 0x5e ... 0x5f: // TODO -> demote/promote f32<->f64 + return not_implemented(opcode); + + case 0x60 ... 0x62: // TODO -> i8x16. ~ abs, neg, popcnt + return not_implemented(opcode); + + case 0x63 ... 0x64: // TODO -> i8x16. ~ all_true, bitmask + return not_implemented(opcode); + + case 0x65 ... 0x66: // TODO -> i8x16.narrow_i16x8_(s/u) + return not_implemented(opcode); + + case 0x67 ... 0x6a: // TODO -> f32x4. ~ ceil, floor, trunc, nearest + return not_implemented(opcode); + + case 0x6b ... 0x6d: // i8x16 shifts + return i_instr_simd_shift(m, opcode); + + case 0x6e ... 0x73: // i8x16.(add/sub) + return i_instr_simd_bin_v128_v128_op(m, opcode); + + case 0x74 ... 0x75: // TODO -> f64x2. ~ ceil, floor + return not_implemented(opcode); + + case 0x76 ... 0x79: // i8x16.(min/max) + return i_instr_simd_bin_v128_v128_op(m, opcode); + + case 0x7a: // TODO -> f64x2.trunc + return not_implemented(opcode); + + case 0x7b: // TODO -> i8x16.avgr_u + return not_implemented(opcode); + + case 0x7c ... 0x7f: // TODO -> (dim).extadd_pairwise_(dim2) + return i_instr_simd_ext_add_pairwise(m, opcode); + + // --- From 0x80: 3-byte SIMD opcodes, so check next PC for 0x01 --- + + case 0x80 ... 0x81: // TODO -> i16x8. ~ abs, neg + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0x82: // TODO -> i16x8.q15mulr_sat_s + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0x83 ... 0x84: // TODO -> i16x8. ~ all_true, bitmask + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0x85 ... 0x86: // TODO -> i16x8.narrow_i32x4_(s/u) + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0x87 ... 0x8a: // TODO -> i16x8.extend_(low/high)_i8x16_(s/u) + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0x8b ... 0x8d: // i16x8 shifts + return next_pc_0x01(m, opcode) && i_instr_simd_shift(m, opcode); + + case 0x8e ... 0x93: // i16x8.(add/sub) + return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode); + + case 0x94: // TODO -> f64x2.nearest + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0x95: // i16x8.mul + case 0x96 ... 0x99: // i16x8.(min/max) + return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode); + + // case 0x9a: ! invalid opcode ! + + case 0x9b: // TODO -> i16x8.avgr_u + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0x9c ... 0x9f: // TODO -> i16x8.extmul_(low/high)_i8x16_(s/u) + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0xa0 ... 0xa1: // TODO -> i32x4. ~ abs, neg + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + // case 0xa2: ! invalid opcode ! + + case 0xa3 ... 0xa4: // TODO -> i32x4. ~ all_true, bitmask + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + // case 0xa5 ... 0xa6: ! invalid opcode ! + + case 0xa7 ... 0xaa: // TODO -> i32x4.extend_(low/high)_i16x8_(s/u) + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0xab ... 0xad: // i32x4 shifts + return next_pc_0x01(m, opcode) && i_instr_simd_shift(m, opcode); + + case 0xae: // i32x4.add + return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode); + + // case 0xaf ... 0xb0: ! invalid opcode ! + + case 0xb1: // i32x4.sub + return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode); + + // case 0xb2 ... 0xb4: ! invalid opcode ! + + case 0xb5: // i32x4.mul + case 0xb6 ... 0xb9: // i32x4.(min/max) + return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode); + + case 0xba: // TODO -> i32x4.dot_i16x8_s + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + // case 0xbb: ! invalid opcode ! + + case 0xbc ... 0xbf: // TODO -> i32x4.extmul_(low/high)_i16x8_(s/u) + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0xc0 ... 0xc1: // TODO -> i64x2. ~ abs, neg + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + // case 0xc2: ! invalid opcode ! + + case 0xc3 ... 0xc4: // TODO -> i64x2. ~ all_true, bitmask + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + // case 0xc5 ... 0xc6: ! invalid opcode ! + + case 0xc7 ... 0xca: // TODO -> i64x2.extend_(low/high)_i32x4_(s/u) + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0xcb ... 0xcd: // i64x2 shifts + return next_pc_0x01(m, opcode) && i_instr_simd_shift(m, opcode); + + case 0xce: // i64x2.add + return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode); + + // case 0xcf ... 0xd0: ! invalid opcode ! + + case 0xd1: // i64x2.sub + return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode); + + // case 0xd2 ... 0xd4: ! invalid opcode ! + + case 0xd5: // i64x2.mul + return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode); + + case 0xd6 ... 0xdb: // i64x2 relational operators + return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode); + + case 0xdc ... 0xdf: // TODO -> i64x2.extmul_(low/high)_i32x4_(s/u) + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0xe0 ... 0xe1: // TODO -> f32x4. ~ abs, neg + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + // case 0xe2: ! invalid opcode ! + + case 0xe3: // TODO -> f32x4.sqrt + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0xe4 ... 0xeb: // f32x4. ~ add, sub, mul, div, min, max, pmin, pmax + return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode); + + case 0xec ... 0xed: // TODO -> f64x2. ~ abs, neg + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + // case 0xee: ! invalid opcode ! + case 0xef: // TODO -> f64x2.sqrt + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0xf0 ... 0xf7: // f64x2. ~ add, sub, mul, div, min, max, pmin, pmax + return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode); + + case 0xf8 ... 0xf9: // TODO -> i32x4.trunc_sat_f32x4_(s/u) + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0xfa ... 0xfb: // TODO -> f32x4.convert_i32x4_(s/u) + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0xfc ... 0xfd: // TODO -> i32x4.trunc_sat_f64x2_(s/u)_zero + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + case 0xfe ... 0xff: // TODO -> f64x2.convert_low_i32x4_(s/u) + return next_pc_0x01(m, opcode) && not_implemented(opcode); + + default: { + sprintf(exception, "unrecognized SIMD opcode 0x%x (0x%x 0x%x)", opcode, 0xfd, opcode); + return false; + } + } +} + + void Interpreter::report_overflow([[maybe_unused]] Module *m, [[maybe_unused]] uint8_t *maddr) { dbg_warn("memory start: %p, memory end: %p, maddr: %p\n", m->memory.bytes, diff --git a/src/Interpreter/interpreter.h b/src/Interpreter/interpreter.h index d11260d3..7afa7966 100644 --- a/src/Interpreter/interpreter.h +++ b/src/Interpreter/interpreter.h @@ -55,5 +55,6 @@ class Interpreter { static void report_overflow(Module *m, uint8_t *maddr); protected: + bool interpret_simd(Module *m); private: }; diff --git a/src/Utils/util.cpp b/src/Utils/util.cpp index 7455bb3f..305064fb 100644 --- a/src/Utils/util.cpp +++ b/src/Utils/util.cpp @@ -61,9 +61,11 @@ uint64_t read_LEB_(uint8_t **pos, uint32_t maxbits, bool sign) { FATAL("Unsigned LEB at byte %p overflow", (void *)startpos); } } + if (sign && (shift < maxbits) && (byte & 0x40u)) { // Sign extend by highest bits set 1 except last shift bits - result |= UINT64_MAX << shift; + const auto old = result; + result |= (~0ul) << shift; } return result; } @@ -92,24 +94,33 @@ StackValue *readWasmArgs(Type function, uint8_t *data) { switch (args[i].value_type) { case I32: { - args[i].value.int32 = read_LEB_signed(&data, 32); + args[i].value.uint32 = read_LEB_signed(&data, 32); break; } case F32: { memcpy(&args[i].value.f32, data, sizeof(float)); // todo read ieee 754 + printf("Received F32 %f (%02x%02x%02x%02x)", args[i].value.f32, data[0], data[1], data[2], data[3]); data += sizeof(float); break; } case I64: { - args[i].value.int64 = read_LEB_signed(&data, 64); + args[i].value.uint64 = read_LEB_signed(&data, 64); break; } case F64: { memcpy(&args[i].value.f64, data, sizeof(double)); + printf("Received F64 %lf (%02x%02x%02x%02x%02x%02x%02x%02x)", + args[i].value.f64, data[0], data[1], data[2], data[3], + data[4], data[5], data[6], data[7]); data += sizeof(double); break; } + case V128: { + memcpy(&args[i].value.simd, data, sizeof(decltype(args[i].value.simd))); + data += sizeof(decltype(args[i].value.simd)); + break; + } default: { FATAL("no argument of type %" SCNu8 "\n", args[i].value_type); } diff --git a/src/WARDuino.h b/src/WARDuino.h index 8d04d160..ec67f362 100644 --- a/src/WARDuino.h +++ b/src/WARDuino.h @@ -28,6 +28,8 @@ #define F32 0x7d // -0x03 #define F64 0x7c // -0x04 +#define V128 0x7b // according to the spec? TODO + #define I32_8 0x7b // -0x05 #define I32_16 0x7a // -0x06 #define I64_8 0x79 // -0x07 diff --git a/src/WARDuino/WARDuino.cpp b/src/WARDuino/WARDuino.cpp index 215821c0..504cb69e 100644 --- a/src/WARDuino/WARDuino.cpp +++ b/src/WARDuino/WARDuino.cpp @@ -53,9 +53,9 @@ bool resolvesym(char *filename, char *symbol, uint8_t external_kind, void **val, // char exception[4096]; // Static definition of block_types -uint32_t block_type_results[4][1] = {{I32}, {I64}, {F32}, {F64}}; +uint32_t block_type_results[5][1] = {{I32}, {I64}, {F32}, {F64}, {V128}}; -Type block_types[5]; +Type block_types[6]; void initTypes() { block_types[0].form = BLOCK; @@ -140,6 +140,44 @@ void parse_memory_type(Module *m, uint8_t **pos) { } } +void skip_immediates_simd(uint8_t **pos) { + uint8_t simd_opcode = **pos; + *pos += 1; // skip opcode + switch(simd_opcode) { + case 0x00 ... 0x0b: // v128.loadXXX, v128.store + read_LEB_32(pos); + break; + case 0x0c: // v128.const + *pos += 16; // v128 consts are straight 16-byte blocks + break; + case 0x0d: // i8x16.shuffle + *pos += 1; + break; + case 0x0e ... 0x14: // i8x16.swizzle, dim.splat + break; + case 0x15 ... 0x22: // dim.extract_lane, dim.replace_lane + *pos += 1; + break; + case 0x23 ... 0x53: // relational operators + break; + case 0x54 ... 0x5b: // v128.loadX_lane, v128.storeX_lane + read_LEB_32(pos); + *pos += 1; + break; + case 0x5c ... 0x5d: // v128.loadX_zero + read_LEB_32(pos); + break; + case 0x5e ... 0x7f: // math-like functionality, extensions, ... + break; + case 0x80 ... 0xff: { + uint8_t second_opcode = **pos; + assert(second_opcode == 0x01 && "3-byte SIMD instructions should have 0x01 as their 3rd byte."); + *pos++; + break; + } + } +} + void skip_immediates(uint8_t **pos) { uint32_t count, opcode = **pos; *pos = *pos + 1; @@ -190,6 +228,9 @@ void skip_immediates(uint8_t **pos) { } read_LEB_32(pos); // default target break; + case 0xfd: + skip_immediates_simd(pos); + break; default: // no immediates break; } diff --git a/src/WARDuino/internals.h b/src/WARDuino/internals.h index d0f6b263..6c183e46 100644 --- a/src/WARDuino/internals.h +++ b/src/WARDuino/internals.h @@ -54,6 +54,15 @@ typedef struct StackValue { int64_t int64; float f32; double f64; + + union { // TODO: temporary solution for SIMD types + int8_t i8x16[16]; + int16_t i16x8[8]; + int32_t i32x4[4]; + int64_t i64x2[2]; + float f32x4[4]; + double f64x2[2]; + } simd; } value; } StackValue; diff --git a/tests/latch/core/simd_splat_0.asserts.wast b/tests/latch/core/simd_splat_0.asserts.wast new file mode 100644 index 00000000..711716a1 --- /dev/null +++ b/tests/latch/core/simd_splat_0.asserts.wast @@ -0,0 +1,107 @@ +(assert_return (invoke "i8x16.splat" (i32.const 0)) (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)) +(assert_return (invoke "i8x16.splat" (i32.const 5)) (v128.const i8x16 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5)) +(assert_return (invoke "i8x16.splat" (i32.const -5)) (v128.const i8x16 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5)) +(assert_return (invoke "i8x16.splat" (i32.const 257)) (v128.const i8x16 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1)) +(assert_return (invoke "i8x16.splat" (i32.const 0xff)) (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) +(assert_return (invoke "i8x16.splat" (i32.const -128)) (v128.const i8x16 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128)) +(assert_return (invoke "i8x16.splat" (i32.const 127)) (v128.const i8x16 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127)) +(assert_return (invoke "i8x16.splat" (i32.const -129)) (v128.const i8x16 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127)) +(assert_return (invoke "i8x16.splat" (i32.const 128)) (v128.const i8x16 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128)) +(assert_return (invoke "i8x16.splat" (i32.const 0xff7f)) (v128.const i8x16 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f)) +(assert_return (invoke "i8x16.splat" (i32.const 0x80)) (v128.const i8x16 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80)) +(assert_return (invoke "i8x16.splat" (i32.const 0xAB)) (v128.const i32x4 0xABABABAB 0xABABABAB 0xABABABAB 0xABABABAB)) + +(assert_return (invoke "i16x8.splat" (i32.const 0)) (v128.const i16x8 0 0 0 0 0 0 0 0)) +(assert_return (invoke "i16x8.splat" (i32.const 5)) (v128.const i16x8 5 5 5 5 5 5 5 5)) +(assert_return (invoke "i16x8.splat" (i32.const -5)) (v128.const i16x8 -5 -5 -5 -5 -5 -5 -5 -5)) +(assert_return (invoke "i16x8.splat" (i32.const 65537)) (v128.const i16x8 1 1 1 1 1 1 1 1)) +(assert_return (invoke "i16x8.splat" (i32.const 0xffff)) (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) +(assert_return (invoke "i16x8.splat" (i32.const -32768)) (v128.const i16x8 -32768 -32768 -32768 -32768 -32768 -32768 -32768 -32768)) +(assert_return (invoke "i16x8.splat" (i32.const 32767)) (v128.const i16x8 32767 32767 32767 32767 32767 32767 32767 32767)) +(assert_return (invoke "i16x8.splat" (i32.const -32769)) (v128.const i16x8 32767 32767 32767 32767 32767 32767 32767 32767)) +(assert_return (invoke "i16x8.splat" (i32.const 32768)) (v128.const i16x8 -32768 -32768 -32768 -32768 -32768 -32768 -32768 -32768)) +(assert_return (invoke "i16x8.splat" (i32.const 0xffff7fff)) (v128.const i16x8 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff)) +(assert_return (invoke "i16x8.splat" (i32.const 0x8000)) (v128.const i16x8 0x8000 0x8000 0x8000 0x8000 0x8000 0x8000 0x8000 0x8000)) +(assert_return (invoke "i16x8.splat" (i32.const 0xABCD)) (v128.const i32x4 0xABCDABCD 0xABCDABCD 0xABCDABCD 0xABCDABCD)) +(assert_return (invoke "i16x8.splat" (i32.const 012345)) (v128.const i16x8 012_345 012_345 012_345 012_345 012_345 012_345 012_345 012_345)) +(assert_return (invoke "i16x8.splat" (i32.const 0x01234)) (v128.const i16x8 0x0_1234 0x0_1234 0x0_1234 0x0_1234 0x0_1234 0x0_1234 0x0_1234 0x0_1234)) + +(assert_return (invoke "i32x4.splat" (i32.const 0)) (v128.const i32x4 0 0 0 0)) +(assert_return (invoke "i32x4.splat" (i32.const 5)) (v128.const i32x4 5 5 5 5)) +(assert_return (invoke "i32x4.splat" (i32.const -5)) (v128.const i32x4 -5 -5 -5 -5)) +(assert_return (invoke "i32x4.splat" (i32.const 0xffffffff)) (v128.const i32x4 -1 -1 -1 -1)) +(assert_return (invoke "i32x4.splat" (i32.const 4294967295)) (v128.const i32x4 -1 -1 -1 -1)) +(assert_return (invoke "i32x4.splat" (i32.const -2147483648)) (v128.const i32x4 0x80000000 0x80000000 0x80000000 0x80000000)) +(assert_return (invoke "i32x4.splat" (i32.const 2147483647)) (v128.const i32x4 0x7fffffff 0x7fffffff 0x7fffffff 0x7fffffff)) +(assert_return (invoke "i32x4.splat" (i32.const 2147483648)) (v128.const i32x4 0x80000000 0x80000000 0x80000000 0x80000000)) +(assert_return (invoke "i32x4.splat" (i32.const 01234567890)) (v128.const i32x4 012_3456_7890 012_3456_7890 012_3456_7890 012_3456_7890)) +(assert_return (invoke "i32x4.splat" (i32.const 0x012345678)) (v128.const i32x4 0x0_1234_5678 0x0_1234_5678 0x0_1234_5678 0x0_1234_5678)) + +(assert_return (invoke "f32x4.splat" (f32.const 0.0)) (v128.const f32x4 0.0 0.0 0.0 0.0)) +(assert_return (invoke "f32x4.splat" (f32.const 1.1)) (v128.const f32x4 1.1 1.1 1.1 1.1)) +(assert_return (invoke "f32x4.splat" (f32.const -1.1)) (v128.const f32x4 -1.1 -1.1 -1.1 -1.1)) +(assert_return (invoke "f32x4.splat" (f32.const 1e38)) (v128.const f32x4 1e38 1e38 1e38 1e38)) +(assert_return (invoke "f32x4.splat" (f32.const -1e38)) (v128.const f32x4 -1e38 -1e38 -1e38 -1e38)) +(assert_return (invoke "f32x4.splat" (f32.const 0x1.fffffep127)) (v128.const f32x4 0x1.fffffep127 0x1.fffffep127 0x1.fffffep127 0x1.fffffep127)) +(assert_return (invoke "f32x4.splat" (f32.const -0x1.fffffep127)) (v128.const f32x4 -0x1.fffffep127 -0x1.fffffep127 -0x1.fffffep127 -0x1.fffffep127)) +(assert_return (invoke "f32x4.splat" (f32.const 0x1p127)) (v128.const f32x4 0x1p127 0x1p127 0x1p127 0x1p127)) +(assert_return (invoke "f32x4.splat" (f32.const -0x1p127)) (v128.const f32x4 -0x1p127 -0x1p127 -0x1p127 -0x1p127)) +(assert_return (invoke "f32x4.splat" (f32.const inf)) (v128.const f32x4 inf inf inf inf)) +(assert_return (invoke "f32x4.splat" (f32.const -inf)) (v128.const f32x4 -inf -inf -inf -inf)) +(assert_return (invoke "f32x4.splat" (f32.const nan)) (v128.const f32x4 nan nan nan nan)) +(assert_return (invoke "f32x4.splat" (f32.const nan:0x1)) (v128.const f32x4 nan:0x1 nan:0x1 nan:0x1 nan:0x1)) +(assert_return (invoke "f32x4.splat" (f32.const nan:0x7f_ffff)) (v128.const f32x4 nan:0x7f_ffff nan:0x7f_ffff nan:0x7f_ffff nan:0x7f_ffff)) +(assert_return (invoke "f32x4.splat" (f32.const 0123456789)) (v128.const f32x4 0123456789 0123456789 0123456789 0123456789)) +(assert_return (invoke "f32x4.splat" (f32.const 0123456789.)) (v128.const f32x4 0123456789. 0123456789. 0123456789. 0123456789.)) +(assert_return (invoke "f32x4.splat" (f32.const 0x0123456789ABCDEF)) (v128.const f32x4 0x0123456789ABCDEF 0x0123456789ABCDEF 0x0123456789ABCDEF 0x0123456789ABCDEF)) +(assert_return (invoke "f32x4.splat" (f32.const 0x0123456789ABCDEF.)) (v128.const f32x4 0x0123456789ABCDEF. 0x0123456789ABCDEF. 0x0123456789ABCDEF. 0x0123456789ABCDEF.)) +(assert_return (invoke "f32x4.splat" (f32.const 0123456789e019)) (v128.const f32x4 0123456789e019 0123456789e019 0123456789e019 0123456789e019)) +(assert_return (invoke "f32x4.splat" (f32.const 0123456789.e+019)) (v128.const f32x4 0123456789.e+019 0123456789.e+019 0123456789.e+019 0123456789.e+019)) +(assert_return (invoke "f32x4.splat" (f32.const 0x0123456789ABCDEFp019)) (v128.const f32x4 0x0123456789ABCDEFp019 0x0123456789ABCDEFp019 0x0123456789ABCDEFp019 0x0123456789ABCDEFp019)) +(assert_return (invoke "f32x4.splat" (f32.const 0x0123456789ABCDEF.p-019)) (v128.const f32x4 0x0123456789ABCDEF.p-019 0x0123456789ABCDEF.p-019 0x0123456789ABCDEF.p-019 0x0123456789ABCDEF.p-019)) + +(assert_return (invoke "i64x2.splat" (i64.const 0)) (v128.const i64x2 0 0)) +(assert_return (invoke "i64x2.splat" (i64.const -0)) (v128.const i64x2 0 0)) +(assert_return (invoke "i64x2.splat" (i64.const 1)) (v128.const i64x2 1 1)) +(assert_return (invoke "i64x2.splat" (i64.const -1)) (v128.const i64x2 -1 -1)) +(assert_return (invoke "i64x2.splat" (i64.const -9223372036854775808)) (v128.const i64x2 -9223372036854775808 -9223372036854775808)) +(assert_return (invoke "i64x2.splat" (i64.const -9223372036854775808)) (v128.const i64x2 9223372036854775808 9223372036854775808)) +(assert_return (invoke "i64x2.splat" (i64.const 9223372036854775807)) (v128.const i64x2 9223372036854775807 9223372036854775807)) +(assert_return (invoke "i64x2.splat" (i64.const 18446744073709551615)) (v128.const i64x2 -1 -1)) +(assert_return (invoke "i64x2.splat" (i64.const 0x7fffffffffffffff)) (v128.const i64x2 0x7fffffffffffffff 0x7fffffffffffffff)) +(assert_return (invoke "i64x2.splat" (i64.const 0xffffffffffffffff)) (v128.const i64x2 -1 -1)) +(assert_return (invoke "i64x2.splat" (i64.const -0x8000000000000000)) (v128.const i64x2 -0x8000000000000000 -0x8000000000000000)) +(assert_return (invoke "i64x2.splat" (i64.const -0x8000000000000000)) (v128.const i64x2 0x8000000000000000 0x8000000000000000)) +(assert_return (invoke "i64x2.splat" (i64.const 01234567890123456789)) (v128.const i64x2 01_234_567_890_123_456_789 01_234_567_890_123_456_789)) +(assert_return (invoke "i64x2.splat" (i64.const 0x01234567890ABcdef)) (v128.const i64x2 0x0_1234_5678_90AB_cdef 0x0_1234_5678_90AB_cdef)) + +(assert_return (invoke "f64x2.splat" (f64.const 0.0)) (v128.const f64x2 0.0 0.0)) +(assert_return (invoke "f64x2.splat" (f64.const -0.0)) (v128.const f64x2 -0.0 -0.0)) +(assert_return (invoke "f64x2.splat" (f64.const 1.1)) (v128.const f64x2 1.1 1.1)) +(assert_return (invoke "f64x2.splat" (f64.const -1.1)) (v128.const f64x2 -1.1 -1.1)) +(assert_return (invoke "f64x2.splat" (f64.const 0x0.0000000000001p-1022)) (v128.const f64x2 0x0.0000000000001p-1022 0x0.0000000000001p-1022)) +(assert_return (invoke "f64x2.splat" (f64.const -0x0.0000000000001p-1022)) (v128.const f64x2 -0x0.0000000000001p-1022 -0x0.0000000000001p-1022)) +(assert_return (invoke "f64x2.splat" (f64.const 0x1p-1022)) (v128.const f64x2 0x1p-1022 0x1p-1022)) +(assert_return (invoke "f64x2.splat" (f64.const -0x1p-1022)) (v128.const f64x2 -0x1p-1022 -0x1p-1022)) +(assert_return (invoke "f64x2.splat" (f64.const 0x1p-1)) (v128.const f64x2 0x1p-1 0x1p-1)) +(assert_return (invoke "f64x2.splat" (f64.const -0x1p-1)) (v128.const f64x2 -0x1p-1 -0x1p-1)) +(assert_return (invoke "f64x2.splat" (f64.const 0x1p+0)) (v128.const f64x2 0x1p+0 0x1p+0)) +(assert_return (invoke "f64x2.splat" (f64.const -0x1p+0)) (v128.const f64x2 -0x1p+0 -0x1p+0)) +(assert_return (invoke "f64x2.splat" (f64.const 0x1.921fb54442d18p+2)) (v128.const f64x2 0x1.921fb54442d18p+2 0x1.921fb54442d18p+2)) +(assert_return (invoke "f64x2.splat" (f64.const -0x1.921fb54442d18p+2)) (v128.const f64x2 -0x1.921fb54442d18p+2 -0x1.921fb54442d18p+2)) +(assert_return (invoke "f64x2.splat" (f64.const 0x1.fffffffffffffp+1023)) (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)) +(assert_return (invoke "f64x2.splat" (f64.const -0x1.fffffffffffffp+1023)) (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023)) +(assert_return (invoke "f64x2.splat" (f64.const inf)) (v128.const f64x2 inf inf)) +(assert_return (invoke "f64x2.splat" (f64.const -inf)) (v128.const f64x2 -inf -inf)) +(assert_return (invoke "f64x2.splat" (f64.const nan)) (v128.const f64x2 nan nan)) +(assert_return (invoke "f64x2.splat" (f64.const -nan)) (v128.const f64x2 -nan -nan)) +(assert_return (invoke "f64x2.splat" (f64.const nan:0x4000000000000)) (v128.const f64x2 nan:0x4000000000000 nan:0x4000000000000)) +(assert_return (invoke "f64x2.splat" (f64.const -nan:0x4000000000000)) (v128.const f64x2 -nan:0x4000000000000 -nan:0x4000000000000)) +(assert_return (invoke "f64x2.splat" (f64.const 0123456789)) (v128.const f64x2 0123456789 0123456789)) +(assert_return (invoke "f64x2.splat" (f64.const 0123456789.)) (v128.const f64x2 0123456789. 0123456789.)) +(assert_return (invoke "f64x2.splat" (f64.const 0x0123456789ABCDEFabcdef)) (v128.const f64x2 0x0123456789ABCDEFabcdef 0x0123456789ABCDEFabcdef)) +(assert_return (invoke "f64x2.splat" (f64.const 0x0123456789ABCDEFabcdef.)) (v128.const f64x2 0x0123456789ABCDEFabcdef. 0x0123456789ABCDEFabcdef.)) +(assert_return (invoke "f64x2.splat" (f64.const 0123456789e019)) (v128.const f64x2 0123456789e019 0123456789e019)) +(assert_return (invoke "f64x2.splat" (f64.const 0123456789e+019)) (v128.const f64x2 0123456789e+019 0123456789e+019)) +(assert_return (invoke "f64x2.splat" (f64.const 0x0123456789ABCDEFabcdef.p019)) (v128.const f64x2 0x0123456789ABCDEFabcdef.p019 0x0123456789ABCDEFabcdef.p019)) +(assert_return (invoke "f64x2.splat" (f64.const 0x0123456789ABCDEFabcdef.p-019)) (v128.const f64x2 0x0123456789ABCDEFabcdef.p-019 0x0123456789ABCDEFabcdef.p-019)) \ No newline at end of file diff --git a/tests/latch/core/simd_splat_0.wast b/tests/latch/core/simd_splat_0.wast new file mode 100644 index 00000000..39b3b563 --- /dev/null +++ b/tests/latch/core/simd_splat_0.wast @@ -0,0 +1,8 @@ +(module + (func (export "i8x16.splat") (param i32) (result v128) (i8x16.splat (local.get 0))) + (func (export "i16x8.splat") (param i32) (result v128) (i16x8.splat (local.get 0))) + (func (export "i32x4.splat") (param i32) (result v128) (i32x4.splat (local.get 0))) + (func (export "f32x4.splat") (param f32) (result v128) (f32x4.splat (local.get 0))) + (func (export "i64x2.splat") (param i64) (result v128) (i64x2.splat (local.get 0))) + (func (export "f64x2.splat") (param f64) (result v128) (f64x2.splat (local.get 0))) +) \ No newline at end of file diff --git a/tests/latch/core/simd_splat_1.asserts.wast b/tests/latch/core/simd_splat_1.asserts.wast new file mode 100644 index 00000000..3d892695 --- /dev/null +++ b/tests/latch/core/simd_splat_1.asserts.wast @@ -0,0 +1,5 @@ +(assert_return (invoke "as-v128_store-operand-1" (i32.const 1)) (v128.const i8x16 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1)) +(assert_return (invoke "as-v128_store-operand-2" (i32.const 256)) (v128.const i16x8 0x100 0x100 0x100 0x100 0x100 0x100 0x100 0x100)) +(assert_return (invoke "as-v128_store-operand-3" (i32.const 0xffffffff)) (v128.const i32x4 -1 -1 -1 -1)) +(assert_return (invoke "as-v128_store-operand-4" (i64.const 1)) (v128.const i64x2 1 1)) +(assert_return (invoke "as-v128_store-operand-5" (f64.const -0x1p+0)) (v128.const f64x2 -0x1p+0 -0x1p+0)) \ No newline at end of file diff --git a/tests/latch/core/simd_splat_1.wast b/tests/latch/core/simd_splat_1.wast new file mode 100644 index 00000000..bc6be2e8 --- /dev/null +++ b/tests/latch/core/simd_splat_1.wast @@ -0,0 +1,17 @@ +(module (memory 1) + (func (export "as-v128_store-operand-1") (param i32) (result v128) + (v128.store (i32.const 0) (i8x16.splat (local.get 0))) + (v128.load (i32.const 0))) + (func (export "as-v128_store-operand-2") (param i32) (result v128) + (v128.store (i32.const 0) (i16x8.splat (local.get 0))) + (v128.load (i32.const 0))) + (func (export "as-v128_store-operand-3") (param i32) (result v128) + (v128.store (i32.const 0) (i32x4.splat (local.get 0))) + (v128.load (i32.const 0))) + (func (export "as-v128_store-operand-4") (param i64) (result v128) + (v128.store (i32.const 0) (i64x2.splat (local.get 0))) + (v128.load (i32.const 0))) + (func (export "as-v128_store-operand-5") (param f64) (result v128) + (v128.store (i32.const 0) (f64x2.splat (local.get 0))) + (v128.load (i32.const 0))) +) \ No newline at end of file diff --git a/tests/latch/core/simd_splat_2.asserts.wast b/tests/latch/core/simd_splat_2.asserts.wast new file mode 100644 index 00000000..6e7f0fe3 --- /dev/null +++ b/tests/latch/core/simd_splat_2.asserts.wast @@ -0,0 +1,51 @@ +(assert_return (invoke "as-i8x16_extract_lane_s-operand-first" (i32.const 42)) (i32.const 42)) +(assert_return (invoke "as-i8x16_extract_lane_s-operand-last" (i32.const -42)) (i32.const -42)) +(assert_return (invoke "as-i16x8_extract_lane_s-operand-first" (i32.const 0xffff7fff)) (i32.const 32767)) +(assert_return (invoke "as-i16x8_extract_lane_s-operand-last" (i32.const 0x8000)) (i32.const -32768)) +(assert_return (invoke "as-i32x4_extract_lane_s-operand-first" (i32.const 0x7fffffff)) (i32.const 2147483647)) +(assert_return (invoke "as-i32x4_extract_lane_s-operand-last" (i32.const 0x80000000)) (i32.const -2147483648)) +(assert_return (invoke "as-f32x4_extract_lane_s-operand-first" (f32.const 1.5)) (f32.const 1.5)) +(assert_return (invoke "as-f32x4_extract_lane_s-operand-last" (f32.const -0.25)) (f32.const -0.25)) +(assert_return (invoke "as-v8x16_swizzle-operands" (i32.const 1) (i32.const -1)) (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)) +(assert_return (invoke "as-i64x2_extract_lane-operand-last" (i64.const -42)) (i64.const -42)) +(assert_return (invoke "as-i64x2_extract_lane-operand-first" (i64.const 42)) (i64.const 42)) +(assert_return (invoke "as-f64x2_extract_lane-operand-first" (f64.const 1.5)) (f64.const 1.5)) +(assert_return (invoke "as-f64x2_extract_lane-operand-last" (f64.const -0x1p+0)) (f64.const -0x1p+0)) + +(assert_return (invoke "as-i8x16_add_sub-operands" (i32.const 3) (i32.const 2) (i32.const 1)) (v128.const i8x16 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4)) +(assert_return (invoke "as-i16x8_add_sub_mul-operands" (i32.const 257) (i32.const 128) (i32.const 16) (i32.const 16)) (v128.const i16x8 129 129 129 129 129 129 129 129)) +(assert_return (invoke "as-i32x4_add_sub_mul-operands" (i32.const 65535) (i32.const 65537) (i32.const 256) (i32.const 256)) (v128.const i32x4 0x10000 0x10000 0x10000 0x10000)) +(assert_return (invoke "as-i64x2_add_sub_mul-operands" (i64.const 0x7fffffff) (i64.const 0x1_0000_0001) (i64.const 65536) (i64.const 65536)) (v128.const i64x2 0x8000_0000 0x8000_0000)) +(assert_return (invoke "as-f64x2_add_sub_mul-operands" (f64.const 0x1p-1) (f64.const 0.75) (f64.const 0x1p-1) (f64.const 0.5)) (v128.const f64x2 0x1p+0 0x1p+0)) + +(assert_return (invoke "as-i8x16_add_sat_s-operands" (i32.const 0x7f) (i32.const 1)) (v128.const i8x16 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f)) +(assert_return (invoke "as-i16x8_add_sat_s-operands" (i32.const 0x7fff) (i32.const 1)) (v128.const i16x8 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff)) +(assert_return (invoke "as-i8x16_sub_sat_u-operands" (i32.const 0x7f) (i32.const 0xff)) (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)) +(assert_return (invoke "as-i16x8_sub_sat_u-operands" (i32.const 0x7fff) (i32.const 0xffff)) (v128.const i16x8 0 0 0 0 0 0 0 0)) + +(assert_return (invoke "as-i8x16_shr_s-operand" (i32.const 0xf0) (i32.const 3)) (v128.const i8x16 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2)) +(assert_return (invoke "as-i16x8_shr_s-operand" (i32.const 0x100) (i32.const 4)) (v128.const i16x8 16 16 16 16 16 16 16 16)) +(assert_return (invoke "as-i32x4_shr_s-operand" (i32.const -1) (i32.const 16)) (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "as-v128_and-operands" (i32.const 0x11) (i32.const 0xff)) (v128.const i8x16 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17)) +(assert_return (invoke "as-v128_or-operands" (i32.const 0) (i32.const 0xffff)) (v128.const i16x8 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff)) +(assert_return (invoke "as-v128_xor-operands" (i32.const 0xf0f0f0f0) (i32.const 0xffffffff)) (v128.const i32x4 0xf0f0f0f 0xf0f0f0f 0xf0f0f0f 0xf0f0f0f)) + +(assert_return (invoke "as-i8x16_all_true-operand" (i32.const 0)) (i32.const 0)) +(assert_return (invoke "as-i16x8_all_true-operand" (i32.const 0xffff)) (i32.const 1)) +(assert_return (invoke "as-i32x4_all_true-operand1" (i32.const 0xf0f0f0f0)) (i32.const 1)) +(assert_return (invoke "as-i32x4_all_true-operand2" (i64.const -1)) (i32.const 1)) + +(assert_return (invoke "as-i8x16_eq-operands" (i32.const 1) (i32.const 2)) (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)) +(assert_return (invoke "as-i16x8_eq-operands" (i32.const -1) (i32.const 65535)) (v128.const i16x8 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff)) +(assert_return (invoke "as-i32x4_eq-operands1" (i32.const -1) (i32.const 0xffffffff)) (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff)) +(assert_return (invoke "as-f32x4_eq-operands" (f32.const +0.0) (f32.const -0.0)) (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff)) +(assert_return (invoke "as-i32x4_eq-operands2" (i64.const 1) (i64.const 2)) (v128.const i64x2 0xffffffff00000000 0xffffffff00000000)) +(assert_return (invoke "as-f64x2_eq-operands" (f64.const +0.0) (f64.const -0.0)) (v128.const i64x2 -1 -1)) + +(assert_return (invoke "as-f32x4_abs-operand" (f32.const -1.125)) (v128.const f32x4 1.125 1.125 1.125 1.125)) +(assert_return (invoke "as-f32x4_min-operands" (f32.const 0.25) (f32.const 1e-38)) (v128.const f32x4 1e-38 1e-38 1e-38 1e-38)) +(assert_return (invoke "as-f32x4_div-operands" (f32.const 1.0) (f32.const 8.0)) (v128.const f32x4 0.125 0.125 0.125 0.125)) + +(assert_return (invoke "as-f32x4_convert_s_i32x4-operand" (i32.const 12345)) (v128.const f32x4 12345.0 12345.0 12345.0 12345.0)) +(assert_return (invoke "as-i32x4_trunc_s_f32x4_sat-operand" (f32.const 1.1)) (v128.const i32x4 1 1 1 1)) diff --git a/tests/latch/core/simd_splat_2.wast b/tests/latch/core/simd_splat_2.wast new file mode 100644 index 00000000..819ef5c0 --- /dev/null +++ b/tests/latch/core/simd_splat_2.wast @@ -0,0 +1,119 @@ +(module + ;; Accessing lane + (func (export "as-i8x16_extract_lane_s-operand-first") (param i32) (result i32) + (i8x16.extract_lane_s 0 (i8x16.splat (local.get 0)))) + (func (export "as-i8x16_extract_lane_s-operand-last") (param i32) (result i32) + (i8x16.extract_lane_s 15 (i8x16.splat (local.get 0)))) + (func (export "as-i16x8_extract_lane_s-operand-first") (param i32) (result i32) + (i16x8.extract_lane_s 0 (i16x8.splat (local.get 0)))) + (func (export "as-i16x8_extract_lane_s-operand-last") (param i32) (result i32) + (i16x8.extract_lane_s 7 (i16x8.splat (local.get 0)))) + (func (export "as-i32x4_extract_lane_s-operand-first") (param i32) (result i32) + (i32x4.extract_lane 0 (i32x4.splat (local.get 0)))) + (func (export "as-i32x4_extract_lane_s-operand-last") (param i32) (result i32) + (i32x4.extract_lane 3 (i32x4.splat (local.get 0)))) + (func (export "as-f32x4_extract_lane_s-operand-first") (param f32) (result f32) + (f32x4.extract_lane 0 (f32x4.splat (local.get 0)))) + (func (export "as-f32x4_extract_lane_s-operand-last") (param f32) (result f32) + (f32x4.extract_lane 3 (f32x4.splat (local.get 0)))) + (func (export "as-v8x16_swizzle-operands") (param i32) (param i32) (result v128) + (i8x16.swizzle (i8x16.splat (local.get 0)) (i8x16.splat (local.get 1)))) + (func (export "as-i64x2_extract_lane-operand-first") (param i64) (result i64) + (i64x2.extract_lane 0 (i64x2.splat (local.get 0)))) + (func (export "as-i64x2_extract_lane-operand-last") (param i64) (result i64) + (i64x2.extract_lane 1 (i64x2.splat (local.get 0)))) + (func (export "as-f64x2_extract_lane-operand-first") (param f64) (result f64) + (f64x2.extract_lane 0 (f64x2.splat (local.get 0)))) + (func (export "as-f64x2_extract_lane-operand-last") (param f64) (result f64) + (f64x2.extract_lane 1 (f64x2.splat (local.get 0)))) + + ;; Integer arithmetic + (func (export "as-i8x16_add_sub-operands") (param i32 i32 i32) (result v128) + (i8x16.add (i8x16.splat (local.get 0)) + (i8x16.sub (i8x16.splat (local.get 1)) (i8x16.splat (local.get 2))))) + (func (export "as-i16x8_add_sub_mul-operands") (param i32 i32 i32 i32) (result v128) + (i16x8.add (i16x8.splat (local.get 0)) + (i16x8.sub (i16x8.splat (local.get 1)) + (i16x8.mul (i16x8.splat (local.get 2)) (i16x8.splat (local.get 3)))))) + (func (export "as-i32x4_add_sub_mul-operands") (param i32 i32 i32 i32) (result v128) + (i32x4.add (i32x4.splat (local.get 0)) + (i32x4.sub (i32x4.splat (local.get 1)) + (i32x4.mul (i32x4.splat (local.get 2)) (i32x4.splat (local.get 3)))))) + + (func (export "as-i64x2_add_sub_mul-operands") (param i64 i64 i64 i64) (result v128) + (i64x2.add (i64x2.splat (local.get 0)) + (i64x2.sub (i64x2.splat (local.get 1)) + (i64x2.mul (i64x2.splat (local.get 2)) (i64x2.splat (local.get 3)))))) + (func (export "as-f64x2_add_sub_mul-operands") (param f64 f64 f64 f64) (result v128) + (f64x2.add (f64x2.splat (local.get 0)) + (f64x2.sub (f64x2.splat (local.get 1)) + (f64x2.mul (f64x2.splat (local.get 2)) (f64x2.splat (local.get 3)))))) + + ;; Saturating integer arithmetic + (func (export "as-i8x16_add_sat_s-operands") (param i32 i32) (result v128) + (i8x16.add_sat_s (i8x16.splat (local.get 0)) (i8x16.splat (local.get 1)))) + (func (export "as-i16x8_add_sat_s-operands") (param i32 i32) (result v128) + (i16x8.add_sat_s (i16x8.splat (local.get 0)) (i16x8.splat (local.get 1)))) + (func (export "as-i8x16_sub_sat_u-operands") (param i32 i32) (result v128) + (i8x16.sub_sat_u (i8x16.splat (local.get 0)) (i8x16.splat (local.get 1)))) + (func (export "as-i16x8_sub_sat_u-operands") (param i32 i32) (result v128) + (i16x8.sub_sat_u (i16x8.splat (local.get 0)) (i16x8.splat (local.get 1)))) + + ;; Bit shifts + (func (export "as-i8x16_shr_s-operand") (param i32 i32) (result v128) + (i8x16.shr_s (i8x16.splat (local.get 0)) (local.get 1))) + (func (export "as-i16x8_shr_s-operand") (param i32 i32) (result v128) + (i16x8.shr_s (i16x8.splat (local.get 0)) (local.get 1))) + (func (export "as-i32x4_shr_s-operand") (param i32 i32) (result v128) + (i32x4.shr_s (i32x4.splat (local.get 0)) (local.get 1))) + + ;; Bitwise operantions + (func (export "as-v128_and-operands") (param i32 i32) (result v128) + (v128.and (i8x16.splat (local.get 0)) (i8x16.splat (local.get 1)))) + (func (export "as-v128_or-operands") (param i32 i32) (result v128) + (v128.or (i16x8.splat (local.get 0)) (i16x8.splat (local.get 1)))) + (func (export "as-v128_xor-operands") (param i32 i32) (result v128) + (v128.xor (i32x4.splat (local.get 0)) (i32x4.splat (local.get 1)))) + + ;; Boolean horizontal reductions + (func (export "as-i8x16_all_true-operand") (param i32) (result i32) + (i8x16.all_true (i8x16.splat (local.get 0)))) + (func (export "as-i16x8_all_true-operand") (param i32) (result i32) + (i16x8.all_true (i16x8.splat (local.get 0)))) + (func (export "as-i32x4_all_true-operand1") (param i32) (result i32) + (i32x4.all_true (i32x4.splat (local.get 0)))) + (func (export "as-i32x4_all_true-operand2") (param i64) (result i32) + (i32x4.all_true (i64x2.splat (local.get 0)))) + + ;; Comparisons + (func (export "as-i8x16_eq-operands") (param i32 i32) (result v128) + (i8x16.eq (i8x16.splat (local.get 0)) (i8x16.splat (local.get 1)))) + (func (export "as-i16x8_eq-operands") (param i32 i32) (result v128) + (i16x8.eq (i16x8.splat (local.get 0)) (i16x8.splat (local.get 1)))) + (func (export "as-i32x4_eq-operands1") (param i32 i32) (result v128) + (i32x4.eq (i32x4.splat (local.get 0)) (i32x4.splat (local.get 1)))) + (func (export "as-i32x4_eq-operands2") (param i64 i64) (result v128) + (i32x4.eq (i64x2.splat (local.get 0)) (i64x2.splat (local.get 1)))) + (func (export "as-f32x4_eq-operands") (param f32 f32) (result v128) + (f32x4.eq (f32x4.splat (local.get 0)) (f32x4.splat (local.get 1)))) + (func (export "as-f64x2_eq-operands") (param f64 f64) (result v128) + (f64x2.eq (f64x2.splat (local.get 0)) (f64x2.splat (local.get 1)))) + + ;; Floating-point sign bit operations + (func (export "as-f32x4_abs-operand") (param f32) (result v128) + (f32x4.abs (f32x4.splat (local.get 0)))) + + ;; Floating-point min + (func (export "as-f32x4_min-operands") (param f32 f32) (result v128) + (f32x4.min (f32x4.splat (local.get 0)) (f32x4.splat (local.get 1)))) + + ;; Floating-point arithmetic + (func (export "as-f32x4_div-operands") (param f32 f32) (result v128) + (f32x4.div (f32x4.splat (local.get 0)) (f32x4.splat (local.get 1)))) + + ;; Conversions + (func (export "as-f32x4_convert_s_i32x4-operand") (param i32) (result v128) + (f32x4.convert_i32x4_s (i32x4.splat (local.get 0)))) + (func (export "as-i32x4_trunc_s_f32x4_sat-operand") (param f32) (result v128) + (i32x4.trunc_sat_f32x4_s (f32x4.splat (local.get 0)))) +) \ No newline at end of file diff --git a/tests/latch/latch-0.3.0.tgz b/tests/latch/latch-0.3.0.tgz index b916137f..9f1913bd 100644 Binary files a/tests/latch/latch-0.3.0.tgz and b/tests/latch/latch-0.3.0.tgz differ diff --git a/tests/latch/package-lock.json b/tests/latch/package-lock.json index 707ca38d..f327001d 100644 --- a/tests/latch/package-lock.json +++ b/tests/latch/package-lock.json @@ -276,9 +276,9 @@ "license": "MIT" }, "node_modules/@types/node": { - "version": "22.5.5", - "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.5.tgz", - "integrity": "sha512-Xjs4y5UPO/CLdzpgR6GirZJx36yScjh73+2NlLlkFRSoQN8B0DpfXPdZGnvVmLRLOsqDpOfTNv7D9trgGhmOIA==", + "version": "22.7.5", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.7.5.tgz", + "integrity": "sha512-jML7s2NAzMWc//QSJ1a3prpk78cOPchGvXJsC3C6R6PSMoooztvRVQEz89gmBTBY1SPMaqo5teB4uNHPdetShQ==", "dev": true, "license": "MIT", "peer": true, @@ -699,7 +699,7 @@ "node_modules/latch": { "version": "0.3.0", "resolved": "file:latch-0.3.0.tgz", - "integrity": "sha512-4ZVbuSUb5lmSvcNWfGDnxrOUoFq7uTL62CvfglTunjPig5r5f0daTWd76CwdylxQobInUH3RVXSx8qIZ1HjTmA==", + "integrity": "sha512-ve2IHiwGMEVRONgAeIQPW8mEkcHJqRC5t7lnPExehk9uLBW6VQdlGV+GxKNdTTANdsnYjz/dsLPfTcVOWLdFrA==", "dev": true, "dependencies": { "ansi-colors": "^4.1.3", @@ -707,9 +707,6 @@ "ora": "^8.0.1", "source-map": "^0.7.4", "ts-node": "^10.5.0" - }, - "bin": { - "latch": "npx ts-node" } }, "node_modules/leven": { diff --git a/tests/latch/src/spec.test.ts b/tests/latch/src/spec.test.ts index ceb4a6c5..1320c228 100644 --- a/tests/latch/src/spec.test.ts +++ b/tests/latch/src/spec.test.ts @@ -68,7 +68,6 @@ spec.tests(tests); framework.run([spec]); // Helper function - function createTest(module: string, asserts: string[]): TestScenario { const steps: Step[] = []; diff --git a/tests/latch/src/util/spec.util.ts b/tests/latch/src/util/spec.util.ts index 07f034a1..8544549b 100644 --- a/tests/latch/src/util/spec.util.ts +++ b/tests/latch/src/util/spec.util.ts @@ -5,6 +5,88 @@ interface Cursor { value: number; } +function float32HexStr(x: number): string { + const ab = new ArrayBuffer(4); + const fb = new Float32Array(ab); + fb[0] = x; + const ui8 = new Uint8Array(ab); + let res = ''; + for (let i = 3; i >= 0; i--) { + res += ui8[i].toString(16).padStart(2, '0'); + } + return res; +} + +function float64HexStr(x: number): string { + const ab = new ArrayBuffer(8); + const fb = new Float64Array(ab); + fb[0] = x; + const ui8 = new Uint8Array(ab); + let res = ''; + for (let i = 7; i >= 0; i--) { + res += ui8[i].toString(16).padStart(2, '0'); + } + return res; +} + +function parseWasmFloat32(str: string) { + const strBuf = str.replace(/_/gi, ''); // remove those damned underscores + + const flt = parseHexFloat(strBuf); + const res = float32HexStr(flt); + return res; +} + +function parseWasmFloat64(str: string): string|undefined { + const strBuf = str.replace(/_/gi, ''); // remove those damned underscores + + // just use hexFloat - it works for 64-bit floats + const flt = parseHexFloat(strBuf); + const res = float64HexStr(flt); + return res; +} + +function parseV128(type: string, args: string[]): string | undefined { + const int_lambda = (bit_width: number, mask: bigint): string|undefined => { + const elems = 128 / bit_width; + const pad_len = bit_width / 4; + + if(args.length !== elems) return undefined; + const res = args + .map(str => str.replace(/_/gi, '')) // WASM allows _ in numbers, TS doesn't like those + .map(str => { + let start_idx = 0; + let sign = 1; + if(str.startsWith('-')) { sign = -1; start_idx = 1; } + else if(str.startsWith('+')) { start_idx = 1; } + return BigInt(sign) * BigInt(str.slice(start_idx)) + }) // parse to (big)-int + .map(num => num & mask) // ensure correct bit width + .map(num => num.toString(16).padStart(pad_len, '0')) // convert to hex + .reduce((acc, val) => acc + val, ''); // concat + return res; + }; + + const float_lambda = (elem_count: number, parser: (s: string) => string|undefined): string|undefined => { + if(args.length !== elem_count) return undefined; + const parsed = args.map(str => parser(str)); + if(parsed.some(str => str === undefined)) return undefined; + return parsed.reduce((acc, val) => acc + (val as string), ''); + } + + switch(type) { + case 'i8x16': return int_lambda(8, 0x00000000000000ffn); + case 'i16x8': return int_lambda(16, 0x000000000000ffffn); + case 'i32x4': return int_lambda(32, 0x00000000ffffffffn); + case 'i64x2': return int_lambda(64, 0xffffffffffffffffn); + + case 'f32x4': return float_lambda(4, parseWasmFloat32); + case 'f64x2': return float_lambda(2, parseWasmFloat64); + + default: return undefined; + } +} + export function parseResult(input: string): WASM.Value | undefined { let cursor = 0; let delta: number = consume(input, cursor, /\(/d); @@ -22,6 +104,9 @@ export function parseResult(input: string): WASM.Value | undefined { delta = consume(input, cursor, /^[^)]*/d); if (type === WASM.Type.f32 || type === WASM.Type.f64) { value = parseHexFloat(input.slice(cursor, cursor + delta)); + } else if(type === WASM.Type.v128) { + const slice = input.slice(cursor, cursor + delta).split(' ').filter(x => x.trim() !== ''); // [dim, arg1, ...] + value = parseV128(slice[0], slice.slice(1)); } else { value = parseInteger(input.slice(cursor, cursor + delta)); } @@ -49,7 +134,7 @@ export function parseArguments(input: string, index: Cursor): WASM.Value[] { cursor += delta + consume(input, cursor + delta, /^[^)]*const /d); delta = consume(input, cursor, /^[^)]*/d); - let maybe: number | undefined; + let maybe: number | bigint | undefined; if (type === WASM.Type.f32 || type === WASM.Type.f64) { maybe = parseHexFloat(input.slice(cursor, cursor + delta)); } else { @@ -106,6 +191,10 @@ function parseHexFloat(input: string): number { return Infinity; } + if (input.includes('e')) { + return parseFloat(input); + } + const radix: number = input.includes('0x') ? 16 : 10; let base: string = input, mantissa, exponent = 0; @@ -127,16 +216,12 @@ function parseHexFloat(input: string): number { return mantissa * Math.pow(2, exponent); } -function parseInteger(hex: string, bytes: number = 4): number { - if (!hex.includes('0x')) { - return parseInt(hex); +function parseInteger(hexU: string, bytes: number = 4): bigint { + const hex = hexU.replace(/_/g, ''); + if(hex.startsWith('-')) { + return BigInt(-1) * BigInt(hex.slice(1)); } - const mask = parseInt('0x80' + '00'.repeat(bytes - 1), 16); - let integer = parseInt(hex, 16); - if (integer >= mask) { - integer = integer - mask * 2; - } - return integer; + return BigInt(hex); } export function find(regex: RegExp, input: string) { @@ -145,4 +230,4 @@ export function find(regex: RegExp, input: string) { return ''; } return match[1]; -} \ No newline at end of file +} diff --git a/tutorials/c/.gitignore b/tutorials/c/.gitignore new file mode 100644 index 00000000..3d758890 --- /dev/null +++ b/tutorials/c/.gitignore @@ -0,0 +1 @@ +!Makefile \ No newline at end of file diff --git a/tutorials/c/Makefile b/tutorials/c/Makefile new file mode 100644 index 00000000..87d87599 --- /dev/null +++ b/tutorials/c/Makefile @@ -0,0 +1,9 @@ +all: add.wasm matmul.wasm +.PHONY: all + +%.wasm: %.c Makefile + clang --target=wasm32 -msimd128 -O3 -flto -nostdlib -Wl,--no-entry -Wl,--export-all -o $@ $< + +clean: + rm -f *.wasm +.PHONY: clean \ No newline at end of file diff --git a/tutorials/c/add.c b/tutorials/c/add.c new file mode 100644 index 00000000..46bc7366 --- /dev/null +++ b/tutorials/c/add.c @@ -0,0 +1,18 @@ +// +// Created by jay on 10/10/24. +// + +#include "wasm_simd128.h" + +// static unsigned char *memory; // 1 WASM page + +__attribute__((import_module("env"), import_name("print_int"))) +extern void print_int(int value); + +typedef v128_t v128; + +__attribute__((noinline)) +int add(const int a, const int b) { return a * a + b; } + +__attribute__((export_name("main"))) +void _start() { print_int(add(4, 2)); } \ No newline at end of file diff --git a/tutorials/c/matmul.c b/tutorials/c/matmul.c new file mode 100644 index 00000000..9baaf97b --- /dev/null +++ b/tutorials/c/matmul.c @@ -0,0 +1,140 @@ +// +// Created by jay on 10/10/24. +// +#include "stdint.h" +#include "wasm_simd128.h" + +static unsigned char *memory[1024]; + +__attribute__((import_module("env"), import_name("print_int"))) +extern void print_int(int value); + +/** + * Computes a * b + c (lane-wise) + */ +__attribute__((noinline)) +v128_t vmlaq_u16(const v128_t a, const v128_t b, const v128_t c) { + return wasm_i16x8_add(wasm_i16x8_mul(a, b), c); +} + +/** + * Computes (a >> shift) + b (lane-wise) + */ +__attribute__((noinline)) +v128_t vsraq_n_u16(const v128_t a, const v128_t b, const unsigned int shift) { + return wasm_i16x8_add(b, wasm_i16x8_shr(a, shift)); +} + +/** + * Computes the sum of all lanes in a vector + */ +__attribute__((noinline)) +short vaddvq_u16(v128_t v) { +#define SWIZZLE_MASK0 wasm_i8x16_const(0x4, 0x5, 0x6, 0x7, -1, -1, -1, -1, 0xc, 0xd, 0xe, 0xf, -1, -1, -1, -1) +#define SWIZZLE_MASK1 wasm_i8x16_const(0x8, 0x9, 0xa, 0xb, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1) + + v = wasm_i32x4_extadd_pairwise_i16x8(v); + v = wasm_i32x4_add(v, wasm_i8x16_swizzle(v, SWIZZLE_MASK0)); + v = wasm_i32x4_add(v, wasm_i8x16_swizzle(v, SWIZZLE_MASK1)); + return wasm_i32x4_extract_lane(v, 0); +} + +__attribute__((noinline)) +void print_v128(const v128_t v) { + print_int(wasm_i16x8_extract_lane(v, 0)); + print_int(wasm_i16x8_extract_lane(v, 1)); + print_int(wasm_i16x8_extract_lane(v, 2)); + print_int(wasm_i16x8_extract_lane(v, 3)); + print_int(wasm_i16x8_extract_lane(v, 4)); + print_int(wasm_i16x8_extract_lane(v, 5)); + print_int(wasm_i16x8_extract_lane(v, 6)); + print_int(wasm_i16x8_extract_lane(v, 7)); +} + +__attribute__((noinline)) +v128_t vmovq_n_u16(const unsigned short n) { + return wasm_i16x8_splat(n); +} + +__attribute__((noinline)) +v128_t vld1q_u16(const unsigned short *ptr) { + return wasm_v128_load(ptr); +} + +__attribute__((noinline)) +v128_t vandq_u16(const v128_t a, const v128_t b) { + return wasm_v128_and(a, b); +} + +unsigned short small_dot(const unsigned short w, const unsigned short a) { + return ((w * a) & 0x0f00) >> 8; +} + +unsigned short small_vec_dot(const v128_t w, const v128_t a) { + return vaddvq_u16(wasm_i16x8_shr(vandq_u16(wasm_i16x8_mul(w, a), wasm_i16x8_splat(0x0f00)), 8)); +} + +unsigned short small_vec_dot_ptr(const unsigned short *w, const unsigned short *a) { + v128_t local = vmovq_n_u16(0); + v128_t sum = vmovq_n_u16(0); + const v128_t v_w = vld1q_u16(w); + const v128_t v_a = vld1q_u16(a); + local = vmlaq_u16(v_w, v_a, local); + local = vandq_u16(local, wasm_i16x8_splat(0x0f00)); + sum = vsraq_n_u16(local, sum, 8); + return vaddvq_u16(sum); +} + +__attribute__((noinline)) +int dot(const unsigned short *w, const unsigned short *a, const int k) { + // expect w=1, a=1, k%(depth*iter*8)=0 +#define DEPTH 2 +#define ITER 2 +#define MASK vmovq_n_u16(0x0f00) + // expect k % 32 == 0 + + v128_t sum = vmovq_n_u16(0); + for(int i = 0; i < k; i += 8 * DEPTH * ITER) { + v128_t local = vmovq_n_u16(0); + for(int j = 0; j < ITER; j++) { + v128_t vecw = vld1q_u16(w); + v128_t veca = vld1q_u16(a); + local = vmlaq_u16(vecw, veca, local); + w += 8; + a += 8; + } + local = vandq_u16(local, MASK); + sum = vsraq_n_u16(local, sum, 8); + } + return vaddvq_u16(sum); +} + +__attribute__((export_name("main"))) +void _start() { + const unsigned short packed_w[16] = { + 0x0001, 0x0001, 0x0001, 0x0001, 0x0000, 0x0101, 0x0202, 0x0303, + 0x0302, 0x0100, 0x0201, 0x0001, 0x0201, 0x0201, 0x0201, 0x0201 + }; + + const unsigned short packed_a[16] = { + 0x0100, 0x0002, 0x0201, 0x0100, 0x0100, 0x0200, 0x0300, 0x0000, + 0x0100, 0x0200, 0x0300, 0x0400, 0x0001, 0x0001, 0x0001, 0x0001 + }; + + print_int(dot(packed_w, packed_a, 32)); // OK! + + // print_int(small_dot(0x0302, 0x0104)); // OK! + + // print_int(small_vec_dot( + // wasm_i16x8_const(0x0001, 0x0203, 0x0001, 0x0203, 0x0302, 0x0100, 0x0302, 0x0100), + // wasm_i16x8_const(0x0001, 0x0001, 0x0001, 0x0001, 0x0201, 0x0201, 0x0201, 0x0201) + // )); // OK! + + // const unsigned short w[16] = { + // 0x0001, 0x0203, 0x0001, 0x0203, 0x0302, 0x0100, 0x0302, 0x0100 + // }; + // const unsigned short a[16] = { + // 0x0001, 0x0001, 0x0001, 0x0001, 0x0201, 0x0201, 0x0201, 0x0201 + // }; + // print_int(small_vec_dot_ptr(w, a)); // OK! +} \ No newline at end of file diff --git a/tutorials/wat/main/simd.wat b/tutorials/wat/main/simd.wat new file mode 100644 index 00000000..8bcc1fd3 --- /dev/null +++ b/tutorials/wat/main/simd.wat @@ -0,0 +1,169 @@ +(module + (import "env" "print_int" (func $print.int (type $i32->void))) + (type $void->void (func)) + (type $i32->void (func (param i32))) + (type $v128->v128->v128->v128 (func (param v128) (param v128) (param v128) (result v128))) + (type $v128->v128->i32->v128 (func (param v128) (param v128) (param i32) (result v128))) + (type $v128->i16 (func (param v128) (result i32))) + (type $v128->void (func (param v128))) + (memory 1) + + (func $vmlaq_u16 (type $v128->v128->v128->v128) + (i16x8.mul (local.get 0) (local.get 1)) + (i16x8.add (local.get 2)) + ) + + (func $vsraq_n_u16 (type $v128->v128->i32->v128) + (i16x8.shr_s (local.get 0) (local.get 2)) + (i16x8.add (local.get 1)) + ) + + (func $print.v128 (type $v128->void) + local.get 0 + i16x8.extract_lane_s 0 + call $print.int + + local.get 0 + i16x8.extract_lane_s 1 + call $print.int + + local.get 0 + i16x8.extract_lane_s 2 + call $print.int + + local.get 0 + i16x8.extract_lane_s 3 + call $print.int + + local.get 0 + i16x8.extract_lane_s 4 + call $print.int + + local.get 0 + i16x8.extract_lane_s 5 + call $print.int + + local.get 0 + i16x8.extract_lane_s 6 + call $print.int + + local.get 0 + i16x8.extract_lane_s 7 + call $print.int + ) + + (func $print.v128.i8x16 (type $v128->void) + local.get 0 + i8x16.extract_lane_s 0 + call $print.int + + local.get 0 + i8x16.extract_lane_s 1 + call $print.int + + local.get 0 + i8x16.extract_lane_s 2 + call $print.int + + local.get 0 + i8x16.extract_lane_s 3 + call $print.int + + local.get 0 + i8x16.extract_lane_s 4 + call $print.int + + local.get 0 + i8x16.extract_lane_s 5 + call $print.int + + local.get 0 + i8x16.extract_lane_s 6 + call $print.int + + local.get 0 + i8x16.extract_lane_s 7 + call $print.int + + local.get 0 + i8x16.extract_lane_s 8 + call $print.int + + local.get 0 + i8x16.extract_lane_s 9 + call $print.int + + local.get 0 + i8x16.extract_lane_s 10 + call $print.int + + local.get 0 + i8x16.extract_lane_s 11 + call $print.int + + local.get 0 + i8x16.extract_lane_s 12 + call $print.int + + local.get 0 + i8x16.extract_lane_s 13 + call $print.int + + local.get 0 + i8x16.extract_lane_s 14 + call $print.int + + local.get 0 + i8x16.extract_lane_s 15 + call $print.int + ) + + (func $vaddvq_u16 (type $v128->i16) + local.get 0 + i32x4.extadd_pairwise_i16x8_u + local.tee 0 + i32x4.extract_lane 0 + local.get 0 + i32x4.extract_lane 1 + local.get 0 + i32x4.extract_lane 2 + local.get 0 + i32x4.extract_lane 3 + i32.add + i32.add + i32.add + ) + + (func $vaddvq_u16.swizzle (type $v128->i16) + local.get 0 + i32x4.extadd_pairwise_i16x8_u + local.tee 0 + v128.const i8x16 0x4 0x5 0x6 0x7 0xc 0xd 0xe 0xf 0x8 0x9 0xa 0xb -1 -1 -1 -1 + i8x16.swizzle + i32x4.add + i32x4.extract_lane 0 +) + + (; (func $vaddvq_u16.swizzle (type $v128->i16) ;) + (; local.get 0 ;) + (; i32x4.extadd_pairwise_i16x8_u ;) + (; local.tee 0 ;) + (; v128.const i8x16 0x4 0x5 0x6 0x7 -1 -1 -1 -1 0xc 0xd 0xe 0xf -1 -1 -1 -1 ;) + (; i8x16.swizzle ;) + (; local.get 0 ;) + (; i32x4.add ;) + (; local.tee 0 ;) + (; v128.const i8x16 0x8 0x9 0xa 0xb -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ;) + (; i8x16.swizzle ;) + (; local.get 0 ;) + (; i32x4.add ;) + (; i32x4.extract_lane 0 ;) + (; ) ;) + + (func $run (type $void->void) + (call $vaddvq_u16.swizzle (v128.const i16x8 1 2 3 4 5 6 7 8)) + call $print.int + ) + + (export "main" (func $run)) +) diff --git a/tutorials/wat/main/simd2.wat b/tutorials/wat/main/simd2.wat new file mode 100644 index 00000000..2657e701 --- /dev/null +++ b/tutorials/wat/main/simd2.wat @@ -0,0 +1,52 @@ +(module + (import "env" "print_int" (func $print.int (type $i32->void))) + (type $void->void (func)) + (type $i32->void (func (param i32))) + (type $void->v128 (func (result v128))) + + (memory 1) + + (func $build (type $void->v128) + (i32.const 0) + i16x8.splat + i32.const 1 + i16x8.replace_lane 0 + i32.const 2 + i16x8.replace_lane 1 + i32.const 3 + i16x8.replace_lane 2 + i32.const 4 + i16x8.replace_lane 3 + i32.const 5 + i16x8.replace_lane 4 + i32.const 6 + i16x8.replace_lane 5 + i32.const 7 + i16x8.replace_lane 6 + i32.const 8 + i16x8.replace_lane 7 + ) + + (func $run (type $void->void) + (i32.const 123456) + i32x4.splat + i32x4.extract_lane 0 + call $print.int + + call $build + i16x8.extract_lane_s 0 + call $print.int + + call $build + i16x8.extract_lane_s 1 + call $print.int + + call $build + i16x8.extract_lane_s 2 + call $print.int + + (v128.store (i32.const 0) (i8x16.splat (i32.const 0))) + ) + + (export "main" (func $run)) +) \ No newline at end of file