diff --git a/.gitignore b/.gitignore
index 0e6a6834..167de90c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 .idea/
 .vscode/
+.cache/
 .ccls
 
 *.bin
diff --git a/platforms/CLI-Emulator/main.cpp b/platforms/CLI-Emulator/main.cpp
index 24261d85..7a041a49 100644
--- a/platforms/CLI-Emulator/main.cpp
+++ b/platforms/CLI-Emulator/main.cpp
@@ -406,6 +406,12 @@ int main(int argc, const char *argv[]) {
         } else {
             wac->run_module(m);
         }
+
+        if(m->exception) {
+            fprintf(stderr, "wdcli: exception: %s\n", m->exception);
+            return 1;
+        }
+
         wac->unload_module(m);
         wac->debugger->stop();
 
diff --git a/src/Debug/debugger.cpp b/src/Debug/debugger.cpp
index 8828f2fe..feceeadc 100644
--- a/src/Debug/debugger.cpp
+++ b/src/Debug/debugger.cpp
@@ -366,6 +366,12 @@ void Debugger::printValue(const StackValue *v, const uint32_t idx,
             snprintf(buff, 255, R"("type":"F64","value":")" FMT(PRIx64) "\"",
                      v->value.uint64);
             break;
+        case V128:
+            // we'll just use hex-strings
+            // 64-bit = 8 bytes = 16 nibbles = 16 hex-characters
+            snprintf(buff, 255, R"("type":"V128","value":"%016lx%016lx")",
+                     v->value.simd.i64x2[0], v->value.simd.i64x2[1]);
+            break;
         default:
             snprintf(buff, 255, R"("type":"%02x","value":")" FMT(PRIx64) "\"",
                      v->value_type, v->value.uint64);
@@ -574,6 +580,9 @@ void Debugger::dumpLocals(const Module *m) const {
                 snprintf(_value_str, 255, R"("type":"F64","value":%.7f)",
                          v->value.f64);
                 break;
+            case V128:
+                snprintf(_value_str, 255, R"("type":"V128","value":"%016lx%016lx")",
+                         v->value.simd.i64x2[0], v->value.simd.i64x2[1]);
             default:
                 snprintf(_value_str, 255,
                          R"("type":"%02x","value":")" FMT(PRIx64) "\"",
diff --git a/src/Interpreter/instructions.cpp b/src/Interpreter/instructions.cpp
index 9046c0d0..5a49be1c 100644
--- a/src/Interpreter/instructions.cpp
+++ b/src/Interpreter/instructions.cpp
@@ -1267,3 +1267,737 @@ bool i_instr_callback([[maybe_unused]] Module *m,
     // TODO
     return true;
 }
+
+/**
+ * [0xfd] 0x0f...0x14 SIMD splat operations
+ */
+bool i_instr_simd_splat(Module* m, uint8_t opcode) {
+    auto &raw_top = m->stack[m->sp];
+    auto &top = raw_top.value;
+
+    switch(opcode) {
+        case 0x0f: { // i8x16.splat
+            const int8_t val = top.int32;
+            for(auto &i : top.simd.i8x16) i = val;
+            break;
+        }
+
+        case 0x10: { // i16x8.splat
+            const int16_t val = top.int32;
+            for(auto &i : top.simd.i16x8) i = val;
+            break;
+        }
+
+        case 0x11: { // i32x4.splat
+            const int32_t val = top.int32;
+            for(auto &i : top.simd.i32x4) i = val;
+            break;
+        }
+
+        case 0x12: { // i64x2.splat
+            const int64_t val = top.int64;
+            for(auto &i : top.simd.i64x2) i = val;
+            break;
+        }
+
+        case 0x13: { // f32x4.splat
+            const float val = top.f32;
+            for(auto &i : top.simd.f32x4) i = val;
+            break;
+        }
+
+        case 0x14: { // f64x2.splat
+            const double val = top.f64;
+            for(auto &i : top.simd.f64x2) i = val;
+            break;
+        }
+    }
+
+    raw_top.value_type = V128;
+    return true;
+}
+
+bool i_instr_simd_extract(Module* m, uint8_t opcode){
+    // inner helper to check if lane is within bounds, then execute the operation
+    auto &raw_top = m->stack[m->sp];
+    const uint8_t lane = *m->pc_ptr;
+    m->pc_ptr++;
+
+    // expect the compiler to inline the lambda - see
+    // https://stackoverflow.com/questions/13722426/why-can-lambdas-be-better-optimized-by-the-compiler-than-plain-functions/13722515#13722515
+    auto lane_handler = [&raw_top, lane]<typename I, typename O>(const int max, I *in, O &out, const uint8_t type) {
+        if(lane > max) {
+            sprintf(exception, "lane index out of bounds (%d > %d)", lane, max);
+            return false;
+        }
+
+        out = in[lane];
+        raw_top.value_type = type;
+
+        return true;
+    };
+
+    auto lane_handler_unsigned = [&raw_top, lane]<typename I, typename O>(const int max, I *in, O &out, const uint8_t type) {
+        if(lane > max) {
+            sprintf(exception, "lane index out of bounds (%d > %d)", lane, max);
+            return false;
+        }
+
+        // type switcher between uint8 and uint16 based on input type
+        using temp_t = std::conditional_t<std::is_same_v<I, int8_t>, uint8_t, uint16_t>;
+        temp_t x;
+        memcpy(&x, &in[lane], sizeof(temp_t));
+        out = x;
+        raw_top.value_type = type;
+
+        return true;
+    };
+
+    switch(opcode) {
+        case 0x15:  // i8x16.extract_lane_s
+            return lane_handler(15, raw_top.value.simd.i8x16, raw_top.value.int32, I32);
+        case 0x16: // i8x16.extract_lane_u
+            return lane_handler_unsigned(15, raw_top.value.simd.i8x16, raw_top.value.uint32, I32);
+
+        case 0x18:  // i16x8.extract_lane_s
+            return lane_handler(7, raw_top.value.simd.i16x8, raw_top.value.int32, I32);
+        case 0x19: // i16x8.extract_lane_u
+            return lane_handler_unsigned(7, raw_top.value.simd.i16x8, raw_top.value.uint32, I32);
+
+        case 0x1b: // i32x4.extract_lane
+            return lane_handler(3, raw_top.value.simd.i32x4, raw_top.value.uint32, I32);
+
+        case 0x1d: // i64x2.extract_lane
+            return lane_handler(1, raw_top.value.simd.i64x2, raw_top.value.uint64, I64);
+
+        case 0x1f: // f32x4.extract_lane
+            return lane_handler(3, raw_top.value.simd.f32x4, raw_top.value.f32, F32);
+
+        case 0x21: // f64x2.extract_lane
+            return lane_handler(1, raw_top.value.simd.f64x2, raw_top.value.f64, F64);
+    }
+
+    return false;
+}
+
+bool i_instr_simd_replace(Module *m, uint8_t opcode) {
+    auto &v128 = m->stack[m->sp - 1];
+    auto &update = m->stack[m->sp].value;
+    const uint8_t lane = *m->pc_ptr;
+    m->pc_ptr++;
+    m->sp -= 1;
+
+    auto lane_handler = [&v128, lane]<typename O>(const int max, O *out, O replace) {
+        if(lane > max) {
+            sprintf(exception, "lane index out of bounds (%d > %d)", lane, max);
+            return false;
+        }
+
+        out[lane] = replace;
+        return true;
+    };
+
+    switch(opcode) {
+        case 0x17: // i8x16.replace_lane
+            return lane_handler(15, v128.value.simd.i8x16, static_cast<int8_t>(update.uint32));
+        case 0x1a: // i16x8.replace_lane
+            return lane_handler(8, v128.value.simd.i16x8, static_cast<int16_t>(update.uint32));
+        case 0x1c: // i32x4.replace_lane
+            return lane_handler(4, v128.value.simd.i32x4, static_cast<int32_t>(update.uint32));
+        case 0x1e: // i64x2.replace_lane
+            return lane_handler(2, v128.value.simd.i64x2, static_cast<int64_t>(update.uint64));
+        case 0x20: // f32x4.replace_lane
+            return lane_handler(4, v128.value.simd.f32x4, update.f32);
+        case 0x22: // f64x2.replace_lane
+            return lane_handler(2, v128.value.simd.f64x2, update.f64);
+    }
+
+    return false;
+}
+
+inline bool verify_endian() {
+    static bool _ = [] {
+        int n = 1;
+        if(*reinterpret_cast<char *>(&n) == 1) {
+            // little endian
+            return true;
+        }
+
+        FATAL("v128.const only supported on little-endian systems");
+        return false;
+    }();
+    return _;
+}
+
+bool i_instr_simd_const(Module* m){
+    verify_endian();
+
+    const uint8_t *data = m->pc_ptr;
+    m->pc_ptr += 16; // skip immediate 16-byte data
+
+    m->sp++;
+    auto &v = m->stack[m->sp].value.simd;
+    std::memcpy(&v, data, 16);
+    m->stack[m->sp].value_type = V128;
+    return true;
+}
+
+bool i_instr_simd_store(Module* m){
+    auto &sv = m->stack[m->sp--];
+    StackValue sv2;
+    sv2.value_type = I64;
+    sv2.value.uint64 = sv.value.simd.i64x2[1];
+
+    const uint32_t flags = read_LEB_32(&m->pc_ptr);
+    const uint32_t offset = read_LEB_32(&m->pc_ptr);
+
+    uint32_t ptr = m->stack[m->sp--].value.uint32;
+
+    if(flags != 2 && TRACE) {
+        dbg_info(
+            "      - unaligned store - flags: 0x%x, offset: 0x%x, addr: 0x%x, val: %s\n",
+            flags, offset, ptr, value_repr(&sv)
+        );
+    }
+
+    if(offset + ptr < ptr && !m->options.disable_memory_bounds) {
+        Interpreter::report_overflow(m, m->memory.bytes + offset + ptr);
+    }
+
+    ptr += offset;
+
+    // store in 2 consecutive locations
+    return m->warduino->interpreter->store(m, I64, ptr, sv) &&
+        m->warduino->interpreter->store(m, I64, ptr + 8, sv2);
+}
+
+bool i_instr_simd_load(Module* m) {
+    const uint32_t flags = read_LEB_32(&m->pc_ptr);
+    const uint32_t offset = read_LEB_32(&m->pc_ptr);
+    uint32_t ptr = m->stack[m->sp--].value.uint32;
+    if(flags != 2 && TRACE) {
+        dbg_info(
+            "      - unaligned load - flags: 0x%x, offset: 0x%x, addr: 0x%x, val: %s\n",
+            flags, offset, ptr
+        );
+    }
+
+    if(offset + ptr < ptr && !m->options.disable_memory_bounds) {
+        Interpreter::report_overflow(m, m->memory.bytes + offset + ptr);
+    }
+
+    ptr += offset;
+
+    // load from 2 consecutive locations
+    bool success = m->warduino->interpreter->load(m, I64, ptr, offset);
+    const auto i64_0 = m->stack[m->sp].value.int64;
+    m->sp--; // make sure we overwrite the previous load
+    success &= m->warduino->interpreter->load(m, I64, ptr + 8, offset);
+    const auto i64_1 = m->stack[m->sp].value.int64;
+
+    // reconstruct v128
+    auto & [value_type, value] = m->stack[m->sp];
+    value_type = V128;
+    value.simd.i64x2[0] = i64_0;
+    value.simd.i64x2[1] = i64_1;
+
+    return success;
+}
+
+bool i_instr_simd_bin_bit_op(Module* m, uint8_t opcode){
+    auto v1 = m->stack[m->sp - 1].value.simd;
+    auto v2 = m->stack[m->sp].value.simd;
+    m->sp--;
+
+    const auto apply = [&v1, &v2, m]<typename F>(F &&op) {
+        m->stack[m->sp].value.simd.i64x2[0] = op(v1.i64x2[0], v2.i64x2[0]);
+        m->stack[m->sp].value.simd.i64x2[1] = op(v1.i64x2[1], v2.i64x2[1]);
+        return true;
+    };
+
+    switch(opcode) {
+        case 0x4e: // bit-and
+            return apply([](const uint64_t &a, const uint64_t &b) { return a & b; });
+        case 0x4f: // bit-and-not
+            return apply([](const uint64_t &a, const uint64_t &b) { return a & ~b; });
+        case 0x50: // bit-or
+            return apply([](const uint64_t &a, const uint64_t &b) { return a | b; });
+        case 0x51: // xor
+            return apply([](const uint64_t &a, const uint64_t &b) { return a ^ b; });
+    }
+
+    return false;
+}
+
+bool i_instr_simd_v128_not(Module* m){
+    m->stack[m->sp].value.simd.i64x2[0] = ~m->stack[m->sp].value.simd.i64x2[0];
+    m->stack[m->sp].value.simd.i64x2[1] = ~m->stack[m->sp].value.simd.i64x2[1];
+    return true;
+}
+
+
+// to implement the whole swathe of SIMD instructions, let's use some templates
+// so the compiler can generate the code...
+namespace {
+// anonymous namespace to hide implementation details/allow code folding
+// the SIMD vector type (for ease of coding)
+using simd_t = decltype(StackValue::value.simd);
+
+// use type-traits for all compile-time meta-data
+template <typename T> struct v128_traits;
+template <> struct v128_traits<int8_t> {
+    // we need to know the number of elements
+    constexpr static uint N = 16;
+    // we need to know the amount of bits
+    constexpr static uint bits = 8;
+    // the type of elements in the SIMD vector
+    using type = int8_t;
+    // the type to use when an operation returns a boolean
+    using bool_alt = int8_t;
+    // we need to know the signed version of this type
+    using signed_t = int8_t;
+    // we also need to know the unsigned version of this type
+    using unsigned_t = uint8_t;
+    // for the ext_add_pairwise, we need the extended type
+    using extended_t = int16_t;
+    // pointer-to-member to access the SIMD vector
+    constexpr static auto member = &simd_t::i8x16;
+};
+template <> struct v128_traits<uint8_t> {
+    constexpr static uint N = 16;
+    constexpr static uint bits = 8;
+    using type = uint8_t;
+    using bool_alt = int8_t;
+    using signed_t = int8_t;
+    using unsigned_t = uint8_t;
+    using extended_t = uint16_t;
+    constexpr static auto member = &simd_t::i8x16;
+};
+template <> struct v128_traits<int16_t> {
+    constexpr static uint N = 8;
+    constexpr static uint bits = 16;
+    using type = int16_t;
+    using bool_alt = int16_t;
+    using signed_t = int16_t;
+    using unsigned_t = uint16_t;
+    using extended_t = int32_t;
+    constexpr static auto member = &simd_t::i16x8;
+};
+template <> struct v128_traits<uint16_t> {
+    constexpr static uint N = 8;
+    constexpr static uint bits = 16;
+    using type = uint16_t;
+    using bool_alt = int16_t;
+    using signed_t = int16_t;
+    using unsigned_t = uint16_t;
+    using extended_t = uint32_t;
+    constexpr static auto member = &simd_t::i16x8;
+};
+template <> struct v128_traits<int32_t> {
+    constexpr static uint N = 4;
+    constexpr static uint bits = 32;
+    using type = int32_t;
+    using bool_alt = int32_t;
+    using signed_t = int32_t;
+    using unsigned_t = uint32_t;
+    constexpr static auto member = &simd_t::i32x4;
+};
+template <> struct v128_traits<uint32_t> {
+    constexpr static uint N = 4;
+    constexpr static uint bits = 32;
+    using type = uint32_t;
+    using bool_alt = int32_t;
+    using signed_t = int32_t;
+    using unsigned_t = uint32_t;
+    constexpr static auto member = &simd_t::i32x4;
+};
+template <> struct v128_traits<int64_t> {
+    constexpr static uint N = 2;
+    constexpr static uint bits = 64;
+    using type = int64_t;
+    using bool_alt = int64_t;
+    using signed_t = int64_t;
+    using unsigned_t = uint64_t;
+    constexpr static auto member = &simd_t::i64x2;
+};
+template <> struct v128_traits<uint64_t> {
+    constexpr static uint N = 2;
+    constexpr static uint bits = 64;
+    using type = uint64_t;
+    using bool_alt = int64_t;
+    using signed_t = int64_t;
+    using unsigned_t = uint64_t;
+    constexpr static auto member = &simd_t::i64x2;
+};
+template <> struct v128_traits<float> {
+    constexpr static uint N = 4;
+    using type = float;
+    using bool_alt = int32_t;
+    using signed_t = float;
+    constexpr static auto member = &simd_t::f32x4;
+};
+template <> struct v128_traits<double> {
+    constexpr static uint N = 2;
+    using type = double;
+    using bool_alt = int64_t;
+    using signed_t = double;
+    constexpr static auto member = &simd_t::f64x2;
+};
+
+// get a reference to a lane in a SIMD vector
+template <typename T>
+constexpr typename v128_traits<T>::signed_t &get_lane_ref(StackValue &sv, uint8_t lane) {
+    // v128_traits<T>::member is pointer-to-member
+    // (struct).*(ptr-to-member) gets struct.member
+    // so we get (simd-union).(correct type field)[lane]
+    return (sv.value.simd.*v128_traits<T>::member)[lane];
+}
+// get a const reference to a lane in a SIMD vector
+template <typename T>
+constexpr const typename v128_traits<T>::signed_t &get_lane_cref(const StackValue &sv, uint8_t lane) {
+    return (sv.value.simd.*v128_traits<T>::member)[lane];
+}
+// get a copy of a lane in a SIMD vector
+template <typename T>
+constexpr typename v128_traits<T>::signed_t get_lane_copy(StackValue &sv, uint8_t lane) {
+    return (sv.value.simd.*v128_traits<T>::member)[lane];
+}
+
+// apply binary operator lane-wise
+template <typename T, typename ExplicitCast = T, typename F>
+constexpr bool simd_bin_op(Module *m, const StackValue &sv1, const StackValue &sv2, F &&f) {
+    static_assert(std::is_invocable_v<F, ExplicitCast, ExplicitCast>, "Operation is not invocable on type T");
+    static_assert(sizeof(T) == sizeof(ExplicitCast), "ExplicitCast must have same size as T");
+
+    using result_t = std::conditional_t<
+        std::is_same_v<std::invoke_result_t<F, ExplicitCast, ExplicitCast>, bool>, // if the result is a boolean
+        typename v128_traits<T>::bool_alt, // then use the bool_alt type (to ensure lane count for floats)
+        typename v128_traits<T>::type // otherwise, use the normal type
+    >;
+    using result_traits = v128_traits<result_t>; // v128 traits for the result type
+
+    static_assert(v128_traits<T>::N == result_traits::N, "Result type must have same lane count as input type");
+
+    m->stack[m->sp].value_type = V128;
+    for(uint i = 0; i < v128_traits<T>::N; i++) {
+        ExplicitCast lane1, lane2;
+        if constexpr(std::is_same_v<T, ExplicitCast>) {
+            lane1 = get_lane_cref<T>(sv1, i);
+            lane2 = get_lane_cref<T>(sv2, i);
+        }
+        else {
+            memcpy(&lane1, &get_lane_cref<T>(sv1, i), sizeof(T));
+            memcpy(&lane2, &get_lane_cref<T>(sv2, i), sizeof(T));
+        }
+        get_lane_ref<result_t>(m->stack[m->sp], i) = f(lane1, lane2);
+    }
+    return true;
+}
+
+template <typename T, template <typename> typename FPre, typename ExplicitCast = T, typename F = FPre<ExplicitCast>>
+constexpr bool simd_bin_op(Module *m, const StackValue &sv1, const StackValue &sv2) {
+    return simd_bin_op<T, ExplicitCast, F>(m, sv1, sv2, F{});
+}
+
+template <typename T, template <typename> typename FPre, typename ExplicitCast = T, typename F = FPre<ExplicitCast>>
+constexpr bool simd_shift(Module *m, const StackValue &sv1, const StackValue &sv2) {
+    static_assert(std::is_invocable_v<F, ExplicitCast, int32_t>, "Operation is not invocable on type T");
+    static_assert(sizeof(T) == sizeof(ExplicitCast), "ExplicitCast must have same size as T");
+
+    using result_t = std::conditional_t<
+        std::is_same_v<std::invoke_result_t<F, ExplicitCast, int32_t>, bool>, // if the result is a boolean
+        typename v128_traits<T>::bool_alt, // then use the bool_alt type (to ensure lane count for floats)
+        typename v128_traits<T>::type // otherwise, use the normal type
+    >;
+    using result_traits = v128_traits<result_t>; // v128 traits for the result type
+
+    static_assert(v128_traits<T>::N == result_traits::N, "Result type must have same lane count as input type");
+
+    F f{};
+
+    m->stack[m->sp].value_type = V128;
+    for(uint i = 0; i < v128_traits<T>::N; i++) {
+        ExplicitCast lane1;
+        int32_t shift = sv2.value.int32;
+        if constexpr(std::is_same_v<T, ExplicitCast>) {
+            lane1 = get_lane_cref<T>(sv1, i);
+        }
+        else {
+            memcpy(&lane1, &get_lane_cref<T>(sv1, i), sizeof(T));
+        }
+        get_lane_ref<result_t>(m->stack[m->sp], i) = f(lane1, shift);
+    }
+    return true;
+}
+
+template <typename T>
+constexpr bool simd_ext_add(Module *m) {
+    using T_traits = v128_traits<T>;
+    using extend_t = typename T_traits::extended_t;
+    using extend_traits = v128_traits<extend_t>;
+
+    StackValue copy = m->stack[m->sp];
+    m->stack[m->sp].value_type = V128;
+    StackValue &res = m->stack[m->sp];
+
+    for(uint i = 0; i < extend_traits::N; i++) {
+        auto x1 = static_cast<extend_t>(get_lane_cref<T>(copy, 2 * i));
+        auto x2 = static_cast<extend_t>(get_lane_cref<T>(copy, 2 * i + 1));
+        get_lane_ref<extend_t>(res, i) = x1 + x2;
+    }
+
+    return true;
+}
+}
+
+// helper code for saturating arithmetic
+namespace {
+template <typename T>
+struct sat_add {
+    constexpr T operator()(const T t1, const T t2) const {
+        // from https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/include/bits/sat_arith.h#L49
+        T t3{};
+        if(!__builtin_add_overflow(t1, t2, &t3)) return t3;
+        if constexpr(std::is_unsigned_v<T>) return std::numeric_limits<T>::max();
+        else if(t1 < 0) return std::numeric_limits<T>::min();
+        else return std::numeric_limits<T>::max();
+    }
+};
+
+template <typename T>
+struct sat_sub {
+    constexpr T operator()(const T t1, const T t2) const {
+        // from https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/include/bits/sat_arith.h#L65
+        T t3{};
+        if(!__builtin_sub_overflow(t1, t2, &t3)) return t3;
+        if constexpr(std::is_unsigned_v<T>) return 0;
+        else if(t1 < 0) return std::numeric_limits<T>::min();
+        else return std::numeric_limits<T>::max();
+    }
+};
+
+template <typename T> struct min_struct { constexpr T operator()(const T t1, const T t2) const { return std::min(t1, t2); } };
+template <typename T> struct max_struct { constexpr T operator()(const T t1, const T t2) const { return std::max(t1, t2); } };
+// pseudo-min/max as defined in https://github.com/WebAssembly/simd/blob/main/proposals/simd/SIMD.md#pseudo-minimum
+template <typename T> struct p_min_struct { constexpr T operator()(const T t1, const T t2) const { return t2 < t1 ? t2 : t1; } };
+template <typename T> struct p_max_struct { constexpr T operator()(const T t1, const T t2) const { return t1 < t2 ? t2 : t1; } };
+
+template <typename T> struct shift_left {
+    constexpr T operator()(const T t1, const int32_t t2) const { return static_cast<T>(t1 << (t2 % v128_traits<T>::bits)); }
+};
+template <typename T> struct arith_shift_right {
+    // sign-preserving right shift ~> works only "correctly" on signed types
+    constexpr T operator()(const T t1, const int32_t t2) const {
+        using type = typename v128_traits<T>::signed_t;
+        type st1;
+
+        if constexpr(std::is_same_v<type, T>) st1 = t1;
+        else memcpy(&st1, &t1, sizeof(T));
+
+        st1 >>= (t2 % v128_traits<T>::bits);
+
+        T result;
+        if constexpr(std::is_same_v<type, T>) result = st1;
+        else memcpy(&result, &st1, sizeof(T));
+
+        return result;
+    }
+};
+template <typename T> struct logic_shift_right {
+    // zero fill right shift ~> works only "correctly" on unsigned types
+    constexpr T operator()(const T t1, const int32_t t2) const {
+        using type = typename v128_traits<T>::unsigned_t;
+        type ut1;
+
+        if constexpr(std::is_same_v<type, T>) ut1 = t1;
+        else memcpy(&ut1, &t1, sizeof(T));
+
+        ut1 >>= (t2 % v128_traits<T>::bits);
+
+        T result;
+        if constexpr(std::is_same_v<type, T>) result = ut1;
+        else memcpy(&result, &ut1, sizeof(T));
+
+        return result;
+    }
+};
+}
+
+bool i_instr_simd_bin_v128_v128_op(Module* m, uint8_t opcode){
+    // need to keep copies!
+    const auto v1 = m->stack[m->sp - 1];
+    const auto v2 = m->stack[m->sp];
+    m->sp--;
+
+    switch(opcode) {
+        // i8x16 -> ==, !=, <, <u, >, >u, <=, <=u, >=, >=u
+        case 0x23: return simd_bin_op<int8_t, std::equal_to>(m, v1, v2);
+        case 0x24: return simd_bin_op<int8_t, std::not_equal_to>(m, v1, v2);
+        case 0x25: return simd_bin_op<int8_t, std::less>(m, v1, v2);
+        case 0x26: return simd_bin_op<int8_t, std::less, uint8_t>(m, v1, v2);
+        case 0x27: return simd_bin_op<int8_t, std::greater>(m, v1, v2);
+        case 0x28: return simd_bin_op<int8_t, std::greater, uint8_t>(m, v1, v2);
+        case 0x29: return simd_bin_op<int8_t, std::less_equal>(m, v1, v2);
+        case 0x2a: return simd_bin_op<int8_t, std::less_equal, uint8_t>(m, v1, v2);
+        case 0x2b: return simd_bin_op<int8_t, std::greater_equal>(m, v1, v2);
+        case 0x2c: return simd_bin_op<int8_t, std::greater_equal, uint8_t>(m, v1, v2);
+        // i16x8 -> ==, !=, <, <u, >, >u, <=, <=u, >=, >=u
+        case 0x2d: return simd_bin_op<int16_t, std::equal_to>(m, v1, v2);
+        case 0x2e: return simd_bin_op<int16_t, std::not_equal_to>(m, v1, v2);
+        case 0x2f: return simd_bin_op<int16_t, std::less>(m, v1, v2);
+        case 0x30: return simd_bin_op<int16_t, std::less, uint16_t>(m, v1, v2);
+        case 0x31: return simd_bin_op<int16_t, std::greater>(m, v1, v2);
+        case 0x32: return simd_bin_op<int16_t, std::greater, uint16_t>(m, v1, v2);
+        case 0x33: return simd_bin_op<int16_t, std::less_equal>(m, v1, v2);
+        case 0x34: return simd_bin_op<int16_t, std::less_equal, uint16_t>(m, v1, v2);
+        case 0x35: return simd_bin_op<int16_t, std::greater_equal>(m, v1, v2);
+        case 0x36: return simd_bin_op<int16_t, std::greater_equal, uint16_t>(m, v1, v2);
+        // i32x4 -> ==, !=, <, <u, >, >u, <=, <=u, >=, >=u
+        case 0x37: return simd_bin_op<int32_t, std::equal_to>(m, v1, v2);
+        case 0x38: return simd_bin_op<int32_t, std::not_equal_to>(m, v1, v2);
+        case 0x39: return simd_bin_op<int32_t, std::less>(m, v1, v2);
+        case 0x3a: return simd_bin_op<int32_t, std::less, uint32_t>(m, v1, v2);
+        case 0x3b: return simd_bin_op<int32_t, std::greater>(m, v1, v2);
+        case 0x3c: return simd_bin_op<int32_t, std::greater, uint32_t>(m, v1, v2);
+        case 0x3d: return simd_bin_op<int32_t, std::less_equal>(m, v1, v2);
+        case 0x3e: return simd_bin_op<int32_t, std::less_equal, uint32_t>(m, v1, v2);
+        case 0x3f: return simd_bin_op<int32_t, std::greater_equal>(m, v1, v2);
+        case 0x40: return simd_bin_op<int32_t, std::greater_equal, uint32_t>(m, v1, v2);
+        // f32x4 -> ==, !=, <, >, <=, >=
+        case 0x41: return simd_bin_op<float, std::equal_to>(m, v1, v2);
+        case 0x42: return simd_bin_op<float, std::not_equal_to>(m, v1, v2);
+        case 0x43: return simd_bin_op<float, std::less>(m, v1, v2);
+        case 0x44: return simd_bin_op<float, std::greater>(m, v1, v2);
+        case 0x45: return simd_bin_op<float, std::less_equal>(m, v1, v2);
+        case 0x46: return simd_bin_op<float, std::greater_equal>(m, v1, v2);
+        // f64x2 -> ==, !=, <, >, <=, >=
+        case 0x47: return simd_bin_op<double, std::equal_to>(m, v1, v2);
+        case 0x48: return simd_bin_op<double, std::not_equal_to>(m, v1, v2);
+        case 0x49: return simd_bin_op<double, std::less>(m, v1, v2);
+        case 0x4a: return simd_bin_op<double, std::greater>(m, v1, v2);
+        case 0x4b: return simd_bin_op<double, std::less_equal>(m, v1, v2);
+        case 0x4c: return simd_bin_op<double, std::greater_equal>(m, v1, v2);
+        // i8x16 -> +, + sat s, + sat u, -, - sat s, - sat u
+        case 0x6e: return simd_bin_op<int8_t, std::plus>(m, v1, v2);
+        case 0x6f: return simd_bin_op<int8_t, sat_add>(m, v1, v2);
+        case 0x70: return simd_bin_op<int8_t, sat_add, uint8_t>(m, v1, v2);
+        case 0x71: return simd_bin_op<int8_t, std::minus>(m, v1, v2);
+        case 0x72: return simd_bin_op<int8_t, sat_sub>(m, v1, v2);
+        case 0x73: return simd_bin_op<int8_t, sat_sub, uint8_t>(m, v1, v2);
+        // i8x16 min s, min u, max s, max u (can't pass std::min/std::max as template argument, they are function-overload-sets)
+        case 0x76: return simd_bin_op<int8_t, min_struct>(m, v1, v2);
+        case 0x77: return simd_bin_op<int8_t, min_struct, uint8_t>(m, v1, v2);
+        case 0x78: return simd_bin_op<int8_t, max_struct>(m, v1, v2);
+        case 0x79: return simd_bin_op<int8_t, max_struct, uint8_t>(m, v1, v2);
+        // i16x8 -> +, + sat s, + sat u, -, - sat s, - sat u, *
+        case 0x8e: return simd_bin_op<int16_t, std::plus>(m, v1, v2);
+        case 0x8f: return simd_bin_op<int16_t, sat_add>(m, v1, v2);
+        case 0x90: return simd_bin_op<int16_t, sat_add, uint16_t>(m, v1, v2);
+        case 0x91: return simd_bin_op<int16_t, std::minus>(m, v1, v2);
+        case 0x92: return simd_bin_op<int16_t, sat_sub>(m, v1, v2);
+        case 0x93: return simd_bin_op<int16_t, sat_sub, uint16_t>(m, v1, v2);
+        case 0x95: return simd_bin_op<int16_t, std::multiplies>(m, v1, v2);
+        // i16x8 min s, min u, max s, max u
+        case 0x96: return simd_bin_op<int16_t, min_struct>(m, v1, v2);
+        case 0x97: return simd_bin_op<int16_t, min_struct, uint16_t>(m, v1, v2);
+        case 0x98: return simd_bin_op<int16_t, max_struct>(m, v1, v2);
+        case 0x99: return simd_bin_op<int16_t, max_struct, uint16_t>(m, v1, v2);
+        // i32x4 -> +, -, *
+        case 0xae: return simd_bin_op<int32_t, std::plus>(m, v1, v2);
+        case 0xb1: return simd_bin_op<int32_t, std::minus>(m, v1, v2);
+        case 0xb5: return simd_bin_op<int32_t, std::multiplies>(m, v1, v2);
+        // i32x4 min s, min u, max s, max u
+        case 0xb6: return simd_bin_op<int32_t, min_struct>(m, v1, v2);
+        case 0xb7: return simd_bin_op<int32_t, min_struct, uint32_t>(m, v1, v2);
+        case 0xb8: return simd_bin_op<int32_t, max_struct>(m, v1, v2);
+        case 0xb9: return simd_bin_op<int32_t, max_struct, uint32_t>(m, v1, v2);
+        // i64x2 -> +, -, *
+        case 0xce: return simd_bin_op<int64_t, std::plus>(m, v1, v2);
+        case 0xd1: return simd_bin_op<int64_t, std::minus>(m, v1, v2);
+        case 0xd5: return simd_bin_op<int64_t, std::multiplies>(m, v1, v2);
+        // i64x2 -> ==, !=, <, >, <=, >=
+        case 0xd6: return simd_bin_op<int64_t, std::equal_to>(m, v1, v2);
+        case 0xd7: return simd_bin_op<int64_t, std::not_equal_to>(m, v1, v2);
+        case 0xd8: return simd_bin_op<int64_t, std::less>(m, v1, v2);
+        case 0xd9: return simd_bin_op<int64_t, std::greater>(m, v1, v2);
+        case 0xda: return simd_bin_op<int64_t, std::less_equal>(m, v1, v2);
+        case 0xdb: return simd_bin_op<int64_t, std::greater_equal>(m, v1, v2);
+        // f32x4 -> +, -, *, /, min, max, pmin, pmax
+        case 0xe4: return simd_bin_op<float, std::plus>(m, v1, v2);
+        case 0xe5: return simd_bin_op<float, std::minus>(m, v1, v2);
+        case 0xe6: return simd_bin_op<float, std::multiplies>(m, v1, v2);
+        case 0xe7: return simd_bin_op<float, std::divides>(m, v1, v2);
+        case 0xe8: return simd_bin_op<float, min_struct>(m, v1, v2);
+        case 0xe9: return simd_bin_op<float, max_struct>(m, v1, v2);
+        case 0xea: return simd_bin_op<float, p_min_struct>(m, v1, v2);
+        case 0xeb: return simd_bin_op<float, p_max_struct>(m, v1, v2);
+        // f64x2 -> +, -, *, /, min, max
+        case 0xf0: return simd_bin_op<double, std::plus>(m, v1, v2);
+        case 0xf1: return simd_bin_op<double, std::minus>(m, v1, v2);
+        case 0xf2: return simd_bin_op<double, std::multiplies>(m, v1, v2);
+        case 0xf3: return simd_bin_op<double, std::divides>(m, v1, v2);
+        case 0xf4: return simd_bin_op<double, min_struct>(m, v1, v2);
+        case 0xf5: return simd_bin_op<double, max_struct>(m, v1, v2);
+        case 0xf6: return simd_bin_op<double, p_min_struct>(m, v1, v2);
+        case 0xf7: return simd_bin_op<double, p_max_struct>(m, v1, v2);
+
+        default:
+            return false;
+    }
+}
+
+bool i_instr_simd_shift(Module* m, uint8_t opcode){
+    // need to keep copies!
+    const auto v1 = m->stack[m->sp - 1];
+    const auto v2 = m->stack[m->sp];
+    m->sp--;
+
+    switch(opcode) {
+        // i8x16 <<, >>(s), >>(u)
+        case 0x6b: return simd_shift<int8_t, shift_left>(m, v1, v2);
+        case 0x6c: return simd_shift<int8_t, arith_shift_right>(m, v1, v2);
+        case 0x6d: return simd_shift<int8_t, logic_shift_right>(m, v1, v2);
+        // i16x8 <<, >>(s), >>(u)
+        case 0x8b: return simd_shift<int16_t, shift_left>(m, v1, v2);
+        case 0x8c: return simd_shift<int16_t, arith_shift_right>(m, v1, v2);
+        case 0x8d: return simd_shift<int16_t, logic_shift_right>(m, v1, v2);
+        // i32x4 <<, >>(s), >>(u)
+        case 0xab: return simd_shift<int32_t, shift_left>(m, v1, v2);
+        case 0xac: return simd_shift<int32_t, arith_shift_right>(m, v1, v2);
+        case 0xad: return simd_shift<int32_t, logic_shift_right>(m, v1, v2);
+        // i64x2 <<, >>(s), >>(u)
+        case 0xcb: return simd_shift<int64_t, shift_left>(m, v1, v2);
+        case 0xcc: return simd_shift<int64_t, arith_shift_right>(m, v1, v2);
+        case 0xcd: return simd_shift<int64_t, logic_shift_right>(m, v1, v2);
+
+        default:
+            return false;
+    }
+}
+
+bool i_instr_simd_ext_add_pairwise(Module* m, uint8_t opcode){
+    switch(opcode) {
+        case 0x7c: return simd_ext_add<int8_t>(m);
+        case 0x7d: return simd_ext_add<uint8_t>(m);
+        case 0x7e: return simd_ext_add<int16_t>(m);
+        case 0x7f: return simd_ext_add<uint16_t>(m);
+        default:
+            return false;
+    }
+}
+
+bool i_instr_simd_swizzle(Module* m){
+    const int8_t *current = m->stack[m->sp - 1].value.simd.i8x16;
+    uint8_t swizzle[16];
+    memcpy(swizzle, m->stack[m->sp].value.simd.i8x16, 16 * sizeof(int8_t));
+    int8_t lanes[16];
+
+    for(int i = 0; i < 16; i++) {
+        lanes[i] = swizzle[i] < 16 ? current[swizzle[i]] : static_cast<int8_t>(0);
+    }
+
+    m->sp--;
+    m->stack[m->sp].value_type = V128;
+    memcpy(m->stack[m->sp].value.simd.i8x16, lanes, 16 * sizeof(int8_t));
+    return true;
+}
diff --git a/src/Interpreter/instructions.h b/src/Interpreter/instructions.h
index 54a80f1b..69b0810e 100644
--- a/src/Interpreter/instructions.h
+++ b/src/Interpreter/instructions.h
@@ -73,3 +73,27 @@ bool i_instr_binary_f64(Module *m, uint8_t opcode);
 bool i_instr_conversion(Module *m, uint8_t opcode);
 
 bool i_instr_callback(Module *m, uint8_t opcode);
+
+bool i_instr_simd_splat(Module *m, uint8_t opcode);
+
+bool i_instr_simd_extract(Module *m, uint8_t opcode);
+
+bool i_instr_simd_replace(Module *m, uint8_t opcode);
+
+bool i_instr_simd_const(Module *m);
+
+bool i_instr_simd_store(Module *m);
+
+bool i_instr_simd_load(Module *m);
+
+bool i_instr_simd_bin_bit_op(Module *m, uint8_t opcode);
+
+bool i_instr_simd_v128_not(Module *m);
+
+bool i_instr_simd_bin_v128_v128_op(Module *m, uint8_t opcode);
+
+bool i_instr_simd_shift(Module *m, uint8_t opcode);
+
+bool i_instr_simd_ext_add_pairwise(Module *m, uint8_t opcode);
+
+bool i_instr_simd_swizzle(Module *m);
\ No newline at end of file
diff --git a/src/Interpreter/interpreter.cpp b/src/Interpreter/interpreter.cpp
index b5f3f4fe..cab9dad5 100644
--- a/src/Interpreter/interpreter.cpp
+++ b/src/Interpreter/interpreter.cpp
@@ -427,6 +427,11 @@ bool Interpreter::interpret(Module *m, bool waiting) {
             case 0xe0 ... 0xe3:
                 success &= i_instr_callback(m, opcode);
                 continue;
+
+            case 0xfd:
+                success &= interpret_simd(m);
+                continue;
+
             default:
                 sprintf(exception, "unrecognized opcode 0x%x", opcode);
                 if (m->options.return_exception) {
@@ -455,12 +460,283 @@ bool Interpreter::interpret(Module *m, bool waiting) {
     if (!success && m->options.return_exception) {
         m->exception = strdup(exception);
     } else if (!success) {
-        FATAL("%s\n", exception);
+        FATAL("%s (0x%x)\n", exception, opcode);
     }
 
     return success;
 }
 
+bool Interpreter::interpret_simd(Module *m) {
+    // this function should only be called from Interpreter::interpret
+    // and should only be called when the opcode is 0xfd
+    // -> at this point, the PC is pointing to the (first byte of) the actual
+    // opcode
+
+    // TODO: technically the 2nd byte (and onwards) of the multi-byte SIMD
+    //       opcodes are LEB-encoded u32's and could have multiple
+    //       representations. However, we only support the shortest possible
+    //       encodings; see https://webassembly.github.io/spec/core/appendix/index-instructions.html
+
+    constexpr static auto next_pc_0x01 = [](Module *m, const uint8_t opcode) -> bool {
+        const auto next = *m->pc_ptr;
+        m->pc_ptr++;
+        if(next != 0x01) {
+            sprintf(exception, "SIMD opcode 0x%02x%02x should be followed by 0x%02x, but got 0x%02x",
+                0x7d, opcode, 0x01, next);
+            return false;
+        }
+        return true;
+    };
+
+    // TODO: should be removed one day...
+    constexpr static auto not_implemented = [](const uint8_t opcode) -> bool {
+        sprintf(exception, "SIMD opcode 0x%02x%02x is not implemented (yet)",
+            0x7d, opcode);
+        return false;
+    };
+
+    const auto opcode = *m->pc_ptr;
+    m->pc_ptr++;
+
+    switch(opcode) {
+        case 0x00: return i_instr_simd_load(m);
+
+        case 0x0b: return i_instr_simd_store(m);
+        case 0x0c: return i_instr_simd_const(m);
+
+        case 0x0d: return not_implemented(opcode); // TODO: i8x16.shuffle
+        case 0x0e: return i_instr_simd_swizzle(m);
+
+        case 0x0f ... 0x14: return i_instr_simd_splat(m, opcode);
+
+        case 0x15: // interspersed with (dim).extract_lane
+        case 0x16:
+        case 0x18:
+        case 0x19:
+        case 0x1b:
+        case 0x1d:
+        case 0x1f:
+        case 0x21:
+            return i_instr_simd_extract(m, opcode);
+
+        case 0x17:
+        case 0x1a:
+        case 0x1c:
+        case 0x1e:
+        case 0x20:
+        case 0x22:
+            return i_instr_simd_replace(m, opcode);
+
+        case 0x23 ... 0x2c: // i8x16 relational operators
+        case 0x2d ... 0x36: // i16x8 relational operators
+        case 0x37 ... 0x40: // i32x4 relational operators
+        case 0x41 ... 0x46: // f32x4 relational operators
+        case 0x47 ... 0x4c: // f64x2 relational operators
+            return i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        case 0x4d: // v128.not
+            return i_instr_simd_v128_not(m);
+
+        case 0x4e ... 0x51: // v128 bit-wise operators
+            return i_instr_simd_bin_bit_op(m, opcode);
+
+        case 0x52: // TODO -> v128.bitselect
+            return not_implemented(opcode);
+
+        case 0x53: // TODO -> v128.any_true
+            return not_implemented(opcode);
+
+        case 0x54 ... 0x5d: // TODO -> v128.loadX_lane, v128.storeX_lane, v128.loadX_zero
+            return not_implemented(opcode);
+
+        case 0x5e ... 0x5f: // TODO -> demote/promote f32<->f64
+            return not_implemented(opcode);
+
+        case 0x60 ... 0x62: // TODO -> i8x16. ~ abs, neg, popcnt
+            return not_implemented(opcode);
+
+        case 0x63 ... 0x64: // TODO -> i8x16. ~ all_true, bitmask
+            return not_implemented(opcode);
+
+        case 0x65 ... 0x66: // TODO -> i8x16.narrow_i16x8_(s/u)
+            return not_implemented(opcode);
+
+        case 0x67 ... 0x6a: // TODO -> f32x4. ~ ceil, floor, trunc, nearest
+            return not_implemented(opcode);
+
+        case 0x6b ... 0x6d: // i8x16 shifts
+            return i_instr_simd_shift(m, opcode);
+
+        case 0x6e ... 0x73: // i8x16.(add/sub)
+            return i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        case 0x74 ... 0x75: // TODO -> f64x2. ~ ceil, floor
+            return not_implemented(opcode);
+
+        case 0x76 ... 0x79: // i8x16.(min/max)
+            return i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        case 0x7a: // TODO -> f64x2.trunc
+            return not_implemented(opcode);
+
+        case 0x7b: // TODO -> i8x16.avgr_u
+            return not_implemented(opcode);
+
+        case 0x7c ... 0x7f: // TODO -> (dim).extadd_pairwise_(dim2)
+            return i_instr_simd_ext_add_pairwise(m, opcode);
+
+        // --- From 0x80: 3-byte SIMD opcodes, so check next PC for 0x01 ---
+
+        case 0x80 ... 0x81: // TODO -> i16x8. ~ abs, neg
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0x82: // TODO -> i16x8.q15mulr_sat_s
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0x83 ... 0x84: // TODO -> i16x8. ~ all_true, bitmask
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0x85 ... 0x86: // TODO -> i16x8.narrow_i32x4_(s/u)
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0x87 ... 0x8a: // TODO -> i16x8.extend_(low/high)_i8x16_(s/u)
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0x8b ... 0x8d: // i16x8 shifts
+            return next_pc_0x01(m, opcode) && i_instr_simd_shift(m, opcode);
+
+        case 0x8e ... 0x93: // i16x8.(add/sub)
+            return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        case 0x94: // TODO -> f64x2.nearest
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0x95: // i16x8.mul
+        case 0x96 ... 0x99: // i16x8.(min/max)
+            return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        // case 0x9a: ! invalid opcode !
+
+        case 0x9b: // TODO -> i16x8.avgr_u
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0x9c ... 0x9f: // TODO -> i16x8.extmul_(low/high)_i8x16_(s/u)
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0xa0 ... 0xa1: // TODO -> i32x4. ~ abs, neg
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        // case 0xa2: ! invalid opcode !
+
+        case 0xa3 ... 0xa4: // TODO -> i32x4. ~ all_true, bitmask
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        // case 0xa5 ... 0xa6: ! invalid opcode !
+
+        case 0xa7 ... 0xaa: // TODO -> i32x4.extend_(low/high)_i16x8_(s/u)
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0xab ... 0xad: // i32x4 shifts
+            return next_pc_0x01(m, opcode) && i_instr_simd_shift(m, opcode);
+
+        case 0xae: // i32x4.add
+            return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        // case 0xaf ... 0xb0: ! invalid opcode !
+
+        case 0xb1: // i32x4.sub
+            return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        // case 0xb2 ... 0xb4: ! invalid opcode !
+
+        case 0xb5: // i32x4.mul
+        case 0xb6 ... 0xb9: // i32x4.(min/max)
+            return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        case 0xba: // TODO -> i32x4.dot_i16x8_s
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        // case 0xbb: ! invalid opcode !
+
+        case 0xbc ... 0xbf: // TODO -> i32x4.extmul_(low/high)_i16x8_(s/u)
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0xc0 ... 0xc1: // TODO -> i64x2. ~ abs, neg
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        // case 0xc2: ! invalid opcode !
+
+        case 0xc3 ... 0xc4: // TODO -> i64x2. ~ all_true, bitmask
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        // case 0xc5 ... 0xc6: ! invalid opcode !
+
+        case 0xc7 ... 0xca: // TODO -> i64x2.extend_(low/high)_i32x4_(s/u)
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0xcb ... 0xcd: // i64x2 shifts
+            return next_pc_0x01(m, opcode) && i_instr_simd_shift(m, opcode);
+
+        case 0xce: // i64x2.add
+            return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        // case 0xcf ... 0xd0: ! invalid opcode !
+
+        case 0xd1: // i64x2.sub
+            return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        // case 0xd2 ... 0xd4: ! invalid opcode !
+
+        case 0xd5: // i64x2.mul
+            return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        case 0xd6 ... 0xdb: // i64x2 relational operators
+            return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        case 0xdc ... 0xdf: // TODO -> i64x2.extmul_(low/high)_i32x4_(s/u)
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0xe0 ... 0xe1: // TODO -> f32x4. ~ abs, neg
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        // case 0xe2: ! invalid opcode !
+
+        case 0xe3: // TODO -> f32x4.sqrt
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0xe4 ... 0xeb: // f32x4. ~ add, sub, mul, div, min, max, pmin, pmax
+            return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        case 0xec ... 0xed: // TODO -> f64x2. ~ abs, neg
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        // case 0xee: ! invalid opcode !
+        case 0xef: // TODO -> f64x2.sqrt
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0xf0 ... 0xf7: // f64x2. ~ add, sub, mul, div, min, max, pmin, pmax
+            return next_pc_0x01(m, opcode) && i_instr_simd_bin_v128_v128_op(m, opcode);
+
+        case 0xf8 ... 0xf9: // TODO -> i32x4.trunc_sat_f32x4_(s/u)
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0xfa ... 0xfb: // TODO -> f32x4.convert_i32x4_(s/u)
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0xfc ... 0xfd: // TODO -> i32x4.trunc_sat_f64x2_(s/u)_zero
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        case 0xfe ... 0xff: // TODO -> f64x2.convert_low_i32x4_(s/u)
+            return next_pc_0x01(m, opcode) && not_implemented(opcode);
+
+        default: {
+            sprintf(exception, "unrecognized SIMD opcode 0x%x (0x%x 0x%x)", opcode, 0xfd, opcode);
+            return false;
+        }
+    }
+}
+
+
 void Interpreter::report_overflow([[maybe_unused]] Module *m,
                                   [[maybe_unused]] uint8_t *maddr) {
     dbg_warn("memory start: %p, memory end: %p, maddr: %p\n", m->memory.bytes,
diff --git a/src/Interpreter/interpreter.h b/src/Interpreter/interpreter.h
index d11260d3..7afa7966 100644
--- a/src/Interpreter/interpreter.h
+++ b/src/Interpreter/interpreter.h
@@ -55,5 +55,6 @@ class Interpreter {
     static void report_overflow(Module *m, uint8_t *maddr);
 
    protected:
+    bool interpret_simd(Module *m);
    private:
 };
diff --git a/src/Utils/util.cpp b/src/Utils/util.cpp
index 7455bb3f..305064fb 100644
--- a/src/Utils/util.cpp
+++ b/src/Utils/util.cpp
@@ -61,9 +61,11 @@ uint64_t read_LEB_(uint8_t **pos, uint32_t maxbits, bool sign) {
             FATAL("Unsigned LEB at byte %p overflow", (void *)startpos);
         }
     }
+
     if (sign && (shift < maxbits) && (byte & 0x40u)) {
         // Sign extend by highest bits set 1 except last shift bits
-        result |= UINT64_MAX << shift;
+        const auto old = result;
+        result |= (~0ul) << shift;
     }
     return result;
 }
@@ -92,24 +94,33 @@ StackValue *readWasmArgs(Type function, uint8_t *data) {
 
         switch (args[i].value_type) {
             case I32: {
-                args[i].value.int32 = read_LEB_signed(&data, 32);
+                args[i].value.uint32 = read_LEB_signed(&data, 32);
                 break;
             }
             case F32: {
                 memcpy(&args[i].value.f32, data,
                        sizeof(float));  // todo read ieee 754
+                printf("Received F32 %f (%02x%02x%02x%02x)", args[i].value.f32, data[0], data[1], data[2], data[3]);
                 data += sizeof(float);
                 break;
             }
             case I64: {
-                args[i].value.int64 = read_LEB_signed(&data, 64);
+                args[i].value.uint64 = read_LEB_signed(&data, 64);
                 break;
             }
             case F64: {
                 memcpy(&args[i].value.f64, data, sizeof(double));
+                printf("Received F64 %lf (%02x%02x%02x%02x%02x%02x%02x%02x)",
+                       args[i].value.f64, data[0], data[1], data[2], data[3],
+                       data[4], data[5], data[6], data[7]);
                 data += sizeof(double);
                 break;
             }
+            case V128: {
+                memcpy(&args[i].value.simd, data, sizeof(decltype(args[i].value.simd)));
+                data += sizeof(decltype(args[i].value.simd));
+                break;
+            }
             default: {
                 FATAL("no argument of type %" SCNu8 "\n", args[i].value_type);
             }
diff --git a/src/WARDuino.h b/src/WARDuino.h
index 8d04d160..ec67f362 100644
--- a/src/WARDuino.h
+++ b/src/WARDuino.h
@@ -28,6 +28,8 @@
 #define F32 0x7d  // -0x03
 #define F64 0x7c  // -0x04
 
+#define V128 0x7b // according to the spec? TODO
+
 #define I32_8 0x7b   // -0x05
 #define I32_16 0x7a  // -0x06
 #define I64_8 0x79   // -0x07
diff --git a/src/WARDuino/WARDuino.cpp b/src/WARDuino/WARDuino.cpp
index 215821c0..504cb69e 100644
--- a/src/WARDuino/WARDuino.cpp
+++ b/src/WARDuino/WARDuino.cpp
@@ -53,9 +53,9 @@ bool resolvesym(char *filename, char *symbol, uint8_t external_kind, void **val,
 // char  exception[4096];
 
 // Static definition of block_types
-uint32_t block_type_results[4][1] = {{I32}, {I64}, {F32}, {F64}};
+uint32_t block_type_results[5][1] = {{I32}, {I64}, {F32}, {F64}, {V128}};
 
-Type block_types[5];
+Type block_types[6];
 
 void initTypes() {
     block_types[0].form = BLOCK;
@@ -140,6 +140,44 @@ void parse_memory_type(Module *m, uint8_t **pos) {
     }
 }
 
+void skip_immediates_simd(uint8_t **pos) {
+    uint8_t simd_opcode = **pos;
+    *pos += 1; // skip opcode
+    switch(simd_opcode) {
+        case 0x00 ... 0x0b: // v128.loadXXX, v128.store
+            read_LEB_32(pos);
+            break;
+        case 0x0c: // v128.const
+            *pos += 16; // v128 consts are straight 16-byte blocks
+            break;
+        case 0x0d: // i8x16.shuffle
+            *pos += 1;
+            break;
+        case 0x0e ... 0x14: // i8x16.swizzle, dim.splat
+            break;
+        case 0x15 ... 0x22: // dim.extract_lane, dim.replace_lane
+            *pos += 1;
+            break;
+        case 0x23 ... 0x53: // relational operators
+            break;
+        case 0x54 ... 0x5b: // v128.loadX_lane, v128.storeX_lane
+            read_LEB_32(pos);
+            *pos += 1;
+            break;
+        case 0x5c ... 0x5d: // v128.loadX_zero
+            read_LEB_32(pos);
+            break;
+        case 0x5e ... 0x7f: // math-like functionality, extensions, ...
+            break;
+        case 0x80 ... 0xff: {
+            uint8_t second_opcode = **pos;
+            assert(second_opcode == 0x01 && "3-byte SIMD instructions should have 0x01 as their 3rd byte.");
+            *pos++;
+            break;
+        }
+    }
+}
+
 void skip_immediates(uint8_t **pos) {
     uint32_t count, opcode = **pos;
     *pos = *pos + 1;
@@ -190,6 +228,9 @@ void skip_immediates(uint8_t **pos) {
             }
             read_LEB_32(pos);  // default target
             break;
+        case 0xfd:
+            skip_immediates_simd(pos);
+            break;
         default:  // no immediates
             break;
     }
diff --git a/src/WARDuino/internals.h b/src/WARDuino/internals.h
index d0f6b263..6c183e46 100644
--- a/src/WARDuino/internals.h
+++ b/src/WARDuino/internals.h
@@ -54,6 +54,15 @@ typedef struct StackValue {
         int64_t int64;
         float f32;
         double f64;
+
+        union { // TODO: temporary solution for SIMD types
+            int8_t i8x16[16];
+            int16_t i16x8[8];
+            int32_t i32x4[4];
+            int64_t i64x2[2];
+            float f32x4[4];
+            double f64x2[2];
+        } simd;
     } value;
 } StackValue;
 
diff --git a/tests/latch/core/simd_splat_0.asserts.wast b/tests/latch/core/simd_splat_0.asserts.wast
new file mode 100644
index 00000000..711716a1
--- /dev/null
+++ b/tests/latch/core/simd_splat_0.asserts.wast
@@ -0,0 +1,107 @@
+(assert_return (invoke "i8x16.splat" (i32.const 0)) (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
+(assert_return (invoke "i8x16.splat" (i32.const 5)) (v128.const i8x16 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5))
+(assert_return (invoke "i8x16.splat" (i32.const -5)) (v128.const i8x16 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 -5))
+(assert_return (invoke "i8x16.splat" (i32.const 257)) (v128.const i8x16 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1))
+(assert_return (invoke "i8x16.splat" (i32.const 0xff)) (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1))
+(assert_return (invoke "i8x16.splat" (i32.const -128)) (v128.const i8x16 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128))
+(assert_return (invoke "i8x16.splat" (i32.const 127)) (v128.const i8x16 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127))
+(assert_return (invoke "i8x16.splat" (i32.const -129)) (v128.const i8x16 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127))
+(assert_return (invoke "i8x16.splat" (i32.const 128)) (v128.const i8x16 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 -128))
+(assert_return (invoke "i8x16.splat" (i32.const 0xff7f)) (v128.const i8x16 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f))
+(assert_return (invoke "i8x16.splat" (i32.const 0x80)) (v128.const i8x16 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80 0x80))
+(assert_return (invoke "i8x16.splat" (i32.const 0xAB)) (v128.const i32x4 0xABABABAB 0xABABABAB 0xABABABAB 0xABABABAB))
+
+(assert_return (invoke "i16x8.splat" (i32.const 0)) (v128.const i16x8 0 0 0 0 0 0 0 0))
+(assert_return (invoke "i16x8.splat" (i32.const 5)) (v128.const i16x8 5 5 5 5 5 5 5 5))
+(assert_return (invoke "i16x8.splat" (i32.const -5)) (v128.const i16x8 -5 -5 -5 -5 -5 -5 -5 -5))
+(assert_return (invoke "i16x8.splat" (i32.const 65537)) (v128.const i16x8 1 1 1 1 1 1 1 1))
+(assert_return (invoke "i16x8.splat" (i32.const 0xffff)) (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+(assert_return (invoke "i16x8.splat" (i32.const -32768)) (v128.const i16x8 -32768 -32768 -32768 -32768 -32768 -32768 -32768 -32768))
+(assert_return (invoke "i16x8.splat" (i32.const 32767)) (v128.const i16x8 32767 32767 32767 32767 32767 32767 32767 32767))
+(assert_return (invoke "i16x8.splat" (i32.const -32769)) (v128.const i16x8 32767 32767 32767 32767 32767 32767 32767 32767))
+(assert_return (invoke "i16x8.splat" (i32.const 32768)) (v128.const i16x8 -32768 -32768 -32768 -32768 -32768 -32768 -32768 -32768))
+(assert_return (invoke "i16x8.splat" (i32.const 0xffff7fff)) (v128.const i16x8 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff))
+(assert_return (invoke "i16x8.splat" (i32.const 0x8000)) (v128.const i16x8 0x8000 0x8000 0x8000 0x8000 0x8000 0x8000 0x8000 0x8000))
+(assert_return (invoke "i16x8.splat" (i32.const 0xABCD)) (v128.const i32x4 0xABCDABCD 0xABCDABCD 0xABCDABCD 0xABCDABCD))
+(assert_return (invoke "i16x8.splat" (i32.const 012345)) (v128.const i16x8 012_345 012_345 012_345 012_345 012_345 012_345 012_345 012_345))
+(assert_return (invoke "i16x8.splat" (i32.const 0x01234)) (v128.const i16x8 0x0_1234 0x0_1234 0x0_1234 0x0_1234 0x0_1234 0x0_1234 0x0_1234 0x0_1234))
+
+(assert_return (invoke "i32x4.splat" (i32.const 0)) (v128.const i32x4 0 0 0 0))
+(assert_return (invoke "i32x4.splat" (i32.const 5)) (v128.const i32x4 5 5 5 5))
+(assert_return (invoke "i32x4.splat" (i32.const -5)) (v128.const i32x4 -5 -5 -5 -5))
+(assert_return (invoke "i32x4.splat" (i32.const 0xffffffff)) (v128.const i32x4 -1 -1 -1 -1))
+(assert_return (invoke "i32x4.splat" (i32.const 4294967295)) (v128.const i32x4 -1 -1 -1 -1))
+(assert_return (invoke "i32x4.splat" (i32.const -2147483648)) (v128.const i32x4 0x80000000 0x80000000 0x80000000 0x80000000))
+(assert_return (invoke "i32x4.splat" (i32.const 2147483647)) (v128.const i32x4 0x7fffffff 0x7fffffff 0x7fffffff 0x7fffffff))
+(assert_return (invoke "i32x4.splat" (i32.const 2147483648)) (v128.const i32x4 0x80000000 0x80000000 0x80000000 0x80000000))
+(assert_return (invoke "i32x4.splat" (i32.const 01234567890)) (v128.const i32x4 012_3456_7890 012_3456_7890 012_3456_7890 012_3456_7890))
+(assert_return (invoke "i32x4.splat" (i32.const 0x012345678)) (v128.const i32x4 0x0_1234_5678 0x0_1234_5678 0x0_1234_5678 0x0_1234_5678))
+
+(assert_return (invoke "f32x4.splat" (f32.const 0.0)) (v128.const f32x4 0.0 0.0 0.0 0.0))
+(assert_return (invoke "f32x4.splat" (f32.const 1.1)) (v128.const f32x4 1.1 1.1 1.1 1.1))
+(assert_return (invoke "f32x4.splat" (f32.const -1.1)) (v128.const f32x4 -1.1 -1.1 -1.1 -1.1))
+(assert_return (invoke "f32x4.splat" (f32.const 1e38)) (v128.const f32x4 1e38 1e38 1e38 1e38))
+(assert_return (invoke "f32x4.splat" (f32.const -1e38)) (v128.const f32x4 -1e38 -1e38 -1e38 -1e38))
+(assert_return (invoke "f32x4.splat" (f32.const 0x1.fffffep127)) (v128.const f32x4 0x1.fffffep127 0x1.fffffep127 0x1.fffffep127 0x1.fffffep127))
+(assert_return (invoke "f32x4.splat" (f32.const -0x1.fffffep127)) (v128.const f32x4 -0x1.fffffep127 -0x1.fffffep127 -0x1.fffffep127 -0x1.fffffep127))
+(assert_return (invoke "f32x4.splat" (f32.const 0x1p127)) (v128.const f32x4 0x1p127 0x1p127 0x1p127 0x1p127))
+(assert_return (invoke "f32x4.splat" (f32.const -0x1p127)) (v128.const f32x4 -0x1p127 -0x1p127 -0x1p127 -0x1p127))
+(assert_return (invoke "f32x4.splat" (f32.const inf)) (v128.const f32x4 inf inf inf inf))
+(assert_return (invoke "f32x4.splat" (f32.const -inf)) (v128.const f32x4 -inf -inf -inf -inf))
+(assert_return (invoke "f32x4.splat" (f32.const nan)) (v128.const f32x4 nan nan nan nan))
+(assert_return (invoke "f32x4.splat" (f32.const nan:0x1)) (v128.const f32x4 nan:0x1 nan:0x1 nan:0x1 nan:0x1))
+(assert_return (invoke "f32x4.splat" (f32.const nan:0x7f_ffff)) (v128.const f32x4 nan:0x7f_ffff nan:0x7f_ffff nan:0x7f_ffff nan:0x7f_ffff))
+(assert_return (invoke "f32x4.splat" (f32.const 0123456789)) (v128.const f32x4 0123456789 0123456789 0123456789 0123456789))
+(assert_return (invoke "f32x4.splat" (f32.const 0123456789.)) (v128.const f32x4 0123456789. 0123456789. 0123456789. 0123456789.))
+(assert_return (invoke "f32x4.splat" (f32.const 0x0123456789ABCDEF)) (v128.const f32x4 0x0123456789ABCDEF 0x0123456789ABCDEF 0x0123456789ABCDEF 0x0123456789ABCDEF))
+(assert_return (invoke "f32x4.splat" (f32.const 0x0123456789ABCDEF.)) (v128.const f32x4 0x0123456789ABCDEF. 0x0123456789ABCDEF. 0x0123456789ABCDEF. 0x0123456789ABCDEF.))
+(assert_return (invoke "f32x4.splat" (f32.const 0123456789e019)) (v128.const f32x4 0123456789e019 0123456789e019 0123456789e019 0123456789e019))
+(assert_return (invoke "f32x4.splat" (f32.const 0123456789.e+019)) (v128.const f32x4 0123456789.e+019 0123456789.e+019 0123456789.e+019 0123456789.e+019))
+(assert_return (invoke "f32x4.splat" (f32.const 0x0123456789ABCDEFp019)) (v128.const f32x4 0x0123456789ABCDEFp019 0x0123456789ABCDEFp019 0x0123456789ABCDEFp019 0x0123456789ABCDEFp019))
+(assert_return (invoke "f32x4.splat" (f32.const 0x0123456789ABCDEF.p-019)) (v128.const f32x4 0x0123456789ABCDEF.p-019 0x0123456789ABCDEF.p-019 0x0123456789ABCDEF.p-019 0x0123456789ABCDEF.p-019))
+
+(assert_return (invoke "i64x2.splat" (i64.const 0)) (v128.const i64x2 0 0))
+(assert_return (invoke "i64x2.splat" (i64.const -0)) (v128.const i64x2 0 0))
+(assert_return (invoke "i64x2.splat" (i64.const 1)) (v128.const i64x2 1 1))
+(assert_return (invoke "i64x2.splat" (i64.const -1)) (v128.const i64x2 -1 -1))
+(assert_return (invoke "i64x2.splat" (i64.const -9223372036854775808)) (v128.const i64x2 -9223372036854775808 -9223372036854775808))
+(assert_return (invoke "i64x2.splat" (i64.const -9223372036854775808)) (v128.const i64x2 9223372036854775808 9223372036854775808))
+(assert_return (invoke "i64x2.splat" (i64.const 9223372036854775807)) (v128.const i64x2 9223372036854775807 9223372036854775807))
+(assert_return (invoke "i64x2.splat" (i64.const 18446744073709551615)) (v128.const i64x2 -1 -1))
+(assert_return (invoke "i64x2.splat" (i64.const 0x7fffffffffffffff)) (v128.const i64x2 0x7fffffffffffffff 0x7fffffffffffffff))
+(assert_return (invoke "i64x2.splat" (i64.const 0xffffffffffffffff)) (v128.const i64x2 -1 -1))
+(assert_return (invoke "i64x2.splat" (i64.const -0x8000000000000000)) (v128.const i64x2 -0x8000000000000000 -0x8000000000000000))
+(assert_return (invoke "i64x2.splat" (i64.const -0x8000000000000000)) (v128.const i64x2 0x8000000000000000 0x8000000000000000))
+(assert_return (invoke "i64x2.splat" (i64.const 01234567890123456789)) (v128.const i64x2 01_234_567_890_123_456_789 01_234_567_890_123_456_789))
+(assert_return (invoke "i64x2.splat" (i64.const 0x01234567890ABcdef)) (v128.const i64x2 0x0_1234_5678_90AB_cdef 0x0_1234_5678_90AB_cdef))
+
+(assert_return (invoke "f64x2.splat" (f64.const 0.0)) (v128.const f64x2 0.0 0.0))
+(assert_return (invoke "f64x2.splat" (f64.const -0.0)) (v128.const f64x2 -0.0 -0.0))
+(assert_return (invoke "f64x2.splat" (f64.const 1.1)) (v128.const f64x2 1.1 1.1))
+(assert_return (invoke "f64x2.splat" (f64.const -1.1)) (v128.const f64x2 -1.1 -1.1))
+(assert_return (invoke "f64x2.splat" (f64.const 0x0.0000000000001p-1022)) (v128.const f64x2 0x0.0000000000001p-1022 0x0.0000000000001p-1022))
+(assert_return (invoke "f64x2.splat" (f64.const -0x0.0000000000001p-1022)) (v128.const f64x2 -0x0.0000000000001p-1022 -0x0.0000000000001p-1022))
+(assert_return (invoke "f64x2.splat" (f64.const 0x1p-1022)) (v128.const f64x2 0x1p-1022 0x1p-1022))
+(assert_return (invoke "f64x2.splat" (f64.const -0x1p-1022)) (v128.const f64x2 -0x1p-1022 -0x1p-1022))
+(assert_return (invoke "f64x2.splat" (f64.const 0x1p-1)) (v128.const f64x2 0x1p-1 0x1p-1))
+(assert_return (invoke "f64x2.splat" (f64.const -0x1p-1)) (v128.const f64x2 -0x1p-1 -0x1p-1))
+(assert_return (invoke "f64x2.splat" (f64.const 0x1p+0)) (v128.const f64x2 0x1p+0 0x1p+0))
+(assert_return (invoke "f64x2.splat" (f64.const -0x1p+0)) (v128.const f64x2 -0x1p+0 -0x1p+0))
+(assert_return (invoke "f64x2.splat" (f64.const 0x1.921fb54442d18p+2)) (v128.const f64x2 0x1.921fb54442d18p+2 0x1.921fb54442d18p+2))
+(assert_return (invoke "f64x2.splat" (f64.const -0x1.921fb54442d18p+2)) (v128.const f64x2 -0x1.921fb54442d18p+2 -0x1.921fb54442d18p+2))
+(assert_return (invoke "f64x2.splat" (f64.const 0x1.fffffffffffffp+1023)) (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023))
+(assert_return (invoke "f64x2.splat" (f64.const -0x1.fffffffffffffp+1023)) (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023))
+(assert_return (invoke "f64x2.splat" (f64.const inf)) (v128.const f64x2 inf inf))
+(assert_return (invoke "f64x2.splat" (f64.const -inf)) (v128.const f64x2 -inf -inf))
+(assert_return (invoke "f64x2.splat" (f64.const nan)) (v128.const f64x2 nan nan))
+(assert_return (invoke "f64x2.splat" (f64.const -nan)) (v128.const f64x2 -nan -nan))
+(assert_return (invoke "f64x2.splat" (f64.const nan:0x4000000000000)) (v128.const f64x2 nan:0x4000000000000 nan:0x4000000000000))
+(assert_return (invoke "f64x2.splat" (f64.const -nan:0x4000000000000)) (v128.const f64x2 -nan:0x4000000000000 -nan:0x4000000000000))
+(assert_return (invoke "f64x2.splat" (f64.const 0123456789)) (v128.const f64x2 0123456789 0123456789))
+(assert_return (invoke "f64x2.splat" (f64.const 0123456789.)) (v128.const f64x2 0123456789. 0123456789.))
+(assert_return (invoke "f64x2.splat" (f64.const 0x0123456789ABCDEFabcdef)) (v128.const f64x2 0x0123456789ABCDEFabcdef 0x0123456789ABCDEFabcdef))
+(assert_return (invoke "f64x2.splat" (f64.const 0x0123456789ABCDEFabcdef.)) (v128.const f64x2 0x0123456789ABCDEFabcdef. 0x0123456789ABCDEFabcdef.))
+(assert_return (invoke "f64x2.splat" (f64.const 0123456789e019)) (v128.const f64x2 0123456789e019 0123456789e019))
+(assert_return (invoke "f64x2.splat" (f64.const 0123456789e+019)) (v128.const f64x2 0123456789e+019 0123456789e+019))
+(assert_return (invoke "f64x2.splat" (f64.const 0x0123456789ABCDEFabcdef.p019)) (v128.const f64x2 0x0123456789ABCDEFabcdef.p019 0x0123456789ABCDEFabcdef.p019))
+(assert_return (invoke "f64x2.splat" (f64.const 0x0123456789ABCDEFabcdef.p-019)) (v128.const f64x2 0x0123456789ABCDEFabcdef.p-019 0x0123456789ABCDEFabcdef.p-019))
\ No newline at end of file
diff --git a/tests/latch/core/simd_splat_0.wast b/tests/latch/core/simd_splat_0.wast
new file mode 100644
index 00000000..39b3b563
--- /dev/null
+++ b/tests/latch/core/simd_splat_0.wast
@@ -0,0 +1,8 @@
+(module
+  (func (export "i8x16.splat") (param i32) (result v128) (i8x16.splat (local.get 0)))
+  (func (export "i16x8.splat") (param i32) (result v128) (i16x8.splat (local.get 0)))
+  (func (export "i32x4.splat") (param i32) (result v128) (i32x4.splat (local.get 0)))
+  (func (export "f32x4.splat") (param f32) (result v128) (f32x4.splat (local.get 0)))
+  (func (export "i64x2.splat") (param i64) (result v128) (i64x2.splat (local.get 0)))
+  (func (export "f64x2.splat") (param f64) (result v128) (f64x2.splat (local.get 0)))
+)
\ No newline at end of file
diff --git a/tests/latch/core/simd_splat_1.asserts.wast b/tests/latch/core/simd_splat_1.asserts.wast
new file mode 100644
index 00000000..3d892695
--- /dev/null
+++ b/tests/latch/core/simd_splat_1.asserts.wast
@@ -0,0 +1,5 @@
+(assert_return (invoke "as-v128_store-operand-1" (i32.const 1)) (v128.const i8x16 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1))
+(assert_return (invoke "as-v128_store-operand-2" (i32.const 256)) (v128.const i16x8 0x100 0x100 0x100 0x100 0x100 0x100 0x100 0x100))
+(assert_return (invoke "as-v128_store-operand-3" (i32.const 0xffffffff)) (v128.const i32x4 -1 -1 -1 -1))
+(assert_return (invoke "as-v128_store-operand-4" (i64.const 1)) (v128.const i64x2 1 1))
+(assert_return (invoke "as-v128_store-operand-5" (f64.const -0x1p+0)) (v128.const f64x2 -0x1p+0 -0x1p+0))
\ No newline at end of file
diff --git a/tests/latch/core/simd_splat_1.wast b/tests/latch/core/simd_splat_1.wast
new file mode 100644
index 00000000..bc6be2e8
--- /dev/null
+++ b/tests/latch/core/simd_splat_1.wast
@@ -0,0 +1,17 @@
+(module (memory 1)
+  (func (export "as-v128_store-operand-1") (param i32) (result v128)
+    (v128.store (i32.const 0) (i8x16.splat (local.get 0)))
+    (v128.load (i32.const 0)))
+  (func (export "as-v128_store-operand-2") (param i32) (result v128)
+    (v128.store (i32.const 0) (i16x8.splat (local.get 0)))
+    (v128.load (i32.const 0)))
+  (func (export "as-v128_store-operand-3") (param i32) (result v128)
+    (v128.store (i32.const 0) (i32x4.splat (local.get 0)))
+    (v128.load (i32.const 0)))
+  (func (export "as-v128_store-operand-4") (param i64) (result v128)
+    (v128.store (i32.const 0) (i64x2.splat (local.get 0)))
+    (v128.load (i32.const 0)))
+  (func (export "as-v128_store-operand-5") (param f64) (result v128)
+    (v128.store (i32.const 0) (f64x2.splat (local.get 0)))
+    (v128.load (i32.const 0)))
+)
\ No newline at end of file
diff --git a/tests/latch/core/simd_splat_2.asserts.wast b/tests/latch/core/simd_splat_2.asserts.wast
new file mode 100644
index 00000000..6e7f0fe3
--- /dev/null
+++ b/tests/latch/core/simd_splat_2.asserts.wast
@@ -0,0 +1,51 @@
+(assert_return (invoke "as-i8x16_extract_lane_s-operand-first" (i32.const 42)) (i32.const 42))
+(assert_return (invoke "as-i8x16_extract_lane_s-operand-last" (i32.const -42)) (i32.const -42))
+(assert_return (invoke "as-i16x8_extract_lane_s-operand-first" (i32.const 0xffff7fff)) (i32.const 32767))
+(assert_return (invoke "as-i16x8_extract_lane_s-operand-last" (i32.const 0x8000)) (i32.const -32768))
+(assert_return (invoke "as-i32x4_extract_lane_s-operand-first" (i32.const 0x7fffffff)) (i32.const 2147483647))
+(assert_return (invoke "as-i32x4_extract_lane_s-operand-last" (i32.const 0x80000000)) (i32.const -2147483648))
+(assert_return (invoke "as-f32x4_extract_lane_s-operand-first" (f32.const 1.5)) (f32.const 1.5))
+(assert_return (invoke "as-f32x4_extract_lane_s-operand-last" (f32.const -0.25)) (f32.const -0.25))
+(assert_return (invoke "as-v8x16_swizzle-operands" (i32.const 1) (i32.const -1)) (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
+(assert_return (invoke "as-i64x2_extract_lane-operand-last" (i64.const -42)) (i64.const -42))
+(assert_return (invoke "as-i64x2_extract_lane-operand-first" (i64.const 42)) (i64.const 42))
+(assert_return (invoke "as-f64x2_extract_lane-operand-first" (f64.const 1.5)) (f64.const 1.5))
+(assert_return (invoke "as-f64x2_extract_lane-operand-last" (f64.const -0x1p+0)) (f64.const -0x1p+0))
+
+(assert_return (invoke "as-i8x16_add_sub-operands" (i32.const 3) (i32.const 2) (i32.const 1)) (v128.const i8x16 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4))
+(assert_return (invoke "as-i16x8_add_sub_mul-operands" (i32.const 257) (i32.const 128) (i32.const 16) (i32.const 16)) (v128.const i16x8 129 129 129 129 129 129 129 129))
+(assert_return (invoke "as-i32x4_add_sub_mul-operands" (i32.const 65535) (i32.const 65537) (i32.const 256) (i32.const 256)) (v128.const i32x4 0x10000 0x10000 0x10000 0x10000))
+(assert_return (invoke "as-i64x2_add_sub_mul-operands" (i64.const 0x7fffffff) (i64.const 0x1_0000_0001) (i64.const 65536) (i64.const 65536)) (v128.const i64x2 0x8000_0000 0x8000_0000))
+(assert_return (invoke "as-f64x2_add_sub_mul-operands" (f64.const 0x1p-1) (f64.const 0.75) (f64.const 0x1p-1) (f64.const 0.5)) (v128.const f64x2 0x1p+0 0x1p+0))
+
+(assert_return (invoke "as-i8x16_add_sat_s-operands" (i32.const 0x7f) (i32.const 1)) (v128.const i8x16 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f 0x7f))
+(assert_return (invoke "as-i16x8_add_sat_s-operands" (i32.const 0x7fff) (i32.const 1)) (v128.const i16x8 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff 0x7fff))
+(assert_return (invoke "as-i8x16_sub_sat_u-operands" (i32.const 0x7f) (i32.const 0xff)) (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
+(assert_return (invoke "as-i16x8_sub_sat_u-operands" (i32.const 0x7fff) (i32.const 0xffff)) (v128.const i16x8 0 0 0 0 0 0 0 0))
+
+(assert_return (invoke "as-i8x16_shr_s-operand" (i32.const 0xf0) (i32.const 3)) (v128.const i8x16 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2))
+(assert_return (invoke "as-i16x8_shr_s-operand" (i32.const 0x100) (i32.const 4)) (v128.const i16x8 16 16 16 16 16 16 16 16))
+(assert_return (invoke "as-i32x4_shr_s-operand" (i32.const -1) (i32.const 16)) (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "as-v128_and-operands" (i32.const 0x11) (i32.const 0xff)) (v128.const i8x16 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17))
+(assert_return (invoke "as-v128_or-operands" (i32.const 0) (i32.const 0xffff)) (v128.const i16x8 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff))
+(assert_return (invoke "as-v128_xor-operands" (i32.const 0xf0f0f0f0) (i32.const 0xffffffff)) (v128.const i32x4 0xf0f0f0f 0xf0f0f0f 0xf0f0f0f 0xf0f0f0f))
+
+(assert_return (invoke "as-i8x16_all_true-operand" (i32.const 0)) (i32.const 0))
+(assert_return (invoke "as-i16x8_all_true-operand" (i32.const 0xffff)) (i32.const 1))
+(assert_return (invoke "as-i32x4_all_true-operand1" (i32.const 0xf0f0f0f0)) (i32.const 1))
+(assert_return (invoke "as-i32x4_all_true-operand2" (i64.const -1)) (i32.const 1))
+
+(assert_return (invoke "as-i8x16_eq-operands" (i32.const 1) (i32.const 2)) (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
+(assert_return (invoke "as-i16x8_eq-operands" (i32.const -1) (i32.const 65535)) (v128.const i16x8 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff 0xffff))
+(assert_return (invoke "as-i32x4_eq-operands1" (i32.const -1) (i32.const 0xffffffff)) (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff))
+(assert_return (invoke "as-f32x4_eq-operands" (f32.const +0.0) (f32.const -0.0)) (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff))
+(assert_return (invoke "as-i32x4_eq-operands2" (i64.const 1) (i64.const 2)) (v128.const i64x2 0xffffffff00000000 0xffffffff00000000))
+(assert_return (invoke "as-f64x2_eq-operands" (f64.const +0.0) (f64.const -0.0)) (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "as-f32x4_abs-operand" (f32.const -1.125)) (v128.const f32x4 1.125 1.125 1.125 1.125))
+(assert_return (invoke "as-f32x4_min-operands" (f32.const 0.25) (f32.const 1e-38)) (v128.const f32x4 1e-38 1e-38 1e-38 1e-38))
+(assert_return (invoke "as-f32x4_div-operands" (f32.const 1.0) (f32.const 8.0)) (v128.const f32x4 0.125 0.125 0.125 0.125))
+
+(assert_return (invoke "as-f32x4_convert_s_i32x4-operand" (i32.const 12345)) (v128.const f32x4 12345.0 12345.0 12345.0 12345.0))
+(assert_return (invoke "as-i32x4_trunc_s_f32x4_sat-operand" (f32.const 1.1)) (v128.const i32x4 1 1 1 1))
diff --git a/tests/latch/core/simd_splat_2.wast b/tests/latch/core/simd_splat_2.wast
new file mode 100644
index 00000000..819ef5c0
--- /dev/null
+++ b/tests/latch/core/simd_splat_2.wast
@@ -0,0 +1,119 @@
+(module
+  ;; Accessing lane
+  (func (export "as-i8x16_extract_lane_s-operand-first") (param i32) (result i32)
+    (i8x16.extract_lane_s 0 (i8x16.splat (local.get 0))))
+  (func (export "as-i8x16_extract_lane_s-operand-last") (param i32) (result i32)
+    (i8x16.extract_lane_s 15 (i8x16.splat (local.get 0))))
+  (func (export "as-i16x8_extract_lane_s-operand-first") (param i32) (result i32)
+    (i16x8.extract_lane_s 0 (i16x8.splat (local.get 0))))
+  (func (export "as-i16x8_extract_lane_s-operand-last") (param i32) (result i32)
+    (i16x8.extract_lane_s 7 (i16x8.splat (local.get 0))))
+  (func (export "as-i32x4_extract_lane_s-operand-first") (param i32) (result i32)
+    (i32x4.extract_lane 0 (i32x4.splat (local.get 0))))
+  (func (export "as-i32x4_extract_lane_s-operand-last") (param i32) (result i32)
+    (i32x4.extract_lane 3 (i32x4.splat (local.get 0))))
+  (func (export "as-f32x4_extract_lane_s-operand-first") (param f32) (result f32)
+    (f32x4.extract_lane 0 (f32x4.splat (local.get 0))))
+  (func (export "as-f32x4_extract_lane_s-operand-last") (param f32) (result f32)
+    (f32x4.extract_lane 3 (f32x4.splat (local.get 0))))
+  (func (export "as-v8x16_swizzle-operands") (param i32) (param i32) (result v128)
+    (i8x16.swizzle (i8x16.splat (local.get 0)) (i8x16.splat (local.get 1))))
+  (func (export "as-i64x2_extract_lane-operand-first") (param i64) (result i64)
+    (i64x2.extract_lane 0 (i64x2.splat (local.get 0))))
+  (func (export "as-i64x2_extract_lane-operand-last") (param i64) (result i64)
+    (i64x2.extract_lane 1 (i64x2.splat (local.get 0))))
+  (func (export "as-f64x2_extract_lane-operand-first") (param f64) (result f64)
+    (f64x2.extract_lane 0 (f64x2.splat (local.get 0))))
+  (func (export "as-f64x2_extract_lane-operand-last") (param f64) (result f64)
+    (f64x2.extract_lane 1 (f64x2.splat (local.get 0))))
+
+  ;; Integer arithmetic
+  (func (export "as-i8x16_add_sub-operands") (param i32 i32 i32) (result v128)
+    (i8x16.add (i8x16.splat (local.get 0))
+      (i8x16.sub (i8x16.splat (local.get 1)) (i8x16.splat (local.get 2)))))
+  (func (export "as-i16x8_add_sub_mul-operands") (param i32 i32 i32 i32) (result v128)
+    (i16x8.add (i16x8.splat (local.get 0))
+      (i16x8.sub (i16x8.splat (local.get 1))
+        (i16x8.mul (i16x8.splat (local.get 2)) (i16x8.splat (local.get 3))))))
+  (func (export "as-i32x4_add_sub_mul-operands") (param i32 i32 i32 i32) (result v128)
+    (i32x4.add (i32x4.splat (local.get 0))
+      (i32x4.sub (i32x4.splat (local.get 1))
+        (i32x4.mul (i32x4.splat (local.get 2)) (i32x4.splat (local.get 3))))))
+
+  (func (export "as-i64x2_add_sub_mul-operands") (param i64 i64 i64 i64) (result v128)
+    (i64x2.add (i64x2.splat (local.get 0))
+      (i64x2.sub (i64x2.splat (local.get 1))
+        (i64x2.mul (i64x2.splat (local.get 2)) (i64x2.splat (local.get 3))))))
+  (func (export "as-f64x2_add_sub_mul-operands") (param f64 f64 f64 f64) (result v128)
+    (f64x2.add (f64x2.splat (local.get 0))
+      (f64x2.sub (f64x2.splat (local.get 1))
+        (f64x2.mul (f64x2.splat (local.get 2)) (f64x2.splat (local.get 3))))))
+
+  ;; Saturating integer arithmetic
+  (func (export "as-i8x16_add_sat_s-operands") (param i32 i32) (result v128)
+    (i8x16.add_sat_s (i8x16.splat (local.get 0)) (i8x16.splat (local.get 1))))
+  (func (export "as-i16x8_add_sat_s-operands") (param i32 i32) (result v128)
+    (i16x8.add_sat_s (i16x8.splat (local.get 0)) (i16x8.splat (local.get 1))))
+  (func (export "as-i8x16_sub_sat_u-operands") (param i32 i32) (result v128)
+    (i8x16.sub_sat_u (i8x16.splat (local.get 0)) (i8x16.splat (local.get 1))))
+  (func (export "as-i16x8_sub_sat_u-operands") (param i32 i32) (result v128)
+    (i16x8.sub_sat_u (i16x8.splat (local.get 0)) (i16x8.splat (local.get 1))))
+
+  ;; Bit shifts
+  (func (export "as-i8x16_shr_s-operand") (param i32 i32) (result v128)
+    (i8x16.shr_s (i8x16.splat (local.get 0)) (local.get 1)))
+  (func (export "as-i16x8_shr_s-operand") (param i32 i32) (result v128)
+    (i16x8.shr_s (i16x8.splat (local.get 0)) (local.get 1)))
+  (func (export "as-i32x4_shr_s-operand") (param i32 i32) (result v128)
+    (i32x4.shr_s (i32x4.splat (local.get 0)) (local.get 1)))
+
+  ;; Bitwise operantions
+  (func (export "as-v128_and-operands") (param i32 i32) (result v128)
+    (v128.and (i8x16.splat (local.get 0)) (i8x16.splat (local.get 1))))
+  (func (export "as-v128_or-operands") (param i32 i32) (result v128)
+    (v128.or (i16x8.splat (local.get 0)) (i16x8.splat (local.get 1))))
+  (func (export "as-v128_xor-operands") (param i32 i32) (result v128)
+    (v128.xor (i32x4.splat (local.get 0)) (i32x4.splat (local.get 1))))
+
+  ;; Boolean horizontal reductions
+  (func (export "as-i8x16_all_true-operand") (param i32) (result i32)
+    (i8x16.all_true (i8x16.splat (local.get 0))))
+  (func (export "as-i16x8_all_true-operand") (param i32) (result i32)
+    (i16x8.all_true (i16x8.splat (local.get 0))))
+  (func (export "as-i32x4_all_true-operand1") (param i32) (result i32)
+    (i32x4.all_true (i32x4.splat (local.get 0))))
+  (func (export "as-i32x4_all_true-operand2") (param i64) (result i32)
+    (i32x4.all_true (i64x2.splat (local.get 0))))
+
+  ;; Comparisons
+  (func (export "as-i8x16_eq-operands") (param i32 i32) (result v128)
+    (i8x16.eq (i8x16.splat (local.get 0)) (i8x16.splat (local.get 1))))
+  (func (export "as-i16x8_eq-operands") (param i32 i32) (result v128)
+    (i16x8.eq (i16x8.splat (local.get 0)) (i16x8.splat (local.get 1))))
+  (func (export "as-i32x4_eq-operands1") (param i32 i32) (result v128)
+    (i32x4.eq (i32x4.splat (local.get 0)) (i32x4.splat (local.get 1))))
+  (func (export "as-i32x4_eq-operands2") (param i64 i64) (result v128)
+    (i32x4.eq (i64x2.splat (local.get 0)) (i64x2.splat (local.get 1))))
+  (func (export "as-f32x4_eq-operands") (param f32 f32) (result v128)
+    (f32x4.eq (f32x4.splat (local.get 0)) (f32x4.splat (local.get 1))))
+  (func (export "as-f64x2_eq-operands") (param f64 f64) (result v128)
+    (f64x2.eq (f64x2.splat (local.get 0)) (f64x2.splat (local.get 1))))
+
+  ;; Floating-point sign bit operations
+  (func (export "as-f32x4_abs-operand") (param f32) (result v128)
+    (f32x4.abs (f32x4.splat (local.get 0))))
+
+  ;; Floating-point min
+  (func (export "as-f32x4_min-operands") (param f32 f32) (result v128)
+    (f32x4.min (f32x4.splat (local.get 0)) (f32x4.splat (local.get 1))))
+
+  ;; Floating-point arithmetic
+  (func (export "as-f32x4_div-operands") (param f32 f32) (result v128)
+    (f32x4.div (f32x4.splat (local.get 0)) (f32x4.splat (local.get 1))))
+
+  ;; Conversions
+  (func (export "as-f32x4_convert_s_i32x4-operand") (param i32) (result v128)
+    (f32x4.convert_i32x4_s (i32x4.splat (local.get 0))))
+  (func (export "as-i32x4_trunc_s_f32x4_sat-operand") (param f32) (result v128)
+    (i32x4.trunc_sat_f32x4_s (f32x4.splat (local.get 0))))
+)
\ No newline at end of file
diff --git a/tests/latch/latch-0.3.0.tgz b/tests/latch/latch-0.3.0.tgz
index b916137f..9f1913bd 100644
Binary files a/tests/latch/latch-0.3.0.tgz and b/tests/latch/latch-0.3.0.tgz differ
diff --git a/tests/latch/package-lock.json b/tests/latch/package-lock.json
index 707ca38d..f327001d 100644
--- a/tests/latch/package-lock.json
+++ b/tests/latch/package-lock.json
@@ -276,9 +276,9 @@
       "license": "MIT"
     },
     "node_modules/@types/node": {
-      "version": "22.5.5",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.5.5.tgz",
-      "integrity": "sha512-Xjs4y5UPO/CLdzpgR6GirZJx36yScjh73+2NlLlkFRSoQN8B0DpfXPdZGnvVmLRLOsqDpOfTNv7D9trgGhmOIA==",
+      "version": "22.7.5",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.7.5.tgz",
+      "integrity": "sha512-jML7s2NAzMWc//QSJ1a3prpk78cOPchGvXJsC3C6R6PSMoooztvRVQEz89gmBTBY1SPMaqo5teB4uNHPdetShQ==",
       "dev": true,
       "license": "MIT",
       "peer": true,
@@ -699,7 +699,7 @@
     "node_modules/latch": {
       "version": "0.3.0",
       "resolved": "file:latch-0.3.0.tgz",
-      "integrity": "sha512-4ZVbuSUb5lmSvcNWfGDnxrOUoFq7uTL62CvfglTunjPig5r5f0daTWd76CwdylxQobInUH3RVXSx8qIZ1HjTmA==",
+      "integrity": "sha512-ve2IHiwGMEVRONgAeIQPW8mEkcHJqRC5t7lnPExehk9uLBW6VQdlGV+GxKNdTTANdsnYjz/dsLPfTcVOWLdFrA==",
       "dev": true,
       "dependencies": {
         "ansi-colors": "^4.1.3",
@@ -707,9 +707,6 @@
         "ora": "^8.0.1",
         "source-map": "^0.7.4",
         "ts-node": "^10.5.0"
-      },
-      "bin": {
-        "latch": "npx ts-node"
       }
     },
     "node_modules/leven": {
diff --git a/tests/latch/src/spec.test.ts b/tests/latch/src/spec.test.ts
index ceb4a6c5..1320c228 100644
--- a/tests/latch/src/spec.test.ts
+++ b/tests/latch/src/spec.test.ts
@@ -68,7 +68,6 @@ spec.tests(tests);
 framework.run([spec]);
 
 // Helper function
-
 function createTest(module: string, asserts: string[]): TestScenario {
     const steps: Step[] = [];
 
diff --git a/tests/latch/src/util/spec.util.ts b/tests/latch/src/util/spec.util.ts
index 07f034a1..8544549b 100644
--- a/tests/latch/src/util/spec.util.ts
+++ b/tests/latch/src/util/spec.util.ts
@@ -5,6 +5,88 @@ interface Cursor {
     value: number;
 }
 
+function float32HexStr(x: number): string {
+    const ab = new ArrayBuffer(4);
+    const fb = new Float32Array(ab);
+    fb[0] = x;
+    const ui8 = new Uint8Array(ab);
+    let res = '';
+    for (let i = 3; i >= 0; i--) {
+        res += ui8[i].toString(16).padStart(2, '0');
+    }
+    return res;
+}
+
+function float64HexStr(x: number): string {
+    const ab = new ArrayBuffer(8);
+    const fb = new Float64Array(ab);
+    fb[0] = x;
+    const ui8 = new Uint8Array(ab);
+    let res = '';
+    for (let i = 7; i >= 0; i--) {
+        res += ui8[i].toString(16).padStart(2, '0');
+    }
+    return res;
+}
+
+function parseWasmFloat32(str: string) {
+    const strBuf = str.replace(/_/gi, ''); // remove those damned underscores
+
+    const flt = parseHexFloat(strBuf);
+    const res = float32HexStr(flt);
+    return res;
+}
+
+function parseWasmFloat64(str: string): string|undefined {
+    const strBuf = str.replace(/_/gi, ''); // remove those damned underscores
+
+    // just use hexFloat - it works for 64-bit floats
+    const flt = parseHexFloat(strBuf);
+    const res = float64HexStr(flt);
+    return res;
+}
+
+function parseV128(type: string, args: string[]): string | undefined {
+    const int_lambda = (bit_width: number, mask: bigint): string|undefined => {
+        const elems = 128 / bit_width;
+        const pad_len = bit_width / 4;
+
+        if(args.length !== elems) return undefined;
+        const res = args
+            .map(str => str.replace(/_/gi, '')) // WASM allows _ in numbers, TS doesn't like those
+            .map(str => {
+                let start_idx = 0;
+                let sign = 1;
+                if(str.startsWith('-')) { sign = -1; start_idx = 1; }
+                else if(str.startsWith('+')) { start_idx = 1; }
+                return BigInt(sign) * BigInt(str.slice(start_idx))
+            }) // parse to (big)-int
+            .map(num => num & mask) // ensure correct bit width
+            .map(num => num.toString(16).padStart(pad_len, '0')) // convert to hex
+            .reduce((acc, val) => acc + val, ''); // concat
+        return res;
+    };
+
+    const float_lambda = (elem_count: number, parser: (s: string) => string|undefined): string|undefined => {
+        if(args.length !== elem_count) return undefined;
+        const parsed = args.map(str => parser(str));
+        if(parsed.some(str => str === undefined)) return undefined;
+        return parsed.reduce((acc, val) => acc + (val as string), '');
+    }
+
+    switch(type) {
+        case 'i8x16': return int_lambda(8,  0x00000000000000ffn);
+        case 'i16x8': return int_lambda(16, 0x000000000000ffffn);
+        case 'i32x4': return int_lambda(32, 0x00000000ffffffffn);
+        case 'i64x2': return int_lambda(64, 0xffffffffffffffffn);
+
+        case 'f32x4': return float_lambda(4, parseWasmFloat32);
+        case 'f64x2': return float_lambda(2, parseWasmFloat64);
+
+        default: return undefined;
+    }
+}
+
 export function parseResult(input: string): WASM.Value | undefined {
     let cursor = 0;
     let delta: number = consume(input, cursor, /\(/d);
@@ -22,6 +104,9 @@ export function parseResult(input: string): WASM.Value | undefined {
     delta = consume(input, cursor, /^[^)]*/d);
     if (type === WASM.Type.f32 || type === WASM.Type.f64) {
         value = parseHexFloat(input.slice(cursor, cursor + delta));
+    } else if(type === WASM.Type.v128) {
+        const slice = input.slice(cursor, cursor + delta).split(' ').filter(x => x.trim() !== ''); // [dim, arg1, ...]
+        value = parseV128(slice[0], slice.slice(1));
     } else {
         value = parseInteger(input.slice(cursor, cursor + delta));
     }
@@ -49,7 +134,7 @@ export function parseArguments(input: string, index: Cursor): WASM.Value[] {
 
         cursor += delta + consume(input, cursor + delta, /^[^)]*const /d);
         delta = consume(input, cursor, /^[^)]*/d);
-        let maybe: number | undefined;
+        let maybe: number | bigint | undefined;
         if (type === WASM.Type.f32 || type === WASM.Type.f64) {
             maybe = parseHexFloat(input.slice(cursor, cursor + delta));
         } else {
@@ -106,6 +191,10 @@ function parseHexFloat(input: string): number {
         return Infinity;
     }
 
+    if (input.includes('e')) {
+        return parseFloat(input);
+    }
+
     const radix: number = input.includes('0x') ? 16 : 10;
     let base: string = input, mantissa, exponent = 0;
 
@@ -127,16 +216,12 @@ function parseHexFloat(input: string): number {
     return mantissa * Math.pow(2, exponent);
 }
 
-function parseInteger(hex: string, bytes: number = 4): number {
-    if (!hex.includes('0x')) {
-        return parseInt(hex);
+function parseInteger(hexU: string, bytes: number = 4): bigint {
+    const hex = hexU.replace(/_/g, '');
+    if(hex.startsWith('-')) {
+        return BigInt(-1) * BigInt(hex.slice(1));
     }
-    const mask = parseInt('0x80' + '00'.repeat(bytes - 1), 16);
-    let integer = parseInt(hex, 16);
-    if (integer >= mask) {
-        integer = integer - mask * 2;
-    }
-    return integer;
+    return BigInt(hex);
 }
 
 export function find(regex: RegExp, input: string) {
@@ -145,4 +230,4 @@ export function find(regex: RegExp, input: string) {
         return '';
     }
     return match[1];
-}
\ No newline at end of file
+}
diff --git a/tutorials/c/.gitignore b/tutorials/c/.gitignore
new file mode 100644
index 00000000..3d758890
--- /dev/null
+++ b/tutorials/c/.gitignore
@@ -0,0 +1 @@
+!Makefile
\ No newline at end of file
diff --git a/tutorials/c/Makefile b/tutorials/c/Makefile
new file mode 100644
index 00000000..87d87599
--- /dev/null
+++ b/tutorials/c/Makefile
@@ -0,0 +1,9 @@
+all: add.wasm matmul.wasm
+.PHONY: all
+
+%.wasm: %.c Makefile
+	clang --target=wasm32 -msimd128 -O3 -flto -nostdlib -Wl,--no-entry -Wl,--export-all -o $@ $<
+
+clean:
+	rm -f *.wasm
+.PHONY: clean
\ No newline at end of file
diff --git a/tutorials/c/add.c b/tutorials/c/add.c
new file mode 100644
index 00000000..46bc7366
--- /dev/null
+++ b/tutorials/c/add.c
@@ -0,0 +1,18 @@
+//
+// Created by jay on 10/10/24.
+//
+
+#include "wasm_simd128.h"
+
+// static unsigned char *memory; // 1 WASM page
+
+__attribute__((import_module("env"), import_name("print_int")))
+extern void print_int(int value);
+
+typedef v128_t v128;
+
+__attribute__((noinline))
+int add(const int a, const int b) { return a * a + b; }
+
+__attribute__((export_name("main")))
+void _start() { print_int(add(4, 2)); }
\ No newline at end of file
diff --git a/tutorials/c/matmul.c b/tutorials/c/matmul.c
new file mode 100644
index 00000000..9baaf97b
--- /dev/null
+++ b/tutorials/c/matmul.c
@@ -0,0 +1,140 @@
+//
+// Created by jay on 10/10/24.
+//
+#include "stdint.h"
+#include "wasm_simd128.h"
+
+static unsigned char *memory[1024];
+
+__attribute__((import_module("env"), import_name("print_int")))
+extern void print_int(int value);
+
+/**
+ * Computes a * b + c (lane-wise)
+ */
+__attribute__((noinline))
+v128_t vmlaq_u16(const v128_t a, const v128_t b, const v128_t c) {
+  return wasm_i16x8_add(wasm_i16x8_mul(a, b), c);
+}
+
+/**
+ * Computes (a >> shift) + b (lane-wise)
+ */
+__attribute__((noinline))
+v128_t vsraq_n_u16(const v128_t a, const v128_t b, const unsigned int shift) {
+  return wasm_i16x8_add(b, wasm_i16x8_shr(a, shift));
+}
+
+/**
+ * Computes the sum of all lanes in a vector
+ */
+__attribute__((noinline))
+short vaddvq_u16(v128_t v) {
+#define SWIZZLE_MASK0 wasm_i8x16_const(0x4, 0x5, 0x6, 0x7, -1, -1, -1, -1, 0xc, 0xd, 0xe, 0xf, -1, -1, -1, -1)
+#define SWIZZLE_MASK1 wasm_i8x16_const(0x8, 0x9, 0xa, 0xb, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)
+
+  v = wasm_i32x4_extadd_pairwise_i16x8(v);
+  v = wasm_i32x4_add(v, wasm_i8x16_swizzle(v, SWIZZLE_MASK0));
+  v = wasm_i32x4_add(v, wasm_i8x16_swizzle(v, SWIZZLE_MASK1));
+  return wasm_i32x4_extract_lane(v, 0);
+}
+
+__attribute__((noinline))
+void print_v128(const v128_t v) {
+  print_int(wasm_i16x8_extract_lane(v, 0));
+  print_int(wasm_i16x8_extract_lane(v, 1));
+  print_int(wasm_i16x8_extract_lane(v, 2));
+  print_int(wasm_i16x8_extract_lane(v, 3));
+  print_int(wasm_i16x8_extract_lane(v, 4));
+  print_int(wasm_i16x8_extract_lane(v, 5));
+  print_int(wasm_i16x8_extract_lane(v, 6));
+  print_int(wasm_i16x8_extract_lane(v, 7));
+}
+
+__attribute__((noinline))
+v128_t vmovq_n_u16(const unsigned short n) {
+  return wasm_i16x8_splat(n);
+}
+
+__attribute__((noinline))
+v128_t vld1q_u16(const unsigned short *ptr) {
+  return wasm_v128_load(ptr);
+}
+
+__attribute__((noinline))
+v128_t vandq_u16(const v128_t a, const v128_t b) {
+  return wasm_v128_and(a, b);
+}
+
+unsigned short small_dot(const unsigned short w, const unsigned short a) {
+  return ((w * a) & 0x0f00) >> 8;
+}
+
+unsigned short small_vec_dot(const v128_t w, const v128_t a) {
+  return vaddvq_u16(wasm_i16x8_shr(vandq_u16(wasm_i16x8_mul(w, a), wasm_i16x8_splat(0x0f00)), 8));
+}
+
+unsigned short small_vec_dot_ptr(const unsigned short *w, const unsigned short *a) {
+  v128_t local = vmovq_n_u16(0);
+  v128_t sum = vmovq_n_u16(0);
+  const v128_t v_w = vld1q_u16(w);
+  const v128_t v_a = vld1q_u16(a);
+  local = vmlaq_u16(v_w, v_a, local);
+  local = vandq_u16(local, wasm_i16x8_splat(0x0f00));
+  sum = vsraq_n_u16(local, sum, 8);
+  return vaddvq_u16(sum);
+}
+
+__attribute__((noinline))
+int dot(const unsigned short *w, const unsigned short *a, const int k) {
+  // expect w=1, a=1, k%(depth*iter*8)=0
+#define DEPTH 2
+#define ITER 2
+#define MASK vmovq_n_u16(0x0f00)
+  // expect k % 32 == 0
+
+  v128_t sum = vmovq_n_u16(0);
+  for(int i = 0; i < k; i += 8 * DEPTH * ITER) {
+    v128_t local = vmovq_n_u16(0);
+    for(int j = 0; j < ITER; j++) {
+      v128_t vecw = vld1q_u16(w);
+      v128_t veca = vld1q_u16(a);
+      local = vmlaq_u16(vecw, veca, local);
+      w += 8;
+      a += 8;
+    }
+    local = vandq_u16(local, MASK);
+    sum = vsraq_n_u16(local, sum, 8);
+  }
+  return vaddvq_u16(sum);
+}
+
+__attribute__((export_name("main")))
+void _start() {
+  const unsigned short packed_w[16] = {
+    0x0001, 0x0001, 0x0001, 0x0001, 0x0000, 0x0101, 0x0202, 0x0303,
+    0x0302, 0x0100, 0x0201, 0x0001, 0x0201, 0x0201, 0x0201, 0x0201
+  };
+
+  const unsigned short packed_a[16] = {
+    0x0100, 0x0002, 0x0201, 0x0100, 0x0100, 0x0200, 0x0300, 0x0000,
+    0x0100, 0x0200, 0x0300, 0x0400, 0x0001, 0x0001, 0x0001, 0x0001
+  };
+
+  print_int(dot(packed_w, packed_a, 32)); // OK!
+
+  // print_int(small_dot(0x0302, 0x0104)); // OK!
+
+  // print_int(small_vec_dot(
+  //   wasm_i16x8_const(0x0001, 0x0203, 0x0001, 0x0203, 0x0302, 0x0100, 0x0302, 0x0100),
+  //   wasm_i16x8_const(0x0001, 0x0001, 0x0001, 0x0001, 0x0201, 0x0201, 0x0201, 0x0201)
+  // )); // OK!
+
+  // const unsigned short w[16] = {
+  //   0x0001, 0x0203, 0x0001, 0x0203, 0x0302, 0x0100, 0x0302, 0x0100
+  // };
+  // const unsigned short a[16] = {
+  //   0x0001, 0x0001, 0x0001, 0x0001, 0x0201, 0x0201, 0x0201, 0x0201
+  // };
+  // print_int(small_vec_dot_ptr(w, a)); // OK!
+}
\ No newline at end of file
diff --git a/tutorials/wat/main/simd.wat b/tutorials/wat/main/simd.wat
new file mode 100644
index 00000000..8bcc1fd3
--- /dev/null
+++ b/tutorials/wat/main/simd.wat
@@ -0,0 +1,169 @@
+(module
+    (import "env" "print_int" (func $print.int (type $i32->void)))
+    (type $void->void (func))
+    (type $i32->void (func (param i32)))
+    (type $v128->v128->v128->v128 (func (param v128) (param v128) (param v128) (result v128)))
+    (type $v128->v128->i32->v128 (func (param v128) (param v128) (param i32) (result v128)))
+    (type $v128->i16 (func (param v128) (result i32)))
+    (type $v128->void (func (param v128)))
+    (memory 1)
+
+    (func $vmlaq_u16 (type $v128->v128->v128->v128)
+        (i16x8.mul (local.get 0) (local.get 1))
+        (i16x8.add (local.get 2))
+    )
+
+    (func $vsraq_n_u16 (type $v128->v128->i32->v128)
+        (i16x8.shr_s (local.get 0) (local.get 2))
+        (i16x8.add (local.get 1))
+    )
+
+    (func $print.v128 (type $v128->void)
+        local.get 0
+        i16x8.extract_lane_s 0
+        call $print.int
+
+        local.get 0
+        i16x8.extract_lane_s 1
+        call $print.int
+
+        local.get 0
+        i16x8.extract_lane_s 2
+        call $print.int
+
+        local.get 0
+        i16x8.extract_lane_s 3
+        call $print.int
+
+        local.get 0
+        i16x8.extract_lane_s 4
+        call $print.int
+
+        local.get 0
+        i16x8.extract_lane_s 5
+        call $print.int
+
+        local.get 0
+        i16x8.extract_lane_s 6
+        call $print.int
+
+        local.get 0
+        i16x8.extract_lane_s 7
+        call $print.int
+    )
+
+    (func $print.v128.i8x16 (type $v128->void)
+            local.get 0
+            i8x16.extract_lane_s 0
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 1
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 2
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 3
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 4
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 5
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 6
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 7
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 8
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 9
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 10
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 11
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 12
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 13
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 14
+            call $print.int
+
+            local.get 0
+            i8x16.extract_lane_s 15
+            call $print.int
+        )
+
+    (func $vaddvq_u16 (type $v128->i16)
+        local.get 0
+        i32x4.extadd_pairwise_i16x8_u
+        local.tee 0
+        i32x4.extract_lane 0
+        local.get 0
+        i32x4.extract_lane 1
+        local.get 0
+        i32x4.extract_lane 2
+        local.get 0
+        i32x4.extract_lane 3
+        i32.add
+        i32.add
+        i32.add
+    )
+
+  (func $vaddvq_u16.swizzle (type $v128->i16)
+    local.get 0
+    i32x4.extadd_pairwise_i16x8_u
+    local.tee 0
+    v128.const i8x16 0x4 0x5 0x6 0x7 0xc 0xd 0xe 0xf 0x8 0x9 0xa 0xb -1 -1 -1 -1
+    i8x16.swizzle
+    i32x4.add
+    i32x4.extract_lane 0
+)
+
+    (; (func $vaddvq_u16.swizzle (type $v128->i16) ;)
+    (;     local.get 0 ;)
+    (;     i32x4.extadd_pairwise_i16x8_u ;)
+    (;     local.tee 0 ;)
+    (;     v128.const i8x16 0x4 0x5 0x6 0x7 -1 -1 -1 -1 0xc 0xd 0xe 0xf -1 -1 -1 -1 ;)
+    (;     i8x16.swizzle ;)
+    (;     local.get 0 ;)
+    (;     i32x4.add ;)
+    (;     local.tee 0 ;)
+    (;     v128.const i8x16 0x8 0x9 0xa 0xb -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ;)
+    (;     i8x16.swizzle ;)
+    (;     local.get 0 ;)
+    (;     i32x4.add ;)
+    (;     i32x4.extract_lane 0 ;)
+    (; ) ;)
+
+    (func $run (type $void->void)
+        (call $vaddvq_u16.swizzle (v128.const i16x8 1 2 3 4 5 6 7 8))
+        call $print.int
+    )
+
+    (export "main" (func $run))
+)
diff --git a/tutorials/wat/main/simd2.wat b/tutorials/wat/main/simd2.wat
new file mode 100644
index 00000000..2657e701
--- /dev/null
+++ b/tutorials/wat/main/simd2.wat
@@ -0,0 +1,52 @@
+(module
+    (import "env" "print_int" (func $print.int (type $i32->void)))
+    (type $void->void (func))
+    (type $i32->void (func (param i32)))
+    (type $void->v128 (func (result v128)))
+
+    (memory 1)
+
+    (func $build (type $void->v128)
+        (i32.const 0)
+        i16x8.splat
+        i32.const 1
+        i16x8.replace_lane 0
+        i32.const 2
+        i16x8.replace_lane 1
+        i32.const 3
+        i16x8.replace_lane 2
+        i32.const 4
+        i16x8.replace_lane 3
+        i32.const 5
+        i16x8.replace_lane 4
+        i32.const 6
+        i16x8.replace_lane 5
+        i32.const 7
+        i16x8.replace_lane 6
+        i32.const 8
+        i16x8.replace_lane 7
+    )
+
+    (func $run (type $void->void)
+        (i32.const 123456)
+        i32x4.splat
+        i32x4.extract_lane 0
+        call $print.int
+
+        call $build
+        i16x8.extract_lane_s 0
+        call $print.int
+
+        call $build
+        i16x8.extract_lane_s 1
+        call $print.int
+
+        call $build
+        i16x8.extract_lane_s 2
+        call $print.int
+
+        (v128.store (i32.const 0) (i8x16.splat (i32.const 0)))
+    )
+
+    (export "main" (func $run))
+)
\ No newline at end of file