From c7bf409f994b47afffb46413b69da81f6fb65fd3 Mon Sep 17 00:00:00 2001
From: Zoltan Herczeg <hzmester@freemail.hu>
Date: Tue, 1 Oct 2024 11:25:10 +0000
Subject: [PATCH] Implement relaxed simd in the interpreter

JIT tests are disabled

Signed-off-by: Zoltan Herczeg zherczeg.u-szeged@partner.samsung.com
---
 src/interpreter/ByteCode.h                    | 163 +++++++++----
 src/interpreter/Interpreter.cpp               |  83 ++++++-
 src/parser/WASMParser.cpp                     |  19 +-
 src/shell/Shell.cpp                           |  61 +++--
 src/util/MathOperation.h                      |  16 +-
 .../relaxed-simd/i16x8_relaxed_q15mulr_s.wast |  28 +++
 .../relaxed-simd/i32x4_relaxed_trunc.wast     | 124 ++++++++++
 .../relaxed-simd/i8x16_relaxed_swizzle.wast   |  45 ++++
 .../relaxed-simd/relaxed_dot_product.wast     | 107 +++++++++
 .../relaxed-simd/relaxed_laneselect.wast      | 103 ++++++++
 .../relaxed-simd/relaxed_madd_nmadd.wast      | 224 ++++++++++++++++++
 .../relaxed-simd/relaxed_min_max.wast         | 184 ++++++++++++++
 .../wabt/src/walrus/binary-reader-walrus.cc   |   2 +
 tools/jit_exclude_list.txt                    |   7 +
 14 files changed, 1091 insertions(+), 75 deletions(-)
 create mode 100644 test/extended/relaxed-simd/i16x8_relaxed_q15mulr_s.wast
 create mode 100644 test/extended/relaxed-simd/i32x4_relaxed_trunc.wast
 create mode 100644 test/extended/relaxed-simd/i8x16_relaxed_swizzle.wast
 create mode 100644 test/extended/relaxed-simd/relaxed_dot_product.wast
 create mode 100644 test/extended/relaxed-simd/relaxed_laneselect.wast
 create mode 100644 test/extended/relaxed-simd/relaxed_madd_nmadd.wast
 create mode 100644 test/extended/relaxed-simd/relaxed_min_max.wast

diff --git a/src/interpreter/ByteCode.h b/src/interpreter/ByteCode.h
index 154e63836..acc7f1841 100644
--- a/src/interpreter/ByteCode.h
+++ b/src/interpreter/ByteCode.h
@@ -378,7 +378,7 @@ class FunctionType;
     F(I64X2ExtmulHighI32X4S, (simdExtmulOperation<int32_t, int64_t, false>))   \
     F(I64X2ExtmulLowI32X4U, (simdExtmulOperation<uint32_t, uint64_t, true>))   \
     F(I64X2ExtmulHighI32X4U, (simdExtmulOperation<uint32_t, uint64_t, false>)) \
-    F(I32X4DotI16X8S, (simdDotOperation))                                      \
+    F(I32X4DotI16X8S, (simdDotOperation<int16_t, uint32_t>))                   \
     F(I8X16NarrowI16X8S, (simdNarrowOperation<int16_t, int8_t>))               \
     F(I8X16NarrowI16X8U, (simdNarrowOperation<int16_t, uint8_t>))              \
     F(I16X8NarrowI32X4S, (simdNarrowOperation<int32_t, int16_t>))              \
@@ -588,30 +588,65 @@ class FunctionType;
     F(MemoryAtomicWait64)                 \
     F(AtomicFence)
 
-#define FOR_EACH_BYTECODE(F)                   \
-    FOR_EACH_BYTECODE_OP(F)                    \
-    FOR_EACH_BYTECODE_BINARY_OP(F)             \
-    FOR_EACH_BYTECODE_UNARY_OP(F)              \
-    FOR_EACH_BYTECODE_UNARY_OP_2(F)            \
-    FOR_EACH_BYTECODE_LOAD_OP(F)               \
-    FOR_EACH_BYTECODE_STORE_OP(F)              \
-    FOR_EACH_BYTECODE_SIMD_BINARY_OP(F)        \
-    FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(F)  \
-    FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(F)     \
-    FOR_EACH_BYTECODE_SIMD_UNARY_OP(F)         \
-    FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(F) \
-    FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(F)      \
-    FOR_EACH_BYTECODE_SIMD_LOAD_SPLAT_OP(F)    \
-    FOR_EACH_BYTECODE_SIMD_LOAD_EXTEND_OP(F)   \
-    FOR_EACH_BYTECODE_SIMD_LOAD_LANE_OP(F)     \
-    FOR_EACH_BYTECODE_SIMD_STORE_LANE_OP(F)    \
-    FOR_EACH_BYTECODE_SIMD_EXTRACT_LANE_OP(F)  \
-    FOR_EACH_BYTECODE_SIMD_REPLACE_LANE_OP(F)  \
-    FOR_EACH_BYTECODE_SIMD_ETC_OP(F)           \
-    FOR_EACH_BYTECODE_ATOMIC_LOAD_OP(F)        \
-    FOR_EACH_BYTECODE_ATOMIC_STORE_OP(F)       \
-    FOR_EACH_BYTECODE_ATOMIC_RMW_OP(F)         \
-    FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(F) \
+#define FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(F)                            \
+    F(I32X4RelaxedTruncF32X4S, (simdTruncSatOperation<float, int32_t>))          \
+    F(I32X4RelaxedTruncF32X4U, (simdTruncSatOperation<float, uint32_t>))         \
+    F(I32X4RelaxedTruncF64X2SZero, (simdTruncSatZeroOperation<double, int32_t>)) \
+    F(I32X4RelaxedTruncF64X2UZero, (simdTruncSatZeroOperation<double, uint32_t>))
+
+#define FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(F) \
+    F(F32X4RelaxedMin, floatMin, float, float)      \
+    F(F32X4RelaxedMax, floatMax, float, float)      \
+    F(F64X2RelaxedMin, floatMin, double, double)    \
+    F(F64X2RelaxedMax, floatMax, double, double)    \
+    F(I16X8RelaxedQ15mulrS, saturatingRoundingQMul, int16_t, int16_t)
+
+#define FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(F)      \
+    F(I8X16RelaxedSwizzle, (simdSwizzleOperation<uint8_t>)) \
+    F(I16X8DotI8X16I7X16S, (simdDotOperation<int8_t, uint16_t>))
+
+#define FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(F)   \
+    F(F32X4RelaxedMadd, floatMulAdd, float, float)     \
+    F(F32X4RelaxedNmadd, floatNegMulAdd, float, float) \
+    F(F64X2RelaxedMadd, floatMulAdd, double, double)   \
+    F(F64X2RelaxedNmadd, floatNegMulAdd, double, double)
+
+#define FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(F) \
+    F(I32X4DotI8X16I7X16AddS, (simdDotAddOperation))    \
+    F(I8X16RelaxedLaneSelect, (simdBitSelectOperation)) \
+    F(I16X8RelaxedLaneSelect, (simdBitSelectOperation)) \
+    F(I32X4RelaxedLaneSelect, (simdBitSelectOperation)) \
+    F(I64X2RelaxedLaneSelect, (simdBitSelectOperation))
+
+#define FOR_EACH_BYTECODE(F)                        \
+    FOR_EACH_BYTECODE_OP(F)                         \
+    FOR_EACH_BYTECODE_BINARY_OP(F)                  \
+    FOR_EACH_BYTECODE_UNARY_OP(F)                   \
+    FOR_EACH_BYTECODE_UNARY_OP_2(F)                 \
+    FOR_EACH_BYTECODE_LOAD_OP(F)                    \
+    FOR_EACH_BYTECODE_STORE_OP(F)                   \
+    FOR_EACH_BYTECODE_SIMD_BINARY_OP(F)             \
+    FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(F)       \
+    FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(F)          \
+    FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(F)     \
+    FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(F)  \
+    FOR_EACH_BYTECODE_SIMD_UNARY_OP(F)              \
+    FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(F)      \
+    FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(F)   \
+    FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(F)    \
+    FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(F) \
+    FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(F)           \
+    FOR_EACH_BYTECODE_SIMD_LOAD_SPLAT_OP(F)         \
+    FOR_EACH_BYTECODE_SIMD_LOAD_EXTEND_OP(F)        \
+    FOR_EACH_BYTECODE_SIMD_LOAD_LANE_OP(F)          \
+    FOR_EACH_BYTECODE_SIMD_STORE_LANE_OP(F)         \
+    FOR_EACH_BYTECODE_SIMD_EXTRACT_LANE_OP(F)       \
+    FOR_EACH_BYTECODE_SIMD_REPLACE_LANE_OP(F)       \
+    FOR_EACH_BYTECODE_SIMD_ETC_OP(F)                \
+    FOR_EACH_BYTECODE_ATOMIC_LOAD_OP(F)             \
+    FOR_EACH_BYTECODE_ATOMIC_STORE_OP(F)            \
+    FOR_EACH_BYTECODE_ATOMIC_RMW_OP(F)              \
+    FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(F)      \
     FOR_EACH_BYTECODE_ATOMIC_OTHER(F)
 
 class ByteCode {
@@ -726,6 +761,25 @@ class ByteCodeOffset2Value : public ByteCode {
     uint32_t m_value;
 };
 
+class ByteCodeOffset4 : public ByteCode {
+public:
+    ByteCodeOffset4(Opcode opcode, ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset)
+        : ByteCode(opcode)
+        , m_stackOffsets{ src0Offset, src1Offset, src2Offset, dstOffset }
+    {
+    }
+
+    const ByteCodeStackOffset* srcOffsets() const { return m_stackOffsets; }
+    ByteCodeStackOffset src0Offset() const { return m_stackOffsets[0]; }
+    ByteCodeStackOffset src1Offset() const { return m_stackOffsets[1]; }
+    ByteCodeStackOffset src2Offset() const { return m_stackOffsets[2]; }
+    ByteCodeStackOffset dstOffset() const { return m_stackOffsets[3]; }
+
+protected:
+    ByteCodeStackOffset m_stackOffsets[4];
+};
+
+
 class ByteCodeOffset4Value : public ByteCode {
 public:
     ByteCodeOffset4Value(Opcode opcode, ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset, uint32_t value)
@@ -923,15 +977,56 @@ class UnaryOperation : public ByteCodeOffset2 {
         DEFINE_UNARY_BYTECODE_DUMP(name)                                   \
     };
 
+// dummy ByteCode for ternary operation
+class TernaryOperation : public ByteCodeOffset4 {
+public:
+    TernaryOperation(Opcode code, ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset)
+        : ByteCodeOffset4(code, src0Offset, src1Offset, src2Offset, dstOffset)
+    {
+    }
+
+#if !defined(NDEBUG)
+    void dump(size_t pos)
+    {
+    }
+#endif
+};
+
+#if !defined(NDEBUG)
+#define DEFINE_TERNARY_BYTECODE_DUMP(name)                                                                                                                                                                        \
+    void dump(size_t pos)                                                                                                                                                                                         \
+    {                                                                                                                                                                                                             \
+        printf(#name " src1: %" PRIu32 " src2: %" PRIu32 " src3: %" PRIu32 " dst: %" PRIu32, (uint32_t)m_stackOffsets[0], (uint32_t)m_stackOffsets[1], (uint32_t)m_stackOffsets[2], (uint32_t)m_stackOffsets[3]); \
+    }
+#else
+#define DEFINE_TERNARY_BYTECODE_DUMP(name)
+#endif
+
+#define DEFINE_TERNARY_BYTECODE(name, ...)                                                                                                  \
+    class name : public TernaryOperation {                                                                                                  \
+    public:                                                                                                                                 \
+        name(ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset) \
+            : TernaryOperation(Opcode::name##Opcode, src0Offset, src1Offset, src2Offset, dstOffset)                                         \
+        {                                                                                                                                   \
+        }                                                                                                                                   \
+        DEFINE_TERNARY_BYTECODE_DUMP(name)                                                                                                  \
+    };
+
+
 FOR_EACH_BYTECODE_BINARY_OP(DEFINE_BINARY_BYTECODE)
 FOR_EACH_BYTECODE_UNARY_OP(DEFINE_UNARY_BYTECODE)
 FOR_EACH_BYTECODE_UNARY_OP_2(DEFINE_UNARY_BYTECODE)
 FOR_EACH_BYTECODE_SIMD_BINARY_OP(DEFINE_BINARY_BYTECODE)
 FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(DEFINE_BINARY_BYTECODE)
 FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(DEFINE_BINARY_BYTECODE)
+FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(DEFINE_BINARY_BYTECODE)
+FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(DEFINE_BINARY_BYTECODE)
 FOR_EACH_BYTECODE_SIMD_UNARY_OP(DEFINE_UNARY_BYTECODE)
 FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(DEFINE_UNARY_BYTECODE)
 FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(DEFINE_UNARY_BYTECODE)
+FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(DEFINE_UNARY_BYTECODE)
+FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(DEFINE_TERNARY_BYTECODE)
+FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(DEFINE_TERNARY_BYTECODE)
 
 #define DEFINE_MOVE_BYTECODE(name)                                         \
     class name : public ByteCodeOffset2 {                                  \
@@ -1910,31 +2005,19 @@ FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(DEFINE_RMW_CMPXCHG_BYTECODE)
 #undef DEFINE_RMW_BYTECODE
 
 // FOR_EACH_BYTECODE_SIMD_ETC_OP
-class V128BitSelect : public ByteCode {
+class V128BitSelect : public ByteCodeOffset4 {
 public:
     V128BitSelect(ByteCodeStackOffset lhs, ByteCodeStackOffset rhs, ByteCodeStackOffset c, ByteCodeStackOffset dst)
-        : ByteCode(Opcode::V128BitSelectOpcode)
-        , m_srcOffsets{ lhs, rhs, c }
-        , m_dstOffset(dst)
+        : ByteCodeOffset4(Opcode::V128BitSelectOpcode, lhs, rhs, c, dst)
     {
     }
 
-    const ByteCodeStackOffset* srcOffsets() const
-    {
-        return m_srcOffsets;
-    }
-    ByteCodeStackOffset dstOffset() const { return m_dstOffset; }
-
 #if !defined(NDEBUG)
     void dump(size_t pos)
     {
-        printf("v128.bitselect lhs: %" PRIu32 " rhs: %" PRIu32 " c: %" PRIu32 " dst: %" PRIu32, (uint32_t)m_srcOffsets[0], (uint32_t)m_srcOffsets[1], (uint32_t)m_srcOffsets[2], (uint32_t)m_dstOffset);
+        printf("v128.bitselect lhs: %" PRIu32 " rhs: %" PRIu32 " c: %" PRIu32 " dst: %" PRIu32, (uint32_t)m_stackOffsets[0], (uint32_t)m_stackOffsets[1], (uint32_t)m_stackOffsets[2], (uint32_t)m_stackOffsets[3]);
     }
 #endif
-
-protected:
-    ByteCodeStackOffset m_srcOffsets[3];
-    ByteCodeStackOffset m_dstOffset;
 };
 
 class V128Load32Zero : public MemoryLoad {
diff --git a/src/interpreter/Interpreter.cpp b/src/interpreter/Interpreter.cpp
index 1caf802e1..b28d23a0a 100644
--- a/src/interpreter/Interpreter.cpp
+++ b/src/interpreter/Interpreter.cpp
@@ -270,6 +270,19 @@ inline static void simdSwizzleOperation(ExecutionState& state, BinaryOperation*
     writeValue<Type>(bp, code->dstOffset(), result);
 }
 
+inline static void simdBitSelectOperation(ExecutionState& state, ByteCodeOffset4* code, uint8_t* bp)
+{
+    using Type = typename SIMDType<uint64_t>::Type;
+    auto src0 = readValue<Type>(bp, code->src0Offset());
+    auto src1 = readValue<Type>(bp, code->src1Offset());
+    auto src2 = readValue<Type>(bp, code->src2Offset());
+    Type result;
+    for (uint8_t i = 0; i < Type::Lanes; i++) {
+        result[i] = (src0[i] & src2[i]) | (src1[i] & ~src2[i]);
+    }
+    writeValue<Type>(bp, code->dstOffset(), result);
+}
+
 // FIXME optimize this function
 template <typename P, typename R, bool Low>
 inline static void simdExtmulOperation(ExecutionState& state, BinaryOperation* code, uint8_t* bp)
@@ -286,10 +299,11 @@ inline static void simdExtmulOperation(ExecutionState& state, BinaryOperation* c
     writeValue<ResultType>(bp, code->dstOffset(), result);
 }
 
+template <typename P, typename R>
 inline static void simdDotOperation(ExecutionState& state, BinaryOperation* code, uint8_t* bp)
 {
-    using ParamType = typename SIMDType<int16_t>::Type;
-    using ResultType = typename SIMDType<uint32_t>::Type;
+    using ParamType = typename SIMDType<P>::Type;
+    using ResultType = typename SIMDType<R>::Type;
     auto lhs = readValue<ParamType>(bp, code->srcOffset()[0]);
     auto rhs = readValue<ParamType>(bp, code->srcOffset()[1]);
     ResultType result;
@@ -302,6 +316,26 @@ inline static void simdDotOperation(ExecutionState& state, BinaryOperation* code
     writeValue<ResultType>(bp, code->dstOffset(), result);
 }
 
+inline static void simdDotAddOperation(ExecutionState& state, TernaryOperation* code, uint8_t* bp)
+{
+    using ParamType = typename SIMDType<int8_t>::Type;
+    using ResultType = typename SIMDType<int32_t>::Type;
+    auto src0 = readValue<ParamType>(bp, code->src0Offset());
+    auto src1 = readValue<ParamType>(bp, code->src1Offset());
+    auto src2 = readValue<ResultType>(bp, code->src2Offset());
+    ResultType result;
+    for (uint8_t i = 0; i < ResultType::Lanes; i++) {
+        uint8_t laneIdx = i * 4;
+        int16_t lo0 = static_cast<int16_t>(src0[laneIdx]) * static_cast<int16_t>(src1[laneIdx]);
+        int16_t hi0 = static_cast<int16_t>(src0[laneIdx + 1]) * static_cast<int16_t>(src1[laneIdx + 1]);
+        int16_t lo1 = static_cast<int16_t>(src0[laneIdx + 2]) * static_cast<int16_t>(src1[laneIdx + 2]);
+        int16_t hi1 = static_cast<int16_t>(src0[laneIdx + 3]) * static_cast<int16_t>(src1[laneIdx + 3]);
+        int32_t tmp = static_cast<int16_t>(lo0 + hi0) + static_cast<int16_t>(lo1 + hi1);
+        result[i] = add(state, tmp, src2[i]);
+    }
+    writeValue<ResultType>(bp, code->dstOffset(), result);
+}
+
 template <typename P, typename R>
 inline static void simdNarrowOperation(ExecutionState& state, BinaryOperation* code, uint8_t* bp)
 {
@@ -582,6 +616,35 @@ ByteCodeStackOffset* Interpreter::interpret(ExecutionState& state,
         NEXT_INSTRUCTION();                             \
     }
 
+#define SIMD_TERNARY_OPERATION(name, op, paramType, resultType)    \
+    DEFINE_OPCODE(name)                                            \
+        :                                                          \
+    {                                                              \
+        using ParamType = typename SIMDType<paramType>::Type;      \
+        using ResultType = typename SIMDType<resultType>::Type;    \
+        COMPILE_ASSERT(ParamType::Lanes == ResultType::Lanes, ""); \
+        name* code = (name*)programCounter;                        \
+        auto src0 = readValue<ParamType>(bp, code->src0Offset());  \
+        auto src1 = readValue<ParamType>(bp, code->src1Offset());  \
+        auto src2 = readValue<ParamType>(bp, code->src2Offset());  \
+        ResultType result;                                         \
+        for (uint8_t i = 0; i < ParamType::Lanes; i++) {           \
+            result[i] = op(state, src0[i], src1[i], src2[i]);      \
+        }                                                          \
+        writeValue<ResultType>(bp, code->dstOffset(), result);     \
+        ADD_PROGRAM_COUNTER(name);                                 \
+        NEXT_INSTRUCTION();                                        \
+    }
+
+#define SIMD_TERNARY_OTHER_OPERATION(name, op)            \
+    DEFINE_OPCODE(name)                                   \
+        :                                                 \
+    {                                                     \
+        op(state, (TernaryOperation*)programCounter, bp); \
+        ADD_PROGRAM_COUNTER(BinaryOperation);             \
+        NEXT_INSTRUCTION();                               \
+    }
+
 #define MEMORY_LOAD_OPERATION(opcodeName, readType, writeType)        \
     DEFINE_OPCODE(opcodeName)                                         \
         :                                                             \
@@ -880,9 +943,14 @@ ByteCodeStackOffset* Interpreter::interpret(ExecutionState& state,
     FOR_EACH_BYTECODE_SIMD_BINARY_OP(SIMD_BINARY_OPERATION)
     FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(SIMD_BINARY_SHIFT_OPERATION)
     FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(SIMD_BINARY_OTHER_OPERATION)
+    FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(SIMD_BINARY_OPERATION)
+    FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(SIMD_BINARY_OTHER_OPERATION)
     FOR_EACH_BYTECODE_SIMD_UNARY_OP(SIMD_UNARY_OPERATION)
     FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(SIMD_UNARY_CONVERT_OPERATION)
     FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(SIMD_UNARY_OTHER_OPERATION)
+    FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(SIMD_UNARY_OTHER_OPERATION)
+    FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(SIMD_TERNARY_OPERATION)
+    FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(SIMD_TERNARY_OTHER_OPERATION)
 
     DEFINE_OPCODE(Jump)
         :
@@ -1088,16 +1156,7 @@ ByteCodeStackOffset* Interpreter::interpret(ExecutionState& state,
     DEFINE_OPCODE(V128BitSelect)
         :
     {
-        using Type = typename SIMDType<uint64_t>::Type;
-        V128BitSelect* code = (V128BitSelect*)programCounter;
-        auto lhs = readValue<Type>(bp, code->srcOffsets()[0]);
-        auto rhs = readValue<Type>(bp, code->srcOffsets()[1]);
-        auto c = readValue<Type>(bp, code->srcOffsets()[2]);
-        Type result;
-        for (uint8_t i = 0; i < Type::Lanes; i++) {
-            result[i] = (lhs[i] & c[i]) | (rhs[i] & ~c[i]);
-        }
-        writeValue<Type>(bp, code->dstOffset(), result);
+        simdBitSelectOperation(state, (ByteCodeOffset4*)programCounter, bp);
         ADD_PROGRAM_COUNTER(V128BitSelect);
         NEXT_INSTRUCTION();
     }
diff --git a/src/parser/WASMParser.cpp b/src/parser/WASMParser.cpp
index 98c731011..7dcd295cb 100644
--- a/src/parser/WASMParser.cpp
+++ b/src/parser/WASMParser.cpp
@@ -1398,15 +1398,23 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
     {
         auto code = static_cast<WASMOpcode>(opcode);
         ASSERT(WASMCodeInfo::codeTypeToValueType(g_wasmCodeInfo[opcode].m_paramTypes[2]) == peekVMStackValueType());
-        auto c = popVMStack();
+        auto src2 = popVMStack();
         ASSERT(WASMCodeInfo::codeTypeToValueType(g_wasmCodeInfo[opcode].m_paramTypes[1]) == peekVMStackValueType());
-        auto rhs = popVMStack();
+        auto src1 = popVMStack();
         ASSERT(WASMCodeInfo::codeTypeToValueType(g_wasmCodeInfo[opcode].m_paramTypes[0]) == peekVMStackValueType());
-        auto lhs = popVMStack();
+        auto src0 = popVMStack();
         auto dst = computeExprResultPosition(WASMCodeInfo::codeTypeToValueType(g_wasmCodeInfo[opcode].m_resultType));
         switch (code) {
+#define GENERATE_TERNARY_CODE_CASE(name, ...)                    \
+    case WASMOpcode::name##Opcode: {                             \
+        pushByteCode(Walrus::name(src0, src1, src2, dst), code); \
+        break;                                                   \
+    }
+            FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(GENERATE_TERNARY_CODE_CASE)
+            FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(GENERATE_TERNARY_CODE_CASE)
+#undef GENERATE_TERNARY_CODE_CASE
         case WASMOpcode::V128BitSelectOpcode:
-            pushByteCode(Walrus::V128BitSelect(lhs, rhs, c, dst), code);
+            pushByteCode(Walrus::V128BitSelect(src0, src1, src2, dst), code);
             break;
         default:
             ASSERT_NOT_REACHED();
@@ -2574,6 +2582,8 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
             FOR_EACH_BYTECODE_SIMD_BINARY_OP(GENERATE_BINARY_CODE_CASE)
             FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(GENERATE_BINARY_CODE_CASE)
             FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(GENERATE_BINARY_CODE_CASE)
+            FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(GENERATE_BINARY_CODE_CASE)
+            FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(GENERATE_BINARY_CODE_CASE)
 #undef GENERATE_BINARY_CODE_CASE
         default:
             ASSERT_NOT_REACHED();
@@ -2594,6 +2604,7 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate {
             FOR_EACH_BYTECODE_SIMD_UNARY_OP(GENERATE_UNARY_CODE_CASE)
             FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(GENERATE_UNARY_CODE_CASE)
             FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(GENERATE_UNARY_CODE_CASE)
+            FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(GENERATE_UNARY_CODE_CASE)
 #undef GENERATE_UNARY_CODE_CASE
         case WASMOpcode::I32ReinterpretF32Opcode:
             pushByteCode(Walrus::I32ReinterpretF32(src, dst), code);
diff --git a/src/shell/Shell.cpp b/src/shell/Shell.cpp
index 0199c8081..60015700c 100644
--- a/src/shell/Shell.cpp
+++ b/src/shell/Shell.cpp
@@ -592,7 +592,7 @@ static void printConstVector(wabt::ConstVector& v)
 }
 
 static void executeInvokeAction(wabt::InvokeAction* action, Walrus::Function* fn, wabt::ConstVector expectedResult,
-                                const char* expectedException, bool expectUserException = false)
+                                const char* expectedException, bool expectUserException = false, bool either = false)
 {
     if (fn->functionType()->param().size() != action->args.size()) {
         printf("Error: expected %zu parameter(s) but got %zu.\n", fn->functionType()->param().size(), action->args.size());
@@ -608,7 +608,8 @@ static void executeInvokeAction(wabt::InvokeAction* action, Walrus::Function* fn
         wabt::ConstVector& expectedResult;
         Walrus::ValueVector& args;
         wabt::InvokeAction* action;
-    } data = { fn, expectedResult, args, action };
+        bool either;
+    } data = { fn, expectedResult, args, action, either };
     Walrus::Trap trap;
     auto trapResult = trap.run([](Walrus::ExecutionState& state, void* d) {
         RunData* data = reinterpret_cast<RunData*>(d);
@@ -616,22 +617,48 @@ static void executeInvokeAction(wabt::InvokeAction* action, Walrus::Function* fn
         result.resize(data->fn->functionType()->result().size());
         data->fn->call(state, data->args.data(), result.data());
         if (data->expectedResult.size()) {
-            if (data->fn->functionType()->result().size() != data->expectedResult.size()) {
-                printf("Error: %s returned with %zu parameter(s) but expected %zu", data->action->name.data(), data->fn->functionType()->result().size(), data->expectedResult.size());
-                RELEASE_ASSERT_NOT_REACHED();
-            }
-            // compare result
-            for (size_t i = 0; i < result.size(); i++) {
-                if (!equals(result[i], data->expectedResult[i])) {
-                    printf("Assertion failed at %d: ", data->action->loc.line);
-                    printf("%s(", data->action->name.data());
-                    printConstVector(data->action->args);
-                    printf(") expected ");
-                    printConstVector(data->expectedResult);
-                    printf(", but got %s\n", ((std::string)result[i]).c_str());
+            int errorIndex = -1;
+
+            if (data->either) {
+                if (data->fn->functionType()->result().size() != 1) {
+                    printf("Error: %s returned with %zu parameter(s) but expected 1", data->action->name.data(), data->fn->functionType()->result().size());
+                    RELEASE_ASSERT_NOT_REACHED();
+                }
+
+                // compare result
+                for (size_t i = 0; i < data->expectedResult.size(); i++) {
+                    if (equals(result[0], data->expectedResult[i])) {
+                        return;
+                    }
+                }
+
+                errorIndex = 0;
+            } else {
+                if (data->fn->functionType()->result().size() != data->expectedResult.size()) {
+                    printf("Error: %s returned with %zu parameter(s) but expected %zu", data->action->name.data(), data->fn->functionType()->result().size(), data->expectedResult.size());
                     RELEASE_ASSERT_NOT_REACHED();
                 }
+
+                // compare result
+                for (size_t i = 0; i < result.size(); i++) {
+                    if (!equals(result[i], data->expectedResult[i])) {
+                        errorIndex = i;
+                        break;
+                    }
+                }
+
+                if (errorIndex == -1) {
+                    return;
+                }
             }
+
+            printf("Assertion failed at %d: ", data->action->loc.line);
+            printf("%s(", data->action->name.data());
+            printConstVector(data->action->args);
+            printf(") %sexpected ", data->either ? "any " : "");
+            printConstVector(data->expectedResult);
+            printf(", but got %s\n", ((std::string)result[errorIndex]).c_str());
+            RELEASE_ASSERT_NOT_REACHED();
         }
     },
                                &data);
@@ -667,7 +694,7 @@ static void executeInvokeAction(wabt::InvokeAction* action, Walrus::Function* fn
     } else if (expectedResult.size()) {
         printf("invoke %s(", action->name.data());
         printConstVector(action->args);
-        printf(") expect value(");
+        printf(") expect %svalue(", either ? "either " : "");
         printConstVector(expectedResult);
         printf(") (line: %d) : OK\n", action->loc.line);
     }
@@ -745,7 +772,7 @@ static void executeWAST(Store* store, const std::string& filename, const std::ve
             if (assertReturn->action->type() == wabt::ActionType::Invoke) {
                 auto action = static_cast<wabt::InvokeAction*>(assertReturn->action.get());
                 auto fn = fetchInstance(action->module_var, instanceMap, registeredInstanceMap)->resolveExportFunction(action->name);
-                executeInvokeAction(action, fn, assertReturn->expected->expected, nullptr);
+                executeInvokeAction(action, fn, assertReturn->expected->expected, nullptr, false, assertReturn->expected->type() == wabt::ExpectationType::Either);
             } else if (assertReturn->action->type() == wabt::ActionType::Get) {
                 auto action = static_cast<wabt::GetAction*>(assertReturn->action.get());
                 auto v = fetchInstance(action->module_var, instanceMap, registeredInstanceMap)->resolveExportGlobal(action->name)->value();
diff --git a/src/util/MathOperation.h b/src/util/MathOperation.h
index f01040813..a688ab40b 100644
--- a/src/util/MathOperation.h
+++ b/src/util/MathOperation.h
@@ -319,6 +319,18 @@ ALWAYS_INLINE T floatPMax(ExecutionState& state, T lhs, T rhs)
     return std::max(lhs, rhs);
 }
 
+template <typename T>
+ALWAYS_INLINE T floatMulAdd(ExecutionState& state, T a, T b, T c)
+{
+    return (a * b) + c;
+}
+
+template <typename T>
+ALWAYS_INLINE T floatNegMulAdd(ExecutionState& state, T a, T b, T c)
+{
+    return -(a * b) + c;
+}
+
 template <typename R, typename T>
 bool canConvert(T val) { return true; }
 template <>
@@ -591,10 +603,10 @@ T saturatingRoundingQMul(ExecutionState& state, T lhs, T rhs)
 {
     constexpr int size_in_bits = sizeof(T) * 8;
     int round_const = 1 << (size_in_bits - 2);
-    int64_t product = (int64_t)lhs * rhs;
+    int32_t product = (int32_t)lhs * rhs;
     product += round_const;
     product >>= (size_in_bits - 1);
-    return saturate<T, int64_t>(product);
+    return saturate<T, int32_t>(product);
 }
 
 } // namespace Walrus
diff --git a/test/extended/relaxed-simd/i16x8_relaxed_q15mulr_s.wast b/test/extended/relaxed-simd/i16x8_relaxed_q15mulr_s.wast
new file mode 100644
index 000000000..00f901cbc
--- /dev/null
+++ b/test/extended/relaxed-simd/i16x8_relaxed_q15mulr_s.wast
@@ -0,0 +1,28 @@
+;; Tests for i16x8.relaxed_q15mulr_s.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "i16x8.relaxed_q15mulr_s") (param v128 v128) (result v128) (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1)))
+
+    (func (export "i16x8.relaxed_q15mulr_s_cmp") (param v128 v128) (result v128)
+          (i16x8.eq
+            (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1))
+            (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1))))
+)
+
+;; INT16_MIN = -32768
+(assert_return (invoke "i16x8.relaxed_q15mulr_s"
+                       (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0)
+                       (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0))
+               ;; overflows, return either INT16_MIN or INT16_MAX
+               (either (v128.const i16x8 -32768 32767 32766 0 0 0 0 0)
+                       (v128.const i16x8 32767 32767 32766 0 0 0 0 0)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+(assert_return (invoke "i16x8.relaxed_q15mulr_s_cmp"
+                       (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0)
+                       (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0))
+               ;; overflows, return either INT16_MIN or INT16_MAX
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
diff --git a/test/extended/relaxed-simd/i32x4_relaxed_trunc.wast b/test/extended/relaxed-simd/i32x4_relaxed_trunc.wast
new file mode 100644
index 000000000..cca3ecb95
--- /dev/null
+++ b/test/extended/relaxed-simd/i32x4_relaxed_trunc.wast
@@ -0,0 +1,124 @@
+;; Tests for i32x4.relaxed_trunc_f32x4_s, i32x4.relaxed_trunc_f32x4_u, i32x4.relaxed_trunc_f64x2_s_zero, and i32x4.relaxed_trunc_f64x2_u_zero.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "i32x4.relaxed_trunc_f32x4_s") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_s (local.get 0)))
+    (func (export "i32x4.relaxed_trunc_f32x4_u") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_u (local.get 0)))
+    (func (export "i32x4.relaxed_trunc_f64x2_s_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0)))
+    (func (export "i32x4.relaxed_trunc_f64x2_u_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0)))
+
+    (func (export "i32x4.relaxed_trunc_f32x4_s_cmp") (param v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_trunc_f32x4_s (local.get 0))
+            (i32x4.relaxed_trunc_f32x4_s (local.get 0))))
+    (func (export "i32x4.relaxed_trunc_f32x4_u_cmp") (param v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_trunc_f32x4_u (local.get 0))
+            (i32x4.relaxed_trunc_f32x4_u (local.get 0))))
+    (func (export "i32x4.relaxed_trunc_f64x2_s_zero_cmp") (param v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0))
+            (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0))))
+    (func (export "i32x4.relaxed_trunc_f64x2_u_zero_cmp") (param v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0))
+            (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0))))
+)
+
+;; Test some edge cases around min/max to ensure that the instruction either
+;; saturates correctly or returns INT_MIN.
+;;
+;; Note, though, that INT_MAX itself is not tested. The value for INT_MAX is
+;; 2147483647 but that is not representable in a `f32` since it requires 31 bits
+;; when a f32 has only 24 bits available. This means that the closest integers
+;; to INT_MAX which can be represented are 2147483520 and 2147483648, meaning
+;; that the INT_MAX test case cannot be tested.
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s"
+                       ;;                INT32_MIN     <INT32_MIN        >INT32_MAX
+                       (v128.const f32x4 -2147483648.0 -2147483904.0 2.0 2147483904.0))
+               ;; out of range -> saturate or INT32_MIN
+               (either (v128.const i32x4 -2147483648 -2147483648 2 2147483647)
+                       (v128.const i32x4 -2147483648 -2147483648 2 -2147483648)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s"
+                       (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444))
+               ;; nans -> 0 or INT32_MIN
+               (either (v128.const i32x4 0 0 0 0)
+                       (v128.const i32x4 0x80000000 0x80000000 0x80000000 0x80000000)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u"
+                       ;; UINT32_MIN UINT32_MIN-1 <UINT32_MAX UINT32_MAX+1
+                       (v128.const f32x4 0 -1.0 4294967040.0 4294967296.0))
+               ;; out of range -> saturate or UINT32_MAX
+               (either (v128.const i32x4 0 0 4294967040 0xffffffff)
+                       (v128.const i32x4 0 0xffffffff 4294967040 0xffffffff)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u"
+                       (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444))
+               ;; nans -> 0 or UINT32_MAX
+               (either (v128.const i32x4 0 0 0 0)
+                       (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero"
+                       (v128.const f64x2 -2147483904.0 2147483904.0))
+               ;; out of range -> saturate or INT32_MIN
+               (either (v128.const i32x4 -2147483648 2147483647 0 0)
+                       (v128.const i32x4 -2147483648 -2147483648 0 0)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero"
+                       (v128.const f64x2 nan -nan))
+               (either (v128.const i32x4 0 0 0 0)
+                       (v128.const i32x4 0x80000000 0x80000000 0 0)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero"
+                       (v128.const f64x2 -1.0 4294967296.0))
+               ;; out of range -> saturate or UINT32_MAX
+               (either (v128.const i32x4 0 0xffffffff 0 0)
+                       (v128.const i32x4 0xffffffff 0xffffffff 0 0)))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero"
+                       (v128.const f64x2 nan -nan))
+               (either (v128.const i32x4 0 0 0 0)
+                       (v128.const i32x4 0 0 0xffffffff 0xffffffff)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp"
+                       ;; INT32_MIN <INT32_MIN INT32_MAX >INT32_MAX
+                       (v128.const f32x4 -2147483648.0 -2147483904.0 2147483647.0 2147483904.0))
+               ;; out of range -> saturate or INT32_MIN
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp"
+                       (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444))
+               ;; nans -> 0 or INT32_MIN
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp"
+                       ;; UINT32_MIN UINT32_MIN-1 <UINT32_MAX UINT32_MAX+1
+                       (v128.const f32x4 0 -1.0 4294967040.0 4294967296.0))
+               ;; out of range -> saturate or UINT32_MAX
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp"
+                       (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444))
+               ;; nans -> 0 or UINT32_MAX
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp"
+                       (v128.const f64x2 -2147483904.0 2147483904.0))
+               ;; out of range -> saturate or INT32_MIN
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp"
+                       (v128.const f64x2 nan -nan))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp"
+                       (v128.const f64x2 -1.0 4294967296.0))
+               ;; out of range -> saturate or UINT32_MAX
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp"
+                       (v128.const f64x2 nan -nan))
+               (v128.const i32x4 -1 -1 -1 -1))
diff --git a/test/extended/relaxed-simd/i8x16_relaxed_swizzle.wast b/test/extended/relaxed-simd/i8x16_relaxed_swizzle.wast
new file mode 100644
index 000000000..f1bcb4552
--- /dev/null
+++ b/test/extended/relaxed-simd/i8x16_relaxed_swizzle.wast
@@ -0,0 +1,45 @@
+;; Tests for relaxed i8x16 swizzle.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "i8x16.relaxed_swizzle") (param v128 v128) (result v128) (i8x16.relaxed_swizzle (local.get 0) (local.get 1)))
+
+    (func (export "i8x16.relaxed_swizzle_cmp") (param v128 v128) (result v128)
+          (i8x16.eq
+            (i8x16.relaxed_swizzle (local.get 0) (local.get 1))
+            (i8x16.relaxed_swizzle (local.get 0) (local.get 1))))
+)
+
+(assert_return (invoke "i8x16.relaxed_swizzle"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
+               (either (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)))
+
+;; out of range, returns 0 or modulo 15 if < 128
+(assert_return (invoke "i8x16.relaxed_swizzle"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31))
+               (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)))
+
+;; out of range, returns 0 if >= 128
+(assert_return (invoke "i8x16.relaxed_swizzle"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255))
+               (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+;; out of range, returns 0 or modulo 15 if < 128
+(assert_return (invoke "i8x16.relaxed_swizzle_cmp"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31))
+               (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1))
+
+;; out of range, returns 0 if >= 128
+(assert_return (invoke "i8x16.relaxed_swizzle_cmp"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255))
+               (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1))
diff --git a/test/extended/relaxed-simd/relaxed_dot_product.wast b/test/extended/relaxed-simd/relaxed_dot_product.wast
new file mode 100644
index 000000000..48714b87b
--- /dev/null
+++ b/test/extended/relaxed-simd/relaxed_dot_product.wast
@@ -0,0 +1,107 @@
+;; Tests for relaxed dot products.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "i16x8.relaxed_dot_i8x16_i7x16_s") (param v128 v128) (result v128) (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1)))
+    (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s") (param v128 v128 v128) (result v128) (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2)))
+
+    (func (export "i16x8.relaxed_dot_i8x16_i7x16_s_cmp") (param v128 v128) (result v128)
+          (i16x8.eq
+            (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1))
+            (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1))))
+    (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp") (param v128 v128 v128) (result v128)
+          (i16x8.eq
+            (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2))
+            (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2))))
+)
+
+;; Simple values to ensure things are functional.
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
+               (v128.const i16x8 1 13 41 85 145 221 313 421))
+
+;; Test max and min i8 values;
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s"
+                       (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8 -32512 32258 0 0 0 0 0 0))
+
+;; signed * unsigned   : -128 *  129 * 2 = -33,024 saturated to -32,768
+;; signed * signed     : -128 * -127 * 2 =  32,512
+;; unsigned * unsigned :  128 *  129 * 2 =  33,024
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s"
+                       (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
+               (either
+                 (v128.const i16x8 -32768 0 0 0 0 0 0 0)
+                 (v128.const i16x8  32512 0 0 0 0 0 0 0)
+                 (v128.const i16x8  33024 0 0 0 0 0 0 0)))
+
+;; Simple values to ensure things are functional.
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s"
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i32x4 0 1 2 3))
+               ;; intermediate result is [14, 126, 366, 734]
+               (v128.const i32x4 14 127 368 737))
+
+;; Test max and min i8 values;
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s"
+                       (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               ;; intermediate result is [-65024, 64516, 0, 0]
+               (v128.const i32x4 -65023 64518 3 4))
+
+;; signed * unsigned   : -128 *  129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI
+;; signed * unsigned with intermediate saturation :
+;;   (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW)
+;;   -32768 + -32768 = -65536 (+ 1)
+;; signed * signed     : -128 * -127 * 4 =  65,024 (+ 1)
+;; unsigned * unsigned :  128 *  129 * 2 =  66,048 (+ 1)
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s"
+                       (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               (either
+                 (v128.const i32x4 -66047 2 3 4)
+                 (v128.const i32x4 -65535 2 3 4)
+                 (v128.const i32x4  65025 2 3 4)
+                 (v128.const i32x4  66049 2 3 4)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+;; Test max and min i8 values;
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp"
+                       (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
+;; Test max and min i8 values;
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp"
+                       (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               ;; intermediate result is [-65024, 64516, 0, 0]
+               (v128.const i32x4 -1 -1 -1 -1))
+
+;; signed * unsigned   : -128 *  129 * 2 = -33,024 saturated to -32,768
+;; signed * signed     : -128 * -127 * 2 =  32,512
+;; unsigned * unsigned :  128 *  129 * 2 =  33,024
+(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp"
+                       (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
+;; signed * unsigned   : -128 *  129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI
+;; signed * unsigned with intermediate saturation :
+;;   (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW)
+;;   -32768 + -32768 = -65536 (+ 1)
+;; signed * signed     : -128 * -127 * 4 =  65,024 (+ 1)
+;; unsigned * unsigned :  128 *  129 * 2 =  66,048 (+ 1)
+(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp"
+                       (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0)
+                       (v128.const i32x4 1 2 3 4))
+               (v128.const i32x4 -1 -1 -1 -1))
diff --git a/test/extended/relaxed-simd/relaxed_laneselect.wast b/test/extended/relaxed-simd/relaxed_laneselect.wast
new file mode 100644
index 000000000..10913816b
--- /dev/null
+++ b/test/extended/relaxed-simd/relaxed_laneselect.wast
@@ -0,0 +1,103 @@
+;; Tests for i8x16.relaxed_laneselect, i16x8.relaxed_laneselect, i32x4.relaxed_laneselect, and i64x2.relaxed_laneselect.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "i8x16.relaxed_laneselect") (param v128 v128 v128) (result v128) (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "i16x8.relaxed_laneselect") (param v128 v128 v128) (result v128) (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "i32x4.relaxed_laneselect") (param v128 v128 v128) (result v128) (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "i64x2.relaxed_laneselect") (param v128 v128 v128) (result v128) (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))
+
+    (func (export "i8x16.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128)
+          (i8x16.eq
+            (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))
+            (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "i16x8.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128)
+          (i16x8.eq
+            (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))
+            (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "i32x4.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128)
+          (i32x4.eq
+            (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))
+            (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "i64x2.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128)
+          (i64x2.eq
+            (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))
+            (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))))
+)
+
+(assert_return (invoke "i8x16.relaxed_laneselect"
+                       (v128.const i8x16 0    1  0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 16   17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31)
+                       (v128.const i8x16 0xff 0  0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0))
+               (either (v128.const i8x16 0    17 0x14 0x32 20 21 22 23 24 25 26 27 28 29 30 31)
+                       (v128.const i8x16 0    17 0x12 0x34 20 21 22 23 24 25 26 27 28 29 30 31)))
+
+(assert_return (invoke "i16x8.relaxed_laneselect"
+                       (v128.const i16x8 0      1 0x1234 0x1234 4 5 6 7)
+                       (v128.const i16x8 8      9 0x5678 0x5678 12 13 14 15)
+                       (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0))
+               (either (v128.const i16x8 0      9 0x1278 0x5634 12 13 14 15)
+                       (v128.const i16x8 0      9 0x1234 0x5678 12 13 14 15)))
+
+;; special case for i16x8 to allow pblendvb
+(assert_return (invoke "i16x8.relaxed_laneselect"
+                       (v128.const i16x8 0      1 0x1234 0x1234 4 5 6 7)
+                       (v128.const i16x8 8      9 0x5678 0x5678 12 13 14 15)
+                       (v128.const i16x8 0xffff 0 0xff00 0x0080 0 0 0 0))  ;; 0x0080 is the special case
+               (either (v128.const i16x8 0      9 0x1278 0x5678 12 13 14 15)  ;; bitselect
+                       (v128.const i16x8 0      9 0x1234 0x5678 12 13 14 15)  ;; top bit of i16 lane examined
+                       (v128.const i16x8 0      9 0x1278 0x5634 12 13 14 15)  ;; top bit of each byte
+                       ))
+
+(assert_return (invoke "i32x4.relaxed_laneselect"
+                       (v128.const i32x4 0          1 0x12341234 0x12341234)
+                       (v128.const i32x4 4          5 0x56785678 0x56785678)
+                       (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff))
+               (either (v128.const i32x4 0          5 0x12345678 0x56781234)
+                       (v128.const i32x4 0          5 0x12341234 0x56785678)))
+
+(assert_return (invoke "i64x2.relaxed_laneselect"
+                       (v128.const i64x2 0                  1)
+                       (v128.const i64x2 2                  3)
+                       (v128.const i64x2 0xffffffffffffffff 0))
+               (either (v128.const i64x2 0                  3)
+                       (v128.const i64x2 0                  3)))
+
+(assert_return (invoke "i64x2.relaxed_laneselect"
+                       (v128.const i64x2 0x1234123412341234 0x1234123412341234)
+                       (v128.const i64x2 0x5678567856785678 0x5678567856785678)
+                       (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff))
+               (either (v128.const i64x2 0x1234123456785678 0x5678567812341234)
+                       (v128.const i64x2 0x1234123412341234 0x5678567856785678)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+(assert_return (invoke "i8x16.relaxed_laneselect_cmp"
+                       (v128.const i8x16 0    1  0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15)
+                       (v128.const i8x16 16   17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31)
+                       (v128.const i8x16 0xff 0  0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0))
+               (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1))
+
+(assert_return (invoke "i16x8.relaxed_laneselect_cmp"
+                       (v128.const i16x8 0      1 0x1234 0x1234 4 5 6 7)
+                       (v128.const i16x8 8      9 0x5678 0x5678 12 13 14 15)
+                       (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0))
+               (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1))
+
+(assert_return (invoke "i32x4.relaxed_laneselect_cmp"
+                       (v128.const i32x4 0          1 0x12341234 0x12341234)
+                       (v128.const i32x4 4          5 0x56785678 0x56785678)
+                       (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "i64x2.relaxed_laneselect_cmp"
+                       (v128.const i64x2 0                  1)
+                       (v128.const i64x2 2                  3)
+                       (v128.const i64x2 0xffffffffffffffff 0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "i64x2.relaxed_laneselect_cmp"
+                       (v128.const i64x2 0x1234123412341234 0x1234123412341234)
+                       (v128.const i64x2 0x5678567856785678 0x5678567856785678)
+                       (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff))
+               (v128.const i64x2 -1 -1))
diff --git a/test/extended/relaxed-simd/relaxed_madd_nmadd.wast b/test/extended/relaxed-simd/relaxed_madd_nmadd.wast
new file mode 100644
index 000000000..187b71d5a
--- /dev/null
+++ b/test/extended/relaxed-simd/relaxed_madd_nmadd.wast
@@ -0,0 +1,224 @@
+;; Tests for f32x4.relaxed_madd, f32x4.relaxed_nmadd, f64x2.relaxed_madd, and f64x2.relaxed_nmadd.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "f32x4.relaxed_madd") (param v128 v128 v128) (result v128) (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "f32x4.relaxed_nmadd") (param v128 v128 v128) (result v128) (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "f64x2.relaxed_nmadd") (param v128 v128 v128) (result v128) (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))
+    (func (export "f64x2.relaxed_madd") (param v128 v128 v128) (result v128) (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))
+
+    (func (export "f32x4.relaxed_madd_cmp") (param v128 v128 v128) (result v128)
+          (f32x4.eq
+            (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2))
+            (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "f32x4.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128)
+          (f32x4.eq
+            (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))
+            (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "f64x2.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128)
+          (f64x2.eq
+            (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))
+            (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))))
+    (func (export "f64x2.relaxed_madd_cmp") (param v128 v128 v128) (result v128)
+          (f64x2.eq
+            (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2))
+            (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2))))
+)
+
+
+;; FLT_MAX == 0x1.fffffep+127
+;; FLT_MAX * 2 - FLT_MAX ==
+;;   FLT_MAX (if fma)
+;;   0       (if no fma)
+;; from https://www.vinc17.net/software/fma-tests.c
+(assert_return (invoke "f32x4.relaxed_madd"
+                       (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 )
+                       (v128.const f32x4 2.0 2.0 2.0 2.0)
+                       (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127))
+               (either (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127)
+                       (v128.const f32x4 inf inf inf inf)))
+
+;; Special values for float:
+;; x            = 0x1.000004p+0 (1 + 2^-22)
+;; y            = 0x1.0002p+0   (1 + 2^-15)
+;; z            = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0)
+;;              = -0x1.000204p+0
+;; x.y          = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit)
+;; x.y+z        = 0 (2 roundings)
+;; fma(x, y, z) = (0x1p-37) 2^-37
+;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
+(assert_return (invoke "f32x4.relaxed_madd"
+                       (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
+                       (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37)
+                       (v128.const f32x4 0 0 0 0)))
+;; nmadd tests with negated x, same answers are expected.
+(assert_return (invoke "f32x4.relaxed_nmadd"
+                       (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0)
+                       (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37)
+                       (v128.const f32x4 0 0 0 0)))
+;; nmadd tests with negated y, same answers are expected.
+(assert_return (invoke "f32x4.relaxed_nmadd"
+                       (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
+                       (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37)
+                       (v128.const f32x4 0 0 0 0)))
+
+;; DBL_MAX = 0x1.fffffffffffffp+1023
+;; DLB_MAX * 2 - DLB_MAX ==
+;;   DLB_MAX (if fma)
+;;   0       (if no fma)
+;; form https://www.vinc17.net/software/fma-tests.c
+;; from https://www.vinc17.net/software/fma-tests.c
+(assert_return (invoke "f64x2.relaxed_madd"
+                       (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)
+                       (v128.const f64x2 2.0 2.0)
+                       (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023))
+               (either (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)
+                       (v128.const f64x2 inf inf)))
+
+;; Special values for double:
+;; x            = 0x1.00000004p+0 (1 + 2^-30)
+;; y            = 0x1.000002p+0   (1 + 2^-23)
+;; z            = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0)
+;;              = -0x1.00000204p+0
+;; x.y          = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit)
+;; x.y+z        = 0 (2 roundings)
+;; fma(x, y, z) = 0x1p-53
+;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
+(assert_return (invoke "f64x2.relaxed_madd"
+                       (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
+                       (v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (either (v128.const f64x2 0x1p-53 0x1p-53)
+                       (v128.const f64x2 0 0)))
+;; nmadd tests with negated x, same answers are expected.
+(assert_return (invoke "f64x2.relaxed_nmadd"
+                       (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0)
+                       (v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (either (v128.const f64x2 0x1p-53 0x1p-53)
+                       (v128.const f64x2 0 0)))
+;; nmadd tests with negated y, same answers are expected.
+(assert_return (invoke "f64x2.relaxed_nmadd"
+                       (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
+                       (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (either (v128.const f64x2 0x1p-53 0x1p-53)
+                       (v128.const f64x2 0 0)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+;; FLT_MAX == 0x1.fffffep+127
+;; FLT_MAX * 2 - FLT_MAX ==
+;;   FLT_MAX (if fma)
+;;   0       (if no fma)
+;; from https://www.vinc17.net/software/fma-tests.c
+(assert_return (invoke "f32x4.relaxed_madd_cmp"
+                       (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 )
+                       (v128.const f32x4 2.0 2.0 2.0 2.0)
+                       (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+;; Special values for float:
+;; x            = 0x1.000004p+0 (1 + 2^-22)
+;; y            = 0x1.0002p+0   (1 + 2^-15)
+;; z            = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0)
+;;              = -0x1.000204p+0
+;; x.y          = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit)
+;; x.y+z        = 0 (2 roundings)
+;; fma(x, y, z) = (0x1p-37) 2^-37
+;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
+(assert_return (invoke "f32x4.relaxed_madd_cmp"
+                       (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
+                       (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (v128.const i32x4 -1 -1 -1 -1))
+;; nmadd tests with negated x, same answers are expected.
+(assert_return (invoke "f32x4.relaxed_nmadd_cmp"
+                       (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0)
+                       (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (v128.const i32x4 -1 -1 -1 -1))
+;; nmadd tests with negated y, same answers are expected.
+(assert_return (invoke "f32x4.relaxed_nmadd_cmp"
+                       (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0)
+                       (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0)
+                       (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+;; DBL_MAX = 0x1.fffffffffffffp+1023
+;; DLB_MAX * 2 - DLB_MAX ==
+;;   DLB_MAX (if fma)
+;;   0       (if no fma)
+;; form https://www.vinc17.net/software/fma-tests.c
+;; from https://www.vinc17.net/software/fma-tests.c
+(assert_return (invoke "f64x2.relaxed_madd_cmp"
+                       (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023)
+                       (v128.const f64x2 2.0 2.0)
+                       (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023))
+               (v128.const i64x2 -1 -1))
+
+;; Special values for double:
+;; x            = 0x1.00000004p+0 (1 + 2^-30)
+;; y            = 0x1.000002p+0   (1 + 2^-23)
+;; z            = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0)
+;;              = -0x1.00000204p+0
+;; x.y          = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit)
+;; x.y+z        = 0 (2 roundings)
+;; fma(x, y, z) = 0x1p-53
+;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information
+(assert_return (invoke "f64x2.relaxed_madd_cmp"
+                       (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
+                       (v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (v128.const i64x2 -1 -1))
+;; nmadd tests with negated x, same answers are expected.
+(assert_return (invoke "f64x2.relaxed_nmadd_cmp"
+                       (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0)
+                       (v128.const f64x2 0x1.000002p+0 0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (v128.const i64x2 -1 -1))
+;; nmadd tests with negated y, same answers are expected.
+(assert_return (invoke "f64x2.relaxed_nmadd_cmp"
+                       (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0)
+                       (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0)
+                       (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0))
+               (v128.const i64x2 -1 -1))
+
+;; Test that the non-deterministic choice of fusing and then rounding or
+;; rounding multiple times in `relaxed_madd` is consistent throughout a
+;; program's execution.
+;;
+;; This property is impossible to test exhaustively, so this is just a simple
+;; smoke test for when the operands to a `relaxed_madd` are known statically
+;; versus when they are dynamically supplied. This should, at least, catch
+;; illegal constant-folding and -propagation by the compiler that leads to
+;; inconsistent rounding behavior at compile time versus at run time.
+;;
+;; FLT_MAX == 0x1.fffffep+127
+;; FLT_MAX * 2 - FLT_MAX ==
+;;   FLT_MAX (if fma)
+;;   0       (if no fma)
+;; from https://www.vinc17.net/software/fma-tests.c
+(module
+  (func (export "test-consistent-nondeterminism") (param v128 v128 v128) (result v128)
+    (f32x4.eq
+      (f32x4.relaxed_madd (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 )
+                          (v128.const f32x4 2.0 2.0 2.0 2.0)
+                          (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127))
+      (f32x4.relaxed_madd (local.get 0)
+                          (local.get 1)
+                          (local.get 2))
+    )
+  )
+)
+(assert_return (invoke "test-consistent-nondeterminism"
+                       (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 )
+                       (v128.const f32x4 2.0 2.0 2.0 2.0)
+                       (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127))
+               (v128.const i32x4 -1 -1 -1 -1))
diff --git a/test/extended/relaxed-simd/relaxed_min_max.wast b/test/extended/relaxed-simd/relaxed_min_max.wast
new file mode 100644
index 000000000..ac3ebb07c
--- /dev/null
+++ b/test/extended/relaxed-simd/relaxed_min_max.wast
@@ -0,0 +1,184 @@
+;; Tests for f32x4.min, f32x4.max, f64x2.min, and f64x2.max.
+;; `either` comes from https://github.com/WebAssembly/threads.
+
+(module
+    (func (export "f32x4.relaxed_min") (param v128 v128) (result v128) (f32x4.relaxed_min (local.get 0) (local.get 1)))
+    (func (export "f32x4.relaxed_max") (param v128 v128) (result v128) (f32x4.relaxed_max (local.get 0) (local.get 1)))
+    (func (export "f64x2.relaxed_min") (param v128 v128) (result v128) (f64x2.relaxed_min (local.get 0) (local.get 1)))
+    (func (export "f64x2.relaxed_max") (param v128 v128) (result v128) (f64x2.relaxed_max (local.get 0) (local.get 1)))
+
+    (func (export "f32x4.relaxed_min_cmp") (param v128 v128) (result v128)
+          (i32x4.eq
+            (f32x4.relaxed_min (local.get 0) (local.get 1))
+            (f32x4.relaxed_min (local.get 0) (local.get 1))))
+    (func (export "f32x4.relaxed_max_cmp") (param v128 v128) (result v128)
+          (i32x4.eq
+            (f32x4.relaxed_max (local.get 0) (local.get 1))
+            (f32x4.relaxed_max (local.get 0) (local.get 1))))
+    (func (export "f64x2.relaxed_min_cmp") (param v128 v128) (result v128)
+          (i64x2.eq
+            (f64x2.relaxed_min (local.get 0) (local.get 1))
+            (f64x2.relaxed_min (local.get 0) (local.get 1))))
+    (func (export "f64x2.relaxed_max_cmp") (param v128 v128) (result v128)
+          (i64x2.eq
+            (f64x2.relaxed_max (local.get 0) (local.get 1))
+            (f64x2.relaxed_max (local.get 0) (local.get 1))))
+)
+
+(assert_return (invoke "f32x4.relaxed_min"
+                       (v128.const f32x4 -nan nan 0 0)
+                       (v128.const f32x4 0 0 -nan nan))
+               (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical)
+                       (v128.const f32x4 nan:canonical nan:canonical 0 0)
+                       (v128.const f32x4 0 0 nan:canonical nan:canonical)
+                       (v128.const f32x4 0 0 0 0)))
+
+(assert_return (invoke "f32x4.relaxed_min"
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0))
+               (either (v128.const f32x4 -0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 -0.0 +0.0 -0.0)))
+
+(assert_return (invoke "f32x4.relaxed_max"
+                       (v128.const f32x4 -nan nan 0 0)
+                       (v128.const f32x4 0 0 -nan nan))
+               (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical)
+                       (v128.const f32x4 nan:canonical nan:canonical 0 0)
+                       (v128.const f32x4 0 0 nan:canonical nan:canonical)
+                       (v128.const f32x4 0 0 0 0)))
+
+(assert_return (invoke "f32x4.relaxed_max"
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0))
+               (either (v128.const f32x4 +0.0 +0.0 +0.0 -0.0)
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 -0.0 +0.0 -0.0)))
+
+(assert_return (invoke "f64x2.relaxed_min"
+                       (v128.const f64x2 -nan nan)
+                       (v128.const f64x2 0 0))
+               (either (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 0 0)))
+
+(assert_return (invoke "f64x2.relaxed_min"
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 -nan nan))
+               (either (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)))
+
+(assert_return (invoke "f64x2.relaxed_min"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0))
+               (either (v128.const f64x2 -0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0)
+                       (v128.const f64x2 -0.0 -0.0)))
+
+(assert_return (invoke "f64x2.relaxed_min"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0))
+               (either (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)))
+
+(assert_return (invoke "f64x2.relaxed_max"
+                       (v128.const f64x2 -nan nan)
+                       (v128.const f64x2 0 0))
+               (either (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 0 0)))
+
+(assert_return (invoke "f64x2.relaxed_max"
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 -nan nan))
+               (either (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 nan:canonical nan:canonical)
+                       (v128.const f64x2 0 0)))
+
+(assert_return (invoke "f64x2.relaxed_max"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0))
+               (either (v128.const f64x2 +0.0 +0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0)
+                       (v128.const f64x2 -0.0 -0.0)))
+
+(assert_return (invoke "f64x2.relaxed_max"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0))
+               (either (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0)))
+
+;; Check that multiple calls to the relaxed instruction with same inputs returns same results.
+
+(assert_return (invoke "f32x4.relaxed_min_cmp"
+                       (v128.const f32x4 -nan nan 0 0)
+                       (v128.const f32x4 0 0 -nan nan))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "f32x4.relaxed_min_cmp"
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "f32x4.relaxed_max_cmp"
+                       (v128.const f32x4 -nan nan 0 0)
+                       (v128.const f32x4 0 0 -nan nan))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "f32x4.relaxed_max_cmp"
+                       (v128.const f32x4 +0.0 -0.0 +0.0 -0.0)
+                       (v128.const f32x4 -0.0 +0.0 +0.0 -0.0))
+               (v128.const i32x4 -1 -1 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_min_cmp"
+                       (v128.const f64x2 -nan nan)
+                       (v128.const f64x2 0 0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_min_cmp"
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 -nan nan))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_min_cmp"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_min_cmp"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_max_cmp"
+                       (v128.const f64x2 -nan nan)
+                       (v128.const f64x2 0 0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_max_cmp"
+                       (v128.const f64x2 0 0)
+                       (v128.const f64x2 -nan nan))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_max_cmp"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 -0.0 +0.0))
+               (v128.const i64x2 -1 -1))
+
+(assert_return (invoke "f64x2.relaxed_max_cmp"
+                       (v128.const f64x2 +0.0 -0.0)
+                       (v128.const f64x2 +0.0 -0.0))
+               (v128.const i64x2 -1 -1))
diff --git a/third_party/wabt/src/walrus/binary-reader-walrus.cc b/third_party/wabt/src/walrus/binary-reader-walrus.cc
index 7d8e224b8..4a94fd7b4 100644
--- a/third_party/wabt/src/walrus/binary-reader-walrus.cc
+++ b/third_party/wabt/src/walrus/binary-reader-walrus.cc
@@ -84,6 +84,8 @@ static Features getFeatures() {
     features.enable_exceptions();
     // TODO: should use command line flag for this (--enable-threads)
     features.enable_threads();
+    // TODO: should use command line flag for this (--enable-relaxed-simd)
+    features.enable_relaxed_simd();
     return features;
 }
 
diff --git a/tools/jit_exclude_list.txt b/tools/jit_exclude_list.txt
index e69de29bb..3ef9aed46 100644
--- a/tools/jit_exclude_list.txt
+++ b/tools/jit_exclude_list.txt
@@ -0,0 +1,7 @@
+i16x8_relaxed_q15mulr_s.wast
+i32x4_relaxed_trunc.wast
+i8x16_relaxed_swizzle.wast
+relaxed_dot_product.wast
+relaxed_laneselect.wast
+relaxed_madd_nmadd.wast
+relaxed_min_max.wast