From c7bf409f994b47afffb46413b69da81f6fb65fd3 Mon Sep 17 00:00:00 2001 From: Zoltan Herczeg Date: Tue, 1 Oct 2024 11:25:10 +0000 Subject: [PATCH] Implement relaxed simd in the interpreter JIT tests are disabled Signed-off-by: Zoltan Herczeg zherczeg.u-szeged@partner.samsung.com --- src/interpreter/ByteCode.h | 163 +++++++++---- src/interpreter/Interpreter.cpp | 83 ++++++- src/parser/WASMParser.cpp | 19 +- src/shell/Shell.cpp | 61 +++-- src/util/MathOperation.h | 16 +- .../relaxed-simd/i16x8_relaxed_q15mulr_s.wast | 28 +++ .../relaxed-simd/i32x4_relaxed_trunc.wast | 124 ++++++++++ .../relaxed-simd/i8x16_relaxed_swizzle.wast | 45 ++++ .../relaxed-simd/relaxed_dot_product.wast | 107 +++++++++ .../relaxed-simd/relaxed_laneselect.wast | 103 ++++++++ .../relaxed-simd/relaxed_madd_nmadd.wast | 224 ++++++++++++++++++ .../relaxed-simd/relaxed_min_max.wast | 184 ++++++++++++++ .../wabt/src/walrus/binary-reader-walrus.cc | 2 + tools/jit_exclude_list.txt | 7 + 14 files changed, 1091 insertions(+), 75 deletions(-) create mode 100644 test/extended/relaxed-simd/i16x8_relaxed_q15mulr_s.wast create mode 100644 test/extended/relaxed-simd/i32x4_relaxed_trunc.wast create mode 100644 test/extended/relaxed-simd/i8x16_relaxed_swizzle.wast create mode 100644 test/extended/relaxed-simd/relaxed_dot_product.wast create mode 100644 test/extended/relaxed-simd/relaxed_laneselect.wast create mode 100644 test/extended/relaxed-simd/relaxed_madd_nmadd.wast create mode 100644 test/extended/relaxed-simd/relaxed_min_max.wast diff --git a/src/interpreter/ByteCode.h b/src/interpreter/ByteCode.h index 154e63836..acc7f1841 100644 --- a/src/interpreter/ByteCode.h +++ b/src/interpreter/ByteCode.h @@ -378,7 +378,7 @@ class FunctionType; F(I64X2ExtmulHighI32X4S, (simdExtmulOperation)) \ F(I64X2ExtmulLowI32X4U, (simdExtmulOperation)) \ F(I64X2ExtmulHighI32X4U, (simdExtmulOperation)) \ - F(I32X4DotI16X8S, (simdDotOperation)) \ + F(I32X4DotI16X8S, (simdDotOperation)) \ F(I8X16NarrowI16X8S, (simdNarrowOperation)) \ F(I8X16NarrowI16X8U, (simdNarrowOperation)) \ F(I16X8NarrowI32X4S, (simdNarrowOperation)) \ @@ -588,30 +588,65 @@ class FunctionType; F(MemoryAtomicWait64) \ F(AtomicFence) -#define FOR_EACH_BYTECODE(F) \ - FOR_EACH_BYTECODE_OP(F) \ - FOR_EACH_BYTECODE_BINARY_OP(F) \ - FOR_EACH_BYTECODE_UNARY_OP(F) \ - FOR_EACH_BYTECODE_UNARY_OP_2(F) \ - FOR_EACH_BYTECODE_LOAD_OP(F) \ - FOR_EACH_BYTECODE_STORE_OP(F) \ - FOR_EACH_BYTECODE_SIMD_BINARY_OP(F) \ - FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(F) \ - FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(F) \ - FOR_EACH_BYTECODE_SIMD_UNARY_OP(F) \ - FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(F) \ - FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(F) \ - FOR_EACH_BYTECODE_SIMD_LOAD_SPLAT_OP(F) \ - FOR_EACH_BYTECODE_SIMD_LOAD_EXTEND_OP(F) \ - FOR_EACH_BYTECODE_SIMD_LOAD_LANE_OP(F) \ - FOR_EACH_BYTECODE_SIMD_STORE_LANE_OP(F) \ - FOR_EACH_BYTECODE_SIMD_EXTRACT_LANE_OP(F) \ - FOR_EACH_BYTECODE_SIMD_REPLACE_LANE_OP(F) \ - FOR_EACH_BYTECODE_SIMD_ETC_OP(F) \ - FOR_EACH_BYTECODE_ATOMIC_LOAD_OP(F) \ - FOR_EACH_BYTECODE_ATOMIC_STORE_OP(F) \ - FOR_EACH_BYTECODE_ATOMIC_RMW_OP(F) \ - FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(F) \ +#define FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(F) \ + F(I32X4RelaxedTruncF32X4S, (simdTruncSatOperation)) \ + F(I32X4RelaxedTruncF32X4U, (simdTruncSatOperation)) \ + F(I32X4RelaxedTruncF64X2SZero, (simdTruncSatZeroOperation)) \ + F(I32X4RelaxedTruncF64X2UZero, (simdTruncSatZeroOperation)) + +#define FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(F) \ + F(F32X4RelaxedMin, floatMin, float, float) \ + F(F32X4RelaxedMax, floatMax, float, float) \ + F(F64X2RelaxedMin, floatMin, double, double) \ + F(F64X2RelaxedMax, floatMax, double, double) \ + F(I16X8RelaxedQ15mulrS, saturatingRoundingQMul, int16_t, int16_t) + +#define FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(F) \ + F(I8X16RelaxedSwizzle, (simdSwizzleOperation)) \ + F(I16X8DotI8X16I7X16S, (simdDotOperation)) + +#define FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(F) \ + F(F32X4RelaxedMadd, floatMulAdd, float, float) \ + F(F32X4RelaxedNmadd, floatNegMulAdd, float, float) \ + F(F64X2RelaxedMadd, floatMulAdd, double, double) \ + F(F64X2RelaxedNmadd, floatNegMulAdd, double, double) + +#define FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(F) \ + F(I32X4DotI8X16I7X16AddS, (simdDotAddOperation)) \ + F(I8X16RelaxedLaneSelect, (simdBitSelectOperation)) \ + F(I16X8RelaxedLaneSelect, (simdBitSelectOperation)) \ + F(I32X4RelaxedLaneSelect, (simdBitSelectOperation)) \ + F(I64X2RelaxedLaneSelect, (simdBitSelectOperation)) + +#define FOR_EACH_BYTECODE(F) \ + FOR_EACH_BYTECODE_OP(F) \ + FOR_EACH_BYTECODE_BINARY_OP(F) \ + FOR_EACH_BYTECODE_UNARY_OP(F) \ + FOR_EACH_BYTECODE_UNARY_OP_2(F) \ + FOR_EACH_BYTECODE_LOAD_OP(F) \ + FOR_EACH_BYTECODE_STORE_OP(F) \ + FOR_EACH_BYTECODE_SIMD_BINARY_OP(F) \ + FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(F) \ + FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(F) \ + FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(F) \ + FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(F) \ + FOR_EACH_BYTECODE_SIMD_UNARY_OP(F) \ + FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(F) \ + FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(F) \ + FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(F) \ + FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(F) \ + FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(F) \ + FOR_EACH_BYTECODE_SIMD_LOAD_SPLAT_OP(F) \ + FOR_EACH_BYTECODE_SIMD_LOAD_EXTEND_OP(F) \ + FOR_EACH_BYTECODE_SIMD_LOAD_LANE_OP(F) \ + FOR_EACH_BYTECODE_SIMD_STORE_LANE_OP(F) \ + FOR_EACH_BYTECODE_SIMD_EXTRACT_LANE_OP(F) \ + FOR_EACH_BYTECODE_SIMD_REPLACE_LANE_OP(F) \ + FOR_EACH_BYTECODE_SIMD_ETC_OP(F) \ + FOR_EACH_BYTECODE_ATOMIC_LOAD_OP(F) \ + FOR_EACH_BYTECODE_ATOMIC_STORE_OP(F) \ + FOR_EACH_BYTECODE_ATOMIC_RMW_OP(F) \ + FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(F) \ FOR_EACH_BYTECODE_ATOMIC_OTHER(F) class ByteCode { @@ -726,6 +761,25 @@ class ByteCodeOffset2Value : public ByteCode { uint32_t m_value; }; +class ByteCodeOffset4 : public ByteCode { +public: + ByteCodeOffset4(Opcode opcode, ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset) + : ByteCode(opcode) + , m_stackOffsets{ src0Offset, src1Offset, src2Offset, dstOffset } + { + } + + const ByteCodeStackOffset* srcOffsets() const { return m_stackOffsets; } + ByteCodeStackOffset src0Offset() const { return m_stackOffsets[0]; } + ByteCodeStackOffset src1Offset() const { return m_stackOffsets[1]; } + ByteCodeStackOffset src2Offset() const { return m_stackOffsets[2]; } + ByteCodeStackOffset dstOffset() const { return m_stackOffsets[3]; } + +protected: + ByteCodeStackOffset m_stackOffsets[4]; +}; + + class ByteCodeOffset4Value : public ByteCode { public: ByteCodeOffset4Value(Opcode opcode, ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset, uint32_t value) @@ -923,15 +977,56 @@ class UnaryOperation : public ByteCodeOffset2 { DEFINE_UNARY_BYTECODE_DUMP(name) \ }; +// dummy ByteCode for ternary operation +class TernaryOperation : public ByteCodeOffset4 { +public: + TernaryOperation(Opcode code, ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset) + : ByteCodeOffset4(code, src0Offset, src1Offset, src2Offset, dstOffset) + { + } + +#if !defined(NDEBUG) + void dump(size_t pos) + { + } +#endif +}; + +#if !defined(NDEBUG) +#define DEFINE_TERNARY_BYTECODE_DUMP(name) \ + void dump(size_t pos) \ + { \ + printf(#name " src1: %" PRIu32 " src2: %" PRIu32 " src3: %" PRIu32 " dst: %" PRIu32, (uint32_t)m_stackOffsets[0], (uint32_t)m_stackOffsets[1], (uint32_t)m_stackOffsets[2], (uint32_t)m_stackOffsets[3]); \ + } +#else +#define DEFINE_TERNARY_BYTECODE_DUMP(name) +#endif + +#define DEFINE_TERNARY_BYTECODE(name, ...) \ + class name : public TernaryOperation { \ + public: \ + name(ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset) \ + : TernaryOperation(Opcode::name##Opcode, src0Offset, src1Offset, src2Offset, dstOffset) \ + { \ + } \ + DEFINE_TERNARY_BYTECODE_DUMP(name) \ + }; + + FOR_EACH_BYTECODE_BINARY_OP(DEFINE_BINARY_BYTECODE) FOR_EACH_BYTECODE_UNARY_OP(DEFINE_UNARY_BYTECODE) FOR_EACH_BYTECODE_UNARY_OP_2(DEFINE_UNARY_BYTECODE) FOR_EACH_BYTECODE_SIMD_BINARY_OP(DEFINE_BINARY_BYTECODE) FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(DEFINE_BINARY_BYTECODE) FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(DEFINE_BINARY_BYTECODE) +FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(DEFINE_BINARY_BYTECODE) +FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(DEFINE_BINARY_BYTECODE) FOR_EACH_BYTECODE_SIMD_UNARY_OP(DEFINE_UNARY_BYTECODE) FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(DEFINE_UNARY_BYTECODE) FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(DEFINE_UNARY_BYTECODE) +FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(DEFINE_UNARY_BYTECODE) +FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(DEFINE_TERNARY_BYTECODE) +FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(DEFINE_TERNARY_BYTECODE) #define DEFINE_MOVE_BYTECODE(name) \ class name : public ByteCodeOffset2 { \ @@ -1910,31 +2005,19 @@ FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(DEFINE_RMW_CMPXCHG_BYTECODE) #undef DEFINE_RMW_BYTECODE // FOR_EACH_BYTECODE_SIMD_ETC_OP -class V128BitSelect : public ByteCode { +class V128BitSelect : public ByteCodeOffset4 { public: V128BitSelect(ByteCodeStackOffset lhs, ByteCodeStackOffset rhs, ByteCodeStackOffset c, ByteCodeStackOffset dst) - : ByteCode(Opcode::V128BitSelectOpcode) - , m_srcOffsets{ lhs, rhs, c } - , m_dstOffset(dst) + : ByteCodeOffset4(Opcode::V128BitSelectOpcode, lhs, rhs, c, dst) { } - const ByteCodeStackOffset* srcOffsets() const - { - return m_srcOffsets; - } - ByteCodeStackOffset dstOffset() const { return m_dstOffset; } - #if !defined(NDEBUG) void dump(size_t pos) { - printf("v128.bitselect lhs: %" PRIu32 " rhs: %" PRIu32 " c: %" PRIu32 " dst: %" PRIu32, (uint32_t)m_srcOffsets[0], (uint32_t)m_srcOffsets[1], (uint32_t)m_srcOffsets[2], (uint32_t)m_dstOffset); + printf("v128.bitselect lhs: %" PRIu32 " rhs: %" PRIu32 " c: %" PRIu32 " dst: %" PRIu32, (uint32_t)m_stackOffsets[0], (uint32_t)m_stackOffsets[1], (uint32_t)m_stackOffsets[2], (uint32_t)m_stackOffsets[3]); } #endif - -protected: - ByteCodeStackOffset m_srcOffsets[3]; - ByteCodeStackOffset m_dstOffset; }; class V128Load32Zero : public MemoryLoad { diff --git a/src/interpreter/Interpreter.cpp b/src/interpreter/Interpreter.cpp index 1caf802e1..b28d23a0a 100644 --- a/src/interpreter/Interpreter.cpp +++ b/src/interpreter/Interpreter.cpp @@ -270,6 +270,19 @@ inline static void simdSwizzleOperation(ExecutionState& state, BinaryOperation* writeValue(bp, code->dstOffset(), result); } +inline static void simdBitSelectOperation(ExecutionState& state, ByteCodeOffset4* code, uint8_t* bp) +{ + using Type = typename SIMDType::Type; + auto src0 = readValue(bp, code->src0Offset()); + auto src1 = readValue(bp, code->src1Offset()); + auto src2 = readValue(bp, code->src2Offset()); + Type result; + for (uint8_t i = 0; i < Type::Lanes; i++) { + result[i] = (src0[i] & src2[i]) | (src1[i] & ~src2[i]); + } + writeValue(bp, code->dstOffset(), result); +} + // FIXME optimize this function template inline static void simdExtmulOperation(ExecutionState& state, BinaryOperation* code, uint8_t* bp) @@ -286,10 +299,11 @@ inline static void simdExtmulOperation(ExecutionState& state, BinaryOperation* c writeValue(bp, code->dstOffset(), result); } +template inline static void simdDotOperation(ExecutionState& state, BinaryOperation* code, uint8_t* bp) { - using ParamType = typename SIMDType::Type; - using ResultType = typename SIMDType::Type; + using ParamType = typename SIMDType

::Type; + using ResultType = typename SIMDType::Type; auto lhs = readValue(bp, code->srcOffset()[0]); auto rhs = readValue(bp, code->srcOffset()[1]); ResultType result; @@ -302,6 +316,26 @@ inline static void simdDotOperation(ExecutionState& state, BinaryOperation* code writeValue(bp, code->dstOffset(), result); } +inline static void simdDotAddOperation(ExecutionState& state, TernaryOperation* code, uint8_t* bp) +{ + using ParamType = typename SIMDType::Type; + using ResultType = typename SIMDType::Type; + auto src0 = readValue(bp, code->src0Offset()); + auto src1 = readValue(bp, code->src1Offset()); + auto src2 = readValue(bp, code->src2Offset()); + ResultType result; + for (uint8_t i = 0; i < ResultType::Lanes; i++) { + uint8_t laneIdx = i * 4; + int16_t lo0 = static_cast(src0[laneIdx]) * static_cast(src1[laneIdx]); + int16_t hi0 = static_cast(src0[laneIdx + 1]) * static_cast(src1[laneIdx + 1]); + int16_t lo1 = static_cast(src0[laneIdx + 2]) * static_cast(src1[laneIdx + 2]); + int16_t hi1 = static_cast(src0[laneIdx + 3]) * static_cast(src1[laneIdx + 3]); + int32_t tmp = static_cast(lo0 + hi0) + static_cast(lo1 + hi1); + result[i] = add(state, tmp, src2[i]); + } + writeValue(bp, code->dstOffset(), result); +} + template inline static void simdNarrowOperation(ExecutionState& state, BinaryOperation* code, uint8_t* bp) { @@ -582,6 +616,35 @@ ByteCodeStackOffset* Interpreter::interpret(ExecutionState& state, NEXT_INSTRUCTION(); \ } +#define SIMD_TERNARY_OPERATION(name, op, paramType, resultType) \ + DEFINE_OPCODE(name) \ + : \ + { \ + using ParamType = typename SIMDType::Type; \ + using ResultType = typename SIMDType::Type; \ + COMPILE_ASSERT(ParamType::Lanes == ResultType::Lanes, ""); \ + name* code = (name*)programCounter; \ + auto src0 = readValue(bp, code->src0Offset()); \ + auto src1 = readValue(bp, code->src1Offset()); \ + auto src2 = readValue(bp, code->src2Offset()); \ + ResultType result; \ + for (uint8_t i = 0; i < ParamType::Lanes; i++) { \ + result[i] = op(state, src0[i], src1[i], src2[i]); \ + } \ + writeValue(bp, code->dstOffset(), result); \ + ADD_PROGRAM_COUNTER(name); \ + NEXT_INSTRUCTION(); \ + } + +#define SIMD_TERNARY_OTHER_OPERATION(name, op) \ + DEFINE_OPCODE(name) \ + : \ + { \ + op(state, (TernaryOperation*)programCounter, bp); \ + ADD_PROGRAM_COUNTER(BinaryOperation); \ + NEXT_INSTRUCTION(); \ + } + #define MEMORY_LOAD_OPERATION(opcodeName, readType, writeType) \ DEFINE_OPCODE(opcodeName) \ : \ @@ -880,9 +943,14 @@ ByteCodeStackOffset* Interpreter::interpret(ExecutionState& state, FOR_EACH_BYTECODE_SIMD_BINARY_OP(SIMD_BINARY_OPERATION) FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(SIMD_BINARY_SHIFT_OPERATION) FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(SIMD_BINARY_OTHER_OPERATION) + FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(SIMD_BINARY_OPERATION) + FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(SIMD_BINARY_OTHER_OPERATION) FOR_EACH_BYTECODE_SIMD_UNARY_OP(SIMD_UNARY_OPERATION) FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(SIMD_UNARY_CONVERT_OPERATION) FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(SIMD_UNARY_OTHER_OPERATION) + FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(SIMD_UNARY_OTHER_OPERATION) + FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(SIMD_TERNARY_OPERATION) + FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(SIMD_TERNARY_OTHER_OPERATION) DEFINE_OPCODE(Jump) : @@ -1088,16 +1156,7 @@ ByteCodeStackOffset* Interpreter::interpret(ExecutionState& state, DEFINE_OPCODE(V128BitSelect) : { - using Type = typename SIMDType::Type; - V128BitSelect* code = (V128BitSelect*)programCounter; - auto lhs = readValue(bp, code->srcOffsets()[0]); - auto rhs = readValue(bp, code->srcOffsets()[1]); - auto c = readValue(bp, code->srcOffsets()[2]); - Type result; - for (uint8_t i = 0; i < Type::Lanes; i++) { - result[i] = (lhs[i] & c[i]) | (rhs[i] & ~c[i]); - } - writeValue(bp, code->dstOffset(), result); + simdBitSelectOperation(state, (ByteCodeOffset4*)programCounter, bp); ADD_PROGRAM_COUNTER(V128BitSelect); NEXT_INSTRUCTION(); } diff --git a/src/parser/WASMParser.cpp b/src/parser/WASMParser.cpp index 98c731011..7dcd295cb 100644 --- a/src/parser/WASMParser.cpp +++ b/src/parser/WASMParser.cpp @@ -1398,15 +1398,23 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate { { auto code = static_cast(opcode); ASSERT(WASMCodeInfo::codeTypeToValueType(g_wasmCodeInfo[opcode].m_paramTypes[2]) == peekVMStackValueType()); - auto c = popVMStack(); + auto src2 = popVMStack(); ASSERT(WASMCodeInfo::codeTypeToValueType(g_wasmCodeInfo[opcode].m_paramTypes[1]) == peekVMStackValueType()); - auto rhs = popVMStack(); + auto src1 = popVMStack(); ASSERT(WASMCodeInfo::codeTypeToValueType(g_wasmCodeInfo[opcode].m_paramTypes[0]) == peekVMStackValueType()); - auto lhs = popVMStack(); + auto src0 = popVMStack(); auto dst = computeExprResultPosition(WASMCodeInfo::codeTypeToValueType(g_wasmCodeInfo[opcode].m_resultType)); switch (code) { +#define GENERATE_TERNARY_CODE_CASE(name, ...) \ + case WASMOpcode::name##Opcode: { \ + pushByteCode(Walrus::name(src0, src1, src2, dst), code); \ + break; \ + } + FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(GENERATE_TERNARY_CODE_CASE) + FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(GENERATE_TERNARY_CODE_CASE) +#undef GENERATE_TERNARY_CODE_CASE case WASMOpcode::V128BitSelectOpcode: - pushByteCode(Walrus::V128BitSelect(lhs, rhs, c, dst), code); + pushByteCode(Walrus::V128BitSelect(src0, src1, src2, dst), code); break; default: ASSERT_NOT_REACHED(); @@ -2574,6 +2582,8 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate { FOR_EACH_BYTECODE_SIMD_BINARY_OP(GENERATE_BINARY_CODE_CASE) FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(GENERATE_BINARY_CODE_CASE) FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(GENERATE_BINARY_CODE_CASE) + FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(GENERATE_BINARY_CODE_CASE) + FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(GENERATE_BINARY_CODE_CASE) #undef GENERATE_BINARY_CODE_CASE default: ASSERT_NOT_REACHED(); @@ -2594,6 +2604,7 @@ class WASMBinaryReader : public wabt::WASMBinaryReaderDelegate { FOR_EACH_BYTECODE_SIMD_UNARY_OP(GENERATE_UNARY_CODE_CASE) FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(GENERATE_UNARY_CODE_CASE) FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(GENERATE_UNARY_CODE_CASE) + FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(GENERATE_UNARY_CODE_CASE) #undef GENERATE_UNARY_CODE_CASE case WASMOpcode::I32ReinterpretF32Opcode: pushByteCode(Walrus::I32ReinterpretF32(src, dst), code); diff --git a/src/shell/Shell.cpp b/src/shell/Shell.cpp index 0199c8081..60015700c 100644 --- a/src/shell/Shell.cpp +++ b/src/shell/Shell.cpp @@ -592,7 +592,7 @@ static void printConstVector(wabt::ConstVector& v) } static void executeInvokeAction(wabt::InvokeAction* action, Walrus::Function* fn, wabt::ConstVector expectedResult, - const char* expectedException, bool expectUserException = false) + const char* expectedException, bool expectUserException = false, bool either = false) { if (fn->functionType()->param().size() != action->args.size()) { printf("Error: expected %zu parameter(s) but got %zu.\n", fn->functionType()->param().size(), action->args.size()); @@ -608,7 +608,8 @@ static void executeInvokeAction(wabt::InvokeAction* action, Walrus::Function* fn wabt::ConstVector& expectedResult; Walrus::ValueVector& args; wabt::InvokeAction* action; - } data = { fn, expectedResult, args, action }; + bool either; + } data = { fn, expectedResult, args, action, either }; Walrus::Trap trap; auto trapResult = trap.run([](Walrus::ExecutionState& state, void* d) { RunData* data = reinterpret_cast(d); @@ -616,22 +617,48 @@ static void executeInvokeAction(wabt::InvokeAction* action, Walrus::Function* fn result.resize(data->fn->functionType()->result().size()); data->fn->call(state, data->args.data(), result.data()); if (data->expectedResult.size()) { - if (data->fn->functionType()->result().size() != data->expectedResult.size()) { - printf("Error: %s returned with %zu parameter(s) but expected %zu", data->action->name.data(), data->fn->functionType()->result().size(), data->expectedResult.size()); - RELEASE_ASSERT_NOT_REACHED(); - } - // compare result - for (size_t i = 0; i < result.size(); i++) { - if (!equals(result[i], data->expectedResult[i])) { - printf("Assertion failed at %d: ", data->action->loc.line); - printf("%s(", data->action->name.data()); - printConstVector(data->action->args); - printf(") expected "); - printConstVector(data->expectedResult); - printf(", but got %s\n", ((std::string)result[i]).c_str()); + int errorIndex = -1; + + if (data->either) { + if (data->fn->functionType()->result().size() != 1) { + printf("Error: %s returned with %zu parameter(s) but expected 1", data->action->name.data(), data->fn->functionType()->result().size()); + RELEASE_ASSERT_NOT_REACHED(); + } + + // compare result + for (size_t i = 0; i < data->expectedResult.size(); i++) { + if (equals(result[0], data->expectedResult[i])) { + return; + } + } + + errorIndex = 0; + } else { + if (data->fn->functionType()->result().size() != data->expectedResult.size()) { + printf("Error: %s returned with %zu parameter(s) but expected %zu", data->action->name.data(), data->fn->functionType()->result().size(), data->expectedResult.size()); RELEASE_ASSERT_NOT_REACHED(); } + + // compare result + for (size_t i = 0; i < result.size(); i++) { + if (!equals(result[i], data->expectedResult[i])) { + errorIndex = i; + break; + } + } + + if (errorIndex == -1) { + return; + } } + + printf("Assertion failed at %d: ", data->action->loc.line); + printf("%s(", data->action->name.data()); + printConstVector(data->action->args); + printf(") %sexpected ", data->either ? "any " : ""); + printConstVector(data->expectedResult); + printf(", but got %s\n", ((std::string)result[errorIndex]).c_str()); + RELEASE_ASSERT_NOT_REACHED(); } }, &data); @@ -667,7 +694,7 @@ static void executeInvokeAction(wabt::InvokeAction* action, Walrus::Function* fn } else if (expectedResult.size()) { printf("invoke %s(", action->name.data()); printConstVector(action->args); - printf(") expect value("); + printf(") expect %svalue(", either ? "either " : ""); printConstVector(expectedResult); printf(") (line: %d) : OK\n", action->loc.line); } @@ -745,7 +772,7 @@ static void executeWAST(Store* store, const std::string& filename, const std::ve if (assertReturn->action->type() == wabt::ActionType::Invoke) { auto action = static_cast(assertReturn->action.get()); auto fn = fetchInstance(action->module_var, instanceMap, registeredInstanceMap)->resolveExportFunction(action->name); - executeInvokeAction(action, fn, assertReturn->expected->expected, nullptr); + executeInvokeAction(action, fn, assertReturn->expected->expected, nullptr, false, assertReturn->expected->type() == wabt::ExpectationType::Either); } else if (assertReturn->action->type() == wabt::ActionType::Get) { auto action = static_cast(assertReturn->action.get()); auto v = fetchInstance(action->module_var, instanceMap, registeredInstanceMap)->resolveExportGlobal(action->name)->value(); diff --git a/src/util/MathOperation.h b/src/util/MathOperation.h index f01040813..a688ab40b 100644 --- a/src/util/MathOperation.h +++ b/src/util/MathOperation.h @@ -319,6 +319,18 @@ ALWAYS_INLINE T floatPMax(ExecutionState& state, T lhs, T rhs) return std::max(lhs, rhs); } +template +ALWAYS_INLINE T floatMulAdd(ExecutionState& state, T a, T b, T c) +{ + return (a * b) + c; +} + +template +ALWAYS_INLINE T floatNegMulAdd(ExecutionState& state, T a, T b, T c) +{ + return -(a * b) + c; +} + template bool canConvert(T val) { return true; } template <> @@ -591,10 +603,10 @@ T saturatingRoundingQMul(ExecutionState& state, T lhs, T rhs) { constexpr int size_in_bits = sizeof(T) * 8; int round_const = 1 << (size_in_bits - 2); - int64_t product = (int64_t)lhs * rhs; + int32_t product = (int32_t)lhs * rhs; product += round_const; product >>= (size_in_bits - 1); - return saturate(product); + return saturate(product); } } // namespace Walrus diff --git a/test/extended/relaxed-simd/i16x8_relaxed_q15mulr_s.wast b/test/extended/relaxed-simd/i16x8_relaxed_q15mulr_s.wast new file mode 100644 index 000000000..00f901cbc --- /dev/null +++ b/test/extended/relaxed-simd/i16x8_relaxed_q15mulr_s.wast @@ -0,0 +1,28 @@ +;; Tests for i16x8.relaxed_q15mulr_s. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i16x8.relaxed_q15mulr_s") (param v128 v128) (result v128) (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1))) + + (func (export "i16x8.relaxed_q15mulr_s_cmp") (param v128 v128) (result v128) + (i16x8.eq + (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1)) + (i16x8.relaxed_q15mulr_s (local.get 0) (local.get 1)))) +) + +;; INT16_MIN = -32768 +(assert_return (invoke "i16x8.relaxed_q15mulr_s" + (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0) + (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0)) + ;; overflows, return either INT16_MIN or INT16_MAX + (either (v128.const i16x8 -32768 32767 32766 0 0 0 0 0) + (v128.const i16x8 32767 32767 32766 0 0 0 0 0))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "i16x8.relaxed_q15mulr_s_cmp" + (v128.const i16x8 -32768 -32767 32767 0 0 0 0 0) + (v128.const i16x8 -32768 -32768 32767 0 0 0 0 0)) + ;; overflows, return either INT16_MIN or INT16_MAX + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + diff --git a/test/extended/relaxed-simd/i32x4_relaxed_trunc.wast b/test/extended/relaxed-simd/i32x4_relaxed_trunc.wast new file mode 100644 index 000000000..cca3ecb95 --- /dev/null +++ b/test/extended/relaxed-simd/i32x4_relaxed_trunc.wast @@ -0,0 +1,124 @@ +;; Tests for i32x4.relaxed_trunc_f32x4_s, i32x4.relaxed_trunc_f32x4_u, i32x4.relaxed_trunc_f64x2_s_zero, and i32x4.relaxed_trunc_f64x2_u_zero. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i32x4.relaxed_trunc_f32x4_s") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_s (local.get 0))) + (func (export "i32x4.relaxed_trunc_f32x4_u") (param v128) (result v128) (i32x4.relaxed_trunc_f32x4_u (local.get 0))) + (func (export "i32x4.relaxed_trunc_f64x2_s_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0))) + (func (export "i32x4.relaxed_trunc_f64x2_u_zero") (param v128) (result v128) (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0))) + + (func (export "i32x4.relaxed_trunc_f32x4_s_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f32x4_s (local.get 0)) + (i32x4.relaxed_trunc_f32x4_s (local.get 0)))) + (func (export "i32x4.relaxed_trunc_f32x4_u_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f32x4_u (local.get 0)) + (i32x4.relaxed_trunc_f32x4_u (local.get 0)))) + (func (export "i32x4.relaxed_trunc_f64x2_s_zero_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0)) + (i32x4.relaxed_trunc_f64x2_s_zero (local.get 0)))) + (func (export "i32x4.relaxed_trunc_f64x2_u_zero_cmp") (param v128) (result v128) + (i32x4.eq + (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0)) + (i32x4.relaxed_trunc_f64x2_u_zero (local.get 0)))) +) + +;; Test some edge cases around min/max to ensure that the instruction either +;; saturates correctly or returns INT_MIN. +;; +;; Note, though, that INT_MAX itself is not tested. The value for INT_MAX is +;; 2147483647 but that is not representable in a `f32` since it requires 31 bits +;; when a f32 has only 24 bits available. This means that the closest integers +;; to INT_MAX which can be represented are 2147483520 and 2147483648, meaning +;; that the INT_MAX test case cannot be tested. +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s" + ;; INT32_MIN INT32_MAX + (v128.const f32x4 -2147483648.0 -2147483904.0 2.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (either (v128.const i32x4 -2147483648 -2147483648 2 2147483647) + (v128.const i32x4 -2147483648 -2147483648 2 -2147483648))) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or INT32_MIN + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0x80000000 0x80000000 0x80000000 0x80000000))) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u" + ;; UINT32_MIN UINT32_MIN-1 saturate or UINT32_MAX + (either (v128.const i32x4 0 0 4294967040 0xffffffff) + (v128.const i32x4 0 0xffffffff 4294967040 0xffffffff))) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or UINT32_MAX + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0xffffffff 0xffffffff 0xffffffff 0xffffffff))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero" + (v128.const f64x2 -2147483904.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (either (v128.const i32x4 -2147483648 2147483647 0 0) + (v128.const i32x4 -2147483648 -2147483648 0 0))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero" + (v128.const f64x2 nan -nan)) + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0x80000000 0x80000000 0 0))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero" + (v128.const f64x2 -1.0 4294967296.0)) + ;; out of range -> saturate or UINT32_MAX + (either (v128.const i32x4 0 0xffffffff 0 0) + (v128.const i32x4 0xffffffff 0xffffffff 0 0))) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero" + (v128.const f64x2 nan -nan)) + (either (v128.const i32x4 0 0 0 0) + (v128.const i32x4 0 0 0xffffffff 0xffffffff))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp" + ;; INT32_MIN INT32_MAX + (v128.const f32x4 -2147483648.0 -2147483904.0 2147483647.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_s_cmp" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or INT32_MIN + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp" + ;; UINT32_MIN UINT32_MIN-1 saturate or UINT32_MAX + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f32x4_u_cmp" + (v128.const f32x4 nan -nan nan:0x444444 -nan:0x444444)) + ;; nans -> 0 or UINT32_MAX + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp" + (v128.const f64x2 -2147483904.0 2147483904.0)) + ;; out of range -> saturate or INT32_MIN + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_s_zero_cmp" + (v128.const f64x2 nan -nan)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp" + (v128.const f64x2 -1.0 4294967296.0)) + ;; out of range -> saturate or UINT32_MAX + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_trunc_f64x2_u_zero_cmp" + (v128.const f64x2 nan -nan)) + (v128.const i32x4 -1 -1 -1 -1)) diff --git a/test/extended/relaxed-simd/i8x16_relaxed_swizzle.wast b/test/extended/relaxed-simd/i8x16_relaxed_swizzle.wast new file mode 100644 index 000000000..f1bcb4552 --- /dev/null +++ b/test/extended/relaxed-simd/i8x16_relaxed_swizzle.wast @@ -0,0 +1,45 @@ +;; Tests for relaxed i8x16 swizzle. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i8x16.relaxed_swizzle") (param v128 v128) (result v128) (i8x16.relaxed_swizzle (local.get 0) (local.get 1))) + + (func (export "i8x16.relaxed_swizzle_cmp") (param v128 v128) (result v128) + (i8x16.eq + (i8x16.relaxed_swizzle (local.get 0) (local.get 1)) + (i8x16.relaxed_swizzle (local.get 0) (local.get 1)))) +) + +(assert_return (invoke "i8x16.relaxed_swizzle" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)) + (either (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))) + +;; out of range, returns 0 or modulo 15 if < 128 +(assert_return (invoke "i8x16.relaxed_swizzle" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)) + (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))) + +;; out of range, returns 0 if >= 128 +(assert_return (invoke "i8x16.relaxed_swizzle" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255)) + (either (v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +;; out of range, returns 0 or modulo 15 if < 128 +(assert_return (invoke "i8x16.relaxed_swizzle_cmp" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)) + (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) + +;; out of range, returns 0 if >= 128 +(assert_return (invoke "i8x16.relaxed_swizzle_cmp" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 128 129 130 131 132 133 134 135 248 249 250 251 252 253 254 255)) + (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) diff --git a/test/extended/relaxed-simd/relaxed_dot_product.wast b/test/extended/relaxed-simd/relaxed_dot_product.wast new file mode 100644 index 000000000..48714b87b --- /dev/null +++ b/test/extended/relaxed-simd/relaxed_dot_product.wast @@ -0,0 +1,107 @@ +;; Tests for relaxed dot products. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i16x8.relaxed_dot_i8x16_i7x16_s") (param v128 v128) (result v128) (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1))) + (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s") (param v128 v128 v128) (result v128) (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2))) + + (func (export "i16x8.relaxed_dot_i8x16_i7x16_s_cmp") (param v128 v128) (result v128) + (i16x8.eq + (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1)) + (i16x8.relaxed_dot_i8x16_i7x16_s (local.get 0) (local.get 1)))) + (func (export "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp") (param v128 v128 v128) (result v128) + (i16x8.eq + (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2)) + (i32x4.relaxed_dot_i8x16_i7x16_add_s (local.get 0) (local.get 1) (local.get 2)))) +) + +;; Simple values to ensure things are functional. +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)) + (v128.const i16x8 1 13 41 85 145 221 313 421)) + +;; Test max and min i8 values; +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s" + (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i16x8 -32512 32258 0 0 0 0 0 0)) + +;; signed * unsigned : -128 * 129 * 2 = -33,024 saturated to -32,768 +;; signed * signed : -128 * -127 * 2 = 32,512 +;; unsigned * unsigned : 128 * 129 * 2 = 33,024 +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s" + (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0)) + (either + (v128.const i16x8 -32768 0 0 0 0 0 0 0) + (v128.const i16x8 32512 0 0 0 0 0 0 0) + (v128.const i16x8 33024 0 0 0 0 0 0 0))) + +;; Simple values to ensure things are functional. +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s" + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i32x4 0 1 2 3)) + ;; intermediate result is [14, 126, 366, 734] + (v128.const i32x4 14 127 368 737)) + +;; Test max and min i8 values; +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s" + (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + ;; intermediate result is [-65024, 64516, 0, 0] + (v128.const i32x4 -65023 64518 3 4)) + +;; signed * unsigned : -128 * 129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI +;; signed * unsigned with intermediate saturation : +;; (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW) +;; -32768 + -32768 = -65536 (+ 1) +;; signed * signed : -128 * -127 * 4 = 65,024 (+ 1) +;; unsigned * unsigned : 128 * 129 * 2 = 66,048 (+ 1) +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s" + (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + (either + (v128.const i32x4 -66047 2 3 4) + (v128.const i32x4 -65535 2 3 4) + (v128.const i32x4 65025 2 3 4) + (v128.const i32x4 66049 2 3 4))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +;; Test max and min i8 values; +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp" + (v128.const i8x16 -128 -128 127 127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + +;; Test max and min i8 values; +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp" + (v128.const i8x16 -128 -128 -128 -128 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i8x16 127 127 127 127 127 127 127 127 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + ;; intermediate result is [-65024, 64516, 0, 0] + (v128.const i32x4 -1 -1 -1 -1)) + +;; signed * unsigned : -128 * 129 * 2 = -33,024 saturated to -32,768 +;; signed * signed : -128 * -127 * 2 = 32,512 +;; unsigned * unsigned : 128 * 129 * 2 = 33,024 +(assert_return (invoke "i16x8.relaxed_dot_i8x16_i7x16_s_cmp" + (v128.const i8x16 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + +;; signed * unsigned : -128 * 129 * 4 = -66,048 (+ 1) VPDPBUSD AVX2-VNNI or AVX512-VNNI +;; signed * unsigned with intermediate saturation : +;; (-128 * 129) + (-128 * 129) = -33024 saturated to -32768 (PMADDUBSW) +;; -32768 + -32768 = -65536 (+ 1) +;; signed * signed : -128 * -127 * 4 = 65,024 (+ 1) +;; unsigned * unsigned : 128 * 129 * 2 = 66,048 (+ 1) +(assert_return (invoke "i32x4.relaxed_dot_i8x16_i7x16_add_s_cmp" + (v128.const i8x16 -128 -128 -128 -128 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i8x16 -127 -127 -127 -127 0 0 0 0 0 0 0 0 0 0 0 0) + (v128.const i32x4 1 2 3 4)) + (v128.const i32x4 -1 -1 -1 -1)) diff --git a/test/extended/relaxed-simd/relaxed_laneselect.wast b/test/extended/relaxed-simd/relaxed_laneselect.wast new file mode 100644 index 000000000..10913816b --- /dev/null +++ b/test/extended/relaxed-simd/relaxed_laneselect.wast @@ -0,0 +1,103 @@ +;; Tests for i8x16.relaxed_laneselect, i16x8.relaxed_laneselect, i32x4.relaxed_laneselect, and i64x2.relaxed_laneselect. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "i8x16.relaxed_laneselect") (param v128 v128 v128) (result v128) (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + (func (export "i16x8.relaxed_laneselect") (param v128 v128 v128) (result v128) (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + (func (export "i32x4.relaxed_laneselect") (param v128 v128 v128) (result v128) (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + (func (export "i64x2.relaxed_laneselect") (param v128 v128 v128) (result v128) (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2))) + + (func (export "i8x16.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i8x16.eq + (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i8x16.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) + (func (export "i16x8.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i16x8.eq + (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i16x8.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) + (func (export "i32x4.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i32x4.eq + (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i32x4.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) + (func (export "i64x2.relaxed_laneselect_cmp") (param v128 v128 v128) (result v128) + (i64x2.eq + (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)) + (i64x2.relaxed_laneselect (local.get 0) (local.get 1) (local.get 2)))) +) + +(assert_return (invoke "i8x16.relaxed_laneselect" + (v128.const i8x16 0 1 0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31) + (v128.const i8x16 0xff 0 0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0)) + (either (v128.const i8x16 0 17 0x14 0x32 20 21 22 23 24 25 26 27 28 29 30 31) + (v128.const i8x16 0 17 0x12 0x34 20 21 22 23 24 25 26 27 28 29 30 31))) + +(assert_return (invoke "i16x8.relaxed_laneselect" + (v128.const i16x8 0 1 0x1234 0x1234 4 5 6 7) + (v128.const i16x8 8 9 0x5678 0x5678 12 13 14 15) + (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0)) + (either (v128.const i16x8 0 9 0x1278 0x5634 12 13 14 15) + (v128.const i16x8 0 9 0x1234 0x5678 12 13 14 15))) + +;; special case for i16x8 to allow pblendvb +(assert_return (invoke "i16x8.relaxed_laneselect" + (v128.const i16x8 0 1 0x1234 0x1234 4 5 6 7) + (v128.const i16x8 8 9 0x5678 0x5678 12 13 14 15) + (v128.const i16x8 0xffff 0 0xff00 0x0080 0 0 0 0)) ;; 0x0080 is the special case + (either (v128.const i16x8 0 9 0x1278 0x5678 12 13 14 15) ;; bitselect + (v128.const i16x8 0 9 0x1234 0x5678 12 13 14 15) ;; top bit of i16 lane examined + (v128.const i16x8 0 9 0x1278 0x5634 12 13 14 15) ;; top bit of each byte + )) + +(assert_return (invoke "i32x4.relaxed_laneselect" + (v128.const i32x4 0 1 0x12341234 0x12341234) + (v128.const i32x4 4 5 0x56785678 0x56785678) + (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff)) + (either (v128.const i32x4 0 5 0x12345678 0x56781234) + (v128.const i32x4 0 5 0x12341234 0x56785678))) + +(assert_return (invoke "i64x2.relaxed_laneselect" + (v128.const i64x2 0 1) + (v128.const i64x2 2 3) + (v128.const i64x2 0xffffffffffffffff 0)) + (either (v128.const i64x2 0 3) + (v128.const i64x2 0 3))) + +(assert_return (invoke "i64x2.relaxed_laneselect" + (v128.const i64x2 0x1234123412341234 0x1234123412341234) + (v128.const i64x2 0x5678567856785678 0x5678567856785678) + (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff)) + (either (v128.const i64x2 0x1234123456785678 0x5678567812341234) + (v128.const i64x2 0x1234123412341234 0x5678567856785678))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "i8x16.relaxed_laneselect_cmp" + (v128.const i8x16 0 1 0x12 0x12 4 5 6 7 8 9 10 11 12 13 14 15) + (v128.const i8x16 16 17 0x34 0x34 20 21 22 23 24 25 26 27 28 29 30 31) + (v128.const i8x16 0xff 0 0xf0 0x0f 0 0 0 0 0 0 0 0 0 0 0 0)) + (v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1)) + +(assert_return (invoke "i16x8.relaxed_laneselect_cmp" + (v128.const i16x8 0 1 0x1234 0x1234 4 5 6 7) + (v128.const i16x8 8 9 0x5678 0x5678 12 13 14 15) + (v128.const i16x8 0xffff 0 0xff00 0x00ff 0 0 0 0)) + (v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1)) + +(assert_return (invoke "i32x4.relaxed_laneselect_cmp" + (v128.const i32x4 0 1 0x12341234 0x12341234) + (v128.const i32x4 4 5 0x56785678 0x56785678) + (v128.const i32x4 0xffffffff 0 0xffff0000 0x0000ffff)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "i64x2.relaxed_laneselect_cmp" + (v128.const i64x2 0 1) + (v128.const i64x2 2 3) + (v128.const i64x2 0xffffffffffffffff 0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "i64x2.relaxed_laneselect_cmp" + (v128.const i64x2 0x1234123412341234 0x1234123412341234) + (v128.const i64x2 0x5678567856785678 0x5678567856785678) + (v128.const i64x2 0xffffffff00000000 0x00000000ffffffff)) + (v128.const i64x2 -1 -1)) diff --git a/test/extended/relaxed-simd/relaxed_madd_nmadd.wast b/test/extended/relaxed-simd/relaxed_madd_nmadd.wast new file mode 100644 index 000000000..187b71d5a --- /dev/null +++ b/test/extended/relaxed-simd/relaxed_madd_nmadd.wast @@ -0,0 +1,224 @@ +;; Tests for f32x4.relaxed_madd, f32x4.relaxed_nmadd, f64x2.relaxed_madd, and f64x2.relaxed_nmadd. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "f32x4.relaxed_madd") (param v128 v128 v128) (result v128) (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2))) + (func (export "f32x4.relaxed_nmadd") (param v128 v128 v128) (result v128) (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))) + (func (export "f64x2.relaxed_nmadd") (param v128 v128 v128) (result v128) (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2))) + (func (export "f64x2.relaxed_madd") (param v128 v128 v128) (result v128) (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2))) + + (func (export "f32x4.relaxed_madd_cmp") (param v128 v128 v128) (result v128) + (f32x4.eq + (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2)) + (f32x4.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))) + (func (export "f32x4.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128) + (f32x4.eq + (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)) + (f32x4.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))) + (func (export "f64x2.relaxed_nmadd_cmp") (param v128 v128 v128) (result v128) + (f64x2.eq + (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)) + (f64x2.relaxed_nmadd (local.get 0) (local.get 1) (local.get 2)))) + (func (export "f64x2.relaxed_madd_cmp") (param v128 v128 v128) (result v128) + (f64x2.eq + (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2)) + (f64x2.relaxed_madd (local.get 0) (local.get 1) (local.get 2)))) +) + + +;; FLT_MAX == 0x1.fffffep+127 +;; FLT_MAX * 2 - FLT_MAX == +;; FLT_MAX (if fma) +;; 0 (if no fma) +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f32x4.relaxed_madd" + (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (either (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127) + (v128.const f32x4 inf inf inf inf))) + +;; Special values for float: +;; x = 0x1.000004p+0 (1 + 2^-22) +;; y = 0x1.0002p+0 (1 + 2^-15) +;; z = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0) +;; = -0x1.000204p+0 +;; x.y = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = (0x1p-37) 2^-37 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f32x4.relaxed_madd" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37) + (v128.const f32x4 0 0 0 0))) +;; nmadd tests with negated x, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd" + (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37) + (v128.const f32x4 0 0 0 0))) +;; nmadd tests with negated y, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (either (v128.const f32x4 0x1p-37 0x1p-37 0x1p-37 0x1p-37) + (v128.const f32x4 0 0 0 0))) + +;; DBL_MAX = 0x1.fffffffffffffp+1023 +;; DLB_MAX * 2 - DLB_MAX == +;; DLB_MAX (if fma) +;; 0 (if no fma) +;; form https://www.vinc17.net/software/fma-tests.c +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f64x2.relaxed_madd" + (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023) + (v128.const f64x2 2.0 2.0) + (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023)) + (either (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023) + (v128.const f64x2 inf inf))) + +;; Special values for double: +;; x = 0x1.00000004p+0 (1 + 2^-30) +;; y = 0x1.000002p+0 (1 + 2^-23) +;; z = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0) +;; = -0x1.00000204p+0 +;; x.y = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = 0x1p-53 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f64x2.relaxed_madd" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (either (v128.const f64x2 0x1p-53 0x1p-53) + (v128.const f64x2 0 0))) +;; nmadd tests with negated x, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd" + (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (either (v128.const f64x2 0x1p-53 0x1p-53) + (v128.const f64x2 0 0))) +;; nmadd tests with negated y, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (either (v128.const f64x2 0x1p-53 0x1p-53) + (v128.const f64x2 0 0))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +;; FLT_MAX == 0x1.fffffep+127 +;; FLT_MAX * 2 - FLT_MAX == +;; FLT_MAX (if fma) +;; 0 (if no fma) +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f32x4.relaxed_madd_cmp" + (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (v128.const i32x4 -1 -1 -1 -1)) + +;; Special values for float: +;; x = 0x1.000004p+0 (1 + 2^-22) +;; y = 0x1.0002p+0 (1 + 2^-15) +;; z = -(1.0 + 0x0.0002p+0 + 0x0.000004p+0) +;; = -0x1.000204p+0 +;; x.y = 1.0 + 0x0.0002p+0 + 0x0.000004p+0 + 0x1p-37 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = (0x1p-37) 2^-37 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f32x4.relaxed_madd_cmp" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (v128.const i32x4 -1 -1 -1 -1)) +;; nmadd tests with negated x, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd_cmp" + (v128.const f32x4 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0 -0x1.000004p+0) + (v128.const f32x4 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0 0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (v128.const i32x4 -1 -1 -1 -1)) +;; nmadd tests with negated y, same answers are expected. +(assert_return (invoke "f32x4.relaxed_nmadd_cmp" + (v128.const f32x4 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0 0x1.000004p+0) + (v128.const f32x4 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0 -0x1.0002p+0) + (v128.const f32x4 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0 -0x1.000204p+0)) + (v128.const i32x4 -1 -1 -1 -1)) + +;; DBL_MAX = 0x1.fffffffffffffp+1023 +;; DLB_MAX * 2 - DLB_MAX == +;; DLB_MAX (if fma) +;; 0 (if no fma) +;; form https://www.vinc17.net/software/fma-tests.c +;; from https://www.vinc17.net/software/fma-tests.c +(assert_return (invoke "f64x2.relaxed_madd_cmp" + (v128.const f64x2 0x1.fffffffffffffp+1023 0x1.fffffffffffffp+1023) + (v128.const f64x2 2.0 2.0) + (v128.const f64x2 -0x1.fffffffffffffp+1023 -0x1.fffffffffffffp+1023)) + (v128.const i64x2 -1 -1)) + +;; Special values for double: +;; x = 0x1.00000004p+0 (1 + 2^-30) +;; y = 0x1.000002p+0 (1 + 2^-23) +;; z = -(1.0 + 0x0.000002p+0 + 0x0.00000004p+0) +;; = -0x1.00000204p+0 +;; x.y = 1.0 + 0x0.000002p+0 + 0x0.00000004p+0 + 0x1p-53 (round bit) +;; x.y+z = 0 (2 roundings) +;; fma(x, y, z) = 0x1p-53 +;; from https://accurate-algorithms.readthedocs.io/en/latest/ch09appendix.html#test-system-information +(assert_return (invoke "f64x2.relaxed_madd_cmp" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (v128.const i64x2 -1 -1)) +;; nmadd tests with negated x, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd_cmp" + (v128.const f64x2 -0x1.00000004p+0 -0x1.00000004p+0) + (v128.const f64x2 0x1.000002p+0 0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (v128.const i64x2 -1 -1)) +;; nmadd tests with negated y, same answers are expected. +(assert_return (invoke "f64x2.relaxed_nmadd_cmp" + (v128.const f64x2 0x1.00000004p+0 0x1.00000004p+0) + (v128.const f64x2 -0x1.000002p+0 -0x1.000002p+0) + (v128.const f64x2 -0x1.00000204p+0 -0x1.00000204p+0)) + (v128.const i64x2 -1 -1)) + +;; Test that the non-deterministic choice of fusing and then rounding or +;; rounding multiple times in `relaxed_madd` is consistent throughout a +;; program's execution. +;; +;; This property is impossible to test exhaustively, so this is just a simple +;; smoke test for when the operands to a `relaxed_madd` are known statically +;; versus when they are dynamically supplied. This should, at least, catch +;; illegal constant-folding and -propagation by the compiler that leads to +;; inconsistent rounding behavior at compile time versus at run time. +;; +;; FLT_MAX == 0x1.fffffep+127 +;; FLT_MAX * 2 - FLT_MAX == +;; FLT_MAX (if fma) +;; 0 (if no fma) +;; from https://www.vinc17.net/software/fma-tests.c +(module + (func (export "test-consistent-nondeterminism") (param v128 v128 v128) (result v128) + (f32x4.eq + (f32x4.relaxed_madd (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (f32x4.relaxed_madd (local.get 0) + (local.get 1) + (local.get 2)) + ) + ) +) +(assert_return (invoke "test-consistent-nondeterminism" + (v128.const f32x4 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 0x1.fffffep+127 ) + (v128.const f32x4 2.0 2.0 2.0 2.0) + (v128.const f32x4 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127 -0x1.fffffep+127)) + (v128.const i32x4 -1 -1 -1 -1)) diff --git a/test/extended/relaxed-simd/relaxed_min_max.wast b/test/extended/relaxed-simd/relaxed_min_max.wast new file mode 100644 index 000000000..ac3ebb07c --- /dev/null +++ b/test/extended/relaxed-simd/relaxed_min_max.wast @@ -0,0 +1,184 @@ +;; Tests for f32x4.min, f32x4.max, f64x2.min, and f64x2.max. +;; `either` comes from https://github.com/WebAssembly/threads. + +(module + (func (export "f32x4.relaxed_min") (param v128 v128) (result v128) (f32x4.relaxed_min (local.get 0) (local.get 1))) + (func (export "f32x4.relaxed_max") (param v128 v128) (result v128) (f32x4.relaxed_max (local.get 0) (local.get 1))) + (func (export "f64x2.relaxed_min") (param v128 v128) (result v128) (f64x2.relaxed_min (local.get 0) (local.get 1))) + (func (export "f64x2.relaxed_max") (param v128 v128) (result v128) (f64x2.relaxed_max (local.get 0) (local.get 1))) + + (func (export "f32x4.relaxed_min_cmp") (param v128 v128) (result v128) + (i32x4.eq + (f32x4.relaxed_min (local.get 0) (local.get 1)) + (f32x4.relaxed_min (local.get 0) (local.get 1)))) + (func (export "f32x4.relaxed_max_cmp") (param v128 v128) (result v128) + (i32x4.eq + (f32x4.relaxed_max (local.get 0) (local.get 1)) + (f32x4.relaxed_max (local.get 0) (local.get 1)))) + (func (export "f64x2.relaxed_min_cmp") (param v128 v128) (result v128) + (i64x2.eq + (f64x2.relaxed_min (local.get 0) (local.get 1)) + (f64x2.relaxed_min (local.get 0) (local.get 1)))) + (func (export "f64x2.relaxed_max_cmp") (param v128 v128) (result v128) + (i64x2.eq + (f64x2.relaxed_max (local.get 0) (local.get 1)) + (f64x2.relaxed_max (local.get 0) (local.get 1)))) +) + +(assert_return (invoke "f32x4.relaxed_min" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical) + (v128.const f32x4 nan:canonical nan:canonical 0 0) + (v128.const f32x4 0 0 nan:canonical nan:canonical) + (v128.const f32x4 0 0 0 0))) + +(assert_return (invoke "f32x4.relaxed_min" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (either (v128.const f32x4 -0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 -0.0 +0.0 -0.0))) + +(assert_return (invoke "f32x4.relaxed_max" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (either (v128.const f32x4 nan:canonical nan:canonical nan:canonical nan:canonical) + (v128.const f32x4 nan:canonical nan:canonical 0 0) + (v128.const f32x4 0 0 nan:canonical nan:canonical) + (v128.const f32x4 0 0 0 0))) + +(assert_return (invoke "f32x4.relaxed_max" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (either (v128.const f32x4 +0.0 +0.0 +0.0 -0.0) + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 -0.0 +0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (either (v128.const f64x2 -0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0) + (v128.const f64x2 -0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_min" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (either (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (either (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0) + (v128.const f64x2 nan:canonical nan:canonical) + (v128.const f64x2 0 0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (either (v128.const f64x2 +0.0 +0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0) + (v128.const f64x2 -0.0 -0.0))) + +(assert_return (invoke "f64x2.relaxed_max" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (either (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0))) + +;; Check that multiple calls to the relaxed instruction with same inputs returns same results. + +(assert_return (invoke "f32x4.relaxed_min_cmp" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f32x4.relaxed_min_cmp" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f32x4.relaxed_max_cmp" + (v128.const f32x4 -nan nan 0 0) + (v128.const f32x4 0 0 -nan nan)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f32x4.relaxed_max_cmp" + (v128.const f32x4 +0.0 -0.0 +0.0 -0.0) + (v128.const f32x4 -0.0 +0.0 +0.0 -0.0)) + (v128.const i32x4 -1 -1 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_min_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 -nan nan) + (v128.const f64x2 0 0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 0 0) + (v128.const f64x2 -nan nan)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 -0.0 +0.0)) + (v128.const i64x2 -1 -1)) + +(assert_return (invoke "f64x2.relaxed_max_cmp" + (v128.const f64x2 +0.0 -0.0) + (v128.const f64x2 +0.0 -0.0)) + (v128.const i64x2 -1 -1)) diff --git a/third_party/wabt/src/walrus/binary-reader-walrus.cc b/third_party/wabt/src/walrus/binary-reader-walrus.cc index 7d8e224b8..4a94fd7b4 100644 --- a/third_party/wabt/src/walrus/binary-reader-walrus.cc +++ b/third_party/wabt/src/walrus/binary-reader-walrus.cc @@ -84,6 +84,8 @@ static Features getFeatures() { features.enable_exceptions(); // TODO: should use command line flag for this (--enable-threads) features.enable_threads(); + // TODO: should use command line flag for this (--enable-relaxed-simd) + features.enable_relaxed_simd(); return features; } diff --git a/tools/jit_exclude_list.txt b/tools/jit_exclude_list.txt index e69de29bb..3ef9aed46 100644 --- a/tools/jit_exclude_list.txt +++ b/tools/jit_exclude_list.txt @@ -0,0 +1,7 @@ +i16x8_relaxed_q15mulr_s.wast +i32x4_relaxed_trunc.wast +i8x16_relaxed_swizzle.wast +relaxed_dot_product.wast +relaxed_laneselect.wast +relaxed_madd_nmadd.wast +relaxed_min_max.wast