Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement relaxed simd in the interpreter #293

Merged
merged 1 commit into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 122 additions & 40 deletions src/interpreter/ByteCode.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ class FunctionType;
F(I64X2ExtmulHighI32X4S, (simdExtmulOperation<int32_t, int64_t, false>)) \
F(I64X2ExtmulLowI32X4U, (simdExtmulOperation<uint32_t, uint64_t, true>)) \
F(I64X2ExtmulHighI32X4U, (simdExtmulOperation<uint32_t, uint64_t, false>)) \
F(I32X4DotI16X8S, (simdDotOperation)) \
F(I32X4DotI16X8S, (simdDotOperation<int16_t, uint32_t>)) \
F(I8X16NarrowI16X8S, (simdNarrowOperation<int16_t, int8_t>)) \
F(I8X16NarrowI16X8U, (simdNarrowOperation<int16_t, uint8_t>)) \
F(I16X8NarrowI32X4S, (simdNarrowOperation<int32_t, int16_t>)) \
Expand Down Expand Up @@ -588,30 +588,65 @@ class FunctionType;
F(MemoryAtomicWait64) \
F(AtomicFence)

#define FOR_EACH_BYTECODE(F) \
FOR_EACH_BYTECODE_OP(F) \
FOR_EACH_BYTECODE_BINARY_OP(F) \
FOR_EACH_BYTECODE_UNARY_OP(F) \
FOR_EACH_BYTECODE_UNARY_OP_2(F) \
FOR_EACH_BYTECODE_LOAD_OP(F) \
FOR_EACH_BYTECODE_STORE_OP(F) \
FOR_EACH_BYTECODE_SIMD_BINARY_OP(F) \
FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(F) \
FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(F) \
FOR_EACH_BYTECODE_SIMD_UNARY_OP(F) \
FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(F) \
FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(F) \
FOR_EACH_BYTECODE_SIMD_LOAD_SPLAT_OP(F) \
FOR_EACH_BYTECODE_SIMD_LOAD_EXTEND_OP(F) \
FOR_EACH_BYTECODE_SIMD_LOAD_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_STORE_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_EXTRACT_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_REPLACE_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_ETC_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_LOAD_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_STORE_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_RMW_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(F) \
#define FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(F) \
F(I32X4RelaxedTruncF32X4S, (simdTruncSatOperation<float, int32_t>)) \
F(I32X4RelaxedTruncF32X4U, (simdTruncSatOperation<float, uint32_t>)) \
F(I32X4RelaxedTruncF64X2SZero, (simdTruncSatZeroOperation<double, int32_t>)) \
F(I32X4RelaxedTruncF64X2UZero, (simdTruncSatZeroOperation<double, uint32_t>))

#define FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(F) \
F(F32X4RelaxedMin, floatMin, float, float) \
F(F32X4RelaxedMax, floatMax, float, float) \
F(F64X2RelaxedMin, floatMin, double, double) \
F(F64X2RelaxedMax, floatMax, double, double) \
F(I16X8RelaxedQ15mulrS, saturatingRoundingQMul, int16_t, int16_t)

#define FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(F) \
F(I8X16RelaxedSwizzle, (simdSwizzleOperation<uint8_t>)) \
F(I16X8DotI8X16I7X16S, (simdDotOperation<int8_t, uint16_t>))

#define FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(F) \
F(F32X4RelaxedMadd, floatMulAdd, float, float) \
F(F32X4RelaxedNmadd, floatNegMulAdd, float, float) \
F(F64X2RelaxedMadd, floatMulAdd, double, double) \
F(F64X2RelaxedNmadd, floatNegMulAdd, double, double)

#define FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(F) \
F(I32X4DotI8X16I7X16AddS, (simdDotAddOperation)) \
F(I8X16RelaxedLaneSelect, (simdBitSelectOperation)) \
F(I16X8RelaxedLaneSelect, (simdBitSelectOperation)) \
F(I32X4RelaxedLaneSelect, (simdBitSelectOperation)) \
F(I64X2RelaxedLaneSelect, (simdBitSelectOperation))

#define FOR_EACH_BYTECODE(F) \
FOR_EACH_BYTECODE_OP(F) \
FOR_EACH_BYTECODE_BINARY_OP(F) \
FOR_EACH_BYTECODE_UNARY_OP(F) \
FOR_EACH_BYTECODE_UNARY_OP_2(F) \
FOR_EACH_BYTECODE_LOAD_OP(F) \
FOR_EACH_BYTECODE_STORE_OP(F) \
FOR_EACH_BYTECODE_SIMD_BINARY_OP(F) \
FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(F) \
FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(F) \
FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(F) \
FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(F) \
FOR_EACH_BYTECODE_SIMD_UNARY_OP(F) \
FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(F) \
FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(F) \
FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(F) \
FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(F) \
FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(F) \
FOR_EACH_BYTECODE_SIMD_LOAD_SPLAT_OP(F) \
FOR_EACH_BYTECODE_SIMD_LOAD_EXTEND_OP(F) \
FOR_EACH_BYTECODE_SIMD_LOAD_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_STORE_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_EXTRACT_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_REPLACE_LANE_OP(F) \
FOR_EACH_BYTECODE_SIMD_ETC_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_LOAD_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_STORE_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_RMW_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(F) \
FOR_EACH_BYTECODE_ATOMIC_OTHER(F)

class ByteCode {
Expand Down Expand Up @@ -726,6 +761,24 @@ class ByteCodeOffset2Value : public ByteCode {
uint32_t m_value;
};

class ByteCodeOffset4 : public ByteCode {
public:
ByteCodeOffset4(Opcode opcode, ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset)
: ByteCode(opcode)
, m_stackOffsets{ src0Offset, src1Offset, src2Offset, dstOffset }
{
}

const ByteCodeStackOffset* srcOffsets() const { return m_stackOffsets; }
ByteCodeStackOffset src0Offset() const { return m_stackOffsets[0]; }
ByteCodeStackOffset src1Offset() const { return m_stackOffsets[1]; }
ByteCodeStackOffset src2Offset() const { return m_stackOffsets[2]; }
ByteCodeStackOffset dstOffset() const { return m_stackOffsets[3]; }

protected:
ByteCodeStackOffset m_stackOffsets[4];
};

class ByteCodeOffset4Value : public ByteCode {
public:
ByteCodeOffset4Value(Opcode opcode, ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset, uint32_t value)
Expand Down Expand Up @@ -923,15 +976,56 @@ class UnaryOperation : public ByteCodeOffset2 {
DEFINE_UNARY_BYTECODE_DUMP(name) \
};

// dummy ByteCode for ternary operation
class TernaryOperation : public ByteCodeOffset4 {
public:
TernaryOperation(Opcode code, ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset)
: ByteCodeOffset4(code, src0Offset, src1Offset, src2Offset, dstOffset)
{
}

#if !defined(NDEBUG)
void dump(size_t pos)
{
}
#endif
};

#if !defined(NDEBUG)
#define DEFINE_TERNARY_BYTECODE_DUMP(name) \
void dump(size_t pos) \
{ \
printf(#name " src1: %" PRIu32 " src2: %" PRIu32 " src3: %" PRIu32 " dst: %" PRIu32, (uint32_t)m_stackOffsets[0], (uint32_t)m_stackOffsets[1], (uint32_t)m_stackOffsets[2], (uint32_t)m_stackOffsets[3]); \
}
#else
#define DEFINE_TERNARY_BYTECODE_DUMP(name)
#endif

#define DEFINE_TERNARY_BYTECODE(name, ...) \
class name : public TernaryOperation { \
public: \
name(ByteCodeStackOffset src0Offset, ByteCodeStackOffset src1Offset, ByteCodeStackOffset src2Offset, ByteCodeStackOffset dstOffset) \
: TernaryOperation(Opcode::name##Opcode, src0Offset, src1Offset, src2Offset, dstOffset) \
{ \
} \
DEFINE_TERNARY_BYTECODE_DUMP(name) \
};


FOR_EACH_BYTECODE_BINARY_OP(DEFINE_BINARY_BYTECODE)
FOR_EACH_BYTECODE_UNARY_OP(DEFINE_UNARY_BYTECODE)
FOR_EACH_BYTECODE_UNARY_OP_2(DEFINE_UNARY_BYTECODE)
FOR_EACH_BYTECODE_SIMD_BINARY_OP(DEFINE_BINARY_BYTECODE)
FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(DEFINE_BINARY_BYTECODE)
FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(DEFINE_BINARY_BYTECODE)
FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(DEFINE_BINARY_BYTECODE)
FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(DEFINE_BINARY_BYTECODE)
FOR_EACH_BYTECODE_SIMD_UNARY_OP(DEFINE_UNARY_BYTECODE)
FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(DEFINE_UNARY_BYTECODE)
FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(DEFINE_UNARY_BYTECODE)
FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(DEFINE_UNARY_BYTECODE)
FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(DEFINE_TERNARY_BYTECODE)
FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(DEFINE_TERNARY_BYTECODE)

#define DEFINE_MOVE_BYTECODE(name) \
class name : public ByteCodeOffset2 { \
Expand Down Expand Up @@ -1910,31 +2004,19 @@ FOR_EACH_BYTECODE_ATOMIC_RMW_CMPXCHG_OP(DEFINE_RMW_CMPXCHG_BYTECODE)
#undef DEFINE_RMW_BYTECODE

// FOR_EACH_BYTECODE_SIMD_ETC_OP
class V128BitSelect : public ByteCode {
class V128BitSelect : public ByteCodeOffset4 {
public:
V128BitSelect(ByteCodeStackOffset lhs, ByteCodeStackOffset rhs, ByteCodeStackOffset c, ByteCodeStackOffset dst)
: ByteCode(Opcode::V128BitSelectOpcode)
, m_srcOffsets{ lhs, rhs, c }
, m_dstOffset(dst)
: ByteCodeOffset4(Opcode::V128BitSelectOpcode, lhs, rhs, c, dst)
{
}

const ByteCodeStackOffset* srcOffsets() const
{
return m_srcOffsets;
}
ByteCodeStackOffset dstOffset() const { return m_dstOffset; }

#if !defined(NDEBUG)
void dump(size_t pos)
{
printf("v128.bitselect lhs: %" PRIu32 " rhs: %" PRIu32 " c: %" PRIu32 " dst: %" PRIu32, (uint32_t)m_srcOffsets[0], (uint32_t)m_srcOffsets[1], (uint32_t)m_srcOffsets[2], (uint32_t)m_dstOffset);
printf("v128.bitselect lhs: %" PRIu32 " rhs: %" PRIu32 " c: %" PRIu32 " dst: %" PRIu32, (uint32_t)m_stackOffsets[0], (uint32_t)m_stackOffsets[1], (uint32_t)m_stackOffsets[2], (uint32_t)m_stackOffsets[3]);
}
#endif

protected:
ByteCodeStackOffset m_srcOffsets[3];
ByteCodeStackOffset m_dstOffset;
};

class V128Load32Zero : public MemoryLoad {
Expand Down
83 changes: 71 additions & 12 deletions src/interpreter/Interpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,19 @@ inline static void simdSwizzleOperation(ExecutionState& state, BinaryOperation*
writeValue<Type>(bp, code->dstOffset(), result);
}

inline static void simdBitSelectOperation(ExecutionState& state, ByteCodeOffset4* code, uint8_t* bp)
{
using Type = typename SIMDType<uint64_t>::Type;
auto src0 = readValue<Type>(bp, code->src0Offset());
auto src1 = readValue<Type>(bp, code->src1Offset());
auto src2 = readValue<Type>(bp, code->src2Offset());
Type result;
for (uint8_t i = 0; i < Type::Lanes; i++) {
result[i] = (src0[i] & src2[i]) | (src1[i] & ~src2[i]);
}
writeValue<Type>(bp, code->dstOffset(), result);
}

// FIXME optimize this function
template <typename P, typename R, bool Low>
inline static void simdExtmulOperation(ExecutionState& state, BinaryOperation* code, uint8_t* bp)
Expand All @@ -286,10 +299,11 @@ inline static void simdExtmulOperation(ExecutionState& state, BinaryOperation* c
writeValue<ResultType>(bp, code->dstOffset(), result);
}

template <typename P, typename R>
inline static void simdDotOperation(ExecutionState& state, BinaryOperation* code, uint8_t* bp)
{
using ParamType = typename SIMDType<int16_t>::Type;
using ResultType = typename SIMDType<uint32_t>::Type;
using ParamType = typename SIMDType<P>::Type;
using ResultType = typename SIMDType<R>::Type;
auto lhs = readValue<ParamType>(bp, code->srcOffset()[0]);
auto rhs = readValue<ParamType>(bp, code->srcOffset()[1]);
ResultType result;
Expand All @@ -302,6 +316,26 @@ inline static void simdDotOperation(ExecutionState& state, BinaryOperation* code
writeValue<ResultType>(bp, code->dstOffset(), result);
}

inline static void simdDotAddOperation(ExecutionState& state, TernaryOperation* code, uint8_t* bp)
{
using ParamType = typename SIMDType<int8_t>::Type;
using ResultType = typename SIMDType<int32_t>::Type;
auto src0 = readValue<ParamType>(bp, code->src0Offset());
auto src1 = readValue<ParamType>(bp, code->src1Offset());
auto src2 = readValue<ResultType>(bp, code->src2Offset());
ResultType result;
for (uint8_t i = 0; i < ResultType::Lanes; i++) {
uint8_t laneIdx = i * 4;
int16_t lo0 = static_cast<int16_t>(src0[laneIdx]) * static_cast<int16_t>(src1[laneIdx]);
int16_t hi0 = static_cast<int16_t>(src0[laneIdx + 1]) * static_cast<int16_t>(src1[laneIdx + 1]);
int16_t lo1 = static_cast<int16_t>(src0[laneIdx + 2]) * static_cast<int16_t>(src1[laneIdx + 2]);
int16_t hi1 = static_cast<int16_t>(src0[laneIdx + 3]) * static_cast<int16_t>(src1[laneIdx + 3]);
int32_t tmp = static_cast<int16_t>(lo0 + hi0) + static_cast<int16_t>(lo1 + hi1);
result[i] = add(state, tmp, src2[i]);
}
writeValue<ResultType>(bp, code->dstOffset(), result);
}

template <typename P, typename R>
inline static void simdNarrowOperation(ExecutionState& state, BinaryOperation* code, uint8_t* bp)
{
Expand Down Expand Up @@ -582,6 +616,35 @@ ByteCodeStackOffset* Interpreter::interpret(ExecutionState& state,
NEXT_INSTRUCTION(); \
}

#define SIMD_TERNARY_OPERATION(name, op, paramType, resultType) \
DEFINE_OPCODE(name) \
: \
{ \
using ParamType = typename SIMDType<paramType>::Type; \
using ResultType = typename SIMDType<resultType>::Type; \
COMPILE_ASSERT(ParamType::Lanes == ResultType::Lanes, ""); \
name* code = (name*)programCounter; \
auto src0 = readValue<ParamType>(bp, code->src0Offset()); \
auto src1 = readValue<ParamType>(bp, code->src1Offset()); \
auto src2 = readValue<ParamType>(bp, code->src2Offset()); \
ResultType result; \
for (uint8_t i = 0; i < ParamType::Lanes; i++) { \
result[i] = op(state, src0[i], src1[i], src2[i]); \
} \
writeValue<ResultType>(bp, code->dstOffset(), result); \
ADD_PROGRAM_COUNTER(name); \
NEXT_INSTRUCTION(); \
}

#define SIMD_TERNARY_OTHER_OPERATION(name, op) \
DEFINE_OPCODE(name) \
: \
{ \
op(state, (TernaryOperation*)programCounter, bp); \
ADD_PROGRAM_COUNTER(BinaryOperation); \
NEXT_INSTRUCTION(); \
}

#define MEMORY_LOAD_OPERATION(opcodeName, readType, writeType) \
DEFINE_OPCODE(opcodeName) \
: \
Expand Down Expand Up @@ -880,9 +943,14 @@ ByteCodeStackOffset* Interpreter::interpret(ExecutionState& state,
FOR_EACH_BYTECODE_SIMD_BINARY_OP(SIMD_BINARY_OPERATION)
FOR_EACH_BYTECODE_SIMD_BINARY_SHIFT_OP(SIMD_BINARY_SHIFT_OPERATION)
FOR_EACH_BYTECODE_SIMD_BINARY_OTHER(SIMD_BINARY_OTHER_OPERATION)
FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OP(SIMD_BINARY_OPERATION)
FOR_EACH_BYTECODE_RELAXED_SIMD_BINARY_OTHER(SIMD_BINARY_OTHER_OPERATION)
FOR_EACH_BYTECODE_SIMD_UNARY_OP(SIMD_UNARY_OPERATION)
FOR_EACH_BYTECODE_SIMD_UNARY_CONVERT_OP(SIMD_UNARY_CONVERT_OPERATION)
FOR_EACH_BYTECODE_SIMD_UNARY_OTHER(SIMD_UNARY_OTHER_OPERATION)
FOR_EACH_BYTECODE_RELAXED_SIMD_UNARY_OTHER(SIMD_UNARY_OTHER_OPERATION)
FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OP(SIMD_TERNARY_OPERATION)
FOR_EACH_BYTECODE_RELAXED_SIMD_TERNARY_OTHER(SIMD_TERNARY_OTHER_OPERATION)

DEFINE_OPCODE(Jump)
:
Expand Down Expand Up @@ -1088,16 +1156,7 @@ ByteCodeStackOffset* Interpreter::interpret(ExecutionState& state,
DEFINE_OPCODE(V128BitSelect)
:
{
using Type = typename SIMDType<uint64_t>::Type;
V128BitSelect* code = (V128BitSelect*)programCounter;
auto lhs = readValue<Type>(bp, code->srcOffsets()[0]);
auto rhs = readValue<Type>(bp, code->srcOffsets()[1]);
auto c = readValue<Type>(bp, code->srcOffsets()[2]);
Type result;
for (uint8_t i = 0; i < Type::Lanes; i++) {
result[i] = (lhs[i] & c[i]) | (rhs[i] & ~c[i]);
}
writeValue<Type>(bp, code->dstOffset(), result);
simdBitSelectOperation(state, (ByteCodeOffset4*)programCounter, bp);
ADD_PROGRAM_COUNTER(V128BitSelect);
NEXT_INSTRUCTION();
}
Expand Down
Loading
Loading