Skip to content

Commit

Permalink
[simd/jit]: Implement more v128 shifting instructions
Browse files Browse the repository at this point in the history
  • Loading branch information
haoyu-zc committed Aug 8, 2023
1 parent 5c4c4c2 commit 72156e8
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 19 deletions.
19 changes: 5 additions & 14 deletions src/engine/x86-64/X86_64Interpreter.v3
Original file line number Diff line number Diff line change
Expand Up @@ -2401,7 +2401,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
for (t in [
(Opcode.I8X16_SHL, masm.emit_i8x16_shl),
(Opcode.I8X16_SHR_S, masm.emit_i8x16_shr_s),
(Opcode.I8X16_SHR_U, masm.emit_i8x16_shr_u)
(Opcode.I8X16_SHR_U, masm.emit_i8x16_shr_u),
(Opcode.I64X2_SHR_S, masm.emit_i64x2_shr_s)
]) {
bindHandler(t.0);
load_v128_xmm0_tmp0();
Expand All @@ -2412,12 +2413,12 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
}
for (t in [
(Opcode.I16X8_SHL, asm.psllw_s_s, 4),
(Opcode.I32X4_SHL, asm.pslld_s_s, 5),
(Opcode.I64X2_SHL, asm.psllq_s_s, 6),
(Opcode.I16X8_SHR_S, asm.psraw_s_s, 4),
(Opcode.I32X4_SHR_S, asm.psrad_s_s, 5),
(Opcode.I16X8_SHR_U, asm.psrlw_s_s, 4),
(Opcode.I32X4_SHL, asm.pslld_s_s, 5),
(Opcode.I32X4_SHR_S, asm.psrad_s_s, 5),
(Opcode.I32X4_SHR_U, asm.psrld_s_s, 5),
(Opcode.I64X2_SHL, asm.psllq_s_s, 6),
(Opcode.I64X2_SHR_U, asm.psrlq_s_s, 6)
]) {
bindHandler(t.0);
Expand All @@ -2427,16 +2428,6 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
decrementVsp();
endHandler();
}
for (t in [
(Opcode.I64X2_SHR_S, masm.emit_i64x2_shr_s)
]) {
bindHandler(t.0);
load_v128_xmm0_tmp0();
t.1(r_xmm0, r_tmp0, r_xmm1, r_xmm2, r_tmp1);
asm.movdqu_m_s(vsph[-2].value, r_xmm0);
decrementVsp();
endHandler();
}

// V128 lane-wise general binary operations
genSimdBinop(Opcode.I8X16_ADD, asm.paddb_s_s);
Expand Down
2 changes: 1 addition & 1 deletion src/engine/x86-64/X86_64MacroAssembler.v3
Original file line number Diff line number Diff line change
Expand Up @@ -999,7 +999,7 @@ class X86_64MacroAssembler extends MacroAssembler {
asm.xorps_s_s(dst, scratch);
asm.psubq_s_s(dst, scratch);
}
def emit_i64x2_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, xmm_tmp: X86_64Xmmr, xmm_shift: X86_64Xmmr, tmp_shift: X86_64Gpr) {
def emit_i64x2_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, tmp_shift: X86_64Gpr, xmm_tmp: X86_64Xmmr, xmm_shift: X86_64Xmmr) {
asm.pcmpeqd_s_s(xmm_tmp, xmm_tmp);
asm.psllq_i(xmm_tmp, 63);
// shift modulo 64
Expand Down
26 changes: 22 additions & 4 deletions src/engine/x86-64/X86_64SinglePassCompiler.v3
Original file line number Diff line number Diff line change
Expand Up @@ -461,9 +461,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
state.push(b.kindFlagsMatching(ValueKind.V128, IN_REG), b.reg, 0);
}

def visit_I8X16_SHL() { visit_I8X16_SHFIT(mmasm.emit_i8x16_shl); }
def visit_I8X16_SHR_S() { visit_I8X16_SHFIT(mmasm.emit_i8x16_shr_s); }
def visit_I8X16_SHR_U() { visit_I8X16_SHFIT(mmasm.emit_i8x16_shr_u); }
def visit_I8X16_SHL() { visit_V128_SHFIT1(mmasm.emit_i8x16_shl); }
def visit_I8X16_SHR_S() { visit_V128_SHFIT1(mmasm.emit_i8x16_shr_s); }
def visit_I8X16_SHR_U() { visit_V128_SHFIT1(mmasm.emit_i8x16_shr_u); }
def visit_I8X16_ADD() { do_op2_x_x(ValueKind.V128, asm.paddb_s_s); }
def visit_I8X16_SUB() { do_op2_x_x(ValueKind.V128, asm.psubb_s_s); }
def visit_I8X16_NEG() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i8x16_neg); }
Expand Down Expand Up @@ -491,6 +491,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_I8X16_NARROW_I16X8_S() { do_op2_x_x(ValueKind.V128, asm.packsswb_s_s); }
def visit_I8X16_NARROW_I16X8_U() { do_op2_x_x(ValueKind.V128, asm.packuswb_s_s); }

def visit_I16X8_SHL() { visit_V128_SHFIT2(4, asm.psllw_s_s); }
def visit_I16X8_SHR_S() {visit_V128_SHFIT2(4, asm.psraw_s_s); }
def visit_I16X8_SHR_U() { visit_V128_SHFIT2(4, asm.psrlw_s_s); }
def visit_I16X8_ADD() { do_op2_x_x(ValueKind.V128, asm.paddw_s_s); }
def visit_I16X8_SUB() { do_op2_x_x(ValueKind.V128, asm.psubw_s_s); }
def visit_I16X8_MUL() { do_op2_x_x(ValueKind.V128, asm.pmullw_s_s); }
Expand Down Expand Up @@ -529,6 +532,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_I16X8_EXTEND_HIGH_I8X16_S() { do_op1_x(ValueKind.V128, mmasm.emit_i16x8_s_convert_i8x16_high); }
def visit_I16X8_EXTEND_HIGH_I8X16_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_u_convert_i8x16_high); }

def visit_I32X4_SHL() { visit_V128_SHFIT2(5, asm.pslld_s_s); }
def visit_I32X4_SHR_S() { visit_V128_SHFIT2(5, asm.psrad_s_s); }
def visit_I32X4_SHR_U() { visit_V128_SHFIT2(5, asm.psrld_s_s); }
def visit_I32X4_ADD() { do_op2_x_x(ValueKind.V128, asm.paddd_s_s); }
def visit_I32X4_SUB() { do_op2_x_x(ValueKind.V128, asm.psubd_s_s); }
def visit_I32X4_MUL() { do_op2_x_x(ValueKind.V128, asm.pmulld_s_s); }
Expand Down Expand Up @@ -564,6 +570,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_I32X4_EXTEND_HIGH_I16X8_S() { do_op1_x(ValueKind.V128, mmasm.emit_i32x4_s_convert_i16x8_high); }
def visit_I32X4_EXTEND_HIGH_I16X8_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i32x4_u_convert_i16x8_high); }

def visit_I64X2_SHL() { visit_V128_SHFIT2(6, asm.psllq_s_s); }
def visit_I64X2_SHR_S() { visit_V128_SHFIT1(mmasm.emit_i64x2_shr_s); }
def visit_I64X2_SHR_U() { visit_V128_SHFIT2(6, asm.psrlq_s_s); }
def visit_I64X2_ADD() { do_op2_x_x(ValueKind.V128, asm.paddq_s_s); }
def visit_I64X2_SUB() { do_op2_x_x(ValueKind.V128, asm.psubq_s_s); }
def visit_I64X2_MUL() { do_op2_x_x(ValueKind.V128, mmasm.emit_i64x2_mul(_, _, X(allocTmp(ValueKind.V128)), X(allocTmp(ValueKind.V128)))); }
Expand Down Expand Up @@ -672,7 +681,7 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
state.push(sv.kindFlagsMatching(ValueKind.V128, IN_REG), sv.reg, 0);
}

private def visit_I8X16_SHFIT<T>(masm_shift: (X86_64Xmmr, X86_64Gpr, X86_64Gpr, X86_64Xmmr, X86_64Xmmr) -> T) {
private def visit_V128_SHFIT1<T>(masm_shift: (X86_64Xmmr, X86_64Gpr, X86_64Gpr, X86_64Xmmr, X86_64Xmmr) -> T) {
var b = popReg();
var a = popRegToOverwrite();
var gtmp = G(allocTmp(ValueKind.I64));
Expand All @@ -682,6 +691,15 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
}

private def visit_V128_SHFIT2<T>(width: byte, asm_shift: (X86_64Xmmr, X86_64Xmmr) -> T) {
var b = popReg();
var a = popRegToOverwrite();
var gtmp = G(allocTmp(ValueKind.I64));
var xtmp = X(allocTmp(ValueKind.V128));
mmasm.emit_v128_shift(X(a.reg), G(b.reg), width, gtmp, xtmp, asm_shift);
state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
}

// r1 = op(r1)
private def do_op1_r<T>(kind: ValueKind, emit: (X86_64Gpr -> T)) -> bool {
var sv = popRegToOverwrite(), r = G(sv.reg);
Expand Down

0 comments on commit 72156e8

Please sign in to comment.