Skip to content

Commit

Permalink
[simd/jit]: Implement v128 shifting instructions (#109 from haoyu-zc/…
Browse files Browse the repository at this point in the history
…jit-shift)
  • Loading branch information
titzer authored Aug 9, 2023
2 parents 1bbad2d + 72156e8 commit 490d655
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 24 deletions.
28 changes: 7 additions & 21 deletions src/engine/x86-64/X86_64Interpreter.v3
Original file line number Diff line number Diff line change
Expand Up @@ -2400,8 +2400,9 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {

for (t in [
(Opcode.I8X16_SHL, masm.emit_i8x16_shl),
(Opcode.I8X16_SHR_S, masm.emit_i8x16_shrs),
(Opcode.I8X16_SHR_U, masm.emit_i8x16_shru)
(Opcode.I8X16_SHR_S, masm.emit_i8x16_shr_s),
(Opcode.I8X16_SHR_U, masm.emit_i8x16_shr_u),
(Opcode.I64X2_SHR_S, masm.emit_i64x2_shr_s)
]) {
bindHandler(t.0);
load_v128_xmm0_tmp0();
Expand All @@ -2412,32 +2413,17 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
}
for (t in [
(Opcode.I16X8_SHL, asm.psllw_s_s, 4),
(Opcode.I32X4_SHL, asm.pslld_s_s, 5),
(Opcode.I64X2_SHL, asm.psllq_s_s, 6),
(Opcode.I16X8_SHR_S, asm.psraw_s_s, 4),
(Opcode.I32X4_SHR_S, asm.psrad_s_s, 5),
(Opcode.I16X8_SHR_U, asm.psrlw_s_s, 4),
(Opcode.I32X4_SHL, asm.pslld_s_s, 5),
(Opcode.I32X4_SHR_S, asm.psrad_s_s, 5),
(Opcode.I32X4_SHR_U, asm.psrld_s_s, 5),
(Opcode.I64X2_SHL, asm.psllq_s_s, 6),
(Opcode.I64X2_SHR_U, asm.psrlq_s_s, 6)
]) {
bindHandler(t.0);
load_v128_xmm0_tmp0();
var width = byte.view(t.2);
var mask = (1 << width) - 1;
asm.movq_r_r(r_tmp1, r_tmp0);
asm.and_r_i(r_tmp1, mask);
asm.movq_s_r(r_xmm1, r_tmp1);
t.1(r_xmm0, r_xmm1);
asm.movdqu_m_s(vsph[-2].value, r_xmm0);
decrementVsp();
endHandler();
}
for (t in [
(Opcode.I64X2_SHR_S, masm.emit_i64x2_shr_s)
]) {
bindHandler(t.0);
load_v128_xmm0_tmp0();
t.1(r_xmm0, r_tmp0, r_xmm1, r_xmm2, r_tmp1);
masm.emit_v128_shift(r_xmm0, r_tmp0, byte.view(t.2), r_tmp1, r_xmm1, t.1);
asm.movdqu_m_s(vsph[-2].value, r_xmm0);
decrementVsp();
endHandler();
Expand Down
14 changes: 11 additions & 3 deletions src/engine/x86-64/X86_64MacroAssembler.v3
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,7 @@ class X86_64MacroAssembler extends MacroAssembler {
asm.movd_s_r(tmp3, tmp1);
asm.psllw_s_s(dst, tmp3);
}
def emit_i8x16_shrs(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) {
def emit_i8x16_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) {
// Unpack the bytes into words, do arithmetic shifts, and repack.
asm.punpckhbw_s_s(tmp2, dst);
asm.punpcklbw_s_s(dst, dst);
Expand All @@ -971,7 +971,7 @@ class X86_64MacroAssembler extends MacroAssembler {
asm.psraw_s_s(dst, tmp3);
asm.packsswb_s_s(dst, tmp2);
}
def emit_i8x16_shru(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) {
def emit_i8x16_shr_u(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) {
// Unpack the bytes into words, do arithmetic shifts, and repack.
asm.punpckhbw_s_s(tmp2, dst);
asm.punpcklbw_s_s(dst, dst);
Expand All @@ -985,13 +985,21 @@ class X86_64MacroAssembler extends MacroAssembler {
asm.psrlw_s_s(dst, tmp3);
asm.packuswb_s_s(dst, tmp2);
}
def emit_v128_shift<T>(dst: X86_64Xmmr, shift: X86_64Gpr, width: byte, gtmp: X86_64Gpr, xtmp: X86_64Xmmr,
asm_pshfit_s_s: (X86_64Xmmr, X86_64Xmmr) -> T) {
var mask = (1 << width) - 1;
asm.movq_r_r(gtmp, shift);
asm.and_r_i(gtmp, mask);
asm.movq_s_r(xtmp, gtmp);
asm_pshfit_s_s(dst, xtmp);
}
def emit_i64x2_abs(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
asm.movshdup_s_s(scratch, dst);
asm.psrad_i(scratch, 31);
asm.xorps_s_s(dst, scratch);
asm.psubq_s_s(dst, scratch);
}
def emit_i64x2_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, xmm_tmp: X86_64Xmmr, xmm_shift: X86_64Xmmr, tmp_shift: X86_64Gpr) {
def emit_i64x2_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, tmp_shift: X86_64Gpr, xmm_tmp: X86_64Xmmr, xmm_shift: X86_64Xmmr) {
asm.pcmpeqd_s_s(xmm_tmp, xmm_tmp);
asm.psllq_i(xmm_tmp, 63);
// shift modulo 64
Expand Down
31 changes: 31 additions & 0 deletions src/engine/x86-64/X86_64SinglePassCompiler.v3
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
state.push(b.kindFlagsMatching(ValueKind.V128, IN_REG), b.reg, 0);
}

def visit_I8X16_SHL() { visit_V128_SHFIT1(mmasm.emit_i8x16_shl); }
def visit_I8X16_SHR_S() { visit_V128_SHFIT1(mmasm.emit_i8x16_shr_s); }
def visit_I8X16_SHR_U() { visit_V128_SHFIT1(mmasm.emit_i8x16_shr_u); }
def visit_I8X16_ADD() { do_op2_x_x(ValueKind.V128, asm.paddb_s_s); }
def visit_I8X16_SUB() { do_op2_x_x(ValueKind.V128, asm.psubb_s_s); }
def visit_I8X16_NEG() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i8x16_neg); }
Expand Down Expand Up @@ -488,6 +491,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_I8X16_NARROW_I16X8_S() { do_op2_x_x(ValueKind.V128, asm.packsswb_s_s); }
def visit_I8X16_NARROW_I16X8_U() { do_op2_x_x(ValueKind.V128, asm.packuswb_s_s); }

def visit_I16X8_SHL() { visit_V128_SHFIT2(4, asm.psllw_s_s); }
def visit_I16X8_SHR_S() {visit_V128_SHFIT2(4, asm.psraw_s_s); }
def visit_I16X8_SHR_U() { visit_V128_SHFIT2(4, asm.psrlw_s_s); }
def visit_I16X8_ADD() { do_op2_x_x(ValueKind.V128, asm.paddw_s_s); }
def visit_I16X8_SUB() { do_op2_x_x(ValueKind.V128, asm.psubw_s_s); }
def visit_I16X8_MUL() { do_op2_x_x(ValueKind.V128, asm.pmullw_s_s); }
Expand Down Expand Up @@ -526,6 +532,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_I16X8_EXTEND_HIGH_I8X16_S() { do_op1_x(ValueKind.V128, mmasm.emit_i16x8_s_convert_i8x16_high); }
def visit_I16X8_EXTEND_HIGH_I8X16_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_u_convert_i8x16_high); }

def visit_I32X4_SHL() { visit_V128_SHFIT2(5, asm.pslld_s_s); }
def visit_I32X4_SHR_S() { visit_V128_SHFIT2(5, asm.psrad_s_s); }
def visit_I32X4_SHR_U() { visit_V128_SHFIT2(5, asm.psrld_s_s); }
def visit_I32X4_ADD() { do_op2_x_x(ValueKind.V128, asm.paddd_s_s); }
def visit_I32X4_SUB() { do_op2_x_x(ValueKind.V128, asm.psubd_s_s); }
def visit_I32X4_MUL() { do_op2_x_x(ValueKind.V128, asm.pmulld_s_s); }
Expand Down Expand Up @@ -561,6 +570,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_I32X4_EXTEND_HIGH_I16X8_S() { do_op1_x(ValueKind.V128, mmasm.emit_i32x4_s_convert_i16x8_high); }
def visit_I32X4_EXTEND_HIGH_I16X8_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i32x4_u_convert_i16x8_high); }

def visit_I64X2_SHL() { visit_V128_SHFIT2(6, asm.psllq_s_s); }
def visit_I64X2_SHR_S() { visit_V128_SHFIT1(mmasm.emit_i64x2_shr_s); }
def visit_I64X2_SHR_U() { visit_V128_SHFIT2(6, asm.psrlq_s_s); }
def visit_I64X2_ADD() { do_op2_x_x(ValueKind.V128, asm.paddq_s_s); }
def visit_I64X2_SUB() { do_op2_x_x(ValueKind.V128, asm.psubq_s_s); }
def visit_I64X2_MUL() { do_op2_x_x(ValueKind.V128, mmasm.emit_i64x2_mul(_, _, X(allocTmp(ValueKind.V128)), X(allocTmp(ValueKind.V128)))); }
Expand Down Expand Up @@ -669,6 +681,25 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
state.push(sv.kindFlagsMatching(ValueKind.V128, IN_REG), sv.reg, 0);
}

private def visit_V128_SHFIT1<T>(masm_shift: (X86_64Xmmr, X86_64Gpr, X86_64Gpr, X86_64Xmmr, X86_64Xmmr) -> T) {
var b = popReg();
var a = popRegToOverwrite();
var gtmp = G(allocTmp(ValueKind.I64));
var xtmp0 = X(allocTmp(ValueKind.V128));
var xtmp1 = X(allocTmp(ValueKind.V128));
masm_shift(X(a.reg), G(b.reg), gtmp, xtmp0, xtmp1);
state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
}

private def visit_V128_SHFIT2<T>(width: byte, asm_shift: (X86_64Xmmr, X86_64Xmmr) -> T) {
var b = popReg();
var a = popRegToOverwrite();
var gtmp = G(allocTmp(ValueKind.I64));
var xtmp = X(allocTmp(ValueKind.V128));
mmasm.emit_v128_shift(X(a.reg), G(b.reg), width, gtmp, xtmp, asm_shift);
state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
}

// r1 = op(r1)
private def do_op1_r<T>(kind: ValueKind, emit: (X86_64Gpr -> T)) -> bool {
var sv = popRegToOverwrite(), r = G(sv.reg);
Expand Down

0 comments on commit 490d655

Please sign in to comment.