diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index b0692d11..5dbe093d 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -2400,8 +2400,9 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { for (t in [ (Opcode.I8X16_SHL, masm.emit_i8x16_shl), - (Opcode.I8X16_SHR_S, masm.emit_i8x16_shrs), - (Opcode.I8X16_SHR_U, masm.emit_i8x16_shru) + (Opcode.I8X16_SHR_S, masm.emit_i8x16_shr_s), + (Opcode.I8X16_SHR_U, masm.emit_i8x16_shr_u), + (Opcode.I64X2_SHR_S, masm.emit_i64x2_shr_s) ]) { bindHandler(t.0); load_v128_xmm0_tmp0(); @@ -2412,32 +2413,17 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { } for (t in [ (Opcode.I16X8_SHL, asm.psllw_s_s, 4), - (Opcode.I32X4_SHL, asm.pslld_s_s, 5), - (Opcode.I64X2_SHL, asm.psllq_s_s, 6), (Opcode.I16X8_SHR_S, asm.psraw_s_s, 4), - (Opcode.I32X4_SHR_S, asm.psrad_s_s, 5), (Opcode.I16X8_SHR_U, asm.psrlw_s_s, 4), + (Opcode.I32X4_SHL, asm.pslld_s_s, 5), + (Opcode.I32X4_SHR_S, asm.psrad_s_s, 5), (Opcode.I32X4_SHR_U, asm.psrld_s_s, 5), + (Opcode.I64X2_SHL, asm.psllq_s_s, 6), (Opcode.I64X2_SHR_U, asm.psrlq_s_s, 6) ]) { bindHandler(t.0); load_v128_xmm0_tmp0(); - var width = byte.view(t.2); - var mask = (1 << width) - 1; - asm.movq_r_r(r_tmp1, r_tmp0); - asm.and_r_i(r_tmp1, mask); - asm.movq_s_r(r_xmm1, r_tmp1); - t.1(r_xmm0, r_xmm1); - asm.movdqu_m_s(vsph[-2].value, r_xmm0); - decrementVsp(); - endHandler(); - } - for (t in [ - (Opcode.I64X2_SHR_S, masm.emit_i64x2_shr_s) - ]) { - bindHandler(t.0); - load_v128_xmm0_tmp0(); - t.1(r_xmm0, r_tmp0, r_xmm1, r_xmm2, r_tmp1); + masm.emit_v128_shift(r_xmm0, r_tmp0, byte.view(t.2), r_tmp1, r_xmm1, t.1); asm.movdqu_m_s(vsph[-2].value, r_xmm0); decrementVsp(); endHandler(); diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3 index d6a928be..26cb18fa 100644 --- a/src/engine/x86-64/X86_64MacroAssembler.v3 +++ b/src/engine/x86-64/X86_64MacroAssembler.v3 @@ -957,7 +957,7 @@ class X86_64MacroAssembler extends MacroAssembler { asm.movd_s_r(tmp3, tmp1); asm.psllw_s_s(dst, tmp3); } - def emit_i8x16_shrs(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) { + def emit_i8x16_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) { // Unpack the bytes into words, do arithmetic shifts, and repack. asm.punpckhbw_s_s(tmp2, dst); asm.punpcklbw_s_s(dst, dst); @@ -971,7 +971,7 @@ class X86_64MacroAssembler extends MacroAssembler { asm.psraw_s_s(dst, tmp3); asm.packsswb_s_s(dst, tmp2); } - def emit_i8x16_shru(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) { + def emit_i8x16_shr_u(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) { // Unpack the bytes into words, do arithmetic shifts, and repack. asm.punpckhbw_s_s(tmp2, dst); asm.punpcklbw_s_s(dst, dst); @@ -985,13 +985,21 @@ class X86_64MacroAssembler extends MacroAssembler { asm.psrlw_s_s(dst, tmp3); asm.packuswb_s_s(dst, tmp2); } + def emit_v128_shift(dst: X86_64Xmmr, shift: X86_64Gpr, width: byte, gtmp: X86_64Gpr, xtmp: X86_64Xmmr, + asm_pshfit_s_s: (X86_64Xmmr, X86_64Xmmr) -> T) { + var mask = (1 << width) - 1; + asm.movq_r_r(gtmp, shift); + asm.and_r_i(gtmp, mask); + asm.movq_s_r(xtmp, gtmp); + asm_pshfit_s_s(dst, xtmp); + } def emit_i64x2_abs(dst: X86_64Xmmr, scratch: X86_64Xmmr) { asm.movshdup_s_s(scratch, dst); asm.psrad_i(scratch, 31); asm.xorps_s_s(dst, scratch); asm.psubq_s_s(dst, scratch); } - def emit_i64x2_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, xmm_tmp: X86_64Xmmr, xmm_shift: X86_64Xmmr, tmp_shift: X86_64Gpr) { + def emit_i64x2_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, tmp_shift: X86_64Gpr, xmm_tmp: X86_64Xmmr, xmm_shift: X86_64Xmmr) { asm.pcmpeqd_s_s(xmm_tmp, xmm_tmp); asm.psllq_i(xmm_tmp, 63); // shift modulo 64 diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index f6d7567d..e81da438 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -461,6 +461,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { state.push(b.kindFlagsMatching(ValueKind.V128, IN_REG), b.reg, 0); } + def visit_I8X16_SHL() { visit_V128_SHFIT1(mmasm.emit_i8x16_shl); } + def visit_I8X16_SHR_S() { visit_V128_SHFIT1(mmasm.emit_i8x16_shr_s); } + def visit_I8X16_SHR_U() { visit_V128_SHFIT1(mmasm.emit_i8x16_shr_u); } def visit_I8X16_ADD() { do_op2_x_x(ValueKind.V128, asm.paddb_s_s); } def visit_I8X16_SUB() { do_op2_x_x(ValueKind.V128, asm.psubb_s_s); } def visit_I8X16_NEG() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i8x16_neg); } @@ -488,6 +491,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_I8X16_NARROW_I16X8_S() { do_op2_x_x(ValueKind.V128, asm.packsswb_s_s); } def visit_I8X16_NARROW_I16X8_U() { do_op2_x_x(ValueKind.V128, asm.packuswb_s_s); } + def visit_I16X8_SHL() { visit_V128_SHFIT2(4, asm.psllw_s_s); } + def visit_I16X8_SHR_S() {visit_V128_SHFIT2(4, asm.psraw_s_s); } + def visit_I16X8_SHR_U() { visit_V128_SHFIT2(4, asm.psrlw_s_s); } def visit_I16X8_ADD() { do_op2_x_x(ValueKind.V128, asm.paddw_s_s); } def visit_I16X8_SUB() { do_op2_x_x(ValueKind.V128, asm.psubw_s_s); } def visit_I16X8_MUL() { do_op2_x_x(ValueKind.V128, asm.pmullw_s_s); } @@ -526,6 +532,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_I16X8_EXTEND_HIGH_I8X16_S() { do_op1_x(ValueKind.V128, mmasm.emit_i16x8_s_convert_i8x16_high); } def visit_I16X8_EXTEND_HIGH_I8X16_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_u_convert_i8x16_high); } + def visit_I32X4_SHL() { visit_V128_SHFIT2(5, asm.pslld_s_s); } + def visit_I32X4_SHR_S() { visit_V128_SHFIT2(5, asm.psrad_s_s); } + def visit_I32X4_SHR_U() { visit_V128_SHFIT2(5, asm.psrld_s_s); } def visit_I32X4_ADD() { do_op2_x_x(ValueKind.V128, asm.paddd_s_s); } def visit_I32X4_SUB() { do_op2_x_x(ValueKind.V128, asm.psubd_s_s); } def visit_I32X4_MUL() { do_op2_x_x(ValueKind.V128, asm.pmulld_s_s); } @@ -561,6 +570,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_I32X4_EXTEND_HIGH_I16X8_S() { do_op1_x(ValueKind.V128, mmasm.emit_i32x4_s_convert_i16x8_high); } def visit_I32X4_EXTEND_HIGH_I16X8_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i32x4_u_convert_i16x8_high); } + def visit_I64X2_SHL() { visit_V128_SHFIT2(6, asm.psllq_s_s); } + def visit_I64X2_SHR_S() { visit_V128_SHFIT1(mmasm.emit_i64x2_shr_s); } + def visit_I64X2_SHR_U() { visit_V128_SHFIT2(6, asm.psrlq_s_s); } def visit_I64X2_ADD() { do_op2_x_x(ValueKind.V128, asm.paddq_s_s); } def visit_I64X2_SUB() { do_op2_x_x(ValueKind.V128, asm.psubq_s_s); } def visit_I64X2_MUL() { do_op2_x_x(ValueKind.V128, mmasm.emit_i64x2_mul(_, _, X(allocTmp(ValueKind.V128)), X(allocTmp(ValueKind.V128)))); } @@ -669,6 +681,25 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { state.push(sv.kindFlagsMatching(ValueKind.V128, IN_REG), sv.reg, 0); } + private def visit_V128_SHFIT1(masm_shift: (X86_64Xmmr, X86_64Gpr, X86_64Gpr, X86_64Xmmr, X86_64Xmmr) -> T) { + var b = popReg(); + var a = popRegToOverwrite(); + var gtmp = G(allocTmp(ValueKind.I64)); + var xtmp0 = X(allocTmp(ValueKind.V128)); + var xtmp1 = X(allocTmp(ValueKind.V128)); + masm_shift(X(a.reg), G(b.reg), gtmp, xtmp0, xtmp1); + state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0); + } + + private def visit_V128_SHFIT2(width: byte, asm_shift: (X86_64Xmmr, X86_64Xmmr) -> T) { + var b = popReg(); + var a = popRegToOverwrite(); + var gtmp = G(allocTmp(ValueKind.I64)); + var xtmp = X(allocTmp(ValueKind.V128)); + mmasm.emit_v128_shift(X(a.reg), G(b.reg), width, gtmp, xtmp, asm_shift); + state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0); + } + // r1 = op(r1) private def do_op1_r(kind: ValueKind, emit: (X86_64Gpr -> T)) -> bool { var sv = popRegToOverwrite(), r = G(sv.reg);