From d59b68c501b8343b6984d0199f74d250b8a988b3 Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Mon, 7 Aug 2023 18:40:14 -0400 Subject: [PATCH] [simd/jit]: Implement more v128 shifting instructions --- src/engine/x86-64/X86_64Interpreter.v3 | 19 ++++---------- src/engine/x86-64/X86_64MacroAssembler.v3 | 2 +- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 26 ++++++++++++++++--- 3 files changed, 28 insertions(+), 19 deletions(-) diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index 1855cbf3..15223274 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -2389,7 +2389,8 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { for (t in [ (Opcode.I8X16_SHL, masm.emit_i8x16_shl), (Opcode.I8X16_SHR_S, masm.emit_i8x16_shr_s), - (Opcode.I8X16_SHR_U, masm.emit_i8x16_shr_u) + (Opcode.I8X16_SHR_U, masm.emit_i8x16_shr_u), + (Opcode.I64X2_SHR_S, masm.emit_i64x2_shr_s) ]) { bindHandler(t.0); load_v128_xmm0_tmp0(); @@ -2400,12 +2401,12 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { } for (t in [ (Opcode.I16X8_SHL, asm.psllw_s_s, 4), - (Opcode.I32X4_SHL, asm.pslld_s_s, 5), - (Opcode.I64X2_SHL, asm.psllq_s_s, 6), (Opcode.I16X8_SHR_S, asm.psraw_s_s, 4), - (Opcode.I32X4_SHR_S, asm.psrad_s_s, 5), (Opcode.I16X8_SHR_U, asm.psrlw_s_s, 4), + (Opcode.I32X4_SHL, asm.pslld_s_s, 5), + (Opcode.I32X4_SHR_S, asm.psrad_s_s, 5), (Opcode.I32X4_SHR_U, asm.psrld_s_s, 5), + (Opcode.I64X2_SHL, asm.psllq_s_s, 6), (Opcode.I64X2_SHR_U, asm.psrlq_s_s, 6) ]) { bindHandler(t.0); @@ -2415,16 +2416,6 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { decrementVsp(); endHandler(); } - for (t in [ - (Opcode.I64X2_SHR_S, masm.emit_i64x2_shr_s) - ]) { - bindHandler(t.0); - load_v128_xmm0_tmp0(); - t.1(r_xmm0, r_tmp0, r_xmm1, r_xmm2, r_tmp1); - asm.movdqu_m_s(vsph[-2].value, r_xmm0); - decrementVsp(); - endHandler(); - } // V128 lane-wise general binary operations genSimdBinop(Opcode.I8X16_ADD, asm.paddb_s_s); diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3 index 647ab3c7..8e6b70f9 100644 --- a/src/engine/x86-64/X86_64MacroAssembler.v3 +++ b/src/engine/x86-64/X86_64MacroAssembler.v3 @@ -995,7 +995,7 @@ class X86_64MacroAssembler extends MacroAssembler { asm.xorps_s_s(dst, scratch); asm.psubq_s_s(dst, scratch); } - def emit_i64x2_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, xmm_tmp: X86_64Xmmr, xmm_shift: X86_64Xmmr, tmp_shift: X86_64Gpr) { + def emit_i64x2_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, tmp_shift: X86_64Gpr, xmm_tmp: X86_64Xmmr, xmm_shift: X86_64Xmmr) { asm.pcmpeqd_s_s(xmm_tmp, xmm_tmp); asm.psllq_i(xmm_tmp, 63); // shift modulo 64 diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index f0a98236..0767b36f 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -461,9 +461,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { state.push(b.kindFlagsMatching(ValueKind.V128, IN_REG), b.reg, 0); } - def visit_I8X16_SHL() { visit_I8X16_SHFIT(mmasm.emit_i8x16_shl); } - def visit_I8X16_SHR_S() { visit_I8X16_SHFIT(mmasm.emit_i8x16_shr_s); } - def visit_I8X16_SHR_U() { visit_I8X16_SHFIT(mmasm.emit_i8x16_shr_u); } + def visit_I8X16_SHL() { visit_V128_SHFIT1(mmasm.emit_i8x16_shl); } + def visit_I8X16_SHR_S() { visit_V128_SHFIT1(mmasm.emit_i8x16_shr_s); } + def visit_I8X16_SHR_U() { visit_V128_SHFIT1(mmasm.emit_i8x16_shr_u); } def visit_I8X16_ADD() { do_op2_x_x(ValueKind.V128, asm.paddb_s_s); } def visit_I8X16_SUB() { do_op2_x_x(ValueKind.V128, asm.psubb_s_s); } def visit_I8X16_NEG() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i8x16_neg); } @@ -491,6 +491,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_I8X16_NARROW_I16X8_S() { do_op2_x_x(ValueKind.V128, asm.packsswb_s_s); } def visit_I8X16_NARROW_I16X8_U() { do_op2_x_x(ValueKind.V128, asm.packuswb_s_s); } + def visit_I16X8_SHL() { visit_V128_SHFIT2(4, asm.psllw_s_s); } + def visit_I16X8_SHR_S() {visit_V128_SHFIT2(4, asm.psraw_s_s); } + def visit_I16X8_SHR_U() { visit_V128_SHFIT2(4, asm.psrlw_s_s); } def visit_I16X8_ADD() { do_op2_x_x(ValueKind.V128, asm.paddw_s_s); } def visit_I16X8_SUB() { do_op2_x_x(ValueKind.V128, asm.psubw_s_s); } def visit_I16X8_MUL() { do_op2_x_x(ValueKind.V128, asm.pmullw_s_s); } @@ -529,6 +532,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_I16X8_EXTEND_HIGH_I8X16_S() { do_op1_x(ValueKind.V128, mmasm.emit_i16x8_s_convert_i8x16_high); } def visit_I16X8_EXTEND_HIGH_I8X16_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_u_convert_i8x16_high); } + def visit_I32X4_SHL() { visit_V128_SHFIT2(5, asm.pslld_s_s); } + def visit_I32X4_SHR_S() { visit_V128_SHFIT2(5, asm.psrad_s_s); } + def visit_I32X4_SHR_U() { visit_V128_SHFIT2(5, asm.psrld_s_s); } def visit_I32X4_ADD() { do_op2_x_x(ValueKind.V128, asm.paddd_s_s); } def visit_I32X4_SUB() { do_op2_x_x(ValueKind.V128, asm.psubd_s_s); } def visit_I32X4_MUL() { do_op2_x_x(ValueKind.V128, asm.pmulld_s_s); } @@ -564,6 +570,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_I32X4_EXTEND_HIGH_I16X8_S() { do_op1_x(ValueKind.V128, mmasm.emit_i32x4_s_convert_i16x8_high); } def visit_I32X4_EXTEND_HIGH_I16X8_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i32x4_u_convert_i16x8_high); } + def visit_I64X2_SHL() { visit_V128_SHFIT2(6, asm.psllq_s_s); } + def visit_I64X2_SHR_S() { visit_V128_SHFIT1(mmasm.emit_i64x2_shr_s); } + def visit_I64X2_SHR_U() { visit_V128_SHFIT2(6, asm.psrlq_s_s); } def visit_I64X2_ADD() { do_op2_x_x(ValueKind.V128, asm.paddq_s_s); } def visit_I64X2_SUB() { do_op2_x_x(ValueKind.V128, asm.psubq_s_s); } def visit_I64X2_MUL() { do_op2_x_x(ValueKind.V128, mmasm.emit_i64x2_mul(_, _, X(allocTmp(ValueKind.V128)), X(allocTmp(ValueKind.V128)))); } @@ -639,7 +648,7 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0); } - private def visit_I8X16_SHFIT(masm_shift: (X86_64Xmmr, X86_64Gpr, X86_64Gpr, X86_64Xmmr, X86_64Xmmr) -> T) { + private def visit_V128_SHFIT1(masm_shift: (X86_64Xmmr, X86_64Gpr, X86_64Gpr, X86_64Xmmr, X86_64Xmmr) -> T) { var b = popReg(); var a = popRegToOverwrite(); var gtmp = G(allocTmp(ValueKind.I64)); @@ -649,6 +658,15 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0); } + private def visit_V128_SHFIT2(width: byte, asm_shift: (X86_64Xmmr, X86_64Xmmr) -> T) { + var b = popReg(); + var a = popRegToOverwrite(); + var gtmp = G(allocTmp(ValueKind.I64)); + var xtmp = X(allocTmp(ValueKind.V128)); + mmasm.emit_v128_shift(X(a.reg), G(b.reg), width, gtmp, xtmp, asm_shift); + state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0); + } + // r1 = op(r1) private def do_op1_r(kind: ValueKind, emit: (X86_64Gpr -> T)) -> bool { var sv = popRegToOverwrite(), r = G(sv.reg);