titzer · titzer · Aug 9, 2023 · Aug 7, 2023 · Aug 7, 2023 · Aug 7, 2023
diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3
@@ -2400,8 +2400,9 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 
 		for (t in [
 			(Opcode.I8X16_SHL, masm.emit_i8x16_shl),
-			(Opcode.I8X16_SHR_S, masm.emit_i8x16_shrs),
-			(Opcode.I8X16_SHR_U, masm.emit_i8x16_shru)
+			(Opcode.I8X16_SHR_S, masm.emit_i8x16_shr_s),
+			(Opcode.I8X16_SHR_U, masm.emit_i8x16_shr_u),
+			(Opcode.I64X2_SHR_S, masm.emit_i64x2_shr_s)
 		]) {
 			bindHandler(t.0);
 			load_v128_xmm0_tmp0();
@@ -2412,32 +2413,17 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
 		}
 		for (t in [
 			(Opcode.I16X8_SHL, asm.psllw_s_s, 4),
-			(Opcode.I32X4_SHL, asm.pslld_s_s, 5),
-			(Opcode.I64X2_SHL, asm.psllq_s_s, 6),
 			(Opcode.I16X8_SHR_S, asm.psraw_s_s, 4),
-			(Opcode.I32X4_SHR_S, asm.psrad_s_s, 5),
 			(Opcode.I16X8_SHR_U, asm.psrlw_s_s, 4),
+			(Opcode.I32X4_SHL, asm.pslld_s_s, 5),
+			(Opcode.I32X4_SHR_S, asm.psrad_s_s, 5),
 			(Opcode.I32X4_SHR_U, asm.psrld_s_s, 5),
+			(Opcode.I64X2_SHL, asm.psllq_s_s, 6),
 			(Opcode.I64X2_SHR_U, asm.psrlq_s_s, 6)
 		]) {
 			bindHandler(t.0);
 			load_v128_xmm0_tmp0();
-			var width = byte.view(t.2);
-			var mask = (1 << width) - 1;
-			asm.movq_r_r(r_tmp1, r_tmp0);
-			asm.and_r_i(r_tmp1, mask);
-			asm.movq_s_r(r_xmm1, r_tmp1);
-			t.1(r_xmm0, r_xmm1);
-			asm.movdqu_m_s(vsph[-2].value, r_xmm0);
-			decrementVsp();
-			endHandler();
-		}
-		for (t in [
-			(Opcode.I64X2_SHR_S, masm.emit_i64x2_shr_s)
-		]) {
-			bindHandler(t.0);
-			load_v128_xmm0_tmp0();
-			t.1(r_xmm0, r_tmp0, r_xmm1, r_xmm2, r_tmp1);
+			masm.emit_v128_shift(r_xmm0, r_tmp0, byte.view(t.2), r_tmp1, r_xmm1, t.1);
 			asm.movdqu_m_s(vsph[-2].value, r_xmm0);
 			decrementVsp();
 			endHandler();

diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3
@@ -957,7 +957,7 @@ class X86_64MacroAssembler extends MacroAssembler {
 		asm.movd_s_r(tmp3, tmp1);
 		asm.psllw_s_s(dst, tmp3);
 	}
-	def emit_i8x16_shrs(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) {
+	def emit_i8x16_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) {
 		// Unpack the bytes into words, do arithmetic shifts, and repack.
 		asm.punpckhbw_s_s(tmp2, dst);
 		asm.punpcklbw_s_s(dst, dst);
@@ -971,7 +971,7 @@ class X86_64MacroAssembler extends MacroAssembler {
 		asm.psraw_s_s(dst, tmp3);
 		asm.packsswb_s_s(dst, tmp2);
 	}
-	def emit_i8x16_shru(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) {
+	def emit_i8x16_shr_u(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) {
 		// Unpack the bytes into words, do arithmetic shifts, and repack.
 		asm.punpckhbw_s_s(tmp2, dst);
 		asm.punpcklbw_s_s(dst, dst);
@@ -985,13 +985,21 @@ class X86_64MacroAssembler extends MacroAssembler {
 		asm.psrlw_s_s(dst, tmp3);
 		asm.packuswb_s_s(dst, tmp2);
 	}
+	def emit_v128_shift<T>(dst: X86_64Xmmr, shift: X86_64Gpr, width: byte, gtmp: X86_64Gpr, xtmp: X86_64Xmmr,
+			asm_pshfit_s_s: (X86_64Xmmr, X86_64Xmmr) -> T) {
+		var mask = (1 << width) - 1;
+		asm.movq_r_r(gtmp, shift);
+		asm.and_r_i(gtmp, mask);
+		asm.movq_s_r(xtmp, gtmp);
+		asm_pshfit_s_s(dst, xtmp);
+	}
 	def emit_i64x2_abs(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
 		asm.movshdup_s_s(scratch, dst);
 		asm.psrad_i(scratch, 31);
 		asm.xorps_s_s(dst, scratch);
 		asm.psubq_s_s(dst, scratch);
 	}
-	def emit_i64x2_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, xmm_tmp: X86_64Xmmr, xmm_shift: X86_64Xmmr, tmp_shift: X86_64Gpr) {
+	def emit_i64x2_shr_s(dst: X86_64Xmmr, shift: X86_64Gpr, tmp_shift: X86_64Gpr, xmm_tmp: X86_64Xmmr, xmm_shift: X86_64Xmmr) {
 		asm.pcmpeqd_s_s(xmm_tmp, xmm_tmp);
 		asm.psllq_i(xmm_tmp, 63);
 		// shift modulo 64

diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3
@@ -461,6 +461,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
 		state.push(b.kindFlagsMatching(ValueKind.V128, IN_REG), b.reg, 0);
 	}
 
+	def visit_I8X16_SHL() { visit_V128_SHFIT1(mmasm.emit_i8x16_shl); }
+	def visit_I8X16_SHR_S() { visit_V128_SHFIT1(mmasm.emit_i8x16_shr_s); }
+	def visit_I8X16_SHR_U() { visit_V128_SHFIT1(mmasm.emit_i8x16_shr_u); }
 	def visit_I8X16_ADD() { do_op2_x_x(ValueKind.V128, asm.paddb_s_s); }
 	def visit_I8X16_SUB() { do_op2_x_x(ValueKind.V128, asm.psubb_s_s); }
 	def visit_I8X16_NEG() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i8x16_neg); }
@@ -488,6 +491,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
 	def visit_I8X16_NARROW_I16X8_S() { do_op2_x_x(ValueKind.V128, asm.packsswb_s_s); }
 	def visit_I8X16_NARROW_I16X8_U() { do_op2_x_x(ValueKind.V128, asm.packuswb_s_s); }
 
+	def visit_I16X8_SHL() { visit_V128_SHFIT2(4, asm.psllw_s_s); }
+	def visit_I16X8_SHR_S() {visit_V128_SHFIT2(4, asm.psraw_s_s); }
+	def visit_I16X8_SHR_U() { visit_V128_SHFIT2(4, asm.psrlw_s_s); }
 	def visit_I16X8_ADD() { do_op2_x_x(ValueKind.V128, asm.paddw_s_s); }
 	def visit_I16X8_SUB() { do_op2_x_x(ValueKind.V128, asm.psubw_s_s); }
 	def visit_I16X8_MUL() { do_op2_x_x(ValueKind.V128, asm.pmullw_s_s); }
@@ -526,6 +532,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
 	def visit_I16X8_EXTEND_HIGH_I8X16_S() { do_op1_x(ValueKind.V128, mmasm.emit_i16x8_s_convert_i8x16_high); }
 	def visit_I16X8_EXTEND_HIGH_I8X16_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_u_convert_i8x16_high); }	
 
+	def visit_I32X4_SHL() { visit_V128_SHFIT2(5, asm.pslld_s_s); }
+	def visit_I32X4_SHR_S() { visit_V128_SHFIT2(5, asm.psrad_s_s); }
+	def visit_I32X4_SHR_U() { visit_V128_SHFIT2(5, asm.psrld_s_s); }
 	def visit_I32X4_ADD() { do_op2_x_x(ValueKind.V128, asm.paddd_s_s); }
 	def visit_I32X4_SUB() { do_op2_x_x(ValueKind.V128, asm.psubd_s_s); }
 	def visit_I32X4_MUL() { do_op2_x_x(ValueKind.V128, asm.pmulld_s_s); }
@@ -561,6 +570,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
 	def visit_I32X4_EXTEND_HIGH_I16X8_S() { do_op1_x(ValueKind.V128, mmasm.emit_i32x4_s_convert_i16x8_high); }
 	def visit_I32X4_EXTEND_HIGH_I16X8_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i32x4_u_convert_i16x8_high); }
 
+	def visit_I64X2_SHL() { visit_V128_SHFIT2(6, asm.psllq_s_s); }
+	def visit_I64X2_SHR_S() { visit_V128_SHFIT1(mmasm.emit_i64x2_shr_s); }
+	def visit_I64X2_SHR_U() { visit_V128_SHFIT2(6, asm.psrlq_s_s); }
 	def visit_I64X2_ADD() { do_op2_x_x(ValueKind.V128, asm.paddq_s_s); }
 	def visit_I64X2_SUB() { do_op2_x_x(ValueKind.V128, asm.psubq_s_s); }
 	def visit_I64X2_MUL() { do_op2_x_x(ValueKind.V128, mmasm.emit_i64x2_mul(_, _, X(allocTmp(ValueKind.V128)), X(allocTmp(ValueKind.V128)))); }
@@ -669,6 +681,25 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
 		state.push(sv.kindFlagsMatching(ValueKind.V128, IN_REG), sv.reg, 0);
 	}
 
+	private def visit_V128_SHFIT1<T>(masm_shift: (X86_64Xmmr, X86_64Gpr, X86_64Gpr, X86_64Xmmr, X86_64Xmmr) -> T) {
+		var b = popReg();
+		var a = popRegToOverwrite();
+		var gtmp = G(allocTmp(ValueKind.I64));
+		var xtmp0 = X(allocTmp(ValueKind.V128));
+		var xtmp1 = X(allocTmp(ValueKind.V128));
+		masm_shift(X(a.reg), G(b.reg), gtmp, xtmp0, xtmp1);
+		state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
+	}
+
+	private def visit_V128_SHFIT2<T>(width: byte, asm_shift: (X86_64Xmmr, X86_64Xmmr) -> T) {
+		var b = popReg();
+		var a = popRegToOverwrite();
+		var gtmp = G(allocTmp(ValueKind.I64));
+		var xtmp = X(allocTmp(ValueKind.V128));
+		mmasm.emit_v128_shift(X(a.reg), G(b.reg), width, gtmp, xtmp, asm_shift);
+		state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
+	}
+
 	// r1 = op(r1)
 	private def do_op1_r<T>(kind: ValueKind, emit: (X86_64Gpr -> T)) -> bool {
 		var sv = popRegToOverwrite(), r = G(sv.reg);