From 35cedfe82a73eadbe6efdd09ad4143f822bdd31b Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Fri, 4 Aug 2023 00:50:06 -0400 Subject: [PATCH 1/3] [simd]: Factor out conversion instructions --- src/engine/x86-64/X86_64Interpreter.v3 | 37 ++++++++--------------- src/engine/x86-64/X86_64MacroAssembler.v3 | 10 ++++++ 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index 239e041b..a24b279c 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -2574,6 +2574,18 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { asm.movdqu_m_s(vsph[-1].value, r_xmm0); endHandler(); } + // Simd type conversions + genSimdUnop(Opcode.F32X4_CONVERT_I32X4_S, asm.cvtdq2ps_s_s); + bindHandler(Opcode.F32X4_CONVERT_I32X4_U); { + asm.movdqu_s_m(r_xmm0, vsph[-1].value); + masm.emit_f32x4_convert_i32x4_u(r_xmm0, r_xmm1); + asm.movdqu_m_s(vsph[-1].value, r_xmm0); + endHandler(); + } + genSimdUnop(Opcode.F64X2_CONVERT_LOW_I32X4_S, asm.cvtdq2pd_s_s); + genSimdUnop(Opcode.F64X2_PROMOTE_LOW_F32X4, asm.cvtps2pd_s_s); + genSimdUnop(Opcode.F32X4_DEMOTE_F64X2_ZERO, asm.cvtpd2ps_s_s); + // Lane-wise unary operations for (t in [ (Opcode.I8X16_ABS, asm.pabsb_s_s), @@ -2582,11 +2594,6 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { (Opcode.F32X4_SQRT, asm.sqrtps_s_s), (Opcode.F64X2_SQRT, asm.sqrtpd_s_s), - (Opcode.F32X4_CONVERT_I32X4_S, asm.cvtdq2ps_s_s), - (Opcode.F64X2_CONVERT_LOW_I32X4_S, asm.cvtdq2pd_s_s), - (Opcode.F64X2_PROMOTE_LOW_F32X4, asm.cvtps2pd_s_s), - (Opcode.F32X4_DEMOTE_F64X2_ZERO, asm.cvtpd2ps_s_s), - (Opcode.I16X8_EXTEND_LOW_I8X16_S, asm.pmovsxbw_s_s), (Opcode.I16X8_EXTEND_LOW_I8X16_U, asm.pmovzxbw_s_s), (Opcode.I32X4_EXTEND_LOW_I16X8_S, asm.pmovsxwd_s_s), @@ -2647,25 +2654,6 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { asm.movdqu_m_s(vsph[-1].value, r_xmm0); endHandler(); } - for (t in [ - (Opcode.F32X4_CONVERT_I32X4_U, asm.cvtdq2ps_s_m) - ]) { - bindHandler(t.0); - asm.movdqu_s_m(r_xmm0, vsph[-1].value); - var dst = r_xmm0; - var scratch = r_xmm1; - asm.pxor_s_s(scratch, scratch); - asm.pblendw_s_s_i(scratch, dst, 0x55); - asm.psubd_s_s(dst, scratch); - asm.cvtdq2ps_s_s(scratch, scratch); - asm.psrld_i(dst, 1); - asm.cvtdq2ps_s_s(dst, dst); - asm.addps_s_s(dst, dst); - asm.addps_s_s(dst, scratch); - asm.movdqu_m_s(vsph[-1].value, r_xmm0); - endHandler(); - } - for (t in [ (Opcode.I32X4_TRUNC_SAT_F32X4_S, masm.emit_i32x4_trunc_sat_f32x4_s(_, r_xmm1)), (Opcode.I32X4_TRUNC_SAT_F32X4_U, masm.emit_i32x4_trunc_sat_f32x4_u(_, r_xmm1, r_xmm2)), @@ -2678,7 +2666,6 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { asm.movdqu_m_s(vsph[-1].value, r_xmm0); endHandler(); } - bindHandler(Opcode.I8X16_SHUFFLE); { var RHS = X86_64Label.new(), LOOP_PRO = X86_64Label.new(), LOOP_EPI = X86_64Label.new(); incrementVsp(); // make room for a local variable dst (the result) diff --git a/src/engine/x86-64/X86_64MacroAssembler.v3 b/src/engine/x86-64/X86_64MacroAssembler.v3 index 2321037c..f2a287e2 100644 --- a/src/engine/x86-64/X86_64MacroAssembler.v3 +++ b/src/engine/x86-64/X86_64MacroAssembler.v3 @@ -1095,6 +1095,16 @@ class X86_64MacroAssembler extends MacroAssembler { asm.psrld_i(dst, 10); asm.andnps_s_s(dst, scratch); } + def emit_f32x4_convert_i32x4_u(dst: X86_64Xmmr, scratch: X86_64Xmmr) { + asm.pxor_s_s(scratch, scratch); + asm.pblendw_s_s_i(scratch, dst, 0x55); + asm.psubd_s_s(dst, scratch); + asm.cvtdq2ps_s_s(scratch, scratch); + asm.psrld_i(dst, 1); + asm.cvtdq2ps_s_s(dst, dst); + asm.addps_s_s(dst, dst); + asm.addps_s_s(dst, scratch); + } def emit_f64x2_min(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) { asm.movaps_s_s(scratch, src); asm.minpd_s_s(scratch, dst); From 4a1b7d05ff452d1659ab927f136fd3aad6168a81 Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Fri, 4 Aug 2023 01:03:06 -0400 Subject: [PATCH 2/3] [simd/jit]: Implement floating point conversion instructions --- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 86364cbd..936cd79a 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -485,6 +485,8 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_I8X16_ADD_SAT_U() { do_op2_x_x(ValueKind.V128, asm.paddusb_s_s); } def visit_I8X16_SUB_SAT_S() { do_op2_x_x(ValueKind.V128, asm.psubsb_s_s); } def visit_I8X16_SUB_SAT_U() { do_op2_x_x(ValueKind.V128, asm.psubusb_s_s); } + def visit_I8X16_NARROW_I16X8_S() { do_op2_x_x(ValueKind.V128, asm.packsswb_s_s); } + def visit_I8X16_NARROW_I16X8_U() { do_op2_x_x(ValueKind.V128, asm.packuswb_s_s); } def visit_I16X8_ADD() { do_op2_x_x(ValueKind.V128, asm.paddw_s_s); } def visit_I16X8_SUB() { do_op2_x_x(ValueKind.V128, asm.psubw_s_s); } @@ -516,7 +518,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_I16X8_EXTMUL_LOW_I8X16_U() { do_op2_x_x(ValueKind.V128, mmasm.emit_i16x8_extmul_low(_, _, X(allocTmp(ValueKind.V128)), false)); } def visit_I16X8_EXTMUL_HIGH_I8X16_S() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_extmul_high_s); } def visit_I16X8_EXTMUL_HIGH_I8X16_U() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_extmul_high_u); } - def visit_I16X8_Q15MULRSAT_S() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_q15mulrsat_s); } // todo: factor out this routine + def visit_I16X8_Q15MULRSAT_S() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_q15mulrsat_s); } + def visit_I16X8_NARROW_I32X4_S() { do_op2_x_x(ValueKind.V128, asm.packssdw_s_s); } + def visit_I16X8_NARROW_I32X4_U() { do_op2_x_x(ValueKind.V128, asm.packusdw_s_s); } def visit_I32X4_ADD() { do_op2_x_x(ValueKind.V128, asm.paddd_s_s); } def visit_I32X4_SUB() { do_op2_x_x(ValueKind.V128, asm.psubd_s_s); } @@ -582,6 +586,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_F32X4_FLOOR() { do_op1_x_x(ValueKind.V128, asm.roundps_s_s(_, _, X86_64Rounding.TO_NEG_INF)); } def visit_F32X4_TRUNC() { do_op1_x_x(ValueKind.V128, asm.roundps_s_s(_, _, X86_64Rounding.TO_ZERO)); } def visit_F32X4_NEAREST() { do_op1_x_x(ValueKind.V128, asm.roundps_s_s(_, _, X86_64Rounding.TO_NEAREST)); } + def visit_F32X4_CONVERT_I32X4_S() { do_op1_x_x(ValueKind.V128, asm.cvtdq2ps_s_s); } + def visit_F32X4_CONVERT_I32X4_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_f32x4_convert_i32x4_u); } + def visit_F32X4_DEMOTE_F64X2_ZERO() { do_op1_x_x(ValueKind.V128, asm.cvtpd2ps_s_s); } def visit_F64X2_ADD() { do_op2_x_x(ValueKind.V128, asm.addpd_s_s); } def visit_F64X2_SUB() { do_op2_x_x(ValueKind.V128, asm.subpd_s_s); } @@ -604,6 +611,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_F64X2_FLOOR() { do_op1_x_x(ValueKind.V128, asm.roundpd_s_s(_, _, X86_64Rounding.TO_NEG_INF)); } def visit_F64X2_TRUNC() { do_op1_x_x(ValueKind.V128, asm.roundpd_s_s(_, _, X86_64Rounding.TO_ZERO)); } def visit_F64X2_NEAREST() { do_op1_x_x(ValueKind.V128, asm.roundpd_s_s(_, _, X86_64Rounding.TO_NEAREST)); } + def visit_F64X2_CONVERT_LOW_I32X4_S() { do_op1_x_x(ValueKind.V128, asm.cvtdq2pd_s_s); } + def visit_F64X2_CONVERT_LOW_I32X4_U() { do_op1_x_gtmp_xtmp(ValueKind.V128, mmasm.emit_f64x2_convert_low_i32x4_u); } + def visit_F64X2_PROMOTE_LOW_F32X4() { do_op1_x_x(ValueKind.V128, asm.cvtps2pd_s_s); } def visit_V128_BITSELECT() { var c = popReg(); From ccaf72678a01fa159abc14da06ff26d2fefc3e12 Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Fri, 4 Aug 2023 01:33:44 -0400 Subject: [PATCH 3/3] [simd/jit]: Implement int to int extesions --- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index 936cd79a..a7bc9857 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -521,6 +521,10 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_I16X8_Q15MULRSAT_S() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_q15mulrsat_s); } def visit_I16X8_NARROW_I32X4_S() { do_op2_x_x(ValueKind.V128, asm.packssdw_s_s); } def visit_I16X8_NARROW_I32X4_U() { do_op2_x_x(ValueKind.V128, asm.packusdw_s_s); } + def visit_I16X8_EXTEND_LOW_I8X16_S() { do_op1_x_x(ValueKind.V128, asm.pmovsxbw_s_s); } + def visit_I16X8_EXTEND_LOW_I8X16_U() { do_op1_x_x(ValueKind.V128, asm.pmovzxbw_s_s); } + def visit_I16X8_EXTEND_HIGH_I8X16_S() { do_op1_x(ValueKind.V128, mmasm.emit_i16x8_s_convert_i8x16_high); } + def visit_I16X8_EXTEND_HIGH_I8X16_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_u_convert_i8x16_high); } def visit_I32X4_ADD() { do_op2_x_x(ValueKind.V128, asm.paddd_s_s); } def visit_I32X4_SUB() { do_op2_x_x(ValueKind.V128, asm.psubd_s_s); } @@ -552,6 +556,10 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_I32X4_TRUNC_SAT_F32X4_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i32x4_trunc_sat_f32x4_u(_, _, X(allocTmp(ValueKind.V128)))); } def visit_I32X4_TRUNC_SAT_F64X2_S_ZERO() { do_op1_x_gtmp_xtmp(ValueKind.V128, mmasm.emit_i32x4_trunc_sat_f64x2_s_zero(_, _, _, X(allocTmp(ValueKind.V128)))); } def visit_I32X4_TRUNC_SAT_F64X2_U_ZERO() { do_op1_x_gtmp_xtmp(ValueKind.V128, mmasm.emit_i32x4_trunc_sat_f64x2_u_zero(_, _, _, X(allocTmp(ValueKind.V128)))); } + def visit_I32X4_EXTEND_LOW_I16X8_S() { do_op1_x_x(ValueKind.V128, asm.pmovsxwd_s_s); } + def visit_I32X4_EXTEND_LOW_I16X8_U() { do_op1_x_x(ValueKind.V128, asm.pmovzxwd_s_s); } + def visit_I32X4_EXTEND_HIGH_I16X8_S() { do_op1_x(ValueKind.V128, mmasm.emit_i32x4_s_convert_i16x8_high); } + def visit_I32X4_EXTEND_HIGH_I16X8_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i32x4_u_convert_i16x8_high); } def visit_I64X2_ADD() { do_op2_x_x(ValueKind.V128, asm.paddq_s_s); } def visit_I64X2_SUB() { do_op2_x_x(ValueKind.V128, asm.psubq_s_s); } @@ -564,6 +572,10 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_I64X2_GE_S() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i64x2_ge_s); } def visit_I64X2_LE_S() { do_c_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i64x2_ge_s); } def visit_I64X2_ABS() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i64x2_abs); } + def visit_I64X2_EXTEND_LOW_I32X4_S() { do_op1_x_x(ValueKind.V128, asm.pmovsxdq_s_s); } + def visit_I64X2_EXTEND_LOW_I32X4_U() { do_op1_x_x(ValueKind.V128, asm.pmovzxdq_s_s); } + def visit_I64X2_EXTEND_HIGH_I32X4_S() { do_op1_x(ValueKind.V128, mmasm.emit_i64x2_s_convert_i32x4_high); } + def visit_I64X2_EXTEND_HIGH_I32X4_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i64x2_u_convert_i32x4_high); } def visit_F32X4_ADD() { do_op2_x_x(ValueKind.V128, asm.addps_s_s); } def visit_F32X4_SUB() { do_op2_x_x(ValueKind.V128, asm.subps_s_s); }