From 7fb1f7db10ccda77ccc7c3e2e0b7b6f99630c26f Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Fri, 4 Aug 2023 00:08:48 -0400 Subject: [PATCH 1/3] [simd]: Factor out rounding instructions --- src/engine/x86-64/X86_64Interpreter.v3 | 33 +++++++++++++------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index e11f63f3..3a4d18a6 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -2600,22 +2600,16 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { asm.movdqu_m_s(vsph[-1].value, r_xmm0); endHandler(); } - for (t in [ - (Opcode.F32X4_CEIL, X86_64Rounding.TO_POS_INF, asm.roundps_s_m), - (Opcode.F32X4_FLOOR, X86_64Rounding.TO_NEG_INF, asm.roundps_s_m), - (Opcode.F32X4_TRUNC, X86_64Rounding.TO_ZERO, asm.roundps_s_m), - (Opcode.F32X4_NEAREST, X86_64Rounding.TO_NEAREST, asm.roundps_s_m), - - (Opcode.F64X2_CEIL, X86_64Rounding.TO_POS_INF, asm.roundpd_s_m), - (Opcode.F64X2_FLOOR, X86_64Rounding.TO_NEG_INF, asm.roundpd_s_m), - (Opcode.F64X2_TRUNC, X86_64Rounding.TO_ZERO, asm.roundpd_s_m), - (Opcode.F64X2_NEAREST, X86_64Rounding.TO_NEAREST, asm.roundpd_s_m) - ]) { - bindHandler(t.0); - t.2(r_xmm0, vsph[-1].value, t.1); - asm.movdqu_m_s(vsph[-1].value, r_xmm0); - endHandler(); - } + + genSimdUnop(Opcode.F32X4_CEIL, asm.roundps_s_s(_, r_xmm0, X86_64Rounding.TO_POS_INF)); + genSimdUnop(Opcode.F32X4_FLOOR, asm.roundps_s_s(_, r_xmm0, X86_64Rounding.TO_NEG_INF)); + genSimdUnop(Opcode.F32X4_TRUNC, asm.roundps_s_s(_, r_xmm0, X86_64Rounding.TO_ZERO)); + genSimdUnop(Opcode.F32X4_NEAREST, asm.roundps_s_s(_, r_xmm0, X86_64Rounding.TO_NEAREST)); + genSimdUnop(Opcode.F64X2_CEIL, asm.roundpd_s_s(_, r_xmm0, X86_64Rounding.TO_POS_INF)); + genSimdUnop(Opcode.F64X2_FLOOR, asm.roundpd_s_s(_, r_xmm0, X86_64Rounding.TO_NEG_INF)); + genSimdUnop(Opcode.F64X2_TRUNC, asm.roundpd_s_s(_, r_xmm0, X86_64Rounding.TO_ZERO)); + genSimdUnop(Opcode.F64X2_NEAREST, asm.roundpd_s_s(_, r_xmm0, X86_64Rounding.TO_NEAREST)); + // Unary operations that need masks for (t in [ (Opcode.F32X4_NEG, masm.emit_v128_negps), @@ -2839,6 +2833,13 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { decrementVsp(); endHandler(); } + def genSimdUnop(opcode: Opcode, f: (X86_64Xmmr) -> T) { + bindHandler(opcode); + asm.movdqu_s_m(r_xmm0, vsph[-1].value); + f(r_xmm0); + asm.movdqu_m_s(vsph[-1].value, r_xmm0); + endHandler(); + } def bindHandler(opcode: Opcode) { if (FastIntTuning.handlerAlignment > 1) w.align(FastIntTuning.handlerAlignment); var pos = w.atEnd().pos; From 60810fc05e111341057b7453f24cb0af33cdf3af Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Fri, 4 Aug 2023 00:17:03 -0400 Subject: [PATCH 2/3] [simd/jit]: Implement floating point roundings --- src/engine/x86-64/X86_64SinglePassCompiler.v3 | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/engine/x86-64/X86_64SinglePassCompiler.v3 b/src/engine/x86-64/X86_64SinglePassCompiler.v3 index eaa54284..8c2d932a 100644 --- a/src/engine/x86-64/X86_64SinglePassCompiler.v3 +++ b/src/engine/x86-64/X86_64SinglePassCompiler.v3 @@ -574,6 +574,10 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_F32X4_MIN() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_f32x4_min); } def visit_F32X4_MAX() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_f32x4_max); } def visit_F32X4_ABS() { do_op1_x_gtmp_xtmp(ValueKind.V128, mmasm.emit_v128_absps); } + def visit_F32X4_CEIL() { do_op1_x_x(ValueKind.V128, asm.roundps_s_s(_, _, X86_64Rounding.TO_POS_INF)); } + def visit_F32X4_FLOOR() { do_op1_x_x(ValueKind.V128, asm.roundps_s_s(_, _, X86_64Rounding.TO_NEG_INF)); } + def visit_F32X4_TRUNC() { do_op1_x_x(ValueKind.V128, asm.roundps_s_s(_, _, X86_64Rounding.TO_ZERO)); } + def visit_F32X4_NEAREST() { do_op1_x_x(ValueKind.V128, asm.roundps_s_s(_, _, X86_64Rounding.TO_NEAREST)); } def visit_F64X2_ADD() { do_op2_x_x(ValueKind.V128, asm.addpd_s_s); } def visit_F64X2_SUB() { do_op2_x_x(ValueKind.V128, asm.subpd_s_s); } @@ -592,7 +596,10 @@ class X86_64SinglePassCompiler extends SinglePassCompiler { def visit_F64X2_MIN() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_f64x2_min); } def visit_F64X2_MAX() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_f64x2_max); } def visit_F64X2_ABS() { do_op1_x_gtmp_xtmp(ValueKind.V128, mmasm.emit_v128_abspd); } - + def visit_F64X2_CEIL() { do_op1_x_x(ValueKind.V128, asm.roundpd_s_s(_, _, X86_64Rounding.TO_POS_INF)); } + def visit_F64X2_FLOOR() { do_op1_x_x(ValueKind.V128, asm.roundpd_s_s(_, _, X86_64Rounding.TO_NEG_INF)); } + def visit_F64X2_TRUNC() { do_op1_x_x(ValueKind.V128, asm.roundpd_s_s(_, _, X86_64Rounding.TO_ZERO)); } + def visit_F64X2_NEAREST() { do_op1_x_x(ValueKind.V128, asm.roundpd_s_s(_, _, X86_64Rounding.TO_NEAREST)); } def visit_V128_BITSELECT() { var c = popReg(); From e9cc14bff7a06fb53d17b92d8bc2ce29ad55e7a4 Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Fri, 4 Aug 2023 00:36:22 -0400 Subject: [PATCH 3/3] [simd]: Factor out genSimdUnop() --- src/engine/x86-64/X86_64Interpreter.v3 | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/engine/x86-64/X86_64Interpreter.v3 b/src/engine/x86-64/X86_64Interpreter.v3 index 3a4d18a6..a4374a06 100644 --- a/src/engine/x86-64/X86_64Interpreter.v3 +++ b/src/engine/x86-64/X86_64Interpreter.v3 @@ -2601,14 +2601,14 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { endHandler(); } - genSimdUnop(Opcode.F32X4_CEIL, asm.roundps_s_s(_, r_xmm0, X86_64Rounding.TO_POS_INF)); - genSimdUnop(Opcode.F32X4_FLOOR, asm.roundps_s_s(_, r_xmm0, X86_64Rounding.TO_NEG_INF)); - genSimdUnop(Opcode.F32X4_TRUNC, asm.roundps_s_s(_, r_xmm0, X86_64Rounding.TO_ZERO)); - genSimdUnop(Opcode.F32X4_NEAREST, asm.roundps_s_s(_, r_xmm0, X86_64Rounding.TO_NEAREST)); - genSimdUnop(Opcode.F64X2_CEIL, asm.roundpd_s_s(_, r_xmm0, X86_64Rounding.TO_POS_INF)); - genSimdUnop(Opcode.F64X2_FLOOR, asm.roundpd_s_s(_, r_xmm0, X86_64Rounding.TO_NEG_INF)); - genSimdUnop(Opcode.F64X2_TRUNC, asm.roundpd_s_s(_, r_xmm0, X86_64Rounding.TO_ZERO)); - genSimdUnop(Opcode.F64X2_NEAREST, asm.roundpd_s_s(_, r_xmm0, X86_64Rounding.TO_NEAREST)); + genSimdUnop(Opcode.F32X4_CEIL, asm.roundps_s_s(_, _, X86_64Rounding.TO_POS_INF)); + genSimdUnop(Opcode.F32X4_FLOOR, asm.roundps_s_s(_, _, X86_64Rounding.TO_NEG_INF)); + genSimdUnop(Opcode.F32X4_TRUNC, asm.roundps_s_s(_, _, X86_64Rounding.TO_ZERO)); + genSimdUnop(Opcode.F32X4_NEAREST, asm.roundps_s_s(_, _, X86_64Rounding.TO_NEAREST)); + genSimdUnop(Opcode.F64X2_CEIL, asm.roundpd_s_s(_, _, X86_64Rounding.TO_POS_INF)); + genSimdUnop(Opcode.F64X2_FLOOR, asm.roundpd_s_s(_, _, X86_64Rounding.TO_NEG_INF)); + genSimdUnop(Opcode.F64X2_TRUNC, asm.roundpd_s_s(_, _, X86_64Rounding.TO_ZERO)); + genSimdUnop(Opcode.F64X2_NEAREST, asm.roundpd_s_s(_, _, X86_64Rounding.TO_NEAREST)); // Unary operations that need masks for (t in [ @@ -2833,10 +2833,10 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) { decrementVsp(); endHandler(); } - def genSimdUnop(opcode: Opcode, f: (X86_64Xmmr) -> T) { + def genSimdUnop(opcode: Opcode, f: (X86_64Xmmr, X86_64Xmmr) -> T) { bindHandler(opcode); asm.movdqu_s_m(r_xmm0, vsph[-1].value); - f(r_xmm0); + f(r_xmm0, r_xmm0); asm.movdqu_m_s(vsph[-1].value, r_xmm0); endHandler(); }