Skip to content

Commit

Permalink
[simd/jit]: Implement simd type conversion instructions (#107 from ha…
Browse files Browse the repository at this point in the history
…oyu-zc/jit-conv)
  • Loading branch information
titzer authored Aug 5, 2023
2 parents 35c6c1c + ccaf726 commit f81aad6
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 26 deletions.
37 changes: 12 additions & 25 deletions src/engine/x86-64/X86_64Interpreter.v3
Original file line number Diff line number Diff line change
Expand Up @@ -2574,6 +2574,18 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
asm.movdqu_m_s(vsph[-1].value, r_xmm0);
endHandler();
}
// Simd type conversions
genSimdUnop(Opcode.F32X4_CONVERT_I32X4_S, asm.cvtdq2ps_s_s);
bindHandler(Opcode.F32X4_CONVERT_I32X4_U); {
asm.movdqu_s_m(r_xmm0, vsph[-1].value);
masm.emit_f32x4_convert_i32x4_u(r_xmm0, r_xmm1);
asm.movdqu_m_s(vsph[-1].value, r_xmm0);
endHandler();
}
genSimdUnop(Opcode.F64X2_CONVERT_LOW_I32X4_S, asm.cvtdq2pd_s_s);
genSimdUnop(Opcode.F64X2_PROMOTE_LOW_F32X4, asm.cvtps2pd_s_s);
genSimdUnop(Opcode.F32X4_DEMOTE_F64X2_ZERO, asm.cvtpd2ps_s_s);

// Lane-wise unary operations
for (t in [
(Opcode.I8X16_ABS, asm.pabsb_s_s),
Expand All @@ -2582,11 +2594,6 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
(Opcode.F32X4_SQRT, asm.sqrtps_s_s),
(Opcode.F64X2_SQRT, asm.sqrtpd_s_s),

(Opcode.F32X4_CONVERT_I32X4_S, asm.cvtdq2ps_s_s),
(Opcode.F64X2_CONVERT_LOW_I32X4_S, asm.cvtdq2pd_s_s),
(Opcode.F64X2_PROMOTE_LOW_F32X4, asm.cvtps2pd_s_s),
(Opcode.F32X4_DEMOTE_F64X2_ZERO, asm.cvtpd2ps_s_s),

(Opcode.I16X8_EXTEND_LOW_I8X16_S, asm.pmovsxbw_s_s),
(Opcode.I16X8_EXTEND_LOW_I8X16_U, asm.pmovzxbw_s_s),
(Opcode.I32X4_EXTEND_LOW_I16X8_S, asm.pmovsxwd_s_s),
Expand Down Expand Up @@ -2647,25 +2654,6 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
asm.movdqu_m_s(vsph[-1].value, r_xmm0);
endHandler();
}
for (t in [
(Opcode.F32X4_CONVERT_I32X4_U, asm.cvtdq2ps_s_m)
]) {
bindHandler(t.0);
asm.movdqu_s_m(r_xmm0, vsph[-1].value);
var dst = r_xmm0;
var scratch = r_xmm1;
asm.pxor_s_s(scratch, scratch);
asm.pblendw_s_s_i(scratch, dst, 0x55);
asm.psubd_s_s(dst, scratch);
asm.cvtdq2ps_s_s(scratch, scratch);
asm.psrld_i(dst, 1);
asm.cvtdq2ps_s_s(dst, dst);
asm.addps_s_s(dst, dst);
asm.addps_s_s(dst, scratch);
asm.movdqu_m_s(vsph[-1].value, r_xmm0);
endHandler();
}

for (t in [
(Opcode.I32X4_TRUNC_SAT_F32X4_S, masm.emit_i32x4_trunc_sat_f32x4_s(_, r_xmm1)),
(Opcode.I32X4_TRUNC_SAT_F32X4_U, masm.emit_i32x4_trunc_sat_f32x4_u(_, r_xmm1, r_xmm2)),
Expand All @@ -2678,7 +2666,6 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
asm.movdqu_m_s(vsph[-1].value, r_xmm0);
endHandler();
}

bindHandler(Opcode.I8X16_SHUFFLE); {
var RHS = X86_64Label.new(), LOOP_PRO = X86_64Label.new(), LOOP_EPI = X86_64Label.new();
incrementVsp(); // make room for a local variable dst (the result)
Expand Down
10 changes: 10 additions & 0 deletions src/engine/x86-64/X86_64MacroAssembler.v3
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,16 @@ class X86_64MacroAssembler extends MacroAssembler {
asm.psrld_i(dst, 10);
asm.andnps_s_s(dst, scratch);
}
def emit_f32x4_convert_i32x4_u(dst: X86_64Xmmr, scratch: X86_64Xmmr) {
asm.pxor_s_s(scratch, scratch);
asm.pblendw_s_s_i(scratch, dst, 0x55);
asm.psubd_s_s(dst, scratch);
asm.cvtdq2ps_s_s(scratch, scratch);
asm.psrld_i(dst, 1);
asm.cvtdq2ps_s_s(dst, dst);
asm.addps_s_s(dst, dst);
asm.addps_s_s(dst, scratch);
}
def emit_f64x2_min(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
asm.movaps_s_s(scratch, src);
asm.minpd_s_s(scratch, dst);
Expand Down
24 changes: 23 additions & 1 deletion src/engine/x86-64/X86_64SinglePassCompiler.v3
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,8 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_I8X16_ADD_SAT_U() { do_op2_x_x(ValueKind.V128, asm.paddusb_s_s); }
def visit_I8X16_SUB_SAT_S() { do_op2_x_x(ValueKind.V128, asm.psubsb_s_s); }
def visit_I8X16_SUB_SAT_U() { do_op2_x_x(ValueKind.V128, asm.psubusb_s_s); }
def visit_I8X16_NARROW_I16X8_S() { do_op2_x_x(ValueKind.V128, asm.packsswb_s_s); }
def visit_I8X16_NARROW_I16X8_U() { do_op2_x_x(ValueKind.V128, asm.packuswb_s_s); }

def visit_I16X8_ADD() { do_op2_x_x(ValueKind.V128, asm.paddw_s_s); }
def visit_I16X8_SUB() { do_op2_x_x(ValueKind.V128, asm.psubw_s_s); }
Expand Down Expand Up @@ -516,7 +518,13 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_I16X8_EXTMUL_LOW_I8X16_U() { do_op2_x_x(ValueKind.V128, mmasm.emit_i16x8_extmul_low(_, _, X(allocTmp(ValueKind.V128)), false)); }
def visit_I16X8_EXTMUL_HIGH_I8X16_S() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_extmul_high_s); }
def visit_I16X8_EXTMUL_HIGH_I8X16_U() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_extmul_high_u); }
def visit_I16X8_Q15MULRSAT_S() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_q15mulrsat_s); } // todo: factor out this routine
def visit_I16X8_Q15MULRSAT_S() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_q15mulrsat_s); }
def visit_I16X8_NARROW_I32X4_S() { do_op2_x_x(ValueKind.V128, asm.packssdw_s_s); }
def visit_I16X8_NARROW_I32X4_U() { do_op2_x_x(ValueKind.V128, asm.packusdw_s_s); }
def visit_I16X8_EXTEND_LOW_I8X16_S() { do_op1_x_x(ValueKind.V128, asm.pmovsxbw_s_s); }
def visit_I16X8_EXTEND_LOW_I8X16_U() { do_op1_x_x(ValueKind.V128, asm.pmovzxbw_s_s); }
def visit_I16X8_EXTEND_HIGH_I8X16_S() { do_op1_x(ValueKind.V128, mmasm.emit_i16x8_s_convert_i8x16_high); }
def visit_I16X8_EXTEND_HIGH_I8X16_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i16x8_u_convert_i8x16_high); }

def visit_I32X4_ADD() { do_op2_x_x(ValueKind.V128, asm.paddd_s_s); }
def visit_I32X4_SUB() { do_op2_x_x(ValueKind.V128, asm.psubd_s_s); }
Expand Down Expand Up @@ -548,6 +556,10 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_I32X4_TRUNC_SAT_F32X4_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i32x4_trunc_sat_f32x4_u(_, _, X(allocTmp(ValueKind.V128)))); }
def visit_I32X4_TRUNC_SAT_F64X2_S_ZERO() { do_op1_x_gtmp_xtmp(ValueKind.V128, mmasm.emit_i32x4_trunc_sat_f64x2_s_zero(_, _, _, X(allocTmp(ValueKind.V128)))); }
def visit_I32X4_TRUNC_SAT_F64X2_U_ZERO() { do_op1_x_gtmp_xtmp(ValueKind.V128, mmasm.emit_i32x4_trunc_sat_f64x2_u_zero(_, _, _, X(allocTmp(ValueKind.V128)))); }
def visit_I32X4_EXTEND_LOW_I16X8_S() { do_op1_x_x(ValueKind.V128, asm.pmovsxwd_s_s); }
def visit_I32X4_EXTEND_LOW_I16X8_U() { do_op1_x_x(ValueKind.V128, asm.pmovzxwd_s_s); }
def visit_I32X4_EXTEND_HIGH_I16X8_S() { do_op1_x(ValueKind.V128, mmasm.emit_i32x4_s_convert_i16x8_high); }
def visit_I32X4_EXTEND_HIGH_I16X8_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i32x4_u_convert_i16x8_high); }

def visit_I64X2_ADD() { do_op2_x_x(ValueKind.V128, asm.paddq_s_s); }
def visit_I64X2_SUB() { do_op2_x_x(ValueKind.V128, asm.psubq_s_s); }
Expand All @@ -560,6 +572,10 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_I64X2_GE_S() { do_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i64x2_ge_s); }
def visit_I64X2_LE_S() { do_c_op2_x_x_xtmp(ValueKind.V128, mmasm.emit_i64x2_ge_s); }
def visit_I64X2_ABS() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i64x2_abs); }
def visit_I64X2_EXTEND_LOW_I32X4_S() { do_op1_x_x(ValueKind.V128, asm.pmovsxdq_s_s); }
def visit_I64X2_EXTEND_LOW_I32X4_U() { do_op1_x_x(ValueKind.V128, asm.pmovzxdq_s_s); }
def visit_I64X2_EXTEND_HIGH_I32X4_S() { do_op1_x(ValueKind.V128, mmasm.emit_i64x2_s_convert_i32x4_high); }
def visit_I64X2_EXTEND_HIGH_I32X4_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_i64x2_u_convert_i32x4_high); }

def visit_F32X4_ADD() { do_op2_x_x(ValueKind.V128, asm.addps_s_s); }
def visit_F32X4_SUB() { do_op2_x_x(ValueKind.V128, asm.subps_s_s); }
Expand All @@ -582,6 +598,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_F32X4_FLOOR() { do_op1_x_x(ValueKind.V128, asm.roundps_s_s(_, _, X86_64Rounding.TO_NEG_INF)); }
def visit_F32X4_TRUNC() { do_op1_x_x(ValueKind.V128, asm.roundps_s_s(_, _, X86_64Rounding.TO_ZERO)); }
def visit_F32X4_NEAREST() { do_op1_x_x(ValueKind.V128, asm.roundps_s_s(_, _, X86_64Rounding.TO_NEAREST)); }
def visit_F32X4_CONVERT_I32X4_S() { do_op1_x_x(ValueKind.V128, asm.cvtdq2ps_s_s); }
def visit_F32X4_CONVERT_I32X4_U() { do_op1_x_xtmp(ValueKind.V128, mmasm.emit_f32x4_convert_i32x4_u); }
def visit_F32X4_DEMOTE_F64X2_ZERO() { do_op1_x_x(ValueKind.V128, asm.cvtpd2ps_s_s); }

def visit_F64X2_ADD() { do_op2_x_x(ValueKind.V128, asm.addpd_s_s); }
def visit_F64X2_SUB() { do_op2_x_x(ValueKind.V128, asm.subpd_s_s); }
Expand All @@ -604,6 +623,9 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_F64X2_FLOOR() { do_op1_x_x(ValueKind.V128, asm.roundpd_s_s(_, _, X86_64Rounding.TO_NEG_INF)); }
def visit_F64X2_TRUNC() { do_op1_x_x(ValueKind.V128, asm.roundpd_s_s(_, _, X86_64Rounding.TO_ZERO)); }
def visit_F64X2_NEAREST() { do_op1_x_x(ValueKind.V128, asm.roundpd_s_s(_, _, X86_64Rounding.TO_NEAREST)); }
def visit_F64X2_CONVERT_LOW_I32X4_S() { do_op1_x_x(ValueKind.V128, asm.cvtdq2pd_s_s); }
def visit_F64X2_CONVERT_LOW_I32X4_U() { do_op1_x_gtmp_xtmp(ValueKind.V128, mmasm.emit_f64x2_convert_low_i32x4_u); }
def visit_F64X2_PROMOTE_LOW_F32X4() { do_op1_x_x(ValueKind.V128, asm.cvtps2pd_s_s); }

def visit_V128_BITSELECT() {
var c = popReg();
Expand Down

0 comments on commit f81aad6

Please sign in to comment.