Skip to content

Commit

Permalink
[simd/jit]: Implement v128 boolean instructions (#115 from haoyu-zc/j…
Browse files Browse the repository at this point in the history
…it-boolean)
  • Loading branch information
titzer authored Aug 17, 2023
2 parents 121d0ee + 935c4f0 commit 888ef4c
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 15 deletions.
22 changes: 7 additions & 15 deletions src/engine/x86-64/X86_64Interpreter.v3
Original file line number Diff line number Diff line change
Expand Up @@ -2339,26 +2339,20 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
}
bindHandler(Opcode.V128_ANYTRUE); {
asm.movdqu_s_m(r_xmm0, vsph[-1].value);
asm.q.xor_r_r(r_tmp0, r_tmp0);
asm.ptest_s_s(r_xmm0, r_xmm0);
asm.set_r(C.NZ, r_tmp0);
masm.emit_v128_anytrue(r_tmp0, r_xmm0);
asm.movd_m_r(vsph[-1].value, r_tmp0);
genTagUpdate(BpTypeCode.I32.code);
endHandler();
}
for (t in [
(Opcode.I8X16_ALLTRUE, asm.pcmpeqb_s_s),
(Opcode.I16X8_ALLTRUE, asm.pcmpeqw_s_s),
(Opcode.I32X4_ALLTRUE, asm.pcmpeqd_s_s),
(Opcode.I64X2_ALLTRUE, asm.pcmpeqq_s_s)
(Opcode.I8X16_ALLTRUE, masm.emit_i8x16_alltrue),
(Opcode.I16X8_ALLTRUE, masm.emit_i16x8_alltrue),
(Opcode.I32X4_ALLTRUE, masm.emit_i32x4_alltrue),
(Opcode.I64X2_ALLTRUE, masm.emit_i64x2_alltrue)
]) {
bindHandler(t.0);
asm.movdqu_s_m(r_xmm0, vsph[-1].value);
asm.q.xor_r_r(r_tmp0, r_tmp0);
asm.pxor_s_s(r_xmm1, r_xmm1);
t.1(r_xmm1, r_xmm0);
asm.ptest_s_s(r_xmm1, r_xmm1);
asm.set_r(C.Z, r_tmp0);
t.1(r_tmp0, r_xmm0, r_xmm1);
asm.movd_m_r(vsph[-1].value, r_tmp0);
genTagUpdate(BpTypeCode.I32.code);
endHandler();
Expand All @@ -2377,9 +2371,7 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
}
bindHandler(Opcode.I16X8_BITMASK); {
asm.movdqu_s_m(r_xmm0, vsph[-1].value);
asm.packsswb_s_s(r_xmm0, r_xmm0);
asm.pmovmskb_r_s(r_tmp0, r_xmm0);
asm.shr_r_i(r_tmp0, 8);
masm.emit_i16x8_bitmask(r_tmp0, r_xmm0);
asm.movd_m_r(vsph[-1].value, r_tmp0);
genTagUpdate(BpTypeCode.I32.code);
endHandler();
Expand Down
22 changes: 22 additions & 0 deletions src/engine/x86-64/X86_64MacroAssembler.v3
Original file line number Diff line number Diff line change
Expand Up @@ -1067,6 +1067,28 @@ class X86_64MacroAssembler extends MacroAssembler {
asm.pmuludq_s_s(dst, scratch);
}
}
def emit_v128_anytrue(dst: X86_64Gpr, src: X86_64Xmmr) {
asm.q.xor_r_r(dst, dst);
asm.ptest_s_s(src, src);
asm.set_r(C.NZ, dst);
}
private def emit_v128_alltrue<T>(dst: X86_64Gpr, src: X86_64Xmmr, tmp: X86_64Xmmr, pcmp: (X86_64Xmmr, X86_64Xmmr) -> T) {
asm.q.xor_r_r(dst, dst);
asm.pxor_s_s(tmp, tmp);
pcmp(tmp, src);
asm.ptest_s_s(tmp, tmp);
asm.set_r(C.Z, dst);
}
def emit_i8x16_alltrue(dst: X86_64Gpr, src: X86_64Xmmr, tmp: X86_64Xmmr) { emit_v128_alltrue(dst, src, tmp, asm.pcmpeqb_s_s); }
def emit_i16x8_alltrue(dst: X86_64Gpr, src: X86_64Xmmr, tmp: X86_64Xmmr) { emit_v128_alltrue(dst, src, tmp, asm.pcmpeqw_s_s); }
def emit_i32x4_alltrue(dst: X86_64Gpr, src: X86_64Xmmr, tmp: X86_64Xmmr) { emit_v128_alltrue(dst, src, tmp, asm.pcmpeqd_s_s); }
def emit_i64x2_alltrue(dst: X86_64Gpr, src: X86_64Xmmr, tmp: X86_64Xmmr) { emit_v128_alltrue(dst, src, tmp, asm.pcmpeqq_s_s); }

def emit_i16x8_bitmask(dst: X86_64Gpr, src: X86_64Xmmr) {
asm.packsswb_s_s(src, src);
asm.pmovmskb_r_s(dst, src);
asm.shr_r_i(dst, 8);
}
def emit_i8x16_popcnt(dst: X86_64Xmmr, tmp1: X86_64Gpr, xtmp1: X86_64Xmmr, xtmp2: X86_64Xmmr, mask: X86_64Xmmr) {
// load masks
load_v128_mask(xtmp1, mask_i8x16_splat_0x0f, tmp1);
Expand Down
18 changes: 18 additions & 0 deletions src/engine/x86-64/X86_64SinglePassCompiler.v3
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,16 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
state.push(b.kindFlagsMatching(ValueKind.V128, IN_REG), b.reg, 0);
}

def visit_V128_ANYTRUE() { do_op1_r_x(ValueKind.I32, mmasm.emit_v128_anytrue); }
def visit_I8X16_ALLTRUE() { do_op1_r_x(ValueKind.I32, mmasm.emit_i8x16_alltrue(_, _, X(allocTmp(ValueKind.V128)))); }
def visit_I16X8_ALLTRUE() { do_op1_r_x(ValueKind.I32, mmasm.emit_i16x8_alltrue(_, _, X(allocTmp(ValueKind.V128)))); }
def visit_I32X4_ALLTRUE() { do_op1_r_x(ValueKind.I32, mmasm.emit_i32x4_alltrue(_, _, X(allocTmp(ValueKind.V128)))); }
def visit_I64X2_ALLTRUE() { do_op1_r_x(ValueKind.I32, mmasm.emit_i64x2_alltrue(_, _, X(allocTmp(ValueKind.V128)))); }
def visit_I8X16_BITMASK() { do_op1_r_x(ValueKind.I32, asm.pmovmskb_r_s); }
def visit_I16X8_BITMASK() { do_op1_r_x(ValueKind.I32, mmasm.emit_i16x8_bitmask); }
def visit_I32X4_BITMASK() { do_op1_r_x(ValueKind.I32, asm.movmskps_r_s); }
def visit_I64X2_BITMASK() { do_op1_r_x(ValueKind.I32, asm.movmskpd_r_s); }

def visit_I8X16_SHL() { visit_V128_SHFIT1(mmasm.emit_i8x16_shl); }
def visit_I8X16_SHR_S() { visit_V128_SHFIT1(mmasm.emit_i8x16_shr_s); }
def visit_I8X16_SHR_U() { visit_V128_SHFIT1(mmasm.emit_i8x16_shr_u); }
Expand Down Expand Up @@ -933,6 +943,14 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
state.push(sv.kindFlagsMatching(kind, IN_REG), d, 0);
return true;
}
// r1 = op(r2)
private def do_op1_r_x<T>(kind: ValueKind, emit: (X86_64Gpr, X86_64Xmmr) -> T) -> bool {
var sv = popReg(), r = X(sv.reg);
var d = allocRegTos(kind);
emit(G(d), r);
state.push(sv.kindFlagsMatching(kind, IN_REG), d, 0);
return true;
}
// r1 = op(r1), SIMD unop
private def do_op1_x<T>(kind: ValueKind, emit: (X86_64Xmmr) -> T) -> bool {
var sv = popRegToOverwrite(), r = X(sv.reg);
Expand Down

0 comments on commit 888ef4c

Please sign in to comment.