Skip to content

Commit

Permalink
[simd/jit]: Implement v128 replace_lane and extract_lane instructions (
Browse files Browse the repository at this point in the history
…#110 from haoyu-zc/jit-replacelane)
  • Loading branch information
titzer authored Aug 14, 2023
2 parents 5f926fd + c7381e8 commit eb81a6b
Show file tree
Hide file tree
Showing 5 changed files with 1,741 additions and 13 deletions.
102 changes: 89 additions & 13 deletions src/engine/x86-64/X86_64SinglePassCompiler.v3
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,60 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_V128_LOAD_32_LANE(imm: MemArg, lane: byte) { visit_V128_LOAD_LANE(imm, lane, asm.q.movd_r_m, asm.pinsrd_s_r_i); }
def visit_V128_LOAD_64_LANE(imm: MemArg, lane: byte) { visit_V128_LOAD_LANE(imm, lane, asm.q.movq_r_m, asm.pinsrq_s_r_i); }

def visit_I8X16_REPLACELANE(lane: byte) { visit_V128_REPLACE_LANE(lane, asm.pinsrb_s_r_i); }
def visit_I16X8_REPLACELANE(lane: byte) { visit_V128_REPLACE_LANE(lane, asm.pinsrw_s_r_i); }
def visit_I32X4_REPLACELANE(lane: byte) { visit_V128_REPLACE_LANE(lane, asm.pinsrd_s_r_i); }
def visit_I64X2_REPLACELANE(lane: byte) { visit_V128_REPLACE_LANE(lane, asm.pinsrq_s_r_i); }
def visit_F32X4_REPLACELANE(lane: byte) {
var b = popReg();
var a = popRegToOverwrite();
asm.insertps_s_s_i(X(a.reg), X(b.reg), byte.view((lane << 4) & 0x30));
state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
}
def visit_F64X2_REPLACELANE(lane: byte) {
var b = popReg();
var a = popRegToOverwrite();
if (lane == 0) {
asm.movsd_s_s(X(a.reg), X(b.reg));
} else {
asm.movlhps_s_s(X(a.reg), X(b.reg));
}
state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
}

def visit_I8X16_EXTRACTLANE_S(lane: byte) { visit_V128_EXTRACT_LANE_S(ValueKind.I32, lane, asm.pextrb_r_s_i, asm.q.movbsx_r_r); }
def visit_I8X16_EXTRACTLANE_U(lane: byte) { visit_V128_EXTRACT_LANE(ValueKind.I32, lane, asm.pextrb_r_s_i); }
def visit_I16X8_EXTRACTLANE_S(lane: byte) { visit_V128_EXTRACT_LANE_S(ValueKind.I32, lane, asm.pextrw_r_s_i, asm.q.movwsx_r_r); }
def visit_I16X8_EXTRACTLANE_U(lane: byte) { visit_V128_EXTRACT_LANE(ValueKind.I32, lane, asm.pextrw_r_s_i); }
def visit_I32X4_EXTRACTLANE(lane: byte) { visit_V128_EXTRACT_LANE(ValueKind.I32, lane, asm.pextrd_r_s_i); }
def visit_I64X2_EXTRACTLANE(lane: byte) { visit_V128_EXTRACT_LANE(ValueKind.I64, lane, asm.pextrq_r_s_i); }
def visit_F32X4_EXTRACTLANE(lane: byte) {
var a = popReg(), r = X(a.reg);
var kind = ValueKind.F32;
var d = allocRegTos(kind);
if (lane == 0) {
asm.movss_s_s(X(d), r);
} else if (lane == 1) {
asm.movshdup_s_s(X(d), r);
} else if (lane == 2) {
asm.movhlps_s_s(X(d), r);
} else {
asm.pshufd_s_s_i(X(d), r, lane);
}
state.push(SpcConsts.kindToFlags(kind) | IN_REG, d, 0);
}
def visit_F64X2_EXTRACTLANE(lane: byte) {
var a = popReg(), r = X(a.reg);
var kind = ValueKind.F64;
var d = allocRegTos(kind);
if (lane == 0) {
asm.movsd_s_s(X(d), r);
} else {
asm.movhlps_s_s(X(d), r);
}
state.push(SpcConsts.kindToFlags(kind) | IN_REG, d, 0);
}

def visit_V128_BITSELECT() {
var c = popReg();
var b = popReg();
Expand All @@ -653,6 +707,25 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
}

private def visit_V128_SHFIT1<T>(masm_shift: (X86_64Xmmr, X86_64Gpr, X86_64Gpr, X86_64Xmmr, X86_64Xmmr) -> T) {
var b = popReg();
var a = popRegToOverwrite();
var gtmp = G(allocTmp(ValueKind.I64));
var xtmp0 = X(allocTmp(ValueKind.V128));
var xtmp1 = X(allocTmp(ValueKind.V128));
masm_shift(X(a.reg), G(b.reg), gtmp, xtmp0, xtmp1);
state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
}

private def visit_V128_SHFIT2<T>(width: byte, asm_shift: (X86_64Xmmr, X86_64Xmmr) -> T) {
var b = popReg();
var a = popRegToOverwrite();
var gtmp = G(allocTmp(ValueKind.I64));
var xtmp = X(allocTmp(ValueKind.V128));
mmasm.emit_v128_shift(X(a.reg), G(b.reg), width, gtmp, xtmp, asm_shift);
state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
}

private def visit_V128_LOAD_LANE<T>(imm: MemArg, lane: byte, asm_mov_r_m: (X86_64Gpr, X86_64Addr) -> T, asm_pins_s_r_i: (X86_64Xmmr, X86_64Gpr, byte) -> T) {
var sv = popRegToOverwrite(), r = X(sv.reg);
var base_reg = regs.mem0_base;
Expand Down Expand Up @@ -680,24 +753,27 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
asm_pins_s_r_i(r, G(lane_val), lane);
state.push(sv.kindFlagsMatching(ValueKind.V128, IN_REG), sv.reg, 0);
}

private def visit_V128_SHFIT1<T>(masm_shift: (X86_64Xmmr, X86_64Gpr, X86_64Gpr, X86_64Xmmr, X86_64Xmmr) -> T) {
private def visit_V128_REPLACE_LANE<T>(lane: byte, asm_pins_s_r_i: (X86_64Xmmr, X86_64Gpr, byte) -> T) {
var b = popReg();
var a = popRegToOverwrite();
var gtmp = G(allocTmp(ValueKind.I64));
var xtmp0 = X(allocTmp(ValueKind.V128));
var xtmp1 = X(allocTmp(ValueKind.V128));
masm_shift(X(a.reg), G(b.reg), gtmp, xtmp0, xtmp1);
asm_pins_s_r_i(X(a.reg), G(b.reg), lane);
state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
}

private def visit_V128_SHFIT2<T>(width: byte, asm_shift: (X86_64Xmmr, X86_64Xmmr) -> T) {
var b = popReg();
var a = popRegToOverwrite();
var gtmp = G(allocTmp(ValueKind.I64));
var xtmp = X(allocTmp(ValueKind.V128));
mmasm.emit_v128_shift(X(a.reg), G(b.reg), width, gtmp, xtmp, asm_shift);
state.push(a.kindFlagsMatching(ValueKind.V128, IN_REG), a.reg, 0);
private def visit_V128_EXTRACT_LANE<T>(kind: ValueKind, lane: byte, asm_pext_r_s_i: (X86_64Gpr, X86_64Xmmr, byte) -> T) {
var a = popReg();
var d = allocRegTos(kind);
asm_pext_r_s_i(G(d), X(a.reg), lane);
state.push(SpcConsts.kindToFlags(kind) | IN_REG, d, 0);
}

private def visit_V128_EXTRACT_LANE_S<T>(kind: ValueKind, lane: byte, asm_pext_r_s_i: (X86_64Gpr, X86_64Xmmr, byte) -> T, asm_movext_s_s: (X86_64Gpr, X86_64Gpr) -> T) {
var a = popReg();
var d = allocRegTos(kind);
asm_pext_r_s_i(G(d), X(a.reg), lane);
asm_movext_s_s(G(d), G(d));
state.push(SpcConsts.kindToFlags(kind) | IN_REG, d, 0);
}

// r1 = op(r1)
Expand Down
Loading

0 comments on commit eb81a6b

Please sign in to comment.