Skip to content

Commit

Permalink
[simd/v3i]: Refactor to use loops for computations with >8 lanes
Browse files Browse the repository at this point in the history
  • Loading branch information
haoyu-zc committed Oct 15, 2023
1 parent 4cf612f commit b578ee4
Showing 1 changed file with 34 additions and 33 deletions.
67 changes: 34 additions & 33 deletions src/engine/V3Eval.v3
Original file line number Diff line number Diff line change
Expand Up @@ -432,43 +432,44 @@ component V3Eval {
return ((u64.view(r1) << 32) | r0, (u64.view(r3) << 32) | r2);
}
private def do_vv_v_x8(a: (u64, u64), b: (u64, u64), f: (u16, u16) -> u16) -> (u64, u64) { // Performs an 8-lane binop
var r0 = f(u16.view(a.0), u16.view(b.0));
var r1 = f(u16.view(a.0 >> 16), u16.view(b.0 >> 16));
var r2 = f(u16.view(a.0 >> 32), u16.view(b.0 >> 32));
var r3 = f(u16.view(a.0 >> 48), u16.view(b.0 >> 48));
var r4 = f(u16.view(a.1), u16.view(b.1));
var r5 = f(u16.view(a.1 >> 16), u16.view(b.1 >> 16));
var r6 = f(u16.view(a.1 >> 32), u16.view(b.1 >> 32));
var r7 = f(u16.view(a.1 >> 48), u16.view(b.1 >> 48));
return ((u64.view(r3) << 48) | (u64.view(r2) << 32) | (u64.view(r1) << 16) | r0,
(u64.view(r7) << 48) | (u64.view(r6) << 32) | (u64.view(r5) << 16) | r4);
var result0: u64 = 0;
var result1: u64 = 0;

for (i < 4) {
var shift_amount: byte = byte.!(i * 16);

var r_a = u16.view((a.0 >> shift_amount) & 0xFFFF);
var r_b = u16.view((b.0 >> shift_amount) & 0xFFFF);
var res = f(r_a, r_b);
result0 |= (u64.view(res) << shift_amount);

r_a = u16.view((a.1 >> shift_amount) & 0xFFFF);
r_b = u16.view((b.1 >> shift_amount) & 0xFFFF);
res = f(r_a, r_b);
result1 |= (u64.view(res) << shift_amount);
}

return (result0, result1);
}
private def do_vv_v_x16(a: (u64, u64), b: (u64, u64), f: (u8, u8) -> u8) -> (u64, u64) { // Performs a 16-lane binop
var r0 = f(u8.view(a.0), u8.view(b.0));
var r1 = f(u8.view(a.0 >> 8), u8.view(b.0 >> 8));
var r2 = f(u8.view(a.0 >> 16), u8.view(b.0 >> 16));
var r3 = f(u8.view(a.0 >> 24), u8.view(b.0 >> 24));
var r4 = f(u8.view(a.0 >> 32), u8.view(b.0 >> 32));
var r5 = f(u8.view(a.0 >> 40), u8.view(b.0 >> 40));
var r6 = f(u8.view(a.0 >> 48), u8.view(b.0 >> 48));
var r7 = f(u8.view(a.0 >> 56), u8.view(b.0 >> 56));

var r8 = f(u8.view(a.1), u8.view(b.1));
var r9 = f(u8.view(a.1 >> 8), u8.view(b.1 >> 8));
var r10 = f(u8.view(a.1 >> 16), u8.view(b.1 >> 16));
var r11 = f(u8.view(a.1 >> 24), u8.view(b.1 >> 24));
var r12 = f(u8.view(a.1 >> 32), u8.view(b.1 >> 32));
var r13 = f(u8.view(a.1 >> 40), u8.view(b.1 >> 40));
var r14 = f(u8.view(a.1 >> 48), u8.view(b.1 >> 48));
var r15 = f(u8.view(a.1 >> 56), u8.view(b.1 >> 56));
var result0: u64 = 0;
var result1: u64 = 0;

return (
(u64.view(r7) << 56) | (u64.view(r6) << 48) | (u64.view(r5) << 40) | (u64.view(r4) << 32) |
(u64.view(r3) << 24) | (u64.view(r2) << 16) | (u64.view(r1) << 8) | r0,
for (i < 8) {
var shift_amount: byte = byte.!(i * 8);

var r_a = u8.view((a.0 >> shift_amount) & 0xFF);
var r_b = u8.view((b.0 >> shift_amount) & 0xFF);
var res = f(r_a, r_b);
result0 |= (u64.view(res) << shift_amount);

r_a = u8.view((a.1 >> shift_amount) & 0xFF);
r_b = u8.view((b.1 >> shift_amount) & 0xFF);
res = f(r_a, r_b);
result1 |= (u64.view(res) << shift_amount);
}

(u64.view(r15) << 56) | (u64.view(r14) << 48) | (u64.view(r13) << 40) | (u64.view(r12) << 32) |
(u64.view(r11) << 24) | (u64.view(r10) << 16) | (u64.view(r9) << 8) | r8
);
return (result0, result1);
}
private def canonf(a: float) -> float {
return if(a == a, a, float.nan);
Expand Down

0 comments on commit b578ee4

Please sign in to comment.