From 97077611177069714bd9b6ad141237e3d6c4c74b Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Sun, 15 Oct 2023 04:57:13 -0400 Subject: [PATCH 1/5] [simd/v3i]: Implement i16x8 basic arithmetic instructions --- src/engine/V3Eval.v3 | 43 ++++++++++++++++++++++++++++++++++ src/engine/v3/V3Interpreter.v3 | 8 +++++++ 2 files changed, 51 insertions(+) diff --git a/src/engine/V3Eval.v3 b/src/engine/V3Eval.v3 index 1e3f9990..56617d1b 100644 --- a/src/engine/V3Eval.v3 +++ b/src/engine/V3Eval.v3 @@ -301,6 +301,30 @@ component V3Eval { var not_b = V128_NOT(b); return V128_AND(a, not_b); } + def I32X4_ADD(a: (u64, u64), b: (u64, u64)) -> (u64, u64) { + return do_vv_v_x4(a, b, u32.+); + } + def I32X4_SUB(a: (u64, u64), b: (u64, u64)) -> (u64, u64) { + return do_vv_v_x4(a, b, u32.-); + } + def I32X4_MUL(a: (u64, u64), b: (u64, u64)) -> (u64, u64) { + return do_vv_v_x4(a, b, u32.*); + } + def I32X4_NEG(a: (u64, u64)) -> (u64, u64) { + return do_vv_v_x4((0, 0), a, u32.-); + } + def I16X8_ADD(a: (u64, u64), b: (u64, u64)) -> (u64, u64) { + return do_vv_v_x8(a, b, u16.+); + } + def I16X8_SUB(a: (u64, u64), b: (u64, u64)) -> (u64, u64) { + return do_vv_v_x8(a, b, u16.-); + } + def I16X8_MUL(a: (u64, u64), b: (u64, u64)) -> (u64, u64) { + return do_vv_v_x8(a, b, u16.*); + } + def I16X8_NEG(a: (u64, u64)) -> (u64, u64) { + return do_vv_v_x8((0, 0), a, u16.-); + } // ---- rounding and conversion ---------------------------------------- def I32_WRAP_I64 = u32.view; @@ -379,6 +403,25 @@ component V3Eval { var r1 = f(a.1, b.1); return (r0, r1); } + private def do_vv_v_x4(a: (u64, u64), b: (u64, u64), f: (u32, u32) -> u32) -> (u64, u64) { // Performs a 4-lane binop + var r0 = f(u32.view(a.0), u32.view(b.0)); + var r1 = f(u32.view(a.0 >> 32), u32.view(b.0 >> 32)); + var r2 = f(u32.view(a.1), u32.view(b.1)); + var r3 = f(u32.view(a.1 >> 32), u32.view(b.1 >> 32)); + return ((u64.view(r1) << 32) | r0, (u64.view(r3) << 32) | r2); + } + private def do_vv_v_x8(a: (u64, u64), b: (u64, u64), f: (u16, u16) -> u16) -> (u64, u64) { // Performs an 8-lane binop + var r0 = f(u16.view(a.0), u16.view(b.0)); + var r1 = f(u16.view(a.0 >> 16), u16.view(b.0 >> 16)); + var r2 = f(u16.view(a.0 >> 32), u16.view(b.0 >> 32)); + var r3 = f(u16.view(a.0 >> 48), u16.view(b.0 >> 48)); + var r4 = f(u16.view(a.1), u16.view(b.1)); + var r5 = f(u16.view(a.1 >> 16), u16.view(b.1 >> 16)); + var r6 = f(u16.view(a.1 >> 32), u16.view(b.1 >> 32)); + var r7 = f(u16.view(a.1 >> 48), u16.view(b.1 >> 48)); + return ((u64.view(r3) << 48) | (u64.view(r2) << 32) | (u64.view(r1) << 16) | r0, + (u64.view(r7) << 48) | (u64.view(r6) << 32) | (u64.view(r5) << 16) | r4); + } private def canonf(a: float) -> float { return if(a == a, a, float.nan); } diff --git a/src/engine/v3/V3Interpreter.v3 b/src/engine/v3/V3Interpreter.v3 index 6f2ffb8e..36236e17 100644 --- a/src/engine/v3/V3Interpreter.v3 +++ b/src/engine/v3/V3Interpreter.v3 @@ -916,6 +916,14 @@ component V3Interpreter { V128_XOR => do_vv_v(V3Eval.V128_XOR); V128_BITSELECT => do_vvv_v(V3Eval.V128_BITSELECT); V128_ANDNOT => do_vv_v(V3Eval.V128_ANDNOT); + I32X4_ADD => do_vv_v(V3Eval.I32X4_ADD); + I32X4_SUB => do_vv_v(V3Eval.I32X4_SUB); + I32X4_MUL => do_vv_v(V3Eval.I32X4_MUL); + I32X4_NEG => do_v_v(V3Eval.I32X4_NEG); + I16X8_ADD => do_vv_v(V3Eval.I16X8_ADD); + I16X8_SUB => do_vv_v(V3Eval.I16X8_SUB); + I16X8_MUL => do_vv_v(V3Eval.I16X8_MUL); + I16X8_NEG => do_v_v(V3Eval.I16X8_NEG); INVALID => trap(TrapReason.INVALID_OPCODE); CRASH_EXEC => System.error("WizengError", "crash-exec opcode executed"); CRASH_COMPILER => System.error("WizengError", "crash-compiler opcode executed"); From a6cbf5dfbe5a47d69c1c70f0fb02ba96a773421b Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Sun, 15 Oct 2023 04:52:07 -0400 Subject: [PATCH 2/5] [simd/v3i]: Implement i8x16 basic arithmetic instructions --- src/engine/V3Eval.v3 | 36 ++++++++++++++++++++++++++++++++++ src/engine/v3/V3Interpreter.v3 | 3 +++ 2 files changed, 39 insertions(+) diff --git a/src/engine/V3Eval.v3 b/src/engine/V3Eval.v3 index 56617d1b..49b83494 100644 --- a/src/engine/V3Eval.v3 +++ b/src/engine/V3Eval.v3 @@ -325,6 +325,15 @@ component V3Eval { def I16X8_NEG(a: (u64, u64)) -> (u64, u64) { return do_vv_v_x8((0, 0), a, u16.-); } + def I8X16_ADD(a: (u64, u64), b: (u64, u64)) -> (u64, u64) { + return do_vv_v_x16(a, b, u8.+); + } + def I8X16_SUB(a: (u64, u64), b: (u64, u64)) -> (u64, u64) { + return do_vv_v_x16(a, b, u8.-); + } + def I8X16_NEG(a: (u64, u64)) -> (u64, u64) { + return do_vv_v_x16((0, 0), a, u8.-); + } // ---- rounding and conversion ---------------------------------------- def I32_WRAP_I64 = u32.view; @@ -422,6 +431,33 @@ component V3Eval { return ((u64.view(r3) << 48) | (u64.view(r2) << 32) | (u64.view(r1) << 16) | r0, (u64.view(r7) << 48) | (u64.view(r6) << 32) | (u64.view(r5) << 16) | r4); } + private def do_vv_v_x16(a: (u64, u64), b: (u64, u64), f: (u8, u8) -> u8) -> (u64, u64) { // Performs a 16-lane binop + var r0 = f(u8.view(a.0), u8.view(b.0)); + var r1 = f(u8.view(a.0 >> 8), u8.view(b.0 >> 8)); + var r2 = f(u8.view(a.0 >> 16), u8.view(b.0 >> 16)); + var r3 = f(u8.view(a.0 >> 24), u8.view(b.0 >> 24)); + var r4 = f(u8.view(a.0 >> 32), u8.view(b.0 >> 32)); + var r5 = f(u8.view(a.0 >> 40), u8.view(b.0 >> 40)); + var r6 = f(u8.view(a.0 >> 48), u8.view(b.0 >> 48)); + var r7 = f(u8.view(a.0 >> 56), u8.view(b.0 >> 56)); + + var r8 = f(u8.view(a.1), u8.view(b.1)); + var r9 = f(u8.view(a.1 >> 8), u8.view(b.1 >> 8)); + var r10 = f(u8.view(a.1 >> 16), u8.view(b.1 >> 16)); + var r11 = f(u8.view(a.1 >> 24), u8.view(b.1 >> 24)); + var r12 = f(u8.view(a.1 >> 32), u8.view(b.1 >> 32)); + var r13 = f(u8.view(a.1 >> 40), u8.view(b.1 >> 40)); + var r14 = f(u8.view(a.1 >> 48), u8.view(b.1 >> 48)); + var r15 = f(u8.view(a.1 >> 56), u8.view(b.1 >> 56)); + + return ( + (u64.view(r7) << 56) | (u64.view(r6) << 48) | (u64.view(r5) << 40) | (u64.view(r4) << 32) | + (u64.view(r3) << 24) | (u64.view(r2) << 16) | (u64.view(r1) << 8) | r0, + + (u64.view(r15) << 56) | (u64.view(r14) << 48) | (u64.view(r13) << 40) | (u64.view(r12) << 32) | + (u64.view(r11) << 24) | (u64.view(r10) << 16) | (u64.view(r9) << 8) | r8 + ); + } private def canonf(a: float) -> float { return if(a == a, a, float.nan); } diff --git a/src/engine/v3/V3Interpreter.v3 b/src/engine/v3/V3Interpreter.v3 index 36236e17..3f325989 100644 --- a/src/engine/v3/V3Interpreter.v3 +++ b/src/engine/v3/V3Interpreter.v3 @@ -924,6 +924,9 @@ component V3Interpreter { I16X8_SUB => do_vv_v(V3Eval.I16X8_SUB); I16X8_MUL => do_vv_v(V3Eval.I16X8_MUL); I16X8_NEG => do_v_v(V3Eval.I16X8_NEG); + I8X16_ADD => do_vv_v(V3Eval.I8X16_ADD); + I8X16_SUB => do_vv_v(V3Eval.I8X16_SUB); + I8X16_NEG => do_v_v(V3Eval.I8X16_NEG); INVALID => trap(TrapReason.INVALID_OPCODE); CRASH_EXEC => System.error("WizengError", "crash-exec opcode executed"); CRASH_COMPILER => System.error("WizengError", "crash-compiler opcode executed"); From 4cf612f69e515c17f530b51e4ccc1fc26de54d71 Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Sun, 15 Oct 2023 07:23:54 -0400 Subject: [PATCH 3/5] [simd/v3i]: Implement i64x2 basic arithmetic instructions --- src/engine/V3Eval.v3 | 12 ++++++++++++ src/engine/v3/V3Interpreter.v3 | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/src/engine/V3Eval.v3 b/src/engine/V3Eval.v3 index 49b83494..844bd7da 100644 --- a/src/engine/V3Eval.v3 +++ b/src/engine/V3Eval.v3 @@ -301,6 +301,18 @@ component V3Eval { var not_b = V128_NOT(b); return V128_AND(a, not_b); } + def I64X2_ADD(a: (u64, u64), b: (u64, u64)) -> (u64, u64) { + return do_vv_v_x2(a, b, u64.+); + } + def I64X2_SUB(a: (u64, u64), b: (u64, u64)) -> (u64, u64) { + return do_vv_v_x2(a, b, u64.-); + } + def I64X2_MUL(a: (u64, u64), b: (u64, u64)) -> (u64, u64) { + return do_vv_v_x2(a, b, u64.*); + } + def I64X2_NEG(a: (u64, u64)) -> (u64, u64) { + return do_vv_v_x2((0, 0), a, u64.-); + } def I32X4_ADD(a: (u64, u64), b: (u64, u64)) -> (u64, u64) { return do_vv_v_x4(a, b, u32.+); } diff --git a/src/engine/v3/V3Interpreter.v3 b/src/engine/v3/V3Interpreter.v3 index 3f325989..c7d8dc37 100644 --- a/src/engine/v3/V3Interpreter.v3 +++ b/src/engine/v3/V3Interpreter.v3 @@ -916,6 +916,10 @@ component V3Interpreter { V128_XOR => do_vv_v(V3Eval.V128_XOR); V128_BITSELECT => do_vvv_v(V3Eval.V128_BITSELECT); V128_ANDNOT => do_vv_v(V3Eval.V128_ANDNOT); + I64X2_ADD => do_vv_v(V3Eval.I64X2_ADD); + I64X2_SUB => do_vv_v(V3Eval.I64X2_SUB); + I64X2_MUL => do_vv_v(V3Eval.I64X2_MUL); + I64X2_NEG => do_v_v(V3Eval.I64X2_NEG); I32X4_ADD => do_vv_v(V3Eval.I32X4_ADD); I32X4_SUB => do_vv_v(V3Eval.I32X4_SUB); I32X4_MUL => do_vv_v(V3Eval.I32X4_MUL); From b578ee4736783063f302621538f0f062a41cdeb2 Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Sun, 15 Oct 2023 09:57:59 -0400 Subject: [PATCH 4/5] [simd/v3i]: Refactor to use loops for computations with >8 lanes --- src/engine/V3Eval.v3 | 67 ++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/src/engine/V3Eval.v3 b/src/engine/V3Eval.v3 index 844bd7da..15c43e51 100644 --- a/src/engine/V3Eval.v3 +++ b/src/engine/V3Eval.v3 @@ -432,43 +432,44 @@ component V3Eval { return ((u64.view(r1) << 32) | r0, (u64.view(r3) << 32) | r2); } private def do_vv_v_x8(a: (u64, u64), b: (u64, u64), f: (u16, u16) -> u16) -> (u64, u64) { // Performs an 8-lane binop - var r0 = f(u16.view(a.0), u16.view(b.0)); - var r1 = f(u16.view(a.0 >> 16), u16.view(b.0 >> 16)); - var r2 = f(u16.view(a.0 >> 32), u16.view(b.0 >> 32)); - var r3 = f(u16.view(a.0 >> 48), u16.view(b.0 >> 48)); - var r4 = f(u16.view(a.1), u16.view(b.1)); - var r5 = f(u16.view(a.1 >> 16), u16.view(b.1 >> 16)); - var r6 = f(u16.view(a.1 >> 32), u16.view(b.1 >> 32)); - var r7 = f(u16.view(a.1 >> 48), u16.view(b.1 >> 48)); - return ((u64.view(r3) << 48) | (u64.view(r2) << 32) | (u64.view(r1) << 16) | r0, - (u64.view(r7) << 48) | (u64.view(r6) << 32) | (u64.view(r5) << 16) | r4); + var result0: u64 = 0; + var result1: u64 = 0; + + for (i < 4) { + var shift_amount: byte = byte.!(i * 16); + + var r_a = u16.view((a.0 >> shift_amount) & 0xFFFF); + var r_b = u16.view((b.0 >> shift_amount) & 0xFFFF); + var res = f(r_a, r_b); + result0 |= (u64.view(res) << shift_amount); + + r_a = u16.view((a.1 >> shift_amount) & 0xFFFF); + r_b = u16.view((b.1 >> shift_amount) & 0xFFFF); + res = f(r_a, r_b); + result1 |= (u64.view(res) << shift_amount); + } + + return (result0, result1); } private def do_vv_v_x16(a: (u64, u64), b: (u64, u64), f: (u8, u8) -> u8) -> (u64, u64) { // Performs a 16-lane binop - var r0 = f(u8.view(a.0), u8.view(b.0)); - var r1 = f(u8.view(a.0 >> 8), u8.view(b.0 >> 8)); - var r2 = f(u8.view(a.0 >> 16), u8.view(b.0 >> 16)); - var r3 = f(u8.view(a.0 >> 24), u8.view(b.0 >> 24)); - var r4 = f(u8.view(a.0 >> 32), u8.view(b.0 >> 32)); - var r5 = f(u8.view(a.0 >> 40), u8.view(b.0 >> 40)); - var r6 = f(u8.view(a.0 >> 48), u8.view(b.0 >> 48)); - var r7 = f(u8.view(a.0 >> 56), u8.view(b.0 >> 56)); - - var r8 = f(u8.view(a.1), u8.view(b.1)); - var r9 = f(u8.view(a.1 >> 8), u8.view(b.1 >> 8)); - var r10 = f(u8.view(a.1 >> 16), u8.view(b.1 >> 16)); - var r11 = f(u8.view(a.1 >> 24), u8.view(b.1 >> 24)); - var r12 = f(u8.view(a.1 >> 32), u8.view(b.1 >> 32)); - var r13 = f(u8.view(a.1 >> 40), u8.view(b.1 >> 40)); - var r14 = f(u8.view(a.1 >> 48), u8.view(b.1 >> 48)); - var r15 = f(u8.view(a.1 >> 56), u8.view(b.1 >> 56)); + var result0: u64 = 0; + var result1: u64 = 0; - return ( - (u64.view(r7) << 56) | (u64.view(r6) << 48) | (u64.view(r5) << 40) | (u64.view(r4) << 32) | - (u64.view(r3) << 24) | (u64.view(r2) << 16) | (u64.view(r1) << 8) | r0, + for (i < 8) { + var shift_amount: byte = byte.!(i * 8); + + var r_a = u8.view((a.0 >> shift_amount) & 0xFF); + var r_b = u8.view((b.0 >> shift_amount) & 0xFF); + var res = f(r_a, r_b); + result0 |= (u64.view(res) << shift_amount); + + r_a = u8.view((a.1 >> shift_amount) & 0xFF); + r_b = u8.view((b.1 >> shift_amount) & 0xFF); + res = f(r_a, r_b); + result1 |= (u64.view(res) << shift_amount); + } - (u64.view(r15) << 56) | (u64.view(r14) << 48) | (u64.view(r13) << 40) | (u64.view(r12) << 32) | - (u64.view(r11) << 24) | (u64.view(r10) << 16) | (u64.view(r9) << 8) | r8 - ); + return (result0, result1); } private def canonf(a: float) -> float { return if(a == a, a, float.nan); From 9f04cf340acecfb4bc8f20c43bc692888ce5b0ba Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Sun, 15 Oct 2023 11:11:55 -0400 Subject: [PATCH 5/5] [simd/v3i]: Remove unnecessary locals --- src/engine/V3Eval.v3 | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/src/engine/V3Eval.v3 b/src/engine/V3Eval.v3 index 15c43e51..a9b34758 100644 --- a/src/engine/V3Eval.v3 +++ b/src/engine/V3Eval.v3 @@ -432,44 +432,40 @@ component V3Eval { return ((u64.view(r1) << 32) | r0, (u64.view(r3) << 32) | r2); } private def do_vv_v_x8(a: (u64, u64), b: (u64, u64), f: (u16, u16) -> u16) -> (u64, u64) { // Performs an 8-lane binop - var result0: u64 = 0; - var result1: u64 = 0; + var low: u64 = 0; + var high: u64 = 0; - for (i < 4) { - var shift_amount: byte = byte.!(i * 16); - - var r_a = u16.view((a.0 >> shift_amount) & 0xFFFF); - var r_b = u16.view((b.0 >> shift_amount) & 0xFFFF); + for (shift: byte = 0; shift < 64; shift += 16) { + var r_a = u16.view((a.0 >> shift) & 0xFFFF); + var r_b = u16.view((b.0 >> shift) & 0xFFFF); var res = f(r_a, r_b); - result0 |= (u64.view(res) << shift_amount); + low |= (u64.view(res) << shift); - r_a = u16.view((a.1 >> shift_amount) & 0xFFFF); - r_b = u16.view((b.1 >> shift_amount) & 0xFFFF); + r_a = u16.view((a.1 >> shift) & 0xFFFF); + r_b = u16.view((b.1 >> shift) & 0xFFFF); res = f(r_a, r_b); - result1 |= (u64.view(res) << shift_amount); + high |= (u64.view(res) << shift); } - return (result0, result1); + return (low, high); } private def do_vv_v_x16(a: (u64, u64), b: (u64, u64), f: (u8, u8) -> u8) -> (u64, u64) { // Performs a 16-lane binop - var result0: u64 = 0; - var result1: u64 = 0; + var low: u64 = 0; + var high: u64 = 0; - for (i < 8) { - var shift_amount: byte = byte.!(i * 8); - - var r_a = u8.view((a.0 >> shift_amount) & 0xFF); - var r_b = u8.view((b.0 >> shift_amount) & 0xFF); + for (shift: byte = 0; shift < 64; shift += 8) { + var r_a = u8.view((a.0 >> shift) & 0xFF); + var r_b = u8.view((b.0 >> shift) & 0xFF); var res = f(r_a, r_b); - result0 |= (u64.view(res) << shift_amount); + low |= (u64.view(res) << shift); - r_a = u8.view((a.1 >> shift_amount) & 0xFF); - r_b = u8.view((b.1 >> shift_amount) & 0xFF); + r_a = u8.view((a.1 >> shift) & 0xFF); + r_b = u8.view((b.1 >> shift) & 0xFF); res = f(r_a, r_b); - result1 |= (u64.view(res) << shift_amount); + high |= (u64.view(res) << shift); } - return (result0, result1); + return (low, high); } private def canonf(a: float) -> float { return if(a == a, a, float.nan);