tenstorrent · rdjogoTT · Mar 20, 2024 · Mar 13, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/common/inc/ckernel_sfpu.h b/common/inc/ckernel_sfpu.h
diff --git a/common/inc/sfpu/ckernel_sfpu_abs.h b/common/inc/sfpu/ckernel_sfpu_abs.h
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel_defs.h"
+#include "ckernel.h"
+#include "noc_nonblocking_api.h"
+
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel
+{
+namespace sfpu
+{
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void _calculate_abs_(const int iterations)
+{
+    // SFPU microcode
+    for (int d = 0; d < iterations; d++)
+    {
+        vFloat v = dst_reg[0];
+        dst_reg[0] = sfpi::abs(v);
+        dst_reg++;
+    }
+}
+
+} // namespace sfpu
+} // namespace ckernel
diff --git a/common/inc/sfpu/ckernel_sfpu_add_int32.h b/common/inc/sfpu/ckernel_sfpu_add_int32.h
@@ -0,0 +1,41 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel_defs.h"
+#include "ckernel.h"
+#include "noc_nonblocking_api.h"
+
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel
+{
+namespace sfpu
+{
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void _add_int32_(const int iterations, const uint dst_offset) {
+    // Operand A is input1 (int32)
+    // Operand B is input2 (int32)
+    // Output is int32
+    #pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++) {
+        // operand A - int32
+        TTI_SFPLOAD(0, 12, 3, 0);
+        // operand B - int32
+        TT_SFPLOAD(1, 12, 3, dst_offset * 64);
+        TTI_SFPIADD(0, 1, 0, 4);
+        // MAD has a 2-cycle pipeline latency so we need one cycle latency until next instr can consume the result
+        TTI_NOP;
+        // LREG_0 -> dest as int32
+        TTI_SFPSTORE(0, 12, 3, 0);
+        dst_reg++;
+    }
+}
+
+} // namespace sfpu
+} // namespace ckernel
diff --git a/common/inc/sfpu/ckernel_sfpu_cast_fp32_to_fp16a.h b/common/inc/sfpu/ckernel_sfpu_cast_fp32_to_fp16a.h
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel_defs.h"
+#include "ckernel.h"
+#include "noc_nonblocking_api.h"
+
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel
+{
+namespace sfpu
+{
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void _cast_fp32_to_fp16a_(const int iterations)
+{
+    #pragma GCC unroll 8
+    for (int d = 0; d < iterations; d++)
+    {
+        //vFloat val = dst_reg[0];
+        //dst_reg[0] = float_to_fp16a(val, 0);
+        TTI_SFPLOAD(0, 0, 3, 0);
+        TTI_SFP_STOCH_RND(0,0,0,0,0,8);
+        TTI_SFPSTORE(0,1,3,0);
+        dst_reg++;
+    }
+}
+
+} // namespace sfpu
+} // namespace ckernel
diff --git a/common/inc/sfpu/ckernel_sfpu_clamp.h b/common/inc/sfpu/ckernel_sfpu_clamp.h
@@ -0,0 +1,52 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel_defs.h"
+#include "ckernel.h"
+#include "noc_nonblocking_api.h"
+
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel
+{
+namespace sfpu
+{
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void _calculate_clamp_(const int iterations, uint param0, uint param1, uint param2)
+{
+    // All params are in FP16 format
+    // param0 = min
+    // param1 = max
+
+    //uint format = (param0 >> 16)&0x1;
+    s2vFloat16::Format format = s2vFloat16::fp16a;
+
+    // SFPU microcode
+    vFloat min = s2vFloat16(param0, format);
+    vFloat max = s2vFloat16(param1, format);
+    #pragma GCC unroll 0
+    for (int d = 0; d < iterations; d++)
+    {
+        vFloat val = dst_reg[0];
+
+        v_if (val < min) {
+            val = s2vFloat16(param0, format);
+        } v_elseif (val >= max) {
+            val = s2vFloat16(param1, format);
+        }
+        v_endif;
+
+        dst_reg[0] = val + s2vFloat16b(param2); // 12 bits
+
+        dst_reg++;
+    }
+}
+
+} // namespace sfpu
+} // namespace ckernel
diff --git a/common/inc/sfpu/ckernel_sfpu_comp.h b/common/inc/sfpu/ckernel_sfpu_comp.h
@@ -0,0 +1,98 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel_defs.h"
+#include "ckernel.h"
+#include "noc_nonblocking_api.h"
+#include "ckernel_sfpu_is_fp16_zero.h"
+
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel
+{
+namespace sfpu
+{
+
+sfpi_inline void _calculate_comp_init_flag_(bool check, vFloat& flag1, vFloat& flag2, float init)
+{
+    flag1 = init;
+    if (check) {
+        flag2 = init;
+    }
+}
+
+template <bool APPROXIMATION_MODE, bool invert_output, bool check_zero, bool second_check, bool is_less_than_equal_zero, int ITERATIONS>
+inline void _calculate_comp_(const int iterations, uint exponent_size_8)
+{
+
+    // output_0 and output_1 hold the outputs use use when a zero or negative check is true/false.
+    // False = 0.0 = kCONST_0 (5/8-bit exponent format)
+    // True  = 1.0 = kCONST_1_FP16B (8-bit exponent format)
+    // SFPU uses 8-bit exponent in operations so loading these constants in 8-bit exponent format.
+    // Although a command flag can tell SFPU to re-bias a 5-bit exponent to 8-bit, we are loading 8-bit
+    // exponent and telling SFPU to not add any bias to these constants.
+    constexpr float output_0 = invert_output ? 0.0f : 1.0f;
+    constexpr float output_1 = invert_output ? 1.0f : 0.0f;
+
+    for (int d = 0; d < iterations; d++)
+    {
+        vFloat v = dst_reg[0];
+        vFloat flag1, flag2;
+        if constexpr(check_zero)
+        {
+            v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) {
+                _calculate_comp_init_flag_(second_check, flag1, flag2, output_0);
+            } v_else {
+                _calculate_comp_init_flag_(second_check, flag1, flag2, output_1);
+            }
+            v_endif;
+        }
+        else
+        {
+            v_if (v < 0.0F) {
+                _calculate_comp_init_flag_(second_check, flag1, flag2, output_0);
+            } v_else {
+                _calculate_comp_init_flag_(second_check, flag1, flag2, output_1);
+            }
+            v_endif;
+        }
+
+        vFloat result;
+        if constexpr (second_check)
+        {
+            // less_than_equal_zero
+            // flag1 = 0x3F80(1.0) if DST < 0 else 0
+            // flag2 = 0x3F80(1.0) if DST == 0 else 0
+            // Do a bitwise Or (flag1 | flag2) to get <= condition.
+            // flag1 < 0 OR flag2 == 0 => DST is Less than or Equal to zero.
+            // Result will be either 0x0000(0.0) or 0x3F80(1.0)
+            if constexpr (is_less_than_equal_zero) {
+                result = reinterpret<vFloat>(reinterpret<vUInt>(flag1) | reinterpret<vUInt>(flag2));
+            }
+            else
+            {
+                // greater_than_zero
+                // flag1 = 0x3F80(1.0) if DST >= 0 else 0
+                // flag2 = 0x3F80(1.0) if DST != 0 else 0
+                // Do a bitwise And (flag1 & flag2) to get > condition.
+                // flag2 >= 0 AND flag1 != 0 => DST is Greater than zero
+                // Result will be either 0x0000(0.0) or 0x3F80(1.0)
+                result = reinterpret<vFloat>(reinterpret<vUInt>(flag1) & reinterpret<vUInt>(flag2));
+            }
+        } else {
+            result = flag1;
+        }
+
+        dst_reg[0] = result;
+
+        dst_reg++;
+    }
+}
+
+} // namespace sfpu
+} // namespace ckernel
diff --git a/common/inc/sfpu/ckernel_sfpu_dropout.h b/common/inc/sfpu/ckernel_sfpu_dropout.h
@@ -0,0 +1,93 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel_defs.h"
+#include "ckernel.h"
+#include "noc_nonblocking_api.h"
+
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel
+{
+namespace sfpu
+{
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void _calculate_dropout_(const int iterations, uint prob, uint scale)
+{
+    // SFPU microcode
+
+    FWLOG1("calculate_dropout() -- prob:%x", prob);
+    FWLOG1("calculate_dropout() -- scale:%x", scale);
+
+    vUInt rand = l_reg[LRegs::LReg3];
+
+    #pragma GCC unroll 0
+    for (int d = 0; d < iterations; d++) {
+        ////////////////////////
+        // Scale samples
+        ///////////////////////
+        dst_reg[0] = dst_reg[0] * s2vFloat16b(scale);
+
+        ////////////////////////
+        // Drop samples
+        ///////////////////////
+        v_if (rand < prob) {
+            dst_reg[0] = vConst0;
+        }
+        v_endif;
+
+        ////////////////////////
+        // 16-bit PRNG update
+        ///////////////////////
+        vUInt lfsr = vConstIntPrgm1;
+        vUInt tmp = lfsr & rand;
+        rand = rand >> 1;
+        v_if (tmp != 0) {
+            vUInt mask = vConstIntPrgm0;
+            rand ^= mask;
+        }
+        v_endif;
+
+        dst_reg++;
+    }
+
+    l_reg[LRegs::LReg3] = rand;
+}
+
+inline void _init_dropout_seed_(uint16_t p2){
+    FWLOG1("calculate_dropout() -- input seed:%x", p2);
+
+    uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(0, 0, NOC_NODE_ID);
+
+    uint16_t my_x = noc_id_reg & NOC_NODE_ID_MASK;
+    uint16_t my_y = (noc_id_reg >> NOC_ADDR_NODE_ID_BITS) & NOC_NODE_ID_MASK;
+
+    uint16_t per_tensix_input_seed = p2 ^ (my_x << my_y);
+
+    FWLOG1("calculate_dropout() -- calculated seed:%x", per_tensix_input_seed);
+
+    vInt result = l_reg[LRegs::LReg3];
+
+    vInt tmp = vConstTileId << 10;
+    vInt ptis = per_tensix_input_seed;
+    result = ~(tmp & ptis) & (tmp | ptis);
+
+    l_reg[LRegs::LReg3] = result;
+}
+
+inline void _init_dropout_(const uint seed)
+{
+    vConstIntPrgm0 = 0xb400;
+    vConstIntPrgm1 = 0x1; // binary 0b1 - used to extract LSB
+
+    _init_dropout_seed_(seed);
+}
+
+} // namespace sfpu
+} // namespace ckernel