Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

split ops in ckernel_spfu into individual files #11

Merged
merged 3 commits into from
Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,929 changes: 35 additions & 1,894 deletions common/inc/ckernel_sfpu.h

Large diffs are not rendered by default.

33 changes: 33 additions & 0 deletions common/inc/sfpu/ckernel_sfpu_abs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "ckernel_defs.h"
#include "ckernel.h"
#include "noc_nonblocking_api.h"

#include "sfpi.h"

using namespace sfpi;

namespace ckernel
{
namespace sfpu
{

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _calculate_abs_(const int iterations)
{
// SFPU microcode
for (int d = 0; d < iterations; d++)
{
vFloat v = dst_reg[0];
dst_reg[0] = sfpi::abs(v);
dst_reg++;
}
}

} // namespace sfpu
} // namespace ckernel
41 changes: 41 additions & 0 deletions common/inc/sfpu/ckernel_sfpu_add_int32.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "ckernel_defs.h"
#include "ckernel.h"
#include "noc_nonblocking_api.h"

#include "sfpi.h"

using namespace sfpi;

namespace ckernel
{
namespace sfpu
{

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _add_int32_(const int iterations, const uint dst_offset) {
// Operand A is input1 (int32)
// Operand B is input2 (int32)
// Output is int32
#pragma GCC unroll 8
for (int d = 0; d < ITERATIONS; d++) {
// operand A - int32
TTI_SFPLOAD(0, 12, 3, 0);
// operand B - int32
TT_SFPLOAD(1, 12, 3, dst_offset * 64);
TTI_SFPIADD(0, 1, 0, 4);
// MAD has a 2-cycle pipeline latency so we need one cycle latency until next instr can consume the result
TTI_NOP;
// LREG_0 -> dest as int32
TTI_SFPSTORE(0, 12, 3, 0);
dst_reg++;
}
}

} // namespace sfpu
} // namespace ckernel
36 changes: 36 additions & 0 deletions common/inc/sfpu/ckernel_sfpu_cast_fp32_to_fp16a.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "ckernel_defs.h"
#include "ckernel.h"
#include "noc_nonblocking_api.h"

#include "sfpi.h"

using namespace sfpi;

namespace ckernel
{
namespace sfpu
{

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _cast_fp32_to_fp16a_(const int iterations)
{
#pragma GCC unroll 8
for (int d = 0; d < iterations; d++)
{
//vFloat val = dst_reg[0];
//dst_reg[0] = float_to_fp16a(val, 0);
TTI_SFPLOAD(0, 0, 3, 0);
TTI_SFP_STOCH_RND(0,0,0,0,0,8);
TTI_SFPSTORE(0,1,3,0);
dst_reg++;
}
}

} // namespace sfpu
} // namespace ckernel
52 changes: 52 additions & 0 deletions common/inc/sfpu/ckernel_sfpu_clamp.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "ckernel_defs.h"
#include "ckernel.h"
#include "noc_nonblocking_api.h"

#include "sfpi.h"

using namespace sfpi;

namespace ckernel
{
namespace sfpu
{

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _calculate_clamp_(const int iterations, uint param0, uint param1, uint param2)
{
// All params are in FP16 format
// param0 = min
// param1 = max

//uint format = (param0 >> 16)&0x1;
s2vFloat16::Format format = s2vFloat16::fp16a;

// SFPU microcode
vFloat min = s2vFloat16(param0, format);
vFloat max = s2vFloat16(param1, format);
#pragma GCC unroll 0
for (int d = 0; d < iterations; d++)
{
vFloat val = dst_reg[0];

v_if (val < min) {
val = s2vFloat16(param0, format);
} v_elseif (val >= max) {
val = s2vFloat16(param1, format);
}
v_endif;

dst_reg[0] = val + s2vFloat16b(param2); // 12 bits

dst_reg++;
}
}

} // namespace sfpu
} // namespace ckernel
98 changes: 98 additions & 0 deletions common/inc/sfpu/ckernel_sfpu_comp.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "ckernel_defs.h"
#include "ckernel.h"
#include "noc_nonblocking_api.h"
#include "ckernel_sfpu_is_fp16_zero.h"

#include "sfpi.h"

using namespace sfpi;

namespace ckernel
{
namespace sfpu
{

sfpi_inline void _calculate_comp_init_flag_(bool check, vFloat& flag1, vFloat& flag2, float init)
{
flag1 = init;
if (check) {
flag2 = init;
}
}

template <bool APPROXIMATION_MODE, bool invert_output, bool check_zero, bool second_check, bool is_less_than_equal_zero, int ITERATIONS>
inline void _calculate_comp_(const int iterations, uint exponent_size_8)
{

// output_0 and output_1 hold the outputs use use when a zero or negative check is true/false.
// False = 0.0 = kCONST_0 (5/8-bit exponent format)
// True = 1.0 = kCONST_1_FP16B (8-bit exponent format)
// SFPU uses 8-bit exponent in operations so loading these constants in 8-bit exponent format.
// Although a command flag can tell SFPU to re-bias a 5-bit exponent to 8-bit, we are loading 8-bit
// exponent and telling SFPU to not add any bias to these constants.
constexpr float output_0 = invert_output ? 0.0f : 1.0f;
constexpr float output_1 = invert_output ? 1.0f : 0.0f;

for (int d = 0; d < iterations; d++)
{
vFloat v = dst_reg[0];
vFloat flag1, flag2;
if constexpr(check_zero)
{
v_if (_sfpu_is_fp16_zero_(v, exponent_size_8)) {
_calculate_comp_init_flag_(second_check, flag1, flag2, output_0);
} v_else {
_calculate_comp_init_flag_(second_check, flag1, flag2, output_1);
}
v_endif;
}
else
{
v_if (v < 0.0F) {
_calculate_comp_init_flag_(second_check, flag1, flag2, output_0);
} v_else {
_calculate_comp_init_flag_(second_check, flag1, flag2, output_1);
}
v_endif;
}

vFloat result;
if constexpr (second_check)
{
// less_than_equal_zero
// flag1 = 0x3F80(1.0) if DST < 0 else 0
// flag2 = 0x3F80(1.0) if DST == 0 else 0
// Do a bitwise Or (flag1 | flag2) to get <= condition.
// flag1 < 0 OR flag2 == 0 => DST is Less than or Equal to zero.
// Result will be either 0x0000(0.0) or 0x3F80(1.0)
if constexpr (is_less_than_equal_zero) {
result = reinterpret<vFloat>(reinterpret<vUInt>(flag1) | reinterpret<vUInt>(flag2));
}
else
{
// greater_than_zero
// flag1 = 0x3F80(1.0) if DST >= 0 else 0
// flag2 = 0x3F80(1.0) if DST != 0 else 0
// Do a bitwise And (flag1 & flag2) to get > condition.
// flag2 >= 0 AND flag1 != 0 => DST is Greater than zero
// Result will be either 0x0000(0.0) or 0x3F80(1.0)
result = reinterpret<vFloat>(reinterpret<vUInt>(flag1) & reinterpret<vUInt>(flag2));
}
} else {
result = flag1;
}

dst_reg[0] = result;

dst_reg++;
}
}

} // namespace sfpu
} // namespace ckernel
93 changes: 93 additions & 0 deletions common/inc/sfpu/ckernel_sfpu_dropout.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "ckernel_defs.h"
#include "ckernel.h"
#include "noc_nonblocking_api.h"

#include "sfpi.h"

using namespace sfpi;

namespace ckernel
{
namespace sfpu
{

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _calculate_dropout_(const int iterations, uint prob, uint scale)
{
// SFPU microcode

FWLOG1("calculate_dropout() -- prob:%x", prob);
FWLOG1("calculate_dropout() -- scale:%x", scale);

vUInt rand = l_reg[LRegs::LReg3];

#pragma GCC unroll 0
for (int d = 0; d < iterations; d++) {
////////////////////////
// Scale samples
///////////////////////
dst_reg[0] = dst_reg[0] * s2vFloat16b(scale);

////////////////////////
// Drop samples
///////////////////////
v_if (rand < prob) {
dst_reg[0] = vConst0;
}
v_endif;

////////////////////////
// 16-bit PRNG update
///////////////////////
vUInt lfsr = vConstIntPrgm1;
vUInt tmp = lfsr & rand;
rand = rand >> 1;
v_if (tmp != 0) {
vUInt mask = vConstIntPrgm0;
rand ^= mask;
}
v_endif;

dst_reg++;
}

l_reg[LRegs::LReg3] = rand;
}

inline void _init_dropout_seed_(uint16_t p2){
FWLOG1("calculate_dropout() -- input seed:%x", p2);

uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(0, 0, NOC_NODE_ID);

uint16_t my_x = noc_id_reg & NOC_NODE_ID_MASK;
uint16_t my_y = (noc_id_reg >> NOC_ADDR_NODE_ID_BITS) & NOC_NODE_ID_MASK;

uint16_t per_tensix_input_seed = p2 ^ (my_x << my_y);

FWLOG1("calculate_dropout() -- calculated seed:%x", per_tensix_input_seed);

vInt result = l_reg[LRegs::LReg3];

vInt tmp = vConstTileId << 10;
vInt ptis = per_tensix_input_seed;
result = ~(tmp & ptis) & (tmp | ptis);

l_reg[LRegs::LReg3] = result;
}

inline void _init_dropout_(const uint seed)
{
vConstIntPrgm0 = 0xb400;
vConstIntPrgm1 = 0x1; // binary 0b1 - used to extract LSB

_init_dropout_seed_(seed);
}

} // namespace sfpu
} // namespace ckernel
Loading
Loading