-
Notifications
You must be signed in to change notification settings - Fork 91
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add transpose WH sharded, generalize row major permute when N > 4, and do a minor refactor of ttnn::permute #15881
Changes from all commits
300fe91
b21dc6f
a833cb9
cabfcc9
7b7b12f
21d29ca
ce68210
7f40b41
d547a86
1cf08a9
bac01ce
12858fb
1baf18e
04c8e84
5e23f4b
8ac1b8d
d68e2ac
9f670c5
0401154
01fa552
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -137,4 +137,43 @@ template <uint32_t a, uint32_t b> | |
FORCE_INLINE constexpr uint32_t round_up() { | ||
return b * div_up<a, b>(); | ||
} | ||
|
||
// Function template to swap two elements in a uint32_t array | ||
template <size_t N> | ||
FORCE_INLINE void swap_elements(uint32_t (&array)[N], size_t i, size_t j) { | ||
// Perform the swap | ||
uint32_t temp = array[i]; | ||
array[i] = array[j]; | ||
array[j] = temp; | ||
Comment on lines
+145
to
+147
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is std::swap available in kernels? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't have std support in kernels unfortunately. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ya that would blow up the kernel program size a d not fit |
||
} | ||
|
||
// 2D Transpose function for debug use in reader/writer kernels | ||
FORCE_INLINE void transpose_2d( | ||
uint32_t input_l1_addr, | ||
uint32_t output_l1_addr, | ||
uint32_t X, | ||
uint32_t W, | ||
uint32_t element_size, | ||
uint32_t input_page_size, | ||
uint32_t output_page_size) { | ||
volatile tt_l1_ptr uint8_t* input_ptr = reinterpret_cast<volatile tt_l1_ptr uint8_t*>(input_l1_addr); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there anywhere I can read about There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. tt_metal/hw/inc/risc_attribs.h #define tt_l1_ptr attribute((rvtt_l1_ptr)) Seems to be a wrapper around some riscv hw ptr concept |
||
volatile tt_l1_ptr uint8_t* output_ptr = reinterpret_cast<volatile tt_l1_ptr uint8_t*>(output_l1_addr); | ||
// transpose from XW, where X is outer and W inner, to WX, where W is outer and X is inner | ||
// each element is element_size bytes | ||
// each row is W elements, and each row is separated by input_page_size bytes | ||
// each output row is X elements, and each row is separated by output_page_size bytes | ||
|
||
for (uint32_t x = 0; x < X; ++x) { | ||
for (uint32_t w = 0; w < W; ++w) { | ||
// Compute the input and output addresses | ||
uint32_t input_addr = x * input_page_size + w * element_size; | ||
uint32_t output_addr = w * output_page_size + x * element_size; | ||
// Copy the element - do we have memcpy? use this for now | ||
for (uint32_t i = 0; i < element_size; ++i) { | ||
output_ptr[output_addr + i] = input_ptr[input_addr + i]; | ||
} | ||
} | ||
} | ||
} | ||
|
||
} // namespace tt::data_movement::common |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#include <cstdint> | ||
|
||
#include "compute_kernel_api/eltwise_unary/eltwise_unary.h" | ||
#include "compute_kernel_api/transpose_wh.h" | ||
#include "compute_kernel_api/tilize.h" | ||
#include "compute_kernel_api/untilize.h" | ||
#include "compute_kernel_api/pack_untilize.h" | ||
|
||
namespace NAMESPACE { | ||
void MAIN { | ||
constexpr uint32_t x_block_size = get_compile_time_arg_val(0); | ||
constexpr uint32_t w_block_size = get_compile_time_arg_val(1); | ||
|
||
uint32_t num_blocks = get_arg_val<uint32_t>(0); | ||
|
||
constexpr auto cb_in = tt::CBIndex::c_0; | ||
constexpr auto cb_tilize = tt::CBIndex::c_1; | ||
constexpr auto cb_out = tt::CBIndex::c_2; | ||
|
||
unary_op_init_common(cb_in, cb_out); | ||
|
||
for (uint32_t n = 0; n < num_blocks; n++) { | ||
// tilize input via unpack and then pack | ||
tilize_init_short(cb_in, 1); | ||
|
||
cb_wait_front(cb_in, x_block_size); | ||
cb_reserve_back(cb_tilize, 1); | ||
|
||
tilize_block(cb_in, 1, cb_tilize); // tilize and pack into cb_tilize | ||
|
||
// tile slice according to unpacker is garbage after tilize_block in the second iteration, missing an uninit? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. reading these comments - looks like we got many lines of workarounds. is it the case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's just the one global init workaround that we have atm, but I wrote a lot of comments documenting the issue so it's not lost to history There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And @rdjogoTT found the issue and all these comments are no longer needed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What was the issue? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pack_untilize_uninit is hard coded to a default cb_id = 16, which was cb_out0 before. Since the recommendation is now to use cb_ids in sequential order what worked with the old cb_id=16 does not work with the cb_id that I had. There's a clean up item that they're working on that removes the defaults since they're misleading. |
||
cb_push_back(cb_tilize, 1); | ||
cb_pop_front(cb_in, x_block_size); | ||
|
||
tilize_uninit(cb_in); | ||
|
||
// transpose input | ||
cb_wait_front(cb_tilize, 1); | ||
transpose_wh_init_short(cb_tilize); | ||
pack_untilize_dst_init_short<1>(cb_out); | ||
|
||
tile_regs_acquire(); | ||
transpose_wh_tile(cb_tilize, 0, 0); // transpose call | ||
tile_regs_commit(); | ||
|
||
// pack and untilize | ||
cb_reserve_back(cb_out, w_block_size); | ||
|
||
tile_regs_wait(); | ||
pack_untilize_dst<1>(cb_out); // pack call | ||
tile_regs_release(); | ||
|
||
cb_push_back(cb_out, w_block_size); | ||
|
||
cb_wait_front(cb_out, w_block_size); | ||
pack_untilize_uninit(cb_out); | ||
|
||
cb_pop_front(cb_tilize, 1); | ||
} | ||
} | ||
} // namespace NAMESPACE |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just curious, why is
tt_input_tensor
not sharded, but output is?would there be any practical difference if input is sharded and output configuration is taken from input?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the tt_input_tensor that goes into transpose is sharded. Setting the memory_config=sharded_mem_config shouldn't make a difference here since it doesn't match the output.