Skip to content

Commit

Permalink
#8837: Add 256B nt memcpy tests to test_pull_from_pcie
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-aho committed May 25, 2024
1 parent 5d683ba commit 4170c3d
Showing 1 changed file with 147 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,31 +43,72 @@ void *align(void *ptr, std::size_t max_alignment) {
return reinterpret_cast<void *>(aligned);
}

#define CACHE_LINE_SIZE 64

void nt_memcpy(uint8_t *__restrict dst, const uint8_t * __restrict src, size_t n)
{
size_t num_lines = n / CACHE_LINE_SIZE;
#define INNER_LOOP 8

template <bool stream_load, bool aligned_load>
void nt_memcpy_128b(uint8_t *__restrict dst, const uint8_t *__restrict src, size_t n) {
size_t num_lines = n / (INNER_LOOP * sizeof(__m128i));
constexpr size_t inner_blk_size = INNER_LOOP * sizeof(__m128i);
size_t i;
for (i = 0; i < num_lines; i++) {
size_t j;
for (j = 0; j < CACHE_LINE_SIZE / sizeof(__m128i); j++) {
__m128i blk = _mm_stream_load_si128((__m128i *)src);
for (j = 0; j < INNER_LOOP; j++) {
__m128i blk;
if constexpr (stream_load) {
blk = _mm_stream_load_si128((__m128i *)src);
} else {
if constexpr (aligned_load) {
blk = _mm_load_si128((__m128i *)src);
} else {
blk = _mm_loadu_si128((__m128i *)src);
}
}
/* non-temporal store */
_mm_stream_si128((__m128i *)dst, blk);

src += sizeof(__m128i);
dst += sizeof(__m128i);
}
n -= CACHE_LINE_SIZE;
n -= inner_blk_size;
}

if (num_lines > 0)
_mm_sfence();
}

template <bool stream_load, bool aligned_load>
void nt_memcpy_256b(uint8_t *__restrict dst, const uint8_t *__restrict src, size_t n) {
size_t num_lines = n / (INNER_LOOP * sizeof(__m256i));
constexpr size_t inner_blk_size = INNER_LOOP * sizeof(__m256i);
size_t i;
for (i = 0; i < num_lines; i++) {
size_t j;
for (j = 0; j < INNER_LOOP; j++) {
__m256i blk;
if constexpr (stream_load) {
static_assert(aligned_load);
blk = _mm256_stream_load_si256((__m256i *)src);
} else {
if constexpr (aligned_load) {
blk = _mm256_load_si256((__m256i *)src);
} else {
blk = _mm256_loadu_si256((__m256i *)src);
}
}
/* non-temporal store */
_mm256_stream_si256((__m256i *)dst, blk);

src += sizeof(__m256i);
dst += sizeof(__m256i);
}
n -= inner_blk_size;
}

int main(int argc, char** argv) {
if (num_lines > 0)
_mm_sfence();
}

int main(int argc, char **argv) {
bool pass = true;
std::vector<double> h2d_bandwidth;
uint32_t num_tests = 10;
Expand All @@ -77,23 +118,39 @@ int main(int argc, char** argv) {
bool simulate_write_ptr_update = false;
uint32_t write_ptr_readback_interval = 0;
uint32_t copy_mode = 0;
std::size_t addr_align = MEMCPY_ALIGNMENT;
constexpr uint32_t memcpy_alignment = sizeof(__m256i);
std::size_t addr_align = memcpy_alignment;

try {
// Input arguments parsing
std::vector<std::string> input_args(argv, argv + argc);

if (test_args::has_command_option(input_args, "-h") ||
test_args::has_command_option(input_args, "--help")) {
if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) {
log_info(LogTest, "Usage:");
log_info(LogTest, " --num-tests: number of iterations");
log_info(LogTest, " --total-transfer-size: total size to copy to hugepage in bytes (default {} B)", 512 * 1024 * 1024);
log_info(
LogTest,
" --total-transfer-size: total size to copy to hugepage in bytes (default {} B)",
512 * 1024 * 1024);
log_info(LogTest, " --transfer-size: size of one write to hugepage (default {} B)", 64 * 1024);
log_info(LogTest, " --enable-kernel-read: whether to run a kernel that reads from PCIe (default false)");
log_info(LogTest, " --simulate-wr-ptr-update: whether host writes to reg address at 32KB intervals (default false)");
log_info(LogTest, " --wr-ptr-rdbk-interval: after this many num writes to reg address, do readback (default 0 means no readbacks)");
log_info(LogTest, " --copy-mode: method used to write to pcie. 0: memcpy, 1: 4 byte writes, 2: nt_memcpy (uncached writes + 16B stores), 3: memcpy_to_device (uncached writes + unaligned 16B stores)");
log_info(LogTest, " --addr-align: Alignment of start of data. Must be a power of 2 (default {} B)", MEMCPY_ALIGNMENT);
log_info(
LogTest,
" --simulate-wr-ptr-update: whether host writes to reg address at 32KB intervals (default false)");
log_info(
LogTest,
" --wr-ptr-rdbk-interval: after this many num writes to reg address, do readback (default 0 means no "
"readbacks)");
log_info(
LogTest,
" --copy-mode: method used to write to pcie. 0: memcpy, 1: 4 byte writes, 2: nt_memcpy (16B streaming "
"loads + stores), 3: nt_memcpy (16B aligned loads + streaming stores), 4: nt_memcpy (16B unaligned "
"loads + streaming stores), 5: nt_memcpy (32B streaming loads + stores), 6: nt_memcpy (32B aligned "
"loads + streaming stores), 7: nt_memcpy (32B unaligned loads + streaming stores) 8: memcpy_to_device");
log_info(
LogTest,
" --addr-align: Alignment of start of data. Must be a power of 2 (default {} B)",
memcpy_alignment);
exit(0);
}

Expand All @@ -104,45 +161,64 @@ int main(int argc, char** argv) {
std::tie(total_transfer_size, input_args) = test_args::get_command_option_uint32_and_remaining_args(
input_args, "--total-transfer-size", 512 * 1024 * 1024);

std::tie(transfer_size, input_args) = test_args::get_command_option_uint32_and_remaining_args(
input_args, "--transfer-size", 64 * 1024);
std::tie(transfer_size, input_args) =
test_args::get_command_option_uint32_and_remaining_args(input_args, "--transfer-size", 64 * 1024);

std::tie(enable_kernel_read, input_args) =
test_args::has_command_option_and_remaining_args(input_args, "--enable-kernel-read");

std::tie(simulate_write_ptr_update, input_args) =
test_args::has_command_option_and_remaining_args(input_args, "--simulate-wr-ptr-update");

std::tie(write_ptr_readback_interval, input_args) = test_args::get_command_option_uint32_and_remaining_args(
input_args, "--wr-ptr-rdbk-interval", 0);
std::tie(write_ptr_readback_interval, input_args) =
test_args::get_command_option_uint32_and_remaining_args(input_args, "--wr-ptr-rdbk-interval", 0);

std::tie(copy_mode, input_args) = test_args::get_command_option_uint32_and_remaining_args(
input_args, "--copy-mode", 0);
std::tie(copy_mode, input_args) =
test_args::get_command_option_uint32_and_remaining_args(input_args, "--copy-mode", 0);

std::tie(addr_align, input_args) = test_args::get_command_option_uint32_and_remaining_args(
input_args, "--addr-align", MEMCPY_ALIGNMENT);
std::tie(addr_align, input_args) =
test_args::get_command_option_uint32_and_remaining_args(input_args, "--addr-align", memcpy_alignment);

test_args::validate_remaining_args(input_args);
} catch (const std::exception& e) {
} catch (const std::exception &e) {
log_error(tt::LogTest, "Command line arguments found exception", e.what());
}
TT_ASSERT((addr_align >= 4 && (addr_align & (addr_align - 1)) == 0), "Address alignment must be a power of 2 >= 4");
TT_ASSERT(copy_mode <= 3, "Invalid --copy-mode arg! Only four modes to copy data data from host into hugepages support! memcpy, 4 byte writes, nt_copy, and memcpy_to_device");
if (copy_mode == 2) {
TT_ASSERT(addr_align % 16 == 0, "Address alignment must be a multiple of 16 when using nt_memcpy");
TT_ASSERT(transfer_size % 64 == 0, "Each copy to hugepage must be mod64==0 when using nt_memcpy");
TT_ASSERT(
(addr_align >= 4 && (addr_align & (addr_align - 1)) == 0), "Address alignment must be a power of 2 >= 4");
TT_ASSERT(
copy_mode <= 8,
"Invalid --copy-mode arg! Only eight modes to copy data data from host into hugepages support!");
if (copy_mode >= 2 && copy_mode <= 7) {
if (copy_mode == 2 || copy_mode == 3) {
TT_ASSERT(
addr_align % sizeof(__m128) == 0,
"Address alignment must be a multiple of 16 when using nt_memcpy");
} else if (copy_mode == 5 || copy_mode == 6) {
TT_ASSERT(
addr_align % sizeof(__m256) == 0,
"Address alignment must be a multiple of 32 when using nt_memcpy");
}
if (copy_mode >= 2 && copy_mode <= 4) {
TT_ASSERT(
transfer_size % (INNER_LOOP * sizeof(__m128)) == 0,
"Each copy to hugepage must be mod32==0 when using nt_memcpy");
} else if (copy_mode >= 5 && copy_mode <= 7) {
TT_ASSERT(
transfer_size % (INNER_LOOP * sizeof(__m256)) == 0,
"Each copy to hugepage must be mod64==0 when using nt_memcpy");
}
}

// Device setup
int device_id = 0;
tt_metal::Device* device = tt_metal::CreateDevice(device_id);
tt_metal::Device *device = tt_metal::CreateDevice(device_id);
CoreCoord logical_core(0, 0);
CoreCoord physical_core = device->worker_core_from_logical_core(logical_core);

chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
TT_ASSERT(device_id == mmio_device_id, "This test can only be run on MMIO device!");
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
void* host_hugepage_start = (void*) tt::Cluster::instance().host_dma_address(0, mmio_device_id, channel);
void *host_hugepage_start = (void *)tt::Cluster::instance().host_dma_address(0, mmio_device_id, channel);
uint32_t hugepage_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
uint32_t host_write_ptr = 0;

Expand All @@ -165,16 +241,15 @@ int main(int argc, char** argv) {
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_1,
.noc = tt_metal::NOC::NOC_0,
.compile_args = {host_write_ptr, hugepage_size, kernel_read_size}
});
.compile_args = {host_write_ptr, hugepage_size, kernel_read_size}});

// Add 2 * alignment so that we have enough space when aligning the ptr
// First add is for aligning to next aligned addr
// Second add is for making sure the specified alignment is the max alignment
std::vector<uint32_t> src_vec = create_random_vector_of_bfloat16(
total_transfer_size + 2 * addr_align, 1000, std::chrono::system_clock::now().time_since_epoch().count());

uint32_t * start_ptr = (uint32_t *)align(src_vec.data(), addr_align);
uint32_t *start_ptr = (uint32_t *)align(src_vec.data(), addr_align);
std::vector<uint32_t> result_vec;

const std::string copy_mode_str = copy_mode == 0 ? "memcpy" : copy_mode == 1 ? "4 byte writes" : "nt_memcpy";
Expand All @@ -188,12 +263,17 @@ int main(int argc, char** argv) {
"simulate_write_ptr_update={} "
"write_ptr_readback_interval={} "
"copy_mode={} ",
total_transfer_size, transfer_size, enable_kernel_read, simulate_write_ptr_update, write_ptr_readback_interval, copy_mode_str);
total_transfer_size,
transfer_size,
enable_kernel_read,
simulate_write_ptr_update,
write_ptr_readback_interval,
copy_mode_str);

log_info(LogTest, "Num tests {}", num_tests);
for (uint32_t i = 0; i < num_tests; ++i) {
// Execute application
std::thread t1 ([&]() {
std::thread t1([&]() {
if (enable_kernel_read) {
tt::tt_metal::detail::LaunchProgram(device, program);
}
Expand All @@ -210,13 +290,12 @@ int main(int argc, char** argv) {
}
uint32_t write_size_bytes = std::min((uint32_t)space_available, transfer_size);
write_size_bytes = std::min(write_size_bytes, (total_transfer_size - data_written_bytes));
uint8_t* host_mem_ptr = (uint8_t *)host_hugepage_start + host_write_ptr;
uint8_t *host_mem_ptr = (uint8_t *)host_hugepage_start + host_write_ptr;
uint32_t src_data_offset = data_written_bytes / sizeof(uint32_t);

if (copy_mode == 0) {
memcpy(host_mem_ptr, start_ptr + src_data_offset, write_size_bytes);
} else if (copy_mode == 1) {

uint32_t *host_mem_ptr4B = (uint32_t *)host_mem_ptr;
uint32_t write_size_words = write_size_bytes / sizeof(uint32_t);

Expand All @@ -228,8 +307,29 @@ int main(int argc, char** argv) {

} else if (copy_mode == 2) {
TT_ASSERT(host_write_ptr % 16 == 0 and data_written_bytes % 16 == 0);
nt_memcpy(host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes);
nt_memcpy_128b<true, true>(
host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes);
} else if (copy_mode == 3) {
TT_ASSERT(host_write_ptr % 16 == 0 and data_written_bytes % 16 == 0);
nt_memcpy_128b<false, true>(
host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes);
} else if (copy_mode == 4) {
TT_ASSERT(host_write_ptr % 16 == 0);
nt_memcpy_128b<false, false>(
host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes);
} else if (copy_mode == 5) {
TT_ASSERT(host_write_ptr % 32 == 0 and data_written_bytes % 32 == 0);
nt_memcpy_256b<true, true>(
host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes);
} else if (copy_mode == 6) {
TT_ASSERT(host_write_ptr % 32 == 0 and data_written_bytes % 32 == 0);
nt_memcpy_256b<false, true>(
host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes);
} else if (copy_mode == 7) {
TT_ASSERT(host_write_ptr % 32 == 0);
nt_memcpy_256b<false, false>(
host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes);
} else if (copy_mode == 8) {
TT_ASSERT(host_write_ptr % 16 == 0);
memcpy_to_device<true>(host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes);
}
Expand All @@ -239,7 +339,8 @@ int main(int argc, char** argv) {
if (simulate_write_ptr_update) {
uint32_t num_write_ptr_updates = write_size_bytes / (32 * 1024);
for (int i = 0; i < num_write_ptr_updates; i++) {
tt::Cluster::instance().write_reg(&val_to_write, tt_cxy_pair(device->id(), physical_core), reg_addr);
tt::Cluster::instance().write_reg(
&val_to_write, tt_cxy_pair(device->id(), physical_core), reg_addr);
reg_addr += sizeof(uint32_t);
num_reg_writes = (reg_addr - dispatch_constants::PREFETCH_Q_BASE) / sizeof(uint32_t);
if (num_reg_writes == num_reg_entries) {
Expand All @@ -250,7 +351,8 @@ int main(int argc, char** argv) {

if (write_ptr_readback_interval > 0 and num_reg_writes == write_ptr_readback_interval) {
std::vector<std::uint32_t> read_hex_vec(1, 0);
tt::Cluster::instance().read_core(read_hex_vec.data(), sizeof(uint32_t), tt_cxy_pair(device->id(), physical_core), reg_addr);
tt::Cluster::instance().read_core(
read_hex_vec.data(), sizeof(uint32_t), tt_cxy_pair(device->id(), physical_core), reg_addr);
}

host_write_ptr += write_size_bytes;
Expand All @@ -264,15 +366,11 @@ int main(int argc, char** argv) {

auto elapsed_us = duration_cast<microseconds>(t_end - t_begin).count();
h2d_bandwidth.push_back((total_transfer_size / 1024.0 / 1024.0 / 1024.0) / (elapsed_us / 1000.0 / 1000.0));
log_info(
LogTest,
"H2D BW: {:.3f}ms, {:.3f}GB/s",
elapsed_us / 1000.0,
h2d_bandwidth[i]);
log_info(LogTest, "H2D BW: {:.3f}ms, {:.3f}GB/s", elapsed_us / 1000.0, h2d_bandwidth[i]);
}

pass &= tt_metal::CloseDevice(device);
} catch (const std::exception& e) {
} catch (const std::exception &e) {
pass = false;
log_error(LogTest, "{}", e.what());
log_error(LogTest, "System error message: {}", std::strerror(errno));
Expand Down

0 comments on commit 4170c3d

Please sign in to comment.