From 4170c3dc55c6b182a787820a3472dd0472b05a05 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Fri, 24 May 2024 16:15:48 +0000 Subject: [PATCH] #8837: Add 256B nt memcpy tests to test_pull_from_pcie --- .../3_pcie_transfer/test_pull_from_pcie.cpp | 196 +++++++++++++----- 1 file changed, 147 insertions(+), 49 deletions(-) diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp index 4122ab9a1dc..8e6161eb0f2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp @@ -43,31 +43,72 @@ void *align(void *ptr, std::size_t max_alignment) { return reinterpret_cast(aligned); } -#define CACHE_LINE_SIZE 64 - -void nt_memcpy(uint8_t *__restrict dst, const uint8_t * __restrict src, size_t n) -{ - size_t num_lines = n / CACHE_LINE_SIZE; +#define INNER_LOOP 8 +template +void nt_memcpy_128b(uint8_t *__restrict dst, const uint8_t *__restrict src, size_t n) { + size_t num_lines = n / (INNER_LOOP * sizeof(__m128i)); + constexpr size_t inner_blk_size = INNER_LOOP * sizeof(__m128i); size_t i; for (i = 0; i < num_lines; i++) { size_t j; - for (j = 0; j < CACHE_LINE_SIZE / sizeof(__m128i); j++) { - __m128i blk = _mm_stream_load_si128((__m128i *)src); + for (j = 0; j < INNER_LOOP; j++) { + __m128i blk; + if constexpr (stream_load) { + blk = _mm_stream_load_si128((__m128i *)src); + } else { + if constexpr (aligned_load) { + blk = _mm_load_si128((__m128i *)src); + } else { + blk = _mm_loadu_si128((__m128i *)src); + } + } /* non-temporal store */ _mm_stream_si128((__m128i *)dst, blk); + src += sizeof(__m128i); dst += sizeof(__m128i); } - n -= CACHE_LINE_SIZE; + n -= inner_blk_size; } if (num_lines > 0) _mm_sfence(); } +template +void nt_memcpy_256b(uint8_t *__restrict dst, const uint8_t *__restrict src, size_t n) { + size_t num_lines = n / (INNER_LOOP * sizeof(__m256i)); + constexpr size_t inner_blk_size = INNER_LOOP * sizeof(__m256i); + size_t i; + for (i = 0; i < num_lines; i++) { + size_t j; + for (j = 0; j < INNER_LOOP; j++) { + __m256i blk; + if constexpr (stream_load) { + static_assert(aligned_load); + blk = _mm256_stream_load_si256((__m256i *)src); + } else { + if constexpr (aligned_load) { + blk = _mm256_load_si256((__m256i *)src); + } else { + blk = _mm256_loadu_si256((__m256i *)src); + } + } + /* non-temporal store */ + _mm256_stream_si256((__m256i *)dst, blk); + + src += sizeof(__m256i); + dst += sizeof(__m256i); + } + n -= inner_blk_size; + } -int main(int argc, char** argv) { + if (num_lines > 0) + _mm_sfence(); +} + +int main(int argc, char **argv) { bool pass = true; std::vector h2d_bandwidth; uint32_t num_tests = 10; @@ -77,23 +118,39 @@ int main(int argc, char** argv) { bool simulate_write_ptr_update = false; uint32_t write_ptr_readback_interval = 0; uint32_t copy_mode = 0; - std::size_t addr_align = MEMCPY_ALIGNMENT; + constexpr uint32_t memcpy_alignment = sizeof(__m256i); + std::size_t addr_align = memcpy_alignment; try { // Input arguments parsing std::vector input_args(argv, argv + argc); - if (test_args::has_command_option(input_args, "-h") || - test_args::has_command_option(input_args, "--help")) { + if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) { log_info(LogTest, "Usage:"); log_info(LogTest, " --num-tests: number of iterations"); - log_info(LogTest, " --total-transfer-size: total size to copy to hugepage in bytes (default {} B)", 512 * 1024 * 1024); + log_info( + LogTest, + " --total-transfer-size: total size to copy to hugepage in bytes (default {} B)", + 512 * 1024 * 1024); log_info(LogTest, " --transfer-size: size of one write to hugepage (default {} B)", 64 * 1024); log_info(LogTest, " --enable-kernel-read: whether to run a kernel that reads from PCIe (default false)"); - log_info(LogTest, " --simulate-wr-ptr-update: whether host writes to reg address at 32KB intervals (default false)"); - log_info(LogTest, " --wr-ptr-rdbk-interval: after this many num writes to reg address, do readback (default 0 means no readbacks)"); - log_info(LogTest, " --copy-mode: method used to write to pcie. 0: memcpy, 1: 4 byte writes, 2: nt_memcpy (uncached writes + 16B stores), 3: memcpy_to_device (uncached writes + unaligned 16B stores)"); - log_info(LogTest, " --addr-align: Alignment of start of data. Must be a power of 2 (default {} B)", MEMCPY_ALIGNMENT); + log_info( + LogTest, + " --simulate-wr-ptr-update: whether host writes to reg address at 32KB intervals (default false)"); + log_info( + LogTest, + " --wr-ptr-rdbk-interval: after this many num writes to reg address, do readback (default 0 means no " + "readbacks)"); + log_info( + LogTest, + " --copy-mode: method used to write to pcie. 0: memcpy, 1: 4 byte writes, 2: nt_memcpy (16B streaming " + "loads + stores), 3: nt_memcpy (16B aligned loads + streaming stores), 4: nt_memcpy (16B unaligned " + "loads + streaming stores), 5: nt_memcpy (32B streaming loads + stores), 6: nt_memcpy (32B aligned " + "loads + streaming stores), 7: nt_memcpy (32B unaligned loads + streaming stores) 8: memcpy_to_device"); + log_info( + LogTest, + " --addr-align: Alignment of start of data. Must be a power of 2 (default {} B)", + memcpy_alignment); exit(0); } @@ -104,8 +161,8 @@ int main(int argc, char** argv) { std::tie(total_transfer_size, input_args) = test_args::get_command_option_uint32_and_remaining_args( input_args, "--total-transfer-size", 512 * 1024 * 1024); - std::tie(transfer_size, input_args) = test_args::get_command_option_uint32_and_remaining_args( - input_args, "--transfer-size", 64 * 1024); + std::tie(transfer_size, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--transfer-size", 64 * 1024); std::tie(enable_kernel_read, input_args) = test_args::has_command_option_and_remaining_args(input_args, "--enable-kernel-read"); @@ -113,36 +170,55 @@ int main(int argc, char** argv) { std::tie(simulate_write_ptr_update, input_args) = test_args::has_command_option_and_remaining_args(input_args, "--simulate-wr-ptr-update"); - std::tie(write_ptr_readback_interval, input_args) = test_args::get_command_option_uint32_and_remaining_args( - input_args, "--wr-ptr-rdbk-interval", 0); + std::tie(write_ptr_readback_interval, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--wr-ptr-rdbk-interval", 0); - std::tie(copy_mode, input_args) = test_args::get_command_option_uint32_and_remaining_args( - input_args, "--copy-mode", 0); + std::tie(copy_mode, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--copy-mode", 0); - std::tie(addr_align, input_args) = test_args::get_command_option_uint32_and_remaining_args( - input_args, "--addr-align", MEMCPY_ALIGNMENT); + std::tie(addr_align, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--addr-align", memcpy_alignment); test_args::validate_remaining_args(input_args); - } catch (const std::exception& e) { + } catch (const std::exception &e) { log_error(tt::LogTest, "Command line arguments found exception", e.what()); } - TT_ASSERT((addr_align >= 4 && (addr_align & (addr_align - 1)) == 0), "Address alignment must be a power of 2 >= 4"); - TT_ASSERT(copy_mode <= 3, "Invalid --copy-mode arg! Only four modes to copy data data from host into hugepages support! memcpy, 4 byte writes, nt_copy, and memcpy_to_device"); - if (copy_mode == 2) { - TT_ASSERT(addr_align % 16 == 0, "Address alignment must be a multiple of 16 when using nt_memcpy"); - TT_ASSERT(transfer_size % 64 == 0, "Each copy to hugepage must be mod64==0 when using nt_memcpy"); + TT_ASSERT( + (addr_align >= 4 && (addr_align & (addr_align - 1)) == 0), "Address alignment must be a power of 2 >= 4"); + TT_ASSERT( + copy_mode <= 8, + "Invalid --copy-mode arg! Only eight modes to copy data data from host into hugepages support!"); + if (copy_mode >= 2 && copy_mode <= 7) { + if (copy_mode == 2 || copy_mode == 3) { + TT_ASSERT( + addr_align % sizeof(__m128) == 0, + "Address alignment must be a multiple of 16 when using nt_memcpy"); + } else if (copy_mode == 5 || copy_mode == 6) { + TT_ASSERT( + addr_align % sizeof(__m256) == 0, + "Address alignment must be a multiple of 32 when using nt_memcpy"); + } + if (copy_mode >= 2 && copy_mode <= 4) { + TT_ASSERT( + transfer_size % (INNER_LOOP * sizeof(__m128)) == 0, + "Each copy to hugepage must be mod32==0 when using nt_memcpy"); + } else if (copy_mode >= 5 && copy_mode <= 7) { + TT_ASSERT( + transfer_size % (INNER_LOOP * sizeof(__m256)) == 0, + "Each copy to hugepage must be mod64==0 when using nt_memcpy"); + } } // Device setup int device_id = 0; - tt_metal::Device* device = tt_metal::CreateDevice(device_id); + tt_metal::Device *device = tt_metal::CreateDevice(device_id); CoreCoord logical_core(0, 0); CoreCoord physical_core = device->worker_core_from_logical_core(logical_core); chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); TT_ASSERT(device_id == mmio_device_id, "This test can only be run on MMIO device!"); uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); - void* host_hugepage_start = (void*) tt::Cluster::instance().host_dma_address(0, mmio_device_id, channel); + void *host_hugepage_start = (void *)tt::Cluster::instance().host_dma_address(0, mmio_device_id, channel); uint32_t hugepage_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel); uint32_t host_write_ptr = 0; @@ -165,8 +241,7 @@ int main(int argc, char** argv) { tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::NOC_0, - .compile_args = {host_write_ptr, hugepage_size, kernel_read_size} - }); + .compile_args = {host_write_ptr, hugepage_size, kernel_read_size}}); // Add 2 * alignment so that we have enough space when aligning the ptr // First add is for aligning to next aligned addr @@ -174,7 +249,7 @@ int main(int argc, char** argv) { std::vector src_vec = create_random_vector_of_bfloat16( total_transfer_size + 2 * addr_align, 1000, std::chrono::system_clock::now().time_since_epoch().count()); - uint32_t * start_ptr = (uint32_t *)align(src_vec.data(), addr_align); + uint32_t *start_ptr = (uint32_t *)align(src_vec.data(), addr_align); std::vector result_vec; const std::string copy_mode_str = copy_mode == 0 ? "memcpy" : copy_mode == 1 ? "4 byte writes" : "nt_memcpy"; @@ -188,12 +263,17 @@ int main(int argc, char** argv) { "simulate_write_ptr_update={} " "write_ptr_readback_interval={} " "copy_mode={} ", - total_transfer_size, transfer_size, enable_kernel_read, simulate_write_ptr_update, write_ptr_readback_interval, copy_mode_str); + total_transfer_size, + transfer_size, + enable_kernel_read, + simulate_write_ptr_update, + write_ptr_readback_interval, + copy_mode_str); log_info(LogTest, "Num tests {}", num_tests); for (uint32_t i = 0; i < num_tests; ++i) { // Execute application - std::thread t1 ([&]() { + std::thread t1([&]() { if (enable_kernel_read) { tt::tt_metal::detail::LaunchProgram(device, program); } @@ -210,13 +290,12 @@ int main(int argc, char** argv) { } uint32_t write_size_bytes = std::min((uint32_t)space_available, transfer_size); write_size_bytes = std::min(write_size_bytes, (total_transfer_size - data_written_bytes)); - uint8_t* host_mem_ptr = (uint8_t *)host_hugepage_start + host_write_ptr; + uint8_t *host_mem_ptr = (uint8_t *)host_hugepage_start + host_write_ptr; uint32_t src_data_offset = data_written_bytes / sizeof(uint32_t); if (copy_mode == 0) { memcpy(host_mem_ptr, start_ptr + src_data_offset, write_size_bytes); } else if (copy_mode == 1) { - uint32_t *host_mem_ptr4B = (uint32_t *)host_mem_ptr; uint32_t write_size_words = write_size_bytes / sizeof(uint32_t); @@ -228,8 +307,29 @@ int main(int argc, char** argv) { } else if (copy_mode == 2) { TT_ASSERT(host_write_ptr % 16 == 0 and data_written_bytes % 16 == 0); - nt_memcpy(host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes); + nt_memcpy_128b( + host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes); } else if (copy_mode == 3) { + TT_ASSERT(host_write_ptr % 16 == 0 and data_written_bytes % 16 == 0); + nt_memcpy_128b( + host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes); + } else if (copy_mode == 4) { + TT_ASSERT(host_write_ptr % 16 == 0); + nt_memcpy_128b( + host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes); + } else if (copy_mode == 5) { + TT_ASSERT(host_write_ptr % 32 == 0 and data_written_bytes % 32 == 0); + nt_memcpy_256b( + host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes); + } else if (copy_mode == 6) { + TT_ASSERT(host_write_ptr % 32 == 0 and data_written_bytes % 32 == 0); + nt_memcpy_256b( + host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes); + } else if (copy_mode == 7) { + TT_ASSERT(host_write_ptr % 32 == 0); + nt_memcpy_256b( + host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes); + } else if (copy_mode == 8) { TT_ASSERT(host_write_ptr % 16 == 0); memcpy_to_device(host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes); } @@ -239,7 +339,8 @@ int main(int argc, char** argv) { if (simulate_write_ptr_update) { uint32_t num_write_ptr_updates = write_size_bytes / (32 * 1024); for (int i = 0; i < num_write_ptr_updates; i++) { - tt::Cluster::instance().write_reg(&val_to_write, tt_cxy_pair(device->id(), physical_core), reg_addr); + tt::Cluster::instance().write_reg( + &val_to_write, tt_cxy_pair(device->id(), physical_core), reg_addr); reg_addr += sizeof(uint32_t); num_reg_writes = (reg_addr - dispatch_constants::PREFETCH_Q_BASE) / sizeof(uint32_t); if (num_reg_writes == num_reg_entries) { @@ -250,7 +351,8 @@ int main(int argc, char** argv) { if (write_ptr_readback_interval > 0 and num_reg_writes == write_ptr_readback_interval) { std::vector read_hex_vec(1, 0); - tt::Cluster::instance().read_core(read_hex_vec.data(), sizeof(uint32_t), tt_cxy_pair(device->id(), physical_core), reg_addr); + tt::Cluster::instance().read_core( + read_hex_vec.data(), sizeof(uint32_t), tt_cxy_pair(device->id(), physical_core), reg_addr); } host_write_ptr += write_size_bytes; @@ -264,15 +366,11 @@ int main(int argc, char** argv) { auto elapsed_us = duration_cast(t_end - t_begin).count(); h2d_bandwidth.push_back((total_transfer_size / 1024.0 / 1024.0 / 1024.0) / (elapsed_us / 1000.0 / 1000.0)); - log_info( - LogTest, - "H2D BW: {:.3f}ms, {:.3f}GB/s", - elapsed_us / 1000.0, - h2d_bandwidth[i]); + log_info(LogTest, "H2D BW: {:.3f}ms, {:.3f}GB/s", elapsed_us / 1000.0, h2d_bandwidth[i]); } pass &= tt_metal::CloseDevice(device); - } catch (const std::exception& e) { + } catch (const std::exception &e) { pass = false; log_error(LogTest, "{}", e.what()); log_error(LogTest, "System error message: {}", std::strerror(errno));