diff --git a/transpose-cute/smem_helper.hpp b/transpose-cute/smem_helper.hpp index ab93648..0859cd5 100644 --- a/transpose-cute/smem_helper.hpp +++ b/transpose-cute/smem_helper.hpp @@ -4,12 +4,12 @@ namespace cfx { - using namespace cute; +using namespace cute; // Helper functions for retrieving optimal swizzled layouts template constexpr auto getSmemLayoutK() { - constexpr int headSizeBytes = sizeof(PrecType) * DIM; + constexpr int headSizeBytes = sizeof(PrecType) * DIM; if constexpr (headSizeBytes == 16) { return GMMA::Layout_K_INTER_Atom{}; @@ -24,7 +24,7 @@ template constexpr auto getSmemLayoutK() { template constexpr auto getSmemLayoutMN() { - constexpr int headSizeBytes = sizeof(PrecType) * DIM; + constexpr int headSizeBytes = sizeof(PrecType) * DIM; if constexpr (headSizeBytes == 16) { return GMMA::Layout_MN_INTER_Atom{}; @@ -37,19 +37,18 @@ template constexpr auto getSmemLayoutMN() { } } -void set_smem_size(int smem_size, void const* kernel) -{ - // account for dynamic smem capacity if needed - if (smem_size >= (48 << 10)) { - cudaError_t result = cudaFuncSetAttribute( - kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - smem_size); - if (cudaSuccess != result) { - result = cudaGetLastError(); // to clear the error bit - std::cout << " Shared Memory Allocation Failed " << std:: endl << " cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result) << std::endl; - } +void set_smem_size(int smem_size, void const *kernel) { + // account for dynamic smem capacity if needed + if (smem_size >= (48 << 10)) { + cudaError_t result = cudaFuncSetAttribute( + kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); + if (cudaSuccess != result) { + result = cudaGetLastError(); // to clear the error bit + std::cout << " Shared Memory Allocation Failed " << std::endl + << " cudaFuncSetAttribute() returned error: " + << cudaGetErrorString(result) << std::endl; } + } } -} \ No newline at end of file +} // namespace cfx \ No newline at end of file diff --git a/transpose-cute/transpose_tmastore_vectorized.h b/transpose-cute/transpose_tmastore_vectorized.h index 48d6fb7..2368b77 100644 --- a/transpose-cute/transpose_tmastore_vectorized.h +++ b/transpose-cute/transpose_tmastore_vectorized.h @@ -7,12 +7,12 @@ #include #include -#include -#include #include +#include +#include #include #include -#include +#include #include "cutlass/util/GPU_Clock.hpp" #include "cutlass/util/command_line.h" @@ -26,11 +26,16 @@ using namespace cute; -template +template __global__ static void __launch_bounds__(256) -transposeKernelTMA(TensorS const S, SmemLayout const smemLayout, TiledCopyS const tiled_copy_S, - CUTE_GRID_CONSTANT TiledCopyD const tmaStoreD, GmemLayoutD const gmemLayoutD, TileShapeD const tileShapeD, ThreadLayoutM const tM, SmemLayoutM const smemLayoutM) { + transposeKernelTMA(TensorS const S, SmemLayout const smemLayout, + TiledCopyS const tiled_copy_S, + CUTE_GRID_CONSTANT TiledCopyD const tmaStoreD, + GmemLayoutD const gmemLayoutD, + TileShapeD const tileShapeD, ThreadLayoutM const tM, + SmemLayoutM const smemLayoutM) { using namespace cute; using Element = typename TensorS::value_type; @@ -41,32 +46,34 @@ transposeKernelTMA(TensorS const S, SmemLayout const smemLayout, TiledCopyS cons // Use Shared Storage structure to allocate aligned SMEM addresses. extern __shared__ char shared_memory[]; using SharedStorage = SharedStorageTranspose; - SharedStorage &shared_storage = *reinterpret_cast(shared_memory); - Tensor sM = make_tensor(make_smem_ptr(shared_storage.smem.data()), smemLayoutM); + SharedStorage &shared_storage = + *reinterpret_cast(shared_memory); + Tensor sM = + make_tensor(make_smem_ptr(shared_storage.smem.data()), smemLayoutM); - Tensor gS = S(make_coord(_, _), blockIdx.x, blockIdx.y); // (bM, bN) - auto thr_copy_S = tiled_copy_S.get_thread_slice(threadIdx.x); + Tensor gS = S(make_coord(_, _), blockIdx.x, blockIdx.y); // (bM, bN) + auto thr_copy_S = tiled_copy_S.get_thread_slice(threadIdx.x); - Tensor tSgS = thr_copy_S.partition_S(gS); // (CopyOp, CopyM, CopyN) - Tensor tSrS = make_fragment_like(tSgS); // (CopyOp, CopyM, CopyN) - Tensor tMsM = local_partition(sM, tM, threadIdx.x); + Tensor tSgS = thr_copy_S.partition_S(gS); // (CopyOp, CopyM, CopyN) + Tensor tSrS = make_fragment_like(tSgS); // (CopyOp, CopyM, CopyN) + Tensor tMsM = local_partition(sM, tM, threadIdx.x); // Copy from GMEM to RMEM to SMEM copy(tiled_copy_S, tSgS, tSrS); copy(tSrS, tMsM); - auto synchronize = [&]() { cutlass::arch::NamedBarrier::sync(size(ThreadLayoutM{}), 0); }; -cutlass::arch::fence_view_async_shared(); + cutlass::arch::fence_view_async_shared(); synchronize(); - + // Issue the TMA store. - Tensor mD = tmaStoreD.get_tma_tensor(shape(gmemLayoutD)); + Tensor mD = tmaStoreD.get_tma_tensor(shape(gmemLayoutD)); auto blkCoordD = make_coord(blockIdx.y, blockIdx.x); Tensor gD = local_tile(mD, tileShapeD, blkCoordD); - Tensor sD = make_tensor(make_smem_ptr(shared_storage.smem.data()), smemLayout); // (bN, bM) + Tensor sD = make_tensor(make_smem_ptr(shared_storage.smem.data()), + smemLayout); // (bN, bM) auto cta_tmaD = tmaStoreD.get_slice(0); @@ -82,134 +89,135 @@ cutlass::arch::fence_view_async_shared(); copy(tmaStoreD, tDsD, tDgD); } // Wait for TMA store to complete. - tma_store_wait<0>(); - + tma_store_wait<0>(); } int transpose_host_kernel_tma(int M, int N) { - printf("Vectorized load into registers, write out via TMA Store\n"); - printf("Profiler reports uncoalesced smem accesses\n"); + printf("Vectorized load into registers, write out via TMA Store\n"); + printf("Profiler reports uncoalesced smem accesses\n"); - using Element = float; - using namespace cute; + using Element = float; + using namespace cute; - auto tensor_shape = make_shape(M, N); - auto tensor_shape_trans = make_shape(N, M); + auto tensor_shape = make_shape(M, N); + auto tensor_shape_trans = make_shape(N, M); - //Allocate and initialize - thrust::host_vector h_S(size(tensor_shape)); // (M, N) - thrust::host_vector h_D(size(tensor_shape_trans)); // (N, M) + // Allocate and initialize + thrust::host_vector h_S(size(tensor_shape)); // (M, N) + thrust::host_vector h_D(size(tensor_shape_trans)); // (N, M) - for (size_t i = 0; i < h_S.size(); ++i) { - h_S[i] = static_cast(i); - h_D[i] = Element{}; - } + for (size_t i = 0; i < h_S.size(); ++i) { + h_S[i] = static_cast(i); + h_D[i] = Element{}; + } + + thrust::device_vector d_S = h_S; + thrust::device_vector d_D = h_D; + + // + // Make tensors + // + + // Could also have ColMajor. + auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{}); + auto gmemLayoutD = make_layout(tensor_shape_trans, GenRowMajor{}); + + Tensor tensor_S = make_tensor( + make_gmem_ptr(thrust::raw_pointer_cast(d_S.data())), gmemLayoutS); + Tensor tensor_D = make_tensor( + make_gmem_ptr(thrust::raw_pointer_cast(d_D.data())), gmemLayoutD); - thrust::device_vector d_S = h_S; - thrust::device_vector d_D = h_D; - - // - // Make tensors - // - - // Could also have ColMajor. - auto gmemLayoutS = make_layout(tensor_shape, GenRowMajor{}); - auto gmemLayoutD = make_layout(tensor_shape_trans, GenRowMajor{}); - - Tensor tensor_S = make_tensor(make_gmem_ptr(thrust::raw_pointer_cast(d_S.data())), gmemLayoutS); - Tensor tensor_D = make_tensor(make_gmem_ptr(thrust::raw_pointer_cast(d_D.data())), gmemLayoutD); - - // - // Tile tensors - // - - using bM = Int<32>; - using bN = Int<32>; - - auto block_shape = make_shape(bM{}, bN{}); // (bM, bN) - auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM) - - Tensor tiled_tensor_S = tiled_divide(tensor_S, block_shape); // ((bM, bN), m', n') - Tensor tiled_tensor_D = tiled_divide(tensor_D, block_shape_trans); // ((bN, bM), n', m') - - auto threadLayoutS = make_layout(make_shape(Int<32>{}, Int<8>{}), GenRowMajor{}); - auto vecLayoutS = make_layout(make_shape(Int<1>{}, Int<4>{})); - using AccessTypeS = cutlass::AlignedArray; - using AtomS = Copy_Atom, Element>; - auto tiled_copy_S = make_tiled_copy(AtomS{}, threadLayoutS, vecLayoutS); - - auto tileShapeD = block_shape_trans; - auto smemLayoutD = + // + // Tile tensors + // + + using bM = Int<32>; + using bN = Int<32>; + + auto block_shape = make_shape(bM{}, bN{}); // (bM, bN) + auto block_shape_trans = make_shape(bN{}, bM{}); // (bN, bM) + + Tensor tiled_tensor_S = + tiled_divide(tensor_S, block_shape); // ((bM, bN), m', n') + Tensor tiled_tensor_D = + tiled_divide(tensor_D, block_shape_trans); // ((bN, bM), n', m') + + auto threadLayoutS = + make_layout(make_shape(Int<32>{}, Int<8>{}), GenRowMajor{}); + auto vecLayoutS = make_layout(make_shape(Int<1>{}, Int<4>{})); + using AccessTypeS = cutlass::AlignedArray; + using AtomS = Copy_Atom, Element>; + auto tiled_copy_S = make_tiled_copy(AtomS{}, threadLayoutS, vecLayoutS); + + auto tileShapeD = block_shape_trans; + auto smemLayoutD = tile_to_shape(cfx::getSmemLayoutK(), - make_shape(shape<0>(tileShapeD), shape<1>(tileShapeD))); - //TMA only supports certain swizzles - //https://github.com/NVIDIA/cutlass/blob/main/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp - auto tmaD = - make_tma_copy(SM90_TMA_STORE{}, tensor_D, smemLayoutD, tileShapeD, Int<1>{}); - - auto tileShapeM = make_shape(Int<4>{}, Int<8>{}, Int<32>{}); - auto smemLayoutM = composition(smemLayoutD, make_layout(tileShapeM)); - auto threadLayoutM = make_layout(make_shape(Int<1>{},Int<8>{}, Int<32>{}), make_stride(Int<1>{}, Int<1>{}, Int<8>{})); - - size_t smem_size = int(sizeof(SharedStorageTranspose)); - - // - // Determine grid and block dimensions - // - - dim3 gridDim(size<1>(tiled_tensor_S), size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n' - dim3 blockDim(size(threadLayoutS)); - - transposeKernelTMA<<>>(tiled_tensor_S, smemLayoutD, tiled_copy_S, tmaD, - gmemLayoutD, tileShapeD, threadLayoutM, smemLayoutM); - - int iterations = 10; - - for (int i = 0; i < iterations; i++) { - auto t1 = std::chrono::high_resolution_clock::now(); - transposeKernelTMA<<>>(tiled_tensor_S, smemLayoutD, tiled_copy_S, tmaD, - gmemLayoutD, tileShapeD, threadLayoutM, smemLayoutM); - cudaError result = cudaDeviceSynchronize(); - auto t2 = std::chrono::high_resolution_clock::now(); - if (result != cudaSuccess) { - std::cerr << "CUDA Runtime error: " << cudaGetErrorString(result) - << std::endl; - return -1; - } - std::chrono::duration tDiff = t2 - t1; - double time_ms = tDiff.count(); - std::cout << "Trial " << i << " Completed in " << time_ms << "ms (" - << 2e-6 * M * N * sizeof(Element) / time_ms << " GB/s)" - << std::endl; - } - + make_shape(shape<0>(tileShapeD), shape<1>(tileShapeD))); + // TMA only supports certain swizzles + // https://github.com/NVIDIA/cutlass/blob/main/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp + auto tmaD = make_tma_copy(SM90_TMA_STORE{}, tensor_D, smemLayoutD, tileShapeD, + Int<1>{}); + + auto tileShapeM = make_shape(Int<4>{}, Int<8>{}, Int<32>{}); + auto smemLayoutM = composition(smemLayoutD, make_layout(tileShapeM)); + auto threadLayoutM = make_layout(make_shape(Int<1>{}, Int<8>{}, Int<32>{}), + make_stride(Int<1>{}, Int<1>{}, Int<8>{})); + + size_t smem_size = + int(sizeof(SharedStorageTranspose)); + + // + // Determine grid and block dimensions + // + + dim3 gridDim( + size<1>(tiled_tensor_S), + size<2>(tiled_tensor_S)); // Grid shape corresponds to modes m' and n' + dim3 blockDim(size(threadLayoutS)); + + transposeKernelTMA<<>>( + tiled_tensor_S, smemLayoutD, tiled_copy_S, tmaD, gmemLayoutD, tileShapeD, + threadLayoutM, smemLayoutM); + + int iterations = 10; + + for (int i = 0; i < iterations; i++) { + auto t1 = std::chrono::high_resolution_clock::now(); + transposeKernelTMA<<>>( + tiled_tensor_S, smemLayoutD, tiled_copy_S, tmaD, gmemLayoutD, + tileShapeD, threadLayoutM, smemLayoutM); cudaError result = cudaDeviceSynchronize(); + auto t2 = std::chrono::high_resolution_clock::now(); if (result != cudaSuccess) { - std::cerr << "CUDA Runtime error: " << cudaGetErrorString(result) << std::endl; - return -1; + std::cerr << "CUDA Runtime error: " << cudaGetErrorString(result) + << std::endl; + return -1; } + std::chrono::duration tDiff = t2 - t1; + double time_ms = tDiff.count(); + std::cout << "Trial " << i << " Completed in " << time_ms << "ms (" + << 2e-6 * M * N * sizeof(Element) / time_ms << " GB/s)" + << std::endl; + } - // - // Verify - // - - h_D = d_D; - - int good = 0, bad = 0; + // + // Verify + // - auto transposeFunction = make_layout(tensor_shape, GenRowMajor{}); + h_D = d_D; - for (size_t i = 0; i < h_D.size(); ++i) { - if (h_D[i] == h_S[transposeFunction(i)]) - good++; - else - bad++; - } + int good = 0, bad = 0; - std::cout << "Success " << good << ", Fail " << bad << std::endl; - - return 0; -} + auto transposeFunction = make_layout(tensor_shape, GenRowMajor{}); + for (size_t i = 0; i < h_D.size(); ++i) { + if (h_D[i] == h_S[transposeFunction(i)]) + good++; + else + bad++; + } + std::cout << "Success " << good << ", Fail " << bad << std::endl; + return 0; +}