From 7e16367ce783e7f2e6cdf05d138836ac28c34788 Mon Sep 17 00:00:00 2001 From: Shehtab Zaman Date: Thu, 16 Mar 2023 23:49:24 -0400 Subject: [PATCH] Added ETI --- .../distconv/distconv_layer_norm.hpp | 2 +- .../distconv/distconv_layer_norm.cu | 111 ++++++++++++++++-- 2 files changed, 103 insertions(+), 10 deletions(-) diff --git a/include/lbann/layers/regularizers/distconv/distconv_layer_norm.hpp b/include/lbann/layers/regularizers/distconv/distconv_layer_norm.hpp index a24b4d56bff..e2f7fda48b3 100644 --- a/include/lbann/layers/regularizers/distconv/distconv_layer_norm.hpp +++ b/include/lbann/layers/regularizers/distconv/distconv_layer_norm.hpp @@ -31,7 +31,7 @@ namespace distconv { template -class LayerNorm +class LayerNormalization { using LocaleMPI = tensor::LocaleMPI; diff --git a/src/layers/regularizers/distconv/distconv_layer_norm.cu b/src/layers/regularizers/distconv/distconv_layer_norm.cu index b460194153f..9e2cf0ee6cc 100644 --- a/src/layers/regularizers/distconv/distconv_layer_norm.cu +++ b/src/layers/regularizers/distconv/distconv_layer_norm.cu @@ -34,12 +34,70 @@ template template void LayerNormalization ::calculate_forward_stats( - const DCTensor& input) -{} + const DCTensor& input, + DCTensor& statistics) +{ + if (input_0.get_local_size() == 0) { + util::MPIRootPrintStreamInfo() << "WARNING: EMPTY INPUT FOUND \n"; + return; // no op for empty inputs + } + + const auto& input_dims = input.get_local_shape(); + const auto& statistics_dims = statistics.get_local_shape(); + + const auto local_num_samples = input_0_dims[3]; + + const auto global_num_samples = statistics_dims[3]; + + const auto local_sample_size = std::accumulate(input_dims.begin(), + input_dims.end() - 1, + 1, + std::multiplies()); + + using LocalMat = El::Matrix; + LocalMat local_input(local_sample_size, + local_num_samples, + input.get_buffer(), + local_sample_size); + + LocalMat local_statistics(2, + local_num_samples, + statistics.get_local_shape(), + 2); + + El::Zero(local_statistics); + auto local_means = El::View(local_statistics, El::IR(0), El::ALL); + auto local_vars = El::View(local_statistics, El::IR(1), El::ALL); + + { + using namespace hydrogen; + auto multisync = El::MakeMultiSync(gpu::get_sync_info(local_statistics), + gpu::get_sync_info(local_input)); + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_sample_size + block_size - 1) / block_size; + grid_dims.y = local_num_samples; + hydrogen::gpu::LaunchKernel( + ::lbann::layer_norm_fp_sums_kernel, + grid_dims, + block_dims, + 0, + multisync, + local_num_samples, + local_sample_size, + local_input.LockedBuffer(), + local_input.LDim(), + local_means.Buffer(), + local_means.LDim(), + local_vars.Buffer(), + local_vars.LDim()); + } +} template template -void LayerNormalization ::apply_normalization( +void LayerNormalization::apply_normalization( const DCTensor& input, const DCTensor& statistics, DCTensor& output) @@ -47,7 +105,7 @@ void LayerNormalization ::apply_normalization( template template -void LayerNormalization ::calculate_backward_stats( +void LayerNormalization::calculate_backward_stats( const DCTensor& input, const DCTensor& output_grad, const DCTensor& statistics, @@ -55,11 +113,46 @@ void LayerNormalization ::calculate_backward_stats( {} template template -void LayerNormalization ::apply_grad(const DCTensor& input, - const DCTensor& output_grad, - const DCTensor& statistics, - const DCTensor& statistics_grad, - DCTensor& input_grad) +void LayerNormalization::apply_grad(const DCTensor& input, + const DCTensor& output_grad, + const DCTensor& statistics, + const DCTensor& statistics_grad, + DCTensor& input_grad) {} +#define ETI(T, Backend) \ + template class LayerNormalization; \ + template void LayerNormalization::calculate_forward_stats< \ + tensor::CUDAAllocator>( \ + const tensor::Tensor& input, \ + tensor::Tensor& statistics); \ + template void \ + LayerNormalization::apply_normalization( \ + const tensor::Tensor& input, \ + const tensor::Tensor& \ + statistics, \ + tensor::Tensor& output); \ + template void LayerNormalization::calculate_backward_stats< \ + tensor::CUDAAllocator>( \ + const tensor::Tensor& input, \ + const tensor::Tensor& \ + output_grad, \ + const tensor::Tensor& \ + statistics, \ + tensor::Tensor& \ + statistics_grad); \ + template void \ + LayerNormalization::apply_grad( \ + const tensor::Tensor& input, \ + const tensor::Tensor& \ + output_grad, \ + const tensor::Tensor& \ + statistics, \ + const tensor::Tensor& \ + statistics_grad, \ + tensor::Tensor& input_grad); + +ETI(float, BackendDNNLib) +ETI(double, BackendDNNLib) +#endef ETI #endif // LBANN_HAS_DISTCONV \ No newline at end of file