Skip to content

Commit

Permalink
[release/2.4] [ROCm][layer_norm] Use __builtin_amdgcn_rcpf(x) instead…
Browse files Browse the repository at this point in the history
… of 1.f/x (#1688)

Replace (more) exact calculation with hardware approximation.

Benefits:
Reduced code size.
Improved performance for certain scenarios.

Experiments show low reduction in precision.
Experiments show no significant performance regressions.
bfloat16 as well as float16 related calculations may benefit largely
from this change.

vectorized_layer_norm_kernel:
Gains performance esp. for the following tensor shapes.
Lower values for dim1 do not change performance significantly.
dim1 = 8k-65k may gain considerable performance, but decline gradually
with size.

```
dim0    dim1
----    ----
1024	8192
1024	16384
1024	32768
1024	65536
1024	131072
1024	262144
1024	524288
```

Co-authored-by: Hashem Hashemi <[email protected]>
  • Loading branch information
mhalk and amd-hhashemi authored Dec 9, 2024
1 parent 579c159 commit f0a620f
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 0 deletions.
8 changes: 8 additions & 0 deletions aten/src/ATen/native/cuda/layer_norm_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,11 @@ WelfordDataLN cuWelfordOnlineSum(
{
U delta = val - curr_sum.mean;
U new_count = curr_sum.count + 1.f;
#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
#else
U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
#endif
return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
}

Expand All @@ -140,7 +144,11 @@ WelfordDataLN cuWelfordCombine(
U count = dataA.count + dataB.count;
U mean, sigma2;
if (count > decltype(dataB.count){0}) {
#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
auto coef = __builtin_amdgcn_rcpf(count);
#else
auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
#endif
auto nA = dataA.count * coef;
auto nB = dataB.count * coef;
mean = nA*dataA.mean + nB*dataB.mean;
Expand Down
16 changes: 16 additions & 0 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1066,6 +1066,22 @@ if(USE_ROCM)
list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
endif(CMAKE_BUILD_TYPE MATCHES Debug)

# Get EnVar 'PYTORCH_LAYERNORM_FAST_RECIPROCAL' (or default to on).
if(DEFINED ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL})
set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE $ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL})
else()
set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE ON)
endif()

set(PYTORCH_LAYERNORM_FAST_RECIPROCAL
${PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE}
CACHE BOOL "Enable fast reciprocals within layer normalization." FORCE
)

if(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
add_definitions(-DPYTORCH_LAYERNORM_FAST_RECIPROCAL)
endif()

# needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17)

Expand Down
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@
# USE_ROCM_KERNEL_ASSERT=1
# Enable kernel assert in ROCm platform
#
# PYTORCH_LAYERNORM_FAST_RECIPROCAL
# If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t.
# layer normalization. Default: enabled.
#
# Environment variables we respect (these environment variables are
# conventional and are often understood/set by other software.)
#
Expand Down

0 comments on commit f0a620f

Please sign in to comment.