diff --git a/nemo/collections/nlp/modules/common/megatron/mup/convert.py b/nemo/collections/nlp/modules/common/megatron/mup/convert.py
index aa37f2e2f4ba..d0c7031633b2 100644
--- a/nemo/collections/nlp/modules/common/megatron/mup/convert.py
+++ b/nemo/collections/nlp/modules/common/megatron/mup/convert.py
@@ -137,7 +137,7 @@ def maybe_mup_init(module):
             attn_norm_head_divisors = collections.defaultdict(lambda: attn_norm_head_divisor)
         else:
             # Here we don't use a `defaultdict` so that we get errors for missing values.
-            attn_norm_head_divisors = base_head_widths
+            attn_norm_head_divisors = {name: math.sqrt(head_width) for (name, head_width) in base_head_widths.items()}
 
         for name, layer in self.named_modules():
             if (