diff --git a/nemo/collections/nlp/modules/common/megatron/mup/convert.py b/nemo/collections/nlp/modules/common/megatron/mup/convert.py index aa37f2e2f4ba..d0c7031633b2 100644 --- a/nemo/collections/nlp/modules/common/megatron/mup/convert.py +++ b/nemo/collections/nlp/modules/common/megatron/mup/convert.py @@ -137,7 +137,7 @@ def maybe_mup_init(module): attn_norm_head_divisors = collections.defaultdict(lambda: attn_norm_head_divisor) else: # Here we don't use a `defaultdict` so that we get errors for missing values. - attn_norm_head_divisors = base_head_widths + attn_norm_head_divisors = {name: math.sqrt(head_width) for (name, head_width) in base_head_widths.items()} for name, layer in self.named_modules(): if (