Param init registry (#1096)

mosaicml · Apr 13, 2024 · 676ad7f · 676ad7f
1 parent cb0de4f
commit 676ad7f
Show file tree

Hide file tree

Showing 7 changed files with 196 additions and 64 deletions.
diff --git a/llmfoundry/layers_registry.py b/llmfoundry/layers_registry.py
@@ -73,8 +73,30 @@
     entry_points=True,
     description=_attention_implementations_description)
 
+_param_init_fns_description = (
+    'The param_init_fns registry is used to register functions that initialize parameters.'
+    +
+    'These will be called on a module to initialize its parameters. See param_init_fns.py for examples.'
+)
+param_init_fns = create_registry('llmfoundry',
+                                 'param_init_fns',
+                                 generic_type=Callable[..., None],
+                                 entry_points=True,
+                                 description=_param_init_fns_description)
+
+_module_init_fns_description = """The module_init_fns registry is used to register functions that initialize specific modules.
+These functions should return True if they initialize the module, and False otherwise. This allows them to be called without knowing their contents.
+They should take in the module, init_div_is_residual, and div_is_residual arguments."""
+module_init_fns = create_registry('llmfoundry',
+                                  'module_init_fns',
+                                  generic_type=Callable[..., bool],
+                                  entry_points=True,
+                                  description=_module_init_fns_description)
+
 __all__ = [
     'norms',
+    'param_init_fns',
+    'module_init_fns',
     'ffns',
     'ffns_with_norm',
     'ffns_with_megablocks',

diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
@@ -43,7 +43,7 @@
 from transformers.models.llama.modeling_llama import \
     LlamaRotaryEmbedding as HFRotaryEmbedding
 
-from llmfoundry.layers_registry import norms
+from llmfoundry.layers_registry import norms, param_init_fns
 from llmfoundry.models.layers.attention import (attn_bias_shape,
                                                 build_attn_bias, gen_slopes)
 from llmfoundry.models.layers.blocks import MPTBlock
@@ -62,7 +62,6 @@
     init_empty_weights  # type: ignore (see note)
 from llmfoundry.models.utils.param_init_fns import (
     generic_param_init_fn_,  # type: ignore (see note)
-    MODEL_INIT_REGISTRY,
 )
 from llmfoundry.models.layers.ffn import resolve_ffn_act_fn  # type: ignore (see note)
 
@@ -678,7 +677,7 @@ def forward(
     # Param Initialization, needed for device='meta' fast initialization
     def param_init_fn(self, module: nn.Module) -> None:
         init_fn_name = self.config.init_config['name']
-        MODEL_INIT_REGISTRY[init_fn_name](
+        param_init_fns.get(init_fn_name)(
             module=module,
             n_layers=self.config.n_layers,
             d_model=self.config.d_model,
@@ -838,7 +837,7 @@ def forward(
     # Param Initialization, needed for device='meta' fast initialization
     def param_init_fn(self, module: nn.Module) -> None:
         init_fn_name = self.config.init_config['name']
-        MODEL_INIT_REGISTRY[init_fn_name](
+        param_init_fns.get(init_fn_name)(
             module=module,
             n_layers=self.config.n_layers,
             d_model=self.config.d_model,

diff --git a/llmfoundry/models/utils/__init__.py b/llmfoundry/models/utils/__init__.py
@@ -6,14 +6,12 @@
                                                        init_on_device)
 from llmfoundry.models.utils.mpt_param_count import (mpt_get_active_params,
                                                      mpt_get_total_params)
-from llmfoundry.models.utils.param_init_fns import (MODEL_INIT_REGISTRY,
-                                                    generic_param_init_fn_)
+from llmfoundry.models.utils.param_init_fns import generic_param_init_fn_
 
 __all__ = [
     'init_empty_weights',
     'init_on_device',
     'generic_param_init_fn_',
-    'MODEL_INIT_REGISTRY',
     'config_moe_args',
     'mpt_get_active_params',
     'mpt_get_total_params',