From 62f79302bed7836c9429ba2e324a217659163b2f Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Sat, 30 Mar 2024 20:36:18 +0000 Subject: [PATCH] Add comments --- llmfoundry/callbacks/hf_checkpointer.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 91e5ee6ff9..7104d4d0f2 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -78,11 +78,18 @@ def _register_model_with_run_id_multiprocess(mlflow_logger: MLFlowLogger, logging_level: int, model_uri: str, name: str, await_creation_for: int): + """Function for calling MLFlowLogger.register_model_with_run_id from a. + + spawned child process. + """ + # Setup logging for child process. This ensures that any logs from composer are surfaced. logging.basicConfig( format= f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' ) logging.getLogger('composer').setLevel(logging_level) + + # Register model. mlflow_logger.register_model_with_run_id( model_uri=model_uri, name=name, await_creation_for=await_creation_for) @@ -411,7 +418,7 @@ def _save_checkpoint(self, state: State, logger: Logger): os.path.join(local_save_path, license_filename), ) - # Register the model to mlflow in a child process. + # Spawn a new process to register the model. process = SpawnProcess( target=_register_model_with_run_id_multiprocess, kwargs={