diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 86fd7a9e7cd78..e91382bec6b10 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -219,6 +219,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue that could cause the `LightningOptimizer` wrapper returned by `LightningModule.optimizers()` have different internal state than the optimizer it wraps ([#18280](https://github.com/Lightning-AI/lightning/pull/18280)) +- Fixed a race condition when determining logger experiment version (TensorBoardLogger and CSVLogger) ([#18309](https://github.com/Lightning-AI/lightning/pull/18309)) - Fixed model parameters getting shared between processes when running with `strategy="ddp_spawn"` and `accelerator="cpu"`; this has a necessary memory impact, as parameters are replicated for each process now ([#18238](https://github.com/Lightning-AI/lightning/pull/18238)) diff --git a/src/lightning/pytorch/trainer/call.py b/src/lightning/pytorch/trainer/call.py index 2eab1bac09c0f..e42a6c3ba9d6f 100644 --- a/src/lightning/pytorch/trainer/call.py +++ b/src/lightning/pytorch/trainer/call.py @@ -81,13 +81,13 @@ def _call_setup_hook(trainer: "pl.Trainer") -> None: if isinstance(module, _DeviceDtypeModuleMixin): module._device = trainer.strategy.root_device + trainer.strategy.barrier("pre_setup") + # Trigger lazy creation of experiment in loggers so loggers have their metadata available for logger in trainer.loggers: if hasattr(logger, "experiment"): _ = logger.experiment - trainer.strategy.barrier("pre_setup") - if trainer.datamodule is not None: _call_lightning_datamodule_hook(trainer, "setup", stage=fn) _call_callback_hooks(trainer, "setup", stage=fn) diff --git a/tests/tests_pytorch/loggers/test_all.py b/tests/tests_pytorch/loggers/test_all.py index 3f090b0264d02..a3dcafc21658c 100644 --- a/tests/tests_pytorch/loggers/test_all.py +++ b/tests/tests_pytorch/loggers/test_all.py @@ -382,3 +382,31 @@ def test_logger_default_name(tmpdir, monkeypatch): logger._mlflow_client.create_experiment.assert_called_with(name="lightning_logs", artifact_location=ANY) # on MLFLowLogger `name` refers to the experiment id # assert logger.experiment.get_experiment(logger.name).name == "lightning_logs" + + +class CheckVersion(BoringModel): + def __init__(self, expected_version): + super().__init__() + self.expected_version = expected_version + + def on_train_start(self): + assert self.logger.version == self.expected_version, f"{self.logger.version}" + + +@pytest.mark.parametrize("logger_class", [TensorBoardLogger, CSVLogger]) +def test_logger_same_version_across_ranks(logger_class, tmp_path): + trainer_kwargs = { + "logger": logger_class(tmp_path), + "default_root_dir": tmp_path, + "devices": 2, + "accelerator": "cpu", + "strategy": "ddp_spawn", + "max_steps": 1, + } + model = CheckVersion(expected_version=0) + trainer = Trainer(**trainer_kwargs) + trainer.fit(model) + + model = CheckVersion(expected_version=1) + trainer = Trainer(**trainer_kwargs) + trainer.fit(model)