Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Log all gpu rank stdout/err to MosaicML platform #2839

Merged
merged 39 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
7060757
logging all gpu ranks
jjanezhang Jan 11, 2024
b0b6fab
check if mosaic log dir is not none
jjanezhang Jan 11, 2024
a310bcb
refactor
jjanezhang Jan 11, 2024
d9d85f9
Added logging
jjanezhang Jan 11, 2024
1b37dc4
updated python file command
jjanezhang Jan 11, 2024
5212a81
redirecting to sep stderr and out files
jjanezhang Jan 24, 2024
1734a2f
added constant
jjanezhang Jan 24, 2024
4fa9f1a
Merge branch 'dev' of github.com:mosaicml/composer into jane/log-all-…
jjanezhang Jan 24, 2024
fd828cd
try logging to two files
jjanezhang Jan 24, 2024
1dc1602
teeing to two files
jjanezhang Jan 25, 2024
8222f59
chronological logs
jjanezhang Jan 26, 2024
782b3d8
Merge branch 'dev' of github.com:mosaicml/composer into jane/log-all-…
jjanezhang Jan 26, 2024
8271690
formatted merged in dev
jjanezhang Jan 26, 2024
6699d0c
reassign process std to log file
jjanezhang Jan 26, 2024
0dd64de
formatting all logs exceptions
jjanezhang Jan 26, 2024
30b79bb
adding error formatting
jjanezhang Jan 26, 2024
64a2747
removed process call
jjanezhang Jan 26, 2024
963b535
Merge branch 'dev' into jane/log-all-gpu-ranks
dakinggg Jan 31, 2024
a7a669f
Merge branch 'dev' into jane/log-all-gpu-ranks
dakinggg Feb 1, 2024
8ece4d8
default args std if not mosaic platform
jjanezhang Feb 2, 2024
8484d82
Merge branch 'jane/log-all-gpu-ranks' of github.com:mosaicml/composer…
jjanezhang Feb 2, 2024
f24394c
updated formatting to say STDOUT if not on mosaic
jjanezhang Feb 2, 2024
9bd1436
fixed typing
jjanezhang Feb 2, 2024
fced4bf
Merge branch 'dev' into jane/log-all-gpu-ranks
dakinggg Feb 2, 2024
fa4b8c5
added warning, renamed env var
jjanezhang Feb 2, 2024
7337fab
Merge branch 'jane/log-all-gpu-ranks' of github.com:mosaicml/composer…
jjanezhang Feb 2, 2024
2a47a10
updated warning
jjanezhang Feb 2, 2024
8648185
Merge branch 'dev' into jane/log-all-gpu-ranks
jjanezhang Feb 2, 2024
a255099
added instructions to override ignore, default log env var to empty str
jjanezhang Feb 2, 2024
c84790b
Merge branch 'jane/log-all-gpu-ranks' of github.com:mosaicml/composer…
jjanezhang Feb 2, 2024
398d6e0
set default for log dir
jjanezhang Feb 2, 2024
d44f9af
Merge branch 'dev' into jane/log-all-gpu-ranks
jjanezhang Feb 2, 2024
4dbb767
changed message
jjanezhang Feb 2, 2024
8ae4a16
Merge branch 'dev' of github.com:mosaicml/composer into jane/log-all-…
jjanezhang Feb 2, 2024
de48f75
Merge branch 'jane/log-all-gpu-ranks' of github.com:mosaicml/composer…
jjanezhang Feb 2, 2024
5fab513
set platform to false for provided stdout and err
jjanezhang Feb 2, 2024
47693c9
changed default for log dir to false
jjanezhang Feb 3, 2024
366a811
Update composer/cli/launcher.py
jjanezhang Feb 5, 2024
a6e420e
Merge branch 'dev' into jane/log-all-gpu-ranks
jjanezhang Feb 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 18 additions & 35 deletions composer/cli/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import torch

import composer
from composer.loggers.mosaicml_logger import MOSAICML_ACCESS_TOKEN_ENV_VAR, MOSAICML_LOG_DIR, MOSAICML_PLATFORM_ENV_VAR
from composer.utils import get_free_tcp_port

CLEANUP_TIMEOUT = datetime.timedelta(seconds=30)
Expand Down Expand Up @@ -259,8 +260,7 @@ def _launch_processes(
module_mode: bool,
command_mode: bool,
training_script: str,
stdout_file_format: str,
stderr_file_format: str,
log_file_format: str,
training_script_args: List[Any],
processes: Dict[int, subprocess.Popen],
):
Expand Down Expand Up @@ -313,19 +313,17 @@ def _get_file(format: str):
local_world_size=nproc,
node_rank=node_rank,
)
return open(filename, 'x+')
return open(filename, 'a+')
jjanezhang marked this conversation as resolved.
Show resolved Hide resolved

stderr_file = _get_file(stderr_file_format)
stdout_file = _get_file(stdout_file_format)
log_file = _get_file(log_file_format)

process = subprocess.Popen(
cmd,
stdout=stdout_file,
stderr=stderr_file,
stdout=log_file,
stderr=subprocess.STDOUT,
text=True,
)
process.stderr = stderr_file
process.stdout = stdout_file
process.stdout = log_file
processes[global_rank] = process


Expand Down Expand Up @@ -360,32 +358,16 @@ def _print_process_exit_status(global_rank: int, process: subprocess.Popen):
if process.stdout is None:
output = None
else:
print(f'stdout found for rank {global_rank}')
process.stdout.seek(0)
output = process.stdout.read()

if process.stderr is None:
stderr = None
else:
process.stderr.seek(0)
stderr = process.stderr.read()
exc = subprocess.CalledProcessError(
process.returncode,
cmd=process.args,
output=output,
stderr=stderr,
)
error_msg = [f'Global rank {global_rank} (PID {process.pid}) exited with code {process.returncode}']
if output is not None:
error_msg.extend([
f'----------Begin global rank {global_rank} STDOUT----------',
f'----------Begin global rank {global_rank} logs----------',
output,
f'----------End global rank {global_rank} STDOUT----------',
])
if stderr is not None:
error_msg.extend([
f'----------Begin global rank {global_rank} STDERR----------',
exc.stderr,
f'----------End global rank {global_rank} STDERR----------',
f'----------End global rank {global_rank} logs----------',
])
print('\n'.join(error_msg))

Expand Down Expand Up @@ -466,12 +448,14 @@ def main():
log.setLevel(logging.INFO if args.verbose else logging.WARN)

processes = {}

log_tmpdir = tempfile.TemporaryDirectory()
if args.stdout is None:
args.stdout = f'{log_tmpdir.name}/rank{{rank}}.stdout.txt'
if args.stderr is None:
args.stderr = f'{log_tmpdir.name}/rank{{rank}}.stderr.txt'
log_file_format = f'{log_tmpdir.name}/rank{{rank}}.txt'
jjanezhang marked this conversation as resolved.
Show resolved Hide resolved

# If running on the Mosaic platform, also log all gpu ranks' stderr and stdout to Mosaic platform
if os.environ.get(MOSAICML_PLATFORM_ENV_VAR, 'false').lower() == 'true' and os.environ.get(
MOSAICML_ACCESS_TOKEN_ENV_VAR) is not None and os.environ.get(MOSAICML_LOG_DIR) is not None:
jjanezhang marked this conversation as resolved.
Show resolved Hide resolved
log.info('Logging all gpu ranks to Mosaic Platform')
log_file_format = f'{os.environ.get(MOSAICML_LOG_DIR)}/gpu_{{rank}}.txt'

try:
_launch_processes(nproc=args.nproc,
Expand All @@ -482,8 +466,7 @@ def main():
master_port=args.master_port,
module_mode=args.module_mode,
command_mode=args.command_mode,
stdout_file_format=args.stdout,
stderr_file_format=args.stderr,
log_file_format=log_file_format,
training_script=args.training_script,
training_script_args=args.training_script_args,
processes=processes)
Expand Down
1 change: 1 addition & 0 deletions composer/loggers/mosaicml_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
RUN_NAME_ENV_VAR = 'RUN_NAME'
MOSAICML_PLATFORM_ENV_VAR = 'MOSAICML_PLATFORM'
MOSAICML_ACCESS_TOKEN_ENV_VAR = 'MOSAICML_ACCESS_TOKEN_FILE'
MOSAICML_LOG_DIR = 'MOSAICML_LOG_DIR'
jjanezhang marked this conversation as resolved.
Show resolved Hide resolved


class MosaicMLLogger(LoggerDestination):
Expand Down
Loading