From 890d9ee033a5a5d10e378673f2784ccecd789098 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 19 Nov 2024 14:24:06 -0800 Subject: [PATCH] Bump ubuntu 22.04 + fix CI mlflow tests (#3716) Co-authored-by: v-chen_data --- docker/Dockerfile | 9 ++- docker/README.md | 18 +++--- docker/build_matrix.yaml | 58 +++++++++---------- docker/generate_build_matrix.py | 12 ++-- tests/fixtures/autouse_fixtures.py | 8 +++ .../object_store/test_mlflow_object_store.py | 5 ++ tests/utils/test_dist.py | 4 +- 7 files changed, 65 insertions(+), 49 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index c3f4dee9073..62787b4fe62 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -11,8 +11,8 @@ ARG CUDA_VERSION=11.3.1 # Calculate the base image based on CUDA_VERSION -ARG BASE_IMAGE=${CUDA_VERSION:+"nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04"} -ARG BASE_IMAGE=${BASE_IMAGE:-"ubuntu:20.04"} +ARG BASE_IMAGE=${CUDA_VERSION:+"nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04"} +ARG BASE_IMAGE=${BASE_IMAGE:-"ubuntu:22.04"} # The Python version to install ARG PYTHON_VERSION=3.10 @@ -251,7 +251,7 @@ ARG MOFED_VERSION RUN if [ -n "$MOFED_VERSION" ] ; then \ wget -qO - http://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | sudo apt-key add - && \ - wget -P /etc/apt/sources.list.d/ http://linux.mellanox.com/public/repo/mlnx_ofed/$MOFED_VERSION/ubuntu20.04/mellanox_mlnx_ofed.list && \ + wget -P /etc/apt/sources.list.d/ http://linux.mellanox.com/public/repo/mlnx_ofed/$MOFED_VERSION/ubuntu22.04/mellanox_mlnx_ofed.list && \ apt-get update && \ apt-get install -y mlnx-ofed-dpdk-upstream-libs-user-only ; \ fi @@ -325,6 +325,9 @@ RUN pip install --no-cache-dir --upgrade \ urllib3${URLLIB3_VERSION} \ python-snappy +RUN apt-get remove -y python3-blinker +RUN pip install blinker + ################################################## # Override NVIDIA mistaken env var for 11.8 images ################################################## diff --git a/docker/README.md b/docker/README.md index d4af7cc4adc..41bd0e51b68 100644 --- a/docker/README.md +++ b/docker/README.md @@ -30,15 +30,15 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.5.1 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.5.1 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.5.1 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.5.1_cpu-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.4.1 | cpu | 3.11 | `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.3.1 | cpu | 3.11 | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04` | +| Ubuntu 22.04 | Base | 2.5.1 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04` | +| Ubuntu 22.04 | Base | 2.5.1 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04-aws` | +| Ubuntu 22.04 | Base | 2.5.1 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.5.1_cpu-python3.11-ubuntu22.04` | +| Ubuntu 22.04 | Base | 2.4.1 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu22.04` | +| Ubuntu 22.04 | Base | 2.4.1 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu22.04-aws` | +| Ubuntu 22.04 | Base | 2.4.1 | cpu | 3.11 | `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu22.04` | +| Ubuntu 22.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu22.04` | +| Ubuntu 22.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu22.04-aws` | +| Ubuntu 22.04 | Base | 2.3.1 | cpu | 3.11 | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu22.04` | **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws` diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 9c3d9eed36c..b062299ecad 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -1,6 +1,6 @@ # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT! - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 CUDA_VERSION: 12.4.1 IMAGE_NAME: torch-2-5-1-cu124 MOFED_VERSION: latest-23.10 @@ -10,14 +10,14 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.5.1 TAGS: - - mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.5.1_cu124-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04 + - ghcr.io/databricks-mosaic/pytorch:2.5.1_cu124-python3.11-ubuntu22.04 - mosaicml/pytorch:latest - ghcr.io/databricks-mosaic/pytorch:latest TARGET: pytorch_stage TORCHVISION_VERSION: 0.20.1 - AWS_OFI_NCCL_VERSION: v1.11.0-aws - BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 CUDA_VERSION: 12.4.1 IMAGE_NAME: torch-2-5-1-cu124-aws MOFED_VERSION: '' @@ -27,14 +27,14 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.5.1 TAGS: - - mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04-aws - - ghcr.io/databricks-mosaic/pytorch:2.5.1_cu124-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu22.04-aws + - ghcr.io/databricks-mosaic/pytorch:2.5.1_cu124-python3.11-ubuntu22.04-aws - mosaicml/pytorch:latest-aws - ghcr.io/databricks-mosaic/pytorch:latest-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.20.1 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 + BASE_IMAGE: ubuntu:22.04 CUDA_VERSION: '' IMAGE_NAME: torch-2-5-1-cpu MOFED_VERSION: '' @@ -44,14 +44,14 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.5.1 TAGS: - - mosaicml/pytorch:2.5.1_cpu-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.5.1_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.5.1_cpu-python3.11-ubuntu22.04 + - ghcr.io/databricks-mosaic/pytorch:2.5.1_cpu-python3.11-ubuntu22.04 - mosaicml/pytorch:latest_cpu - ghcr.io/databricks-mosaic/pytorch:latest_cpu TARGET: pytorch_stage TORCHVISION_VERSION: 0.20.1 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 CUDA_VERSION: 12.4.1 IMAGE_NAME: torch-2-4-1-cu124 MOFED_VERSION: latest-23.10 @@ -61,12 +61,12 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.4.1 TAGS: - - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu22.04 + - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu22.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.1 - AWS_OFI_NCCL_VERSION: v1.11.0-aws - BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 CUDA_VERSION: 12.4.1 IMAGE_NAME: torch-2-4-1-cu124-aws MOFED_VERSION: '' @@ -76,12 +76,12 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.4.1 TAGS: - - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws - - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu22.04-aws + - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu22.04-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.1 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 + BASE_IMAGE: ubuntu:22.04 CUDA_VERSION: '' IMAGE_NAME: torch-2-4-1-cpu MOFED_VERSION: '' @@ -91,12 +91,12 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.4.1 TAGS: - - mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu22.04 + - ghcr.io/databricks-mosaic/pytorch:2.4.1_cpu-python3.11-ubuntu22.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.1 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 + BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 CUDA_VERSION: 12.1.1 IMAGE_NAME: torch-2-3-1-cu121 MOFED_VERSION: latest-23.10 @@ -119,12 +119,12 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu22.04 + - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu22.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: v1.11.0-aws - BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 + BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 CUDA_VERSION: 12.1.1 IMAGE_NAME: torch-2-3-1-cu121-aws MOFED_VERSION: '' @@ -147,12 +147,12 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws - - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu22.04-aws + - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu22.04-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 + BASE_IMAGE: ubuntu:22.04 CUDA_VERSION: '' IMAGE_NAME: torch-2-3-1-cpu MOFED_VERSION: '' @@ -162,12 +162,12 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 - - ghcr.io/databricks-mosaic/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu22.04 + - ghcr.io/databricks-mosaic/pytorch:2.3.1_cpu-python3.11-ubuntu22.04 TARGET: pytorch_stage TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.27.0 CUDA_VERSION: 12.4.1 IMAGE_NAME: composer-0-27-0 @@ -185,7 +185,7 @@ TARGET: composer_stage TORCHVISION_VERSION: 0.20.1 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: ubuntu:20.04 + BASE_IMAGE: ubuntu:22.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.27.0 CUDA_VERSION: '' IMAGE_NAME: composer-0-27-0-cpu diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 2d6c713a4af..1fb476b3514 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -35,10 +35,10 @@ def _get_torchvision_version(pytorch_version: str): def _get_base_image(cuda_version: str): if not cuda_version: - return 'ubuntu:20.04' + return 'ubuntu:22.04' if cuda_version == '12.4.1': - return f'nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04' - return f'nvidia/cuda:{cuda_version}-cudnn8-devel-ubuntu20.04' + return f'nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04' + return f'nvidia/cuda:{cuda_version}-cudnn8-devel-ubuntu22.04' def _get_cuda_version(pytorch_version: str, use_cuda: bool): @@ -112,8 +112,8 @@ def _get_pytorch_tags(python_version: str, pytorch_version: str, cuda_version: s tags = [] cuda_version_tag = _get_cuda_version_tag(cuda_version) tags += [ - f'{base_image_name}:{pytorch_version}_{cuda_version_tag}-python{python_version}-ubuntu20.04', - f'{ghcr_base_image_name}:{pytorch_version}_{cuda_version_tag}-python{python_version}-ubuntu20.04', + f'{base_image_name}:{pytorch_version}_{cuda_version_tag}-python{python_version}-ubuntu22.04', + f'{ghcr_base_image_name}:{pytorch_version}_{cuda_version_tag}-python{python_version}-ubuntu22.04', ] if python_version == PRODUCTION_PYTHON_VERSION and pytorch_version == PRODUCTION_PYTORCH_VERSION: @@ -294,7 +294,7 @@ def _main(): interconnect = 'EFA' cuda_version = f"{entry['CUDA_VERSION']} ({interconnect})" if entry['CUDA_VERSION'] else 'cpu' table.append([ - 'Ubuntu 20.04', # Linux distro + 'Ubuntu 22.04', # Linux distro 'Base', # Flavor entry['PYTORCH_VERSION'], # Pytorch version cuda_version, # Cuda version diff --git a/tests/fixtures/autouse_fixtures.py b/tests/fixtures/autouse_fixtures.py index c881157353f..b42a53d8dda 100644 --- a/tests/fixtures/autouse_fixtures.py +++ b/tests/fixtures/autouse_fixtures.py @@ -148,3 +148,11 @@ def remove_run_name_env_var(): os.environ['COMPOSER_RUN_NAME'] = composer_run_name if run_name is not None: os.environ['RUN_NAME'] = run_name + + +@pytest.fixture(autouse=True) +def setup_mlflow_tracking(monkeypatch, tmp_path): + mlflow = pytest.importorskip('mlflow') + tracking_uri = str(tmp_path / 'mlruns') + monkeypatch.setenv(mlflow.environment_variables.MLFLOW_TRACKING_URI.name, tracking_uri) + os.makedirs(tracking_uri, exist_ok=True) diff --git a/tests/utils/object_store/test_mlflow_object_store.py b/tests/utils/object_store/test_mlflow_object_store.py index fd802048bc3..20280cdcdef 100644 --- a/tests/utils/object_store/test_mlflow_object_store.py +++ b/tests/utils/object_store/test_mlflow_object_store.py @@ -19,6 +19,11 @@ DEFAULT_PATH = TEST_PATH_FORMAT.format(experiment_id=EXPERIMENT_ID, run_id=RUN_ID) +@pytest.fixture(autouse=True) +def setup_mlflow_tracking(monkeypatch): + monkeypatch.setenv('MLFLOW_TRACKING_URI', 'databricks') + + def test_parse_dbfs_path(): full_artifact_path = DEFAULT_PATH + ARTIFACT_PATH assert MLFlowObjectStore.parse_dbfs_path(full_artifact_path) == (EXPERIMENT_ID, RUN_ID, ARTIFACT_PATH) diff --git a/tests/utils/test_dist.py b/tests/utils/test_dist.py index 608e56e5d27..0d6115dc4d0 100644 --- a/tests/utils/test_dist.py +++ b/tests/utils/test_dist.py @@ -63,7 +63,7 @@ def test_busy_wait_for_local_rank_zero(tmp_path): dist.barrier() start_time = time.time() - assert os.listdir(gathered_tmp_path) == [] + assert os.listdir(gathered_tmp_path) == ['mlruns'] with dist.busy_wait_for_local_rank_zero(gathered_tmp_path): if dist.get_local_rank() == 0: time.sleep(0.5) @@ -71,6 +71,6 @@ def test_busy_wait_for_local_rank_zero(tmp_path): end_time = time.time() total_time = end_time - start_time gathered_times = dist.all_gather_object(total_time) - assert os.listdir(gathered_tmp_path) == [] + assert os.listdir(gathered_tmp_path) == ['mlruns'] assert len(gathered_times) == 2 assert abs(gathered_times[0] - gathered_times[1]) < 0.1