diff --git a/sky/clouds/service_catalog/images/README.md b/sky/clouds/service_catalog/images/README.md new file mode 100644 index 00000000000..31ce7c6d9ce --- /dev/null +++ b/sky/clouds/service_catalog/images/README.md @@ -0,0 +1,72 @@ +# SkyPilot OS Image Generation Guide + +## Prerequisites +You only need to do this once. +1. Install [Packer](https://developer.hashicorp.com/packer/tutorials/aws-get-started/get-started-install-cli) +2. Download plugins used by Packer +```bash +packer init plugins.pkr.hcl +``` +3. Setup cloud credentials + +## Generate Images +```bash +export CLOUD=gcp # Update this +export TYPE=gpu # Update this +export IMAGE=skypilot-${CLOUD}-${TYPE}-ubuntu +packer build ${IMAGE}.pkr.hcl +``` +You will see the image ID after the build is complete. + +FYI time to packer build an image: + +| Cloud | Type | Approx. Time | +|-------|------|------------------------| +| AWS | GPU | 15 min | +| AWS | CPU | 10 min | +| GCP | GPU | 16 min | +| GCP | CPU | 5 min | + +### GCP +```bash +export IMAGE_NAME=skypilot-gcp-cpu-ubuntu-20241011003407 # Update this + +# Make image public +export IMAGE_ID=projects/sky-dev-465/global/images/${IMAGE_NAME} +gcloud compute images add-iam-policy-binding ${IMAGE_NAME} --member='allAuthenticatedUsers' --role='roles/compute.imageUser' +``` + +### AWS +1. Generate images for all regions +```bash +export IMAGE_ID=ami-0b31b24524afa8e47 # Update this + +python aws_utils/image_gen.py --image-id ${IMAGE_ID} --processor ${TYPE} +``` +2. Add fallback images if any region failed \ +Look for "NEED_FALLBACK" in the output `images.csv` and edit. (You can use public [ubuntu images](https://cloud-images.ubuntu.com/locator/ec2/) as fallback.) + +## Test Images +1. Minimal GPU test: `sky launch --image ${IMAGE_ID} --gpus=L4:1 --cloud ${CLOUD}` then run `nvidia-smi` in the launched instance. +2. Update the image ID in `sky/clouds/gcp.py` and run the test: +```bash +pytest tests/test_smoke.py::test_minimal --gcp +pytest tests/test_smoke.py::test_huggingface --gcp +pytest tests/test_smoke.py::test_job_queue_with_docker --gcp +pytest tests/test_smoke.py::test_cancel_gcp +``` + +## Ship Images & Cleanup +Submit a PR to update [`SkyPilot Catalog`](https://github.com/skypilot-org/skypilot-catalog/tree/master/catalogs) then clean up the old images to avoid extra iamge storage fees. + +### GCP +1. Example PR: [#86](https://github.com/skypilot-org/skypilot-catalog/pull/86) +2. Go to console and delete old images. + +### AWS +1. Copy the old custom image rows from Catalog's existing `images.csv` to a local `images.csv` in this folder. +2. Update Catalog with new images. Example PR: [#89](https://github.com/skypilot-org/skypilot-catalog/pull/89) +3. Delete AMIs across regions by running +```bash +python aws_utils/image_delete.py --tag ${TAG} +``` diff --git a/sky/clouds/service_catalog/images/aws_utils/image_delete.py b/sky/clouds/service_catalog/images/aws_utils/image_delete.py new file mode 100644 index 00000000000..52cbb5b2382 --- /dev/null +++ b/sky/clouds/service_catalog/images/aws_utils/image_delete.py @@ -0,0 +1,63 @@ +"""Delete all images with a given tag and their associated snapshots from images.csv + +Example Usage: put images.csv in the same folder as this script and run + python image_delete.py --tag skypilot:custom-gpu-ubuntu-2204 +""" + +import argparse +import csv +import json +import subprocess + +parser = argparse.ArgumentParser( + description='Delete AWS images and their snapshots across regions.') +parser.add_argument('--tag', + required=True, + help='Tag of the image to delete, see tags in images.csv') +args = parser.parse_args() + + +def get_snapshots(image_id, region): + cmd = f'aws ec2 describe-images --image-ids {image_id} --region {region} --query "Images[*].BlockDeviceMappings[*].Ebs.SnapshotId" --output json' + result = subprocess.run(cmd, + shell=True, + check=True, + capture_output=True, + text=True) + snapshots = json.loads(result.stdout) + return [ + snapshot for sublist in snapshots for snapshot in sublist if snapshot + ] + + +def delete_image_and_snapshots(image_id, region): + # Must get snapshots before deleting the image + snapshots = get_snapshots(image_id, region) + + # Deregister the image + cmd = f'aws ec2 deregister-image --image-id {image_id} --region {region}' + subprocess.run(cmd, shell=True, check=True) + print(f"Deregistered image {image_id} in region {region}") + + # Delete snapshots + for snapshot in snapshots: + cmd = f'aws ec2 delete-snapshot --snapshot-id {snapshot} --region {region}' + subprocess.run(cmd, shell=True, check=True) + print(f'Deleted snapshot {snapshot} in region {region}') + + +def main(): + with open('images.csv', 'r') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + if row['Tag'] == args.tag: + try: + delete_image_and_snapshots(row['ImageId'], row['Region']) + except subprocess.CalledProcessError as e: + print( + f'Failed to delete image {row["ImageId"]} or its snapshots in region {row["Region"]}: {e}' + ) + + +if __name__ == "__main__": + main() diff --git a/sky/clouds/service_catalog/images/aws_utils/image_gen.py b/sky/clouds/service_catalog/images/aws_utils/image_gen.py new file mode 100644 index 00000000000..cb39355ad2c --- /dev/null +++ b/sky/clouds/service_catalog/images/aws_utils/image_gen.py @@ -0,0 +1,151 @@ +"""Copy SkyPilot AMI to multiple regions, make them public, and generate images.csv + +Example Usage: + python aws_image_gen.py --source-image-id ami-00000 --processor gpu +""" + +import argparse +import concurrent.futures +import csv +import json +import os +import subprocess +import threading +import time + +parser = argparse.ArgumentParser( + description='Generate AWS images across regions.') +parser.add_argument('--image-id', + required=True, + help='The source AMI ID to copy from') +parser.add_argument('--processor', required=True, help='e.g. gpu, cpu, etc.') +parser.add_argument('--region', + default='us-east-1', + help='Region of the source AMI') +parser.add_argument('--base-image-id', + default='ami-005fc0f236362e99f', + help='The base AMI of the source AMI.') +parser.add_argument('--os-type', default='ubuntu', help='The OS type') +parser.add_argument('--os-version', default='22.04', help='The OS version') +parser.add_argument('--output-csv', + default='images.csv', + help='The output CSV file name') +args = parser.parse_args() + +# 25 regions +ALL_REGIONS = [ + # 'us-east-1', # Source AMI is already in this region + 'us-east-2', + 'us-west-1', + 'us-west-2', + 'ca-central-1', + 'eu-central-1', # need for smoke test + 'eu-central-2', + 'eu-west-1', + 'eu-west-2', + 'eu-south-1', + 'eu-south-2', + 'eu-west-3', + 'eu-north-1', + 'me-south-1', + 'me-central-1', + 'af-south-1', + 'ap-east-1', + 'ap-south-1', + 'ap-south-2', + 'ap-northeast-3', + 'ap-northeast-2', + 'ap-southeast-1', + 'ap-southeast-2', + 'ap-southeast-3', + 'ap-northeast-1', +] + + +def make_image_public(image_id, region): + unblock_command = f"aws ec2 disable-image-block-public-access --region {region}" + subprocess.run(unblock_command, shell=True, check=True) + public_command = ( + f'aws ec2 modify-image-attribute --image-id {image_id} ' + f'--launch-permission "{{\\\"Add\\\": [{{\\\"Group\\\":\\\"all\\\"}}]}}" --region {region}' + ) + subprocess.run(public_command, shell=True, check=True) + print(f"Made {image_id} public") + + +def copy_image_and_make_public(target_region): + # Copy the AMI to the target region + copy_command = ( + f"aws ec2 copy-image --source-region {args.region} " + f"--source-image-id {args.image_id} --region {target_region} " + f"--name 'skypilot-aws-{args.processor}-{args.os_type}-{time.time()}' --output json" + ) + print(copy_command) + result = subprocess.run(copy_command, + shell=True, + check=True, + capture_output=True, + text=True) + print(result.stdout) + new_image_id = json.loads(result.stdout)['ImageId'] + print(f"Copied image to {target_region} with new image ID: {new_image_id}") + + # Wait for the image to be available + print(f"Waiting for {new_image_id} to be available...") + wait_command = f"aws ec2 wait image-available --image-ids {new_image_id} --region {target_region}" + subprocess.run(wait_command, shell=True, check=True) + + make_image_public(new_image_id, target_region) + + return new_image_id + + +def write_image_to_csv(image_id, region): + with open(args.output_csv, 'a', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + row = [ + f'skypilot:custom-{args.processor}-{args.os_type}', region, + args.os_type, args.os_version, image_id, + time.strftime('%Y%m%d'), args.base_image_id + ] + writer.writerow(row) + print(f"Wrote to CSV: {row}") + + +def main(): + make_image_public(args.image_id, args.region) + if not os.path.exists(args.output_csv): + with open(args.output_csv, 'w', newline='') as csvfile: + writer = csv.writer(csvfile) + writer.writerow([ + 'Tag', 'Region', 'OS', 'OSVersion', 'ImageId', 'CreationDate', + 'BaseImageId' + ]) # Header + print(f"No existing {args.output_csv} so created it.") + + # Process other regions + image_cache = [(args.image_id, args.region)] + + def process_region(copy_to_region): + print(f"Start copying image to {copy_to_region}...") + try: + new_image_id = copy_image_and_make_public(copy_to_region) + except Exception as e: + print(f"Error generating image to {copy_to_region}: {str(e)}") + new_image_id = 'NEED_FALLBACK' + image_cache.append((new_image_id, copy_to_region)) + + with concurrent.futures.ThreadPoolExecutor() as executor: + executor.map(process_region, ALL_REGIONS) + executor.shutdown(wait=True) + + # Sort the images by it's region and write to CSV + sorted_image_cache = sorted(image_cache, key=lambda x: x[1]) + for new_image_id, copy_to_region in sorted_image_cache: + write_image_to_csv(new_image_id, copy_to_region) + + print("All done!") + + +if __name__ == "__main__": + main() diff --git a/sky/clouds/service_catalog/images/plugins.pkr.hcl b/sky/clouds/service_catalog/images/plugins.pkr.hcl new file mode 100644 index 00000000000..e007c1723bf --- /dev/null +++ b/sky/clouds/service_catalog/images/plugins.pkr.hcl @@ -0,0 +1,17 @@ +packer { + required_plugins { + amazon = { + version = ">= 1.2.8" + source = "github.com/hashicorp/amazon" + } + } +} + +packer { + required_plugins { + googlecompute = { + version = ">= 1.1.1" + source = "github.com/hashicorp/googlecompute" + } + } +} diff --git a/sky/clouds/service_catalog/images/provisioners/cloud.sh b/sky/clouds/service_catalog/images/provisioners/cloud.sh new file mode 100644 index 00000000000..b326c9fde51 --- /dev/null +++ b/sky/clouds/service_catalog/images/provisioners/cloud.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +PYTHON_EXEC=$(echo ~/skypilot-runtime)/bin/python + +# TODO: keep this dependency installation align with utils/controller_utils.py and setup.py +install_azure() { + echo "Install cloud dependencies on controller: Azure" + $PYTHON_EXEC -m pip install "azure-cli>=2.31.0" azure-core "azure-identity>=1.13.0" azure-mgmt-network + $PYTHON_EXEC -m pip install azure-storage-blob msgraph-sdk +} + +install_gcp() { + echo "Install cloud dependencies on controller: GCP" + $PYTHON_EXEC -m pip install "google-api-python-client>=2.69.0" + $PYTHON_EXEC -m pip install google-cloud-storage + if ! gcloud --help > /dev/null 2>&1; then + pushd /tmp &>/dev/null + mkdir -p ~/.sky/logs + wget --quiet https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-424.0.0-linux-x86_64.tar.gz > ~/.sky/logs/gcloud_installation.log + tar xzf google-cloud-sdk-424.0.0-linux-x86_64.tar.gz >> ~/.sky/logs/gcloud_installation.log + rm -rf ~/google-cloud-sdk >> ~/.sky/logs/gcloud_installation.log + mv google-cloud-sdk ~/ + ~/google-cloud-sdk/install.sh -q >> ~/.sky/logs/gcloud_installation.log 2>&1 + echo "source ~/google-cloud-sdk/path.bash.inc > /dev/null 2>&1" >> ~/.bashrc + source ~/google-cloud-sdk/path.bash.inc >> ~/.sky/logs/gcloud_installation.log 2>&1 + popd &>/dev/null + fi +} + +install_aws() { + echo "Install cloud dependencies on controller: AWS" + $PYTHON_EXEC -m pip install botocore>=1.29.10 boto3>=1.26.1 + $PYTHON_EXEC -m pip install "urllib3<2" awscli>=1.27.10 "colorama<0.4.5" +} + +if [ "$CLOUD" = "azure" ]; then + install_azure +elif [ "$CLOUD" = "gcp" ]; then + install_gcp +elif [ "$CLOUD" = "aws" ]; then + install_aws +else + echo "Error: Unknown cloud $CLOUD so not installing any cloud dependencies." +fi + +if [ $? -eq 0 ]; then + echo "Successfully installed cloud dependencies on controller: $CLOUD" +else + echo "Error: Failed to install cloud dependencies on controller: $CLOUD" +fi diff --git a/sky/clouds/service_catalog/images/provisioners/cuda.sh b/sky/clouds/service_catalog/images/provisioners/cuda.sh new file mode 100644 index 00000000000..1b2b4ec977e --- /dev/null +++ b/sky/clouds/service_catalog/images/provisioners/cuda.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# This script installs the latest CUDA driver and toolkit version that is compatible with all GPU types. +# For CUDA driver version, choose the latest version that works for ALL GPU types. +# GCP: https://cloud.google.com/compute/docs/gpus/install-drivers-gpu#minimum-driver +# AWS: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/install-nvidia-driver.html +export DEBIAN_FRONTEND=noninteractive + +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update + +# Make sure CUDA toolkit and driver versions are compatible: https://docs.nvidia.com/deploy/cuda-compatibility/index.html +# Current State: Driver Version 535.183.06 and CUDA Version 12.2 +sudo apt-get install -y cuda-drivers-535 +sudo apt-get install -y cuda-toolkit-12-4 + +# Install cuDNN +# https://docs.nvidia.com/deeplearning/cudnn/latest/installation/linux.html#installing-on-linux +sudo apt-get install libcudnn8 +sudo apt-get install libcudnn8-dev + +# Cleanup +rm cuda-keyring_1.1-1_all.deb diff --git a/sky/clouds/service_catalog/images/provisioners/docker.sh b/sky/clouds/service_catalog/images/provisioners/docker.sh new file mode 100644 index 00000000000..da2366408ab --- /dev/null +++ b/sky/clouds/service_catalog/images/provisioners/docker.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Add Docker's official GPG key: +sudo apt-get update +sudo apt-get install ca-certificates curl +sudo install -m 0755 -d /etc/apt/keyrings +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +sudo chmod a+r /etc/apt/keyrings/docker.asc + +# Add the repository to Apt sources: +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +sudo apt-get update + +# Install Docker +sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + +# Add user to Docker group so that user does not need to use sudo to run Docker commands +sudo usermod -aG docker $USER +newgrp docker diff --git a/sky/clouds/service_catalog/images/provisioners/nvidia-container-toolkit.sh b/sky/clouds/service_catalog/images/provisioners/nvidia-container-toolkit.sh new file mode 100644 index 00000000000..b6b3625176b --- /dev/null +++ b/sky/clouds/service_catalog/images/provisioners/nvidia-container-toolkit.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -e + +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && + curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit + +# if there's an empty /etc/docker/daemon.json, `nvidia-ctk runtime configure --runtime=docker` will fail +if [ -f /etc/docker/daemon.json ] && [ ! -s /etc/docker/daemon.json ]; then + sudo rm /etc/docker/daemon.json +fi + +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker + +# Validate +if sudo docker info -f "{{.Runtimes}}" | grep "nvidia-container-runtime"; then + echo "Successfully installed NVIDIA container runtime" +else + echo "Failed to install NVIDIA container runtime" +fi diff --git a/sky/clouds/service_catalog/images/provisioners/skypilot.sh b/sky/clouds/service_catalog/images/provisioners/skypilot.sh new file mode 100644 index 00000000000..ff2aa06b2b6 --- /dev/null +++ b/sky/clouds/service_catalog/images/provisioners/skypilot.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Stop and disable unattended-upgrades +sudo systemctl stop unattended-upgrades || true +sudo systemctl disable unattended-upgrades || true +sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true + +# Configure dpkg +sudo dpkg --configure --force-overwrite -a + +# Apt-get installs +sudo apt-get install jq -y + +# Create necessary directories +mkdir -p ~/sky_workdir +mkdir -p ~/.sky/ +mkdir -p ~/.sky/sky_app +mkdir -p ~/.ssh +touch ~/.ssh/config + +# Install Miniconda +curl -o Miniconda3-Linux-x86_64.sh https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh +bash Miniconda3-Linux-x86_64.sh -b +eval "$(~/miniconda3/bin/conda shell.bash hook)" +rm Miniconda3-Linux-x86_64.sh +conda init +conda config --set auto_activate_base true +conda activate base + +# Conda, Python +echo "Creating conda env with Python 3.10" +conda create -y -n skypilot-runtime python=3.10 +conda activate skypilot-runtime +export PIP_DISABLE_PIP_VERSION_CHECK=1 +echo PATH=$PATH +python3 -m venv ~/skypilot-runtime +PYTHON_EXEC=$(echo ~/skypilot-runtime)/bin/python + +# Pip installs +$PYTHON_EXEC -m pip install "setuptools<70" +$PYTHON_EXEC -m pip install "grpcio!=1.48.0,<=1.51.3,>=1.42.0" +$PYTHON_EXEC -m pip install "skypilot-nightly" + +# Install ray +RAY_ADDRESS=127.0.0.1:6380 +$PYTHON_EXEC -m pip install --exists-action w -U ray[default]==2.9.3 +export PATH=$PATH:$HOME/.local/bin +source ~/skypilot-runtime/bin/activate +which ray > ~/.sky/ray_path || exit 1 +$PYTHON_EXEC -m pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && { + $PYTHON_EXEC -c "from sky.skylet.ray_patches import patch; patch()" || exit 1 +} + +# System configurations +sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf' +sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf' +sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity +sudo systemctl daemon-reload + +# Stop and disable Jupyter service +sudo systemctl stop jupyter > /dev/null 2>&1 || true +sudo systemctl disable jupyter > /dev/null 2>&1 || true + +# Configure fuse +[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf' + +# Cleanup +# Remove SkyPilot in OS image because when user sky launch we will install whatever version of SkyPilot user has on their local machine. +$PYTHON_EXEC -m pip uninstall "skypilot-nightly" -y diff --git a/sky/clouds/service_catalog/images/skypilot-aws-cpu-ubuntu.pkr.hcl b/sky/clouds/service_catalog/images/skypilot-aws-cpu-ubuntu.pkr.hcl new file mode 100644 index 00000000000..c21fbf51b20 --- /dev/null +++ b/sky/clouds/service_catalog/images/skypilot-aws-cpu-ubuntu.pkr.hcl @@ -0,0 +1,47 @@ +variable "region" { + type = string + default = "us-east-1" +} + +locals { + timestamp = regex_replace(timestamp(), "[- TZ:]", "") +} + +source "amazon-ebs" "cpu-ubuntu" { + ami_name = "skypilot-aws-cpu-ubuntu-${local.timestamp}" + instance_type = "t2.micro" + region = var.region + ssh_username = "ubuntu" + source_ami_filter { + filters = { + name = "ubuntu/images/*ubuntu-jammy-22.04-amd64-server-*" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["099720109477"] + } + launch_block_device_mappings { + device_name = "/dev/sda1" + volume_size = 8 + volume_type = "gp2" + delete_on_termination = true + } +} + +build { + name = "aws-cpu-ubuntu-build" + sources = ["sources.amazon-ebs.cpu-ubuntu"] + provisioner "shell" { + script = "./provisioners/docker.sh" + } + provisioner "shell" { + script = "./provisioners/skypilot.sh" + } + provisioner "shell" { + environment_vars = [ + "CLOUD=aws", + ] + script = "./provisioners/cloud.sh" + } +} diff --git a/sky/clouds/service_catalog/images/skypilot-aws-gpu-ubuntu.pkr.hcl b/sky/clouds/service_catalog/images/skypilot-aws-gpu-ubuntu.pkr.hcl new file mode 100644 index 00000000000..c4a8efac4dc --- /dev/null +++ b/sky/clouds/service_catalog/images/skypilot-aws-gpu-ubuntu.pkr.hcl @@ -0,0 +1,55 @@ +variable "region" { + type = string + default = "us-east-1" +} + +locals { + timestamp = regex_replace(timestamp(), "[- TZ:]", "") +} + +source "amazon-ebs" "gpu-ubuntu" { + ami_name = "skypilot-aws-gpu-ubuntu-${local.timestamp}" + instance_type = "g6.xlarge" + region = var.region + ssh_username = "ubuntu" + source_ami_filter { + filters = { + name = "ubuntu/images/*ubuntu-jammy-22.04-amd64-server-*" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["099720109477"] + } + launch_block_device_mappings { + device_name = "/dev/sda1" + volume_size = 30 + volume_type = "gp2" + delete_on_termination = true + } +} + +build { + name = "aws-gpu-ubuntu-build" + sources = [ + "source.amazon-ebs.gpu-ubuntu" + ] + provisioner "shell" { + script = "./provisioners/docker.sh" + } + provisioner "shell" { + script = "./provisioners/cuda.sh" + } + provisioner "shell" { + script = "./provisioners/nvidia-container-toolkit.sh" + } + provisioner "shell" { + script = "./provisioners/skypilot.sh" + } + provisioner "shell" { + environment_vars = [ + "CLOUD=aws", + ] + script = "./provisioners/cloud.sh" + } +} diff --git a/sky/clouds/service_catalog/images/skypilot-gcp-cpu-ubuntu.pkr.hcl b/sky/clouds/service_catalog/images/skypilot-gcp-cpu-ubuntu.pkr.hcl new file mode 100644 index 00000000000..bf3af0519e4 --- /dev/null +++ b/sky/clouds/service_catalog/images/skypilot-gcp-cpu-ubuntu.pkr.hcl @@ -0,0 +1,33 @@ + +locals { + timestamp = regex_replace(timestamp(), "[- TZ:]", "") +} + +source "googlecompute" "cpu-ubuntu" { + project_id = "sky-dev-465" + image_name = "skypilot-gcp-cpu-ubuntu-${local.timestamp}" + source_image_family = "ubuntu-2204-lts" + zone = "us-west1-a" + image_description = "SkyPilot custom image for launching GCP CPU instances." + tags = ["packer"] + disk_size = 10 + machine_type = "e2-medium" + ssh_username = "gcpuser" +} + +build { + name = "gcp-cpu-ubuntu-build" + sources = ["sources.googlecompute.cpu-ubuntu"] + provisioner "shell" { + script = "./provisioners/docker.sh" + } + provisioner "shell" { + script = "./provisioners/skypilot.sh" + } + provisioner "shell" { + environment_vars = [ + "CLOUD=gcp", + ] + script = "./provisioners/cloud.sh" + } +} diff --git a/sky/clouds/service_catalog/images/skypilot-gcp-gpu-ubuntu.pkr.hcl b/sky/clouds/service_catalog/images/skypilot-gcp-gpu-ubuntu.pkr.hcl new file mode 100644 index 00000000000..f46d414493b --- /dev/null +++ b/sky/clouds/service_catalog/images/skypilot-gcp-gpu-ubuntu.pkr.hcl @@ -0,0 +1,46 @@ +variable "zone" { + type = string + default = "us-west1-a" +} + +locals { + timestamp = regex_replace(timestamp(), "[- TZ:]", "") +} + +source "googlecompute" "gpu-ubuntu" { + image_name = "skypilot-gcp-gpu-ubuntu-${local.timestamp}" + project_id = "sky-dev-465" + source_image_family = "ubuntu-2204-lts" + zone = var.zone + image_description = "SkyPilot custom image for launching GCP GPU instances." + tags = ["packer", "gpu", "ubuntu"] + disk_size = 50 + machine_type = "g2-standard-4" + accelerator_type = "projects/sky-dev-465/zones/${var.zone}/acceleratorTypes/nvidia-l4" + accelerator_count = 1 + on_host_maintenance = "TERMINATE" + ssh_username = "gcpuser" +} + +build { + name = "gcp-gpu-ubuntu-build" + sources = ["sources.googlecompute.gpu-ubuntu"] + provisioner "shell" { + script = "./provisioners/docker.sh" + } + provisioner "shell" { + script = "./provisioners/cuda.sh" + } + provisioner "shell" { + script = "./provisioners/nvidia-container-toolkit.sh" + } + provisioner "shell" { + script = "./provisioners/skypilot.sh" + } + provisioner "shell" { + environment_vars = [ + "CLOUD=gcp", + ] + script = "./provisioners/cloud.sh" + } +} diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 22084e9c368..ed86f93ca27 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -383,7 +383,7 @@ def test_aws_region(): f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-east-2\'', f'sky logs {name} 2 --status', # Ensure the job succeeded. # A user program should not access SkyPilot runtime env python by default. - f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} || exit 1\'', + f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'', f'sky logs {name} 3 --status', # Ensure the job succeeded. ], f'sky down -y {name}', @@ -406,7 +406,7 @@ def test_gcp_region_and_service_account(): f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .region | grep us-central1\'', f'sky logs {name} 3 --status', # Ensure the job succeeded. # A user program should not access SkyPilot runtime env python by default. - f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} || exit 1\'', + f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'', f'sky logs {name} 4 --status', # Ensure the job succeeded. ], f'sky down -y {name}', @@ -446,7 +446,7 @@ def test_azure_region(): f'sky exec {name} \'echo $SKYPILOT_CLUSTER_INFO | jq .zone | grep null\'', f'sky logs {name} 3 --status', # Ensure the job succeeded. # A user program should not access SkyPilot runtime env python by default. - f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} || exit 1\'', + f'sky exec {name} \'which python | grep {constants.SKY_REMOTE_PYTHON_ENV_NAME} && exit 1 || true\'', f'sky logs {name} 4 --status', # Ensure the job succeeded. ], f'sky down -y {name}', @@ -864,14 +864,14 @@ def test_custom_default_conda_env(generic_cloud: str): f'sky launch -c {name} -y --cloud {generic_cloud} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky status -r {name} | grep "UP"', f'sky logs {name} 1 --status', - f'sky logs {name} 1 --no-follow | grep -P "myenv\\s+\\*"', + f'sky logs {name} 1 --no-follow | grep -E "myenv\\s+\\*"', f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky logs {name} 2 --status', f'sky autostop -y -i 0 {name}', 'sleep 60', f'sky status -r {name} | grep "STOPPED"', f'sky start -y {name}', - f'sky logs {name} 2 --no-follow | grep -P "myenv\\s+\\*"', + f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"', f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml', f'sky logs {name} 3 --status', ], f'sky down -y {name}')