Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Docs] Deployment on existing infra #3926

Merged
merged 24 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/reservations/existing-machines.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Deploy SkyPilot on existing machines
====================================

This guide will help you deploy SkyPilot on your existing machines - whether they are on-premisesc or reserved instances on a cloud provider.
This guide will help you deploy SkyPilot on your existing machines - whether they are on-premises or reserved instances on a cloud provider.
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved

**Given a list of IP addresses and SSH credentials,**
SkyPilot will install necessary dependencies on the remote machines and configure itself to run jobs and services on the cluster.
Expand Down Expand Up @@ -75,7 +75,7 @@ Deploying SkyPilot
IP_FILE=ips.txt
SSH_USER=username
SSH_KEY=path/to/ssh/key
sky local up --ip $IP_FILE --ssh-user SSH_USER --ssh-key-path $SSH_KEY
sky local up --ips $IP_FILE --ssh-user SSH_USER --ssh-key-path $SSH_KEY

SkyPilot will deploy a Kubernetes cluster on the remote machines, set up GPU support, configure Kubernetes credentials on your local machine, and set up SkyPilot to operate with the new cluster.

Expand Down
9 changes: 5 additions & 4 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5072,7 +5072,7 @@ def local():
pass


def deploy_local_cluster(gpus: bool):
def _deploy_local_cluster(gpus: bool):
cluster_created = False

# Check if GPUs are available on the host
Expand Down Expand Up @@ -5198,7 +5198,8 @@ def deploy_local_cluster(gpus: bool):
f'{gpu_hint}')


def deploy_remote_cluster(ip_file, ssh_user, ssh_key_path, cleanup):
def _deploy_remote_cluster(ip_file: str, ssh_user: str, ssh_key_path: str,
cleanup: bool):
success = False
path_to_package = os.path.dirname(os.path.dirname(__file__))
up_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes',
Expand Down Expand Up @@ -5309,10 +5310,10 @@ def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
# Convert ips and ssh_key_path to absolute paths
ips = os.path.abspath(ips)
ssh_key_path = os.path.abspath(ssh_key_path)
deploy_remote_cluster(ips, ssh_user, ssh_key_path, cleanup)
_deploy_remote_cluster(ips, ssh_user, ssh_key_path, cleanup)
else:
# Run local deployment (kind) if no remote args are specified
deploy_local_cluster(gpus)
_deploy_local_cluster(gpus)


@local.command('down', cls=_DocumentedCodeCommand)
Expand Down
2 changes: 1 addition & 1 deletion sky/utils/kubernetes/deploy_remote_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -240,4 +240,4 @@ echo "You can now interact with your Kubernetes cluster through SkyPilot: "
echo " • List available GPUs: sky show-gpus --cloud kubernetes"
echo " • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes --gpus A100:1"
echo " • Connect to pod with SSH: ssh devbox"
echo " • Connect to pod with VSCode: code --remote ssh-remote+devbox '/'"
echo " • Connect to pod with VSCode: code --remote ssh-remote+devbox '/'"
35 changes: 22 additions & 13 deletions sky/utils/log_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Logging utils."""
import enum
from typing import List, Optional
from types import TracebackType
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved
from typing import List, Optional, Type

import colorama
import pendulum
Expand All @@ -15,13 +16,15 @@
class LineProcessor(object):
"""A processor for log lines."""

def __enter__(self):
def __enter__(self) -> None:
pass

def process_line(self, log_line):
def process_line(self, log_line: str) -> None:
pass

def __exit__(self, except_type, except_value, traceback):
def __exit__(self, except_type: Optional[Type[BaseException]],
except_value: Optional[BaseException],
traceback: Optional[TracebackType]) -> None:
del except_type, except_value, traceback # unused
pass

Expand All @@ -34,12 +37,12 @@ class ProvisionStatus(enum.Enum):
RUNTIME_SETUP = 1
PULLING_DOCKER_IMAGES = 2

def __enter__(self):
def __enter__(self) -> None:
self.state = self.ProvisionStatus.LAUNCH
self.status_display = rich_utils.safe_status('[bold cyan]Launching')
self.status_display.start()

def process_line(self, log_line):
def process_line(self, log_line: str) -> None:
if ('Success.' in log_line and
self.state == self.ProvisionStatus.LAUNCH):
logger.info(f'{colorama.Fore.GREEN}Head node is up.'
Expand All @@ -60,21 +63,23 @@ def process_line(self, log_line):
'[bold cyan]Launching - Preparing SkyPilot runtime')
self.state = self.ProvisionStatus.RUNTIME_SETUP

def __exit__(self, except_type, except_value, traceback):
def __exit__(self, except_type: Optional[Type[BaseException]],
except_value: Optional[BaseException],
traceback: Optional[TracebackType]) -> None:
del except_type, except_value, traceback # unused
self.status_display.stop()


class SkyLocalUpLineProcessor(LineProcessor):
"""A processor for `sky local up` log lines."""

def __enter__(self):
def __enter__(self) -> None:
status = rich_utils.safe_status('[bold cyan]Creating local cluster - '
'initializing Kubernetes')
self.status_display = status
self.status_display.start()

def process_line(self, log_line):
def process_line(self, log_line: str) -> None:
if 'Kind cluster created.' in log_line:
logger.info(f'{colorama.Fore.GREEN}Kubernetes is running.'
f'{colorama.Style.RESET_ALL}')
Expand Down Expand Up @@ -124,20 +129,22 @@ def process_line(self, log_line):
f'{colorama.Fore.GREEN}Nginx Ingress Controller installed.'
f'{colorama.Style.RESET_ALL}')

def __exit__(self, except_type, except_value, traceback):
def __exit__(self, except_type: Optional[Type[BaseException]],
except_value: Optional[BaseException],
traceback: Optional[TracebackType]) -> None:
del except_type, except_value, traceback # unused
self.status_display.stop()


class SkyRemoteUpLineProcessor(LineProcessor):
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved
"""A processor for deploy_remote_cluster.sh log lines."""

def __enter__(self):
def __enter__(self) -> None:
status = rich_utils.safe_status('[bold cyan]Creating remote cluster')
self.status_display = status
self.status_display.start()

def process_line(self, log_line):
def process_line(self, log_line: str) -> None:
# Pre-flight checks
if 'SSH connection successful' in log_line:
logger.info(f'{colorama.Fore.GREEN}SSH connection established.'
Expand Down Expand Up @@ -193,7 +200,9 @@ def process_line(self, log_line):
logger.info(f'{colorama.Fore.GREEN}✔ Remote k3s is running.'
f'{colorama.Style.RESET_ALL}')

def __exit__(self, except_type, except_value, traceback):
def __exit__(self, except_type: Optional[Type[BaseException]],
except_value: Optional[BaseException],
traceback: Optional[TracebackType]) -> None:
del except_type, except_value, traceback # unused
self.status_display.stop()

Expand Down
Loading