From 9b84b530e471b701e44a71b4bc4b4daf51493721 Mon Sep 17 00:00:00 2001 From: wizenheimer Date: Sun, 8 Sep 2024 01:28:47 +0530 Subject: [PATCH 1/5] fix: resolve param credential typo --- sky/provision/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/provision/__init__.py b/sky/provision/__init__.py index 0fe4ab614ce..41d985ade41 100644 --- a/sky/provision/__init__.py +++ b/sky/provision/__init__.py @@ -186,12 +186,12 @@ def get_cluster_info( def get_command_runners( provider_name: str, cluster_info: common.ClusterInfo, - **crednetials: Dict[str, Any], + **credentials: Dict[str, Any], ) -> List[command_runner.CommandRunner]: """Get a command runner for the given cluster.""" ip_list = cluster_info.get_feasible_ips() port_list = cluster_info.get_ssh_ports() return command_runner.SSHCommandRunner.make_runner_list( node_list=zip(ip_list, port_list), - **crednetials, + **credentials, ) From 826835154ca3f060446c48d4e1ef2a285e63b344 Mon Sep 17 00:00:00 2001 From: wizenheimer Date: Sun, 8 Sep 2024 01:51:52 +0530 Subject: [PATCH 2/5] fix: show non-common gpus --- sky/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index eb0267f7ced..bb053fbdd5e 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3181,7 +3181,7 @@ def _output(): yield from tpu_table.get_string() # Other GPUs - if show_all: + if show_all or cloud is not None: yield '\n\n' for gpu, qty in sorted(result.items()): other_table.add_row([gpu, _list_to_str(qty)]) From eba6150c7a21d1bc883f2553eaf91370777bda5d Mon Sep 17 00:00:00 2001 From: wizenheimer Date: Tue, 10 Sep 2024 09:19:59 +0530 Subject: [PATCH 3/5] fix: remove inadvertent price table when showing non common gpu --- sky/cli.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index bb053fbdd5e..defe59a31e1 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3180,19 +3180,26 @@ def _output(): yield '\n\n' yield from tpu_table.get_string() - # Other GPUs + # Handle Other GPUs if show_all or cloud is not None: yield '\n\n' for gpu, qty in sorted(result.items()): other_table.add_row([gpu, _list_to_str(qty)]) yield from other_table.get_string() yield '\n\n' - else: + + # Handle hints and messages + if not show_all and cloud is None: yield ('\n\nHint: use -a/--all to see all accelerators ' '(including non-common ones) and pricing.') - if k8s_messages: - yield '\n' - yield k8s_messages + + # Handle k8 messages if present + if k8s_messages: + yield '\n' + yield k8s_messages + + # Return if we're not showing all or if a specific cloud was queried + if not show_all or cloud is not None: return else: # Parse accelerator string From 14257c64e634fcf9869072793836882bb3cd1566 Mon Sep 17 00:00:00 2001 From: wizenheimer Date: Mon, 16 Sep 2024 23:43:28 +0530 Subject: [PATCH 4/5] feat: update footnotes for cloud specific release --- sky/cli.py | 4223 +++++++++++++++++++++++++++++----------------------- 1 file changed, 2360 insertions(+), 1863 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index defe59a31e1..69064475ea7 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -23,6 +23,7 @@ listed in "sky --help". Take care to put logically connected commands close to each other. """ + import copy import datetime import functools @@ -81,10 +82,10 @@ if typing.TYPE_CHECKING: from sky.backends import backend as backend_lib -pd = adaptors_common.LazyImport('pandas') +pd = adaptors_common.LazyImport("pandas") logger = sky_logging.init_logger(__name__) -_CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) +_CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) _CLUSTER_FLAG_HELP = """\ A cluster name. If provided, either reuse an existing cluster with that name or @@ -96,15 +97,19 @@ _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5 _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = ( - '{cluster_num} cluster{plural} {verb}. Please specify {cause} ' - 'cluster to show its {property}.\nUsage: `sky status --{flag} `') + "{cluster_num} cluster{plural} {verb}. Please specify {cause} " + "cluster to show its {property}.\nUsage: `sky status --{flag} `" +) -_ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, ' - 'please retry after a while.') +_ENDPOINTS_RETRY_MESSAGE = ( + "If the cluster was recently started, " "please retry after a while." +) -_DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by ' - '`sky jobs launch`. `{command}` supports a ' - 'single task only.') +_DAG_NOT_SUPPORTED_MESSAGE = ( + "YAML specifies a DAG which is only supported by " + "`sky jobs launch`. `{command}` supports a " + "single task only." +) def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]: @@ -113,7 +118,7 @@ def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]: for cluster in clusters: glob_cluster = global_user_state.get_glob_cluster_names(cluster) if len(glob_cluster) == 0 and not silent: - click.echo(f'Cluster {cluster} not found.') + click.echo(f"Cluster {cluster} not found.") glob_clusters.extend(glob_cluster) return list(set(glob_clusters)) @@ -124,121 +129,153 @@ def _get_glob_storages(storages: List[str]) -> List[str]: for storage_object in storages: glob_storage = global_user_state.get_glob_storage_name(storage_object) if len(glob_storage) == 0: - click.echo(f'Storage {storage_object} not found.') + click.echo(f"Storage {storage_object} not found.") glob_storages.extend(glob_storage) return list(set(glob_storages)) def _parse_env_var(env_var: str) -> Tuple[str, str]: """Parse env vars into a (KEY, VAL) pair.""" - if '=' not in env_var: + if "=" not in env_var: value = os.environ.get(env_var) if value is None: - raise click.UsageError( - f'{env_var} is not set in local environment.') + raise click.UsageError(f"{env_var} is not set in local environment.") return (env_var, value) - ret = tuple(env_var.split('=', 1)) + ret = tuple(env_var.split("=", 1)) if len(ret) != 2: raise click.UsageError( - f'Invalid env var: {env_var}. Must be in the form of KEY=VAL ' - 'or KEY.') + f"Invalid env var: {env_var}. Must be in the form of KEY=VAL " "or KEY." + ) return ret[0], ret[1] -def _merge_env_vars(env_dict: Optional[Dict[str, str]], - env_list: List[Tuple[str, str]]) -> List[Tuple[str, str]]: +def _merge_env_vars( + env_dict: Optional[Dict[str, str]], env_list: List[Tuple[str, str]] +) -> List[Tuple[str, str]]: """Merges all values from env_list into env_dict.""" if not env_dict: return env_list - for (key, value) in env_list: + for key, value in env_list: env_dict[key] = value return list(env_dict.items()) _TASK_OPTIONS = [ click.option( - '--workdir', + "--workdir", required=False, type=click.Path(exists=True, file_okay=False), - help=('If specified, sync this dir to the remote working directory, ' - 'where the task will be invoked. ' - 'Overrides the "workdir" config in the YAML if both are supplied.' - )), + help=( + "If specified, sync this dir to the remote working directory, " + "where the task will be invoked. " + 'Overrides the "workdir" config in the YAML if both are supplied.' + ), + ), click.option( - '--cloud', + "--cloud", required=False, type=str, - help=('The cloud to use. If specified, overrides the "resources.cloud" ' - 'config. Passing "none" resets the config.')), + help=( + 'The cloud to use. If specified, overrides the "resources.cloud" ' + 'config. Passing "none" resets the config.' + ), + ), click.option( - '--region', + "--region", required=False, type=str, - help=('The region to use. If specified, overrides the ' - '"resources.region" config. Passing "none" resets the config.')), + help=( + "The region to use. If specified, overrides the " + '"resources.region" config. Passing "none" resets the config.' + ), + ), click.option( - '--zone', + "--zone", required=False, type=str, - help=('The zone to use. If specified, overrides the ' - '"resources.zone" config. Passing "none" resets the config.')), + help=( + "The zone to use. If specified, overrides the " + '"resources.zone" config. Passing "none" resets the config.' + ), + ), click.option( - '--num-nodes', + "--num-nodes", required=False, type=int, - help=('Number of nodes to execute the task on. ' - 'Overrides the "num_nodes" config in the YAML if both are ' - 'supplied.')), + help=( + "Number of nodes to execute the task on. " + 'Overrides the "num_nodes" config in the YAML if both are ' + "supplied." + ), + ), click.option( - '--cpus', + "--cpus", default=None, type=str, required=False, - help=('Number of vCPUs each instance must have (e.g., ' - '``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). ' - 'This is used to automatically select the instance type.')), + help=( + "Number of vCPUs each instance must have (e.g., " + "``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). " + "This is used to automatically select the instance type." + ), + ), click.option( - '--memory', + "--memory", default=None, type=str, required=False, help=( - 'Amount of memory each instance must have in GB (e.g., ' - '``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least 16GB))' - )), - click.option('--disk-size', - default=None, - type=int, - required=False, - help=('OS disk size in GBs.')), - click.option('--disk-tier', - default=None, - type=click.Choice(resources_utils.DiskTier.supported_tiers(), - case_sensitive=False), - required=False, - help=resources_utils.DiskTier.cli_help_message()), + "Amount of memory each instance must have in GB (e.g., " + "``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least 16GB))" + ), + ), + click.option( + "--disk-size", + default=None, + type=int, + required=False, + help=("OS disk size in GBs."), + ), + click.option( + "--disk-tier", + default=None, + type=click.Choice( + resources_utils.DiskTier.supported_tiers(), case_sensitive=False + ), + required=False, + help=resources_utils.DiskTier.cli_help_message(), + ), + click.option( + "--use-spot/--no-use-spot", + required=False, + default=None, + help=( + "Whether to request spot instances. If specified, overrides the " + '"resources.use_spot" config.' + ), + ), click.option( - '--use-spot/--no-use-spot', + "--image-id", required=False, default=None, - help=('Whether to request spot instances. If specified, overrides the ' - '"resources.use_spot" config.')), - click.option('--image-id', - required=False, - default=None, - help=('Custom image id for launching the instances. ' - 'Passing "none" resets the config.')), - click.option('--env-file', - required=False, - type=dotenv.dotenv_values, - help="""\ + help=( + "Custom image id for launching the instances. " + 'Passing "none" resets the config.' + ), + ), + click.option( + "--env-file", + required=False, + type=dotenv.dotenv_values, + help="""\ Path to a dotenv file with environment variables to set on the remote node. If any values from ``--env-file`` conflict with values set by - ``--env``, the ``--env`` value will be preferred."""), + ``--env``, the ``--env`` value will be preferred.""", + ), click.option( - '--env', + "--env", required=False, type=_parse_env_var, multiple=True, @@ -256,79 +293,92 @@ def _merge_env_vars(env_dict: Optional[Dict[str, str]], 3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the same value of ``$MY_ENV3`` in the local environment.""", - ) + ), ] _TASK_OPTIONS_WITH_NAME = [ - click.option('--name', - '-n', - required=False, - type=str, - help=('Task name. Overrides the "name" ' - 'config in the YAML if both are supplied.')), + click.option( + "--name", + "-n", + required=False, + type=str, + help=( + 'Task name. Overrides the "name" ' + "config in the YAML if both are supplied." + ), + ), ] + _TASK_OPTIONS _EXTRA_RESOURCES_OPTIONS = [ click.option( - '--gpus', + "--gpus", required=False, type=str, - help= - ('Type and number of GPUs to use. Example values: ' - '"V100:8", "V100" (short for a count of 1), or "V100:0.5" ' - '(fractional counts are supported by the scheduling framework). ' - 'If a new cluster is being launched by this command, this is the ' - 'resources to provision. If an existing cluster is being reused, this' - ' is seen as the task demand, which must fit the cluster\'s total ' - 'resources and is used for scheduling the task. ' - 'Overrides the "accelerators" ' - 'config in the YAML if both are supplied. ' - 'Passing "none" resets the config.')), + help=( + "Type and number of GPUs to use. Example values: " + '"V100:8", "V100" (short for a count of 1), or "V100:0.5" ' + "(fractional counts are supported by the scheduling framework). " + "If a new cluster is being launched by this command, this is the " + "resources to provision. If an existing cluster is being reused, this" + " is seen as the task demand, which must fit the cluster's total " + "resources and is used for scheduling the task. " + 'Overrides the "accelerators" ' + "config in the YAML if both are supplied. " + 'Passing "none" resets the config.' + ), + ), click.option( - '--instance-type', - '-t', + "--instance-type", + "-t", required=False, type=str, - help=('The instance type to use. If specified, overrides the ' - '"resources.instance_type" config. Passing "none" resets the ' - 'config.'), + help=( + "The instance type to use. If specified, overrides the " + '"resources.instance_type" config. Passing "none" resets the ' + "config." + ), ), click.option( - '--ports', + "--ports", required=False, type=str, multiple=True, - help=('Ports to open on the cluster. ' - 'If specified, overrides the "ports" config in the YAML. '), + help=( + "Ports to open on the cluster. " + 'If specified, overrides the "ports" config in the YAML. ' + ), ), ] -def _complete_cluster_name(ctx: click.Context, param: click.Parameter, - incomplete: str) -> List[str]: +def _complete_cluster_name( + ctx: click.Context, param: click.Parameter, incomplete: str +) -> List[str]: """Handle shell completion for cluster names.""" del ctx, param # Unused. return global_user_state.get_cluster_names_start_with(incomplete) -def _complete_storage_name(ctx: click.Context, param: click.Parameter, - incomplete: str) -> List[str]: +def _complete_storage_name( + ctx: click.Context, param: click.Parameter, incomplete: str +) -> List[str]: """Handle shell completion for storage names.""" del ctx, param # Unused. return global_user_state.get_storage_names_start_with(incomplete) -def _complete_file_name(ctx: click.Context, param: click.Parameter, - incomplete: str) -> List[str]: +def _complete_file_name( + ctx: click.Context, param: click.Parameter, incomplete: str +) -> List[str]: """Handle shell completion for file names. Returns a special completion marker that tells click to use the shell's default file completion. """ del ctx, param # Unused. - return [click.shell_completion.CompletionItem(incomplete, type='file')] + return [click.shell_completion.CompletionItem(incomplete, type="file")] def _get_click_major_version(): - return int(click.__version__.split('.', maxsplit=1)[0]) + return int(click.__version__.split(".", maxsplit=1)[0]) def _get_shell_complete_args(complete_fn): @@ -338,49 +388,49 @@ def _get_shell_complete_args(complete_fn): return {} -_RELOAD_ZSH_CMD = 'source ~/.zshrc' -_RELOAD_FISH_CMD = 'source ~/.config/fish/config.fish' -_RELOAD_BASH_CMD = 'source ~/.bashrc' +_RELOAD_ZSH_CMD = "source ~/.zshrc" +_RELOAD_FISH_CMD = "source ~/.config/fish/config.fish" +_RELOAD_BASH_CMD = "source ~/.bashrc" -def _install_shell_completion(ctx: click.Context, param: click.Parameter, - value: str): +def _install_shell_completion(ctx: click.Context, param: click.Parameter, value: str): """A callback for installing shell completion for click.""" del param # Unused. if not value or ctx.resilient_parsing: return - if value == 'auto': - if 'SHELL' not in os.environ: + if value == "auto": + if "SHELL" not in os.environ: click.secho( - 'Cannot auto-detect shell. Please specify shell explicitly.', - fg='red') + "Cannot auto-detect shell. Please specify shell explicitly.", fg="red" + ) ctx.exit() else: - value = os.path.basename(os.environ['SHELL']) + value = os.path.basename(os.environ["SHELL"]) - zshrc_diff = '\n# For SkyPilot shell completion\n. ~/.sky/.sky-complete.zsh' - bashrc_diff = ('\n# For SkyPilot shell completion' - '\n. ~/.sky/.sky-complete.bash') + zshrc_diff = "\n# For SkyPilot shell completion\n. ~/.sky/.sky-complete.zsh" + bashrc_diff = "\n# For SkyPilot shell completion" "\n. ~/.sky/.sky-complete.bash" - if value == 'bash': + if value == "bash": install_cmd = f'_SKY_COMPLETE=bash_source sky > \ ~/.sky/.sky-complete.bash && \ echo "{bashrc_diff}" >> ~/.bashrc' - cmd = (f'(grep -q "SkyPilot" ~/.bashrc) || ' - f'([[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd}) || ' - f'(echo "Bash must be version 4 or above." && exit 1))') + cmd = ( + f'(grep -q "SkyPilot" ~/.bashrc) || ' + f"([[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd}) || " + f'(echo "Bash must be version 4 or above." && exit 1))' + ) reload_cmd = _RELOAD_BASH_CMD - elif value == 'fish': - cmd = '_SKY_COMPLETE=fish_source sky > \ - ~/.config/fish/completions/sky.fish' + elif value == "fish": + cmd = "_SKY_COMPLETE=fish_source sky > \ + ~/.config/fish/completions/sky.fish" reload_cmd = _RELOAD_FISH_CMD - elif value == 'zsh': + elif value == "zsh": install_cmd = f'_SKY_COMPLETE=zsh_source sky > \ ~/.sky/.sky-complete.zsh && \ echo "{zshrc_diff}" >> ~/.zshrc' @@ -389,51 +439,48 @@ def _install_shell_completion(ctx: click.Context, param: click.Parameter, reload_cmd = _RELOAD_ZSH_CMD else: - click.secho(f'Unsupported shell: {value}', fg='red') + click.secho(f"Unsupported shell: {value}", fg="red") ctx.exit() try: - subprocess.run(cmd, - shell=True, - check=True, - executable=shutil.which('bash')) - click.secho(f'Shell completion installed for {value}', fg='green') + subprocess.run(cmd, shell=True, check=True, executable=shutil.which("bash")) + click.secho(f"Shell completion installed for {value}", fg="green") click.echo( - 'Completion will take effect once you restart the terminal: ' + - click.style(f'{reload_cmd}', bold=True)) + "Completion will take effect once you restart the terminal: " + + click.style(f"{reload_cmd}", bold=True) + ) except subprocess.CalledProcessError as e: - click.secho(f'> Installation failed with code {e.returncode}', fg='red') + click.secho(f"> Installation failed with code {e.returncode}", fg="red") ctx.exit() -def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter, - value: str): +def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter, value: str): """A callback for uninstalling shell completion for click.""" del param # Unused. if not value or ctx.resilient_parsing: return - if value == 'auto': - if 'SHELL' not in os.environ: + if value == "auto": + if "SHELL" not in os.environ: click.secho( - 'Cannot auto-detect shell. Please specify shell explicitly.', - fg='red') + "Cannot auto-detect shell. Please specify shell explicitly.", fg="red" + ) ctx.exit() else: - value = os.path.basename(os.environ['SHELL']) + value = os.path.basename(os.environ["SHELL"]) - if value == 'bash': + if value == "bash": cmd = 'sed -i"" -e "/# For SkyPilot shell completion/d" ~/.bashrc && \ sed -i"" -e "/sky-complete.bash/d" ~/.bashrc && \ rm -f ~/.sky/.sky-complete.bash' reload_cmd = _RELOAD_BASH_CMD - elif value == 'fish': - cmd = 'rm -f ~/.config/fish/completions/sky.fish' + elif value == "fish": + cmd = "rm -f ~/.config/fish/completions/sky.fish" reload_cmd = _RELOAD_FISH_CMD - elif value == 'zsh': + elif value == "zsh": cmd = 'sed -i"" -e "/# For SkyPilot shell completion/d" ~/.zshrc && \ sed -i"" -e "/sky-complete.zsh/d" ~/.zshrc && \ rm -f ~/.sky/.sky-complete.zsh' @@ -441,17 +488,18 @@ def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter, reload_cmd = _RELOAD_ZSH_CMD else: - click.secho(f'Unsupported shell: {value}', fg='red') + click.secho(f"Unsupported shell: {value}", fg="red") ctx.exit() try: subprocess.run(cmd, shell=True, check=True) - click.secho(f'Shell completion uninstalled for {value}', fg='green') - click.echo('Changes will take effect once you restart the terminal: ' + - click.style(f'{reload_cmd}', bold=True)) + click.secho(f"Shell completion uninstalled for {value}", fg="green") + click.echo( + "Changes will take effect once you restart the terminal: " + + click.style(f"{reload_cmd}", bold=True) + ) except subprocess.CalledProcessError as e: - click.secho(f'> Uninstallation failed with code {e.returncode}', - fg='red') + click.secho(f"> Uninstallation failed with code {e.returncode}", fg="red") ctx.exit() @@ -467,71 +515,72 @@ def _add_options(func): def _parse_override_params( - cloud: Optional[str] = None, - region: Optional[str] = None, - zone: Optional[str] = None, - gpus: Optional[str] = None, - cpus: Optional[str] = None, - memory: Optional[str] = None, - instance_type: Optional[str] = None, - use_spot: Optional[bool] = None, - image_id: Optional[str] = None, - disk_size: Optional[int] = None, - disk_tier: Optional[str] = None, - ports: Optional[Tuple[str]] = None) -> Dict[str, Any]: + cloud: Optional[str] = None, + region: Optional[str] = None, + zone: Optional[str] = None, + gpus: Optional[str] = None, + cpus: Optional[str] = None, + memory: Optional[str] = None, + instance_type: Optional[str] = None, + use_spot: Optional[bool] = None, + image_id: Optional[str] = None, + disk_size: Optional[int] = None, + disk_tier: Optional[str] = None, + ports: Optional[Tuple[str]] = None, +) -> Dict[str, Any]: """Parses the override parameters into a dictionary.""" override_params: Dict[str, Any] = {} if cloud is not None: - if cloud.lower() == 'none': - override_params['cloud'] = None + if cloud.lower() == "none": + override_params["cloud"] = None else: - override_params['cloud'] = sky_clouds.CLOUD_REGISTRY.from_str(cloud) + override_params["cloud"] = sky_clouds.CLOUD_REGISTRY.from_str(cloud) if region is not None: - if region.lower() == 'none': - override_params['region'] = None + if region.lower() == "none": + override_params["region"] = None else: - override_params['region'] = region + override_params["region"] = region if zone is not None: - if zone.lower() == 'none': - override_params['zone'] = None + if zone.lower() == "none": + override_params["zone"] = None else: - override_params['zone'] = zone + override_params["zone"] = zone if gpus is not None: - if gpus.lower() == 'none': - override_params['accelerators'] = None + if gpus.lower() == "none": + override_params["accelerators"] = None else: - override_params['accelerators'] = gpus + override_params["accelerators"] = gpus if cpus is not None: - if cpus.lower() == 'none': - override_params['cpus'] = None + if cpus.lower() == "none": + override_params["cpus"] = None else: - override_params['cpus'] = cpus + override_params["cpus"] = cpus if memory is not None: - if memory.lower() == 'none': - override_params['memory'] = None + if memory.lower() == "none": + override_params["memory"] = None else: - override_params['memory'] = memory + override_params["memory"] = memory if instance_type is not None: - if instance_type.lower() == 'none': - override_params['instance_type'] = None + if instance_type.lower() == "none": + override_params["instance_type"] = None else: - override_params['instance_type'] = instance_type + override_params["instance_type"] = instance_type if use_spot is not None: - override_params['use_spot'] = use_spot + override_params["use_spot"] = use_spot if image_id is not None: - if image_id.lower() == 'none': - override_params['image_id'] = None + if image_id.lower() == "none": + override_params["image_id"] = None else: - override_params['image_id'] = image_id + override_params["image_id"] = image_id if disk_size is not None: - override_params['disk_size'] = disk_size + override_params["disk_size"] = disk_size if disk_tier is not None: - if disk_tier.lower() == 'none': - override_params['disk_tier'] = None + if disk_tier.lower() == "none": + override_params["disk_tier"] = None else: - override_params['disk_tier'] = disk_tier + override_params["disk_tier"] = disk_tier if ports: - override_params['ports'] = ports + override_params["ports"] = ports return override_params @@ -554,11 +603,12 @@ def _launch_with_confirm( if cluster is None: cluster = backend_utils.generate_cluster_name() - clone_source_str = '' + clone_source_str = "" if clone_disk_from is not None: - clone_source_str = f' from the disk of {clone_disk_from!r}' + clone_source_str = f" from the disk of {clone_disk_from!r}" task, _ = backend_utils.check_can_clone_disk_and_override_task( - clone_disk_from, cluster, task) + clone_disk_from, cluster, task + ) with sky.Dag() as dag: dag.add(task) @@ -568,13 +618,15 @@ def _launch_with_confirm( # Show the optimize log before the prompt if the cluster does not exist. try: sky_check.get_cached_enabled_clouds_or_refresh( - raise_if_no_cloud_access=True) + raise_if_no_cloud_access=True + ) except exceptions.NoCloudAccessError as e: # Catch the exception where the public cloud is not enabled, and # make it yellow for better visibility. with ux_utils.print_exception_no_traceback(): - raise RuntimeError(f'{colorama.Fore.YELLOW}{e}' - f'{colorama.Style.RESET_ALL}') from e + raise RuntimeError( + f"{colorama.Fore.YELLOW}{e}" f"{colorama.Style.RESET_ALL}" + ) from e dag = sky.optimize(dag) task = dag.tasks[0] @@ -587,18 +639,18 @@ def _launch_with_confirm( # it exists but is STOPPED. prompt = None if maybe_status is None: - cluster_str = '' if cluster is None else f' {cluster!r}' + cluster_str = "" if cluster is None else f" {cluster!r}" prompt = ( - f'Launching a new cluster{cluster_str}{clone_source_str}. ' - 'Proceed?') + f"Launching a new cluster{cluster_str}{clone_source_str}. " "Proceed?" + ) elif maybe_status == status_lib.ClusterStatus.STOPPED: - prompt = f'Restarting the stopped cluster {cluster!r}. Proceed?' + prompt = f"Restarting the stopped cluster {cluster!r}. Proceed?" if prompt is not None: confirm_shown = True click.confirm(prompt, default=True, abort=True, show_default=True) if not confirm_shown: - click.secho(f'Running task on cluster {cluster}...', fg='yellow') + click.secho(f"Running task on cluster {cluster}...", fg="yellow") sky.launch( dag, @@ -626,12 +678,12 @@ def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]: config: Optional[List[Dict[str, Any]]] = None result = None shell_splits = shlex.split(entrypoint) - yaml_file_provided = (len(shell_splits) == 1 and - (shell_splits[0].endswith('yaml') or - shell_splits[0].endswith('.yml'))) - invalid_reason = '' + yaml_file_provided = len(shell_splits) == 1 and ( + shell_splits[0].endswith("yaml") or shell_splits[0].endswith(".yml") + ) + invalid_reason = "" try: - with open(entrypoint, 'r', encoding='utf-8') as f: + with open(entrypoint, "r", encoding="utf-8") as f: try: config = list(yaml.safe_load_all(f)) if config: @@ -646,36 +698,43 @@ def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]: except yaml.YAMLError as e: if yaml_file_provided: logger.debug(e) - detailed_error = f'\nYAML Error: {e}\n' - invalid_reason = ('contains an invalid configuration. ' - 'Please check syntax.\n' - f'{detailed_error}') + detailed_error = f"\nYAML Error: {e}\n" + invalid_reason = ( + "contains an invalid configuration. " + "Please check syntax.\n" + f"{detailed_error}" + ) is_yaml = False except OSError: if yaml_file_provided: entry_point_path = os.path.expanduser(entrypoint) if not os.path.exists(entry_point_path): - invalid_reason = ('does not exist. Please check if the path' - ' is correct.') + invalid_reason = ( + "does not exist. Please check if the path" " is correct." + ) elif not os.path.isfile(entry_point_path): - invalid_reason = ('is not a file. Please check if the path' - ' is correct.') + invalid_reason = ( + "is not a file. Please check if the path" " is correct." + ) else: - invalid_reason = ('yaml.safe_load() failed. Please check if the' - ' path is correct.') + invalid_reason = ( + "yaml.safe_load() failed. Please check if the" " path is correct." + ) is_yaml = False if not is_yaml: if yaml_file_provided: click.confirm( - f'{entrypoint!r} looks like a yaml path but {invalid_reason}\n' - 'It will be treated as a command to be run remotely. Continue?', - abort=True) + f"{entrypoint!r} looks like a yaml path but {invalid_reason}\n" + "It will be treated as a command to be run remotely. Continue?", + abort=True, + ) return is_yaml, result def _pop_and_ignore_fields_in_override_params( - params: Dict[str, Any], field_to_ignore: List[str]) -> None: + params: Dict[str, Any], field_to_ignore: List[str] +) -> None: """Pops and ignores fields in override params. Args: @@ -689,14 +748,15 @@ def _pop_and_ignore_fields_in_override_params( for field in field_to_ignore: field_value = params.pop(field, None) if field_value is not None: - click.secho(f'Override param {field}={field_value} is ignored.', - fg='yellow') + click.secho( + f"Override param {field}={field_value} is ignored.", fg="yellow" + ) def _make_task_or_dag_from_entrypoint_with_overrides( entrypoint: Tuple[str, ...], *, - entrypoint_name: str = 'Task', + entrypoint_name: str = "Task", name: Optional[str] = None, workdir: Optional[str] = None, cloud: Optional[str] = None, @@ -723,40 +783,37 @@ def _make_task_or_dag_from_entrypoint_with_overrides( A dag iff the entrypoint is YAML and contains more than 1 task. Otherwise, a task. """ - entrypoint = ' '.join(entrypoint) + entrypoint = " ".join(entrypoint) is_yaml, _ = _check_yaml(entrypoint) entrypoint: Optional[str] if is_yaml: # Treat entrypoint as a yaml. - click.secho(f'{entrypoint_name} from YAML spec: ', - fg='yellow', - nl=False) + click.secho(f"{entrypoint_name} from YAML spec: ", fg="yellow", nl=False) click.secho(entrypoint, bold=True) else: if not entrypoint: entrypoint = None else: # Treat entrypoint as a bash command. - click.secho(f'{entrypoint_name} from command: ', - fg='yellow', - nl=False) + click.secho(f"{entrypoint_name} from command: ", fg="yellow", nl=False) click.secho(entrypoint, bold=True) - override_params = _parse_override_params(cloud=cloud, - region=region, - zone=zone, - gpus=gpus, - cpus=cpus, - memory=memory, - instance_type=instance_type, - use_spot=use_spot, - image_id=image_id, - disk_size=disk_size, - disk_tier=disk_tier, - ports=ports) + override_params = _parse_override_params( + cloud=cloud, + region=region, + zone=zone, + gpus=gpus, + cpus=cpus, + memory=memory, + instance_type=instance_type, + use_spot=use_spot, + image_id=image_id, + disk_size=disk_size, + disk_tier=disk_tier, + ports=ports, + ) if field_to_ignore is not None: - _pop_and_ignore_fields_in_override_params(override_params, - field_to_ignore) + _pop_and_ignore_fields_in_override_params(override_params, field_to_ignore) if is_yaml: assert entrypoint is not None @@ -768,15 +825,17 @@ def _make_task_or_dag_from_entrypoint_with_overrides( # override params. if override_params: click.secho( - f'WARNING: override params {override_params} are ignored, ' - 'since the yaml file contains multiple tasks.', - fg='yellow') + f"WARNING: override params {override_params} are ignored, " + "since the yaml file contains multiple tasks.", + fg="yellow", + ) return dag - assert len(dag.tasks) == 1, ( - f'If you see this, please file an issue; tasks: {dag.tasks}') + assert ( + len(dag.tasks) == 1 + ), f"If you see this, please file an issue; tasks: {dag.tasks}" task = dag.tasks[0] else: - task = sky.Task(name='sky-cmd', run=entrypoint) + task = sky.Task(name="sky-cmd", run=entrypoint) task.set_resources({sky.Resources()}) # env update has been done for DAG in load_chain_dag_from_yaml for YAML. task.update_envs(env) @@ -787,7 +846,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides( # job launch specific. if job_recovery is not None: - override_params['job_recovery'] = job_recovery + override_params["job_recovery"] = job_recovery task.set_resources_override(override_params) @@ -807,7 +866,7 @@ class _NaturalOrderGroup(click.Group): def list_commands(self, ctx): return self.commands.keys() - @usage_lib.entrypoint('sky.cli', fallback=True) + @usage_lib.entrypoint("sky.cli", fallback=True) def invoke(self, ctx): return super().invoke(ctx) @@ -819,36 +878,38 @@ class _DocumentedCodeCommand(click.Command): def get_help(self, ctx): help_str = ctx.command.help - ctx.command.help = help_str.replace('.. code-block:: bash\n', '\b') + ctx.command.help = help_str.replace(".. code-block:: bash\n", "\b") return super().get_help(ctx) def _with_deprecation_warning( - f, - original_name: str, - alias_name: str, - override_command_argument: Optional[Dict[str, Any]] = None): + f, + original_name: str, + alias_name: str, + override_command_argument: Optional[Dict[str, Any]] = None, +): @functools.wraps(f) def wrapper(self, *args, **kwargs): - override_str = '' + override_str = "" if override_command_argument is not None: overrides = [] for k, v in override_command_argument.items(): if isinstance(v, bool): if v: - overrides.append(f'--{k}') + overrides.append(f"--{k}") else: - overrides.append(f'--no-{k}') + overrides.append(f"--no-{k}") else: overrides.append(f'--{k.replace("_", "-")}={v}') - override_str = ' with additional arguments ' + ' '.join(overrides) + override_str = " with additional arguments " + " ".join(overrides) click.secho( - f'WARNING: `{alias_name}` has been renamed to `{original_name}` ' - f'and will be removed in a future release. Please use the ' - f'latter{override_str} instead.\n', + f"WARNING: `{alias_name}` has been renamed to `{original_name}` " + f"and will be removed in a future release. Please use the " + f"latter{override_str} instead.\n", err=True, - fg='yellow') + fg="yellow", + ) return f(self, *args, **kwargs) return wrapper @@ -857,7 +918,7 @@ def wrapper(self, *args, **kwargs): def _override_arguments(callback, override_command_argument: Dict[str, Any]): def wrapper(*args, **kwargs): - logger.info(f'Overriding arguments: {override_command_argument}') + logger.info(f"Overriding arguments: {override_command_argument}") kwargs.update(override_command_argument) return callback(*args, **kwargs) @@ -879,161 +940,194 @@ def _add_command_alias( if new_command_name is None: new_command_name = command.name if new_group == group and new_command_name == command.name: - raise ValueError('Cannot add an alias to the same command.') + raise ValueError("Cannot add an alias to the same command.") new_command = copy.deepcopy(command) new_command.hidden = hidden new_command.name = new_command_name if override_command_argument: - new_command.callback = _override_arguments(new_command.callback, - override_command_argument) + new_command.callback = _override_arguments( + new_command.callback, override_command_argument + ) - orig = f'sky {group.name} {command.name}' - alias = f'sky {new_group.name} {new_command_name}' + orig = f"sky {group.name} {command.name}" + alias = f"sky {new_group.name} {new_command_name}" if with_warning: new_command.invoke = _with_deprecation_warning( new_command.invoke, orig, alias, - override_command_argument=override_command_argument) + override_command_argument=override_command_argument, + ) new_group.add_command(new_command, name=new_command_name) -def _deprecate_and_hide_command(group, command_to_deprecate, - alternative_command): +def _deprecate_and_hide_command(group, command_to_deprecate, alternative_command): """Hide a command and show a deprecation note, hinting the alternative.""" command_to_deprecate.hidden = True if group is not None: - orig = f'sky {group.name} {command_to_deprecate.name}' + orig = f"sky {group.name} {command_to_deprecate.name}" else: - orig = f'sky {command_to_deprecate.name}' + orig = f"sky {command_to_deprecate.name}" command_to_deprecate.invoke = _with_deprecation_warning( - command_to_deprecate.invoke, alternative_command, orig) + command_to_deprecate.invoke, alternative_command, orig + ) @click.group(cls=_NaturalOrderGroup, context_settings=_CONTEXT_SETTINGS) -@click.option('--install-shell-completion', - type=click.Choice(['bash', 'zsh', 'fish', 'auto']), - callback=_install_shell_completion, - expose_value=False, - is_eager=True, - help='Install shell completion for the specified shell.') -@click.option('--uninstall-shell-completion', - type=click.Choice(['bash', 'zsh', 'fish', 'auto']), - callback=_uninstall_shell_completion, - expose_value=False, - is_eager=True, - help='Uninstall shell completion for the specified shell.') -@click.version_option(sky.__version__, '--version', '-v', prog_name='skypilot') -@click.version_option(sky.__commit__, - '--commit', - '-c', - prog_name='skypilot', - message='%(prog)s, commit %(version)s', - help='Show the commit hash and exit') +@click.option( + "--install-shell-completion", + type=click.Choice(["bash", "zsh", "fish", "auto"]), + callback=_install_shell_completion, + expose_value=False, + is_eager=True, + help="Install shell completion for the specified shell.", +) +@click.option( + "--uninstall-shell-completion", + type=click.Choice(["bash", "zsh", "fish", "auto"]), + callback=_uninstall_shell_completion, + expose_value=False, + is_eager=True, + help="Uninstall shell completion for the specified shell.", +) +@click.version_option(sky.__version__, "--version", "-v", prog_name="skypilot") +@click.version_option( + sky.__commit__, + "--commit", + "-c", + prog_name="skypilot", + message="%(prog)s, commit %(version)s", + help="Show the commit hash and exit", +) def cli(): pass @cli.command(cls=_DocumentedCodeCommand) -@click.argument('entrypoint', - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name)) -@click.option('--cluster', - '-c', - default=None, - type=str, - **_get_shell_complete_args(_complete_cluster_name), - help=_CLUSTER_FLAG_HELP) -@click.option('--dryrun', - default=False, - is_flag=True, - help='If True, do not actually run the job.') +@click.argument( + "entrypoint", + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name), +) @click.option( - '--detach-setup', - '-s', + "--cluster", + "-c", + default=None, + type=str, + **_get_shell_complete_args(_complete_cluster_name), + help=_CLUSTER_FLAG_HELP, +) +@click.option( + "--dryrun", + default=False, + is_flag=True, + help="If True, do not actually run the job.", +) +@click.option( + "--detach-setup", + "-s", default=False, is_flag=True, - help= - ('If True, run setup in non-interactive mode as part of the job itself. ' - 'You can safely ctrl-c to detach from logging, and it will not interrupt ' - 'the setup process. To see the logs again after detaching, use `sky logs`.' - ' To cancel setup, cancel the job via `sky cancel`. Useful for long-' - 'running setup commands.')) + help=( + "If True, run setup in non-interactive mode as part of the job itself. " + "You can safely ctrl-c to detach from logging, and it will not interrupt " + "the setup process. To see the logs again after detaching, use `sky logs`." + " To cancel setup, cancel the job via `sky cancel`. Useful for long-" + "running setup commands." + ), +) @click.option( - '--detach-run', - '-d', + "--detach-run", + "-d", default=False, is_flag=True, - help=('If True, as soon as a job is submitted, return from this call ' - 'and do not stream execution logs.')) -@click.option('--docker', - 'backend_name', - flag_value=backends.LocalDockerBackend.NAME, - default=False, - help='If used, runs locally inside a docker container.') + help=( + "If True, as soon as a job is submitted, return from this call " + "and do not stream execution logs." + ), +) +@click.option( + "--docker", + "backend_name", + flag_value=backends.LocalDockerBackend.NAME, + default=False, + help="If used, runs locally inside a docker container.", +) @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS) @click.option( - '--idle-minutes-to-autostop', - '-i', + "--idle-minutes-to-autostop", + "-i", default=None, type=int, required=False, - help=('Automatically stop the cluster after this many minutes ' - 'of idleness, i.e., no running or pending jobs in the cluster\'s job ' - 'queue. Idleness gets reset whenever setting-up/running/pending jobs ' - 'are found in the job queue. ' - 'Setting this flag is equivalent to ' - 'running ``sky launch -d ...`` and then ``sky autostop -i ``' - '. If not set, the cluster will not be autostopped.')) + help=( + "Automatically stop the cluster after this many minutes " + "of idleness, i.e., no running or pending jobs in the cluster's job " + "queue. Idleness gets reset whenever setting-up/running/pending jobs " + "are found in the job queue. " + "Setting this flag is equivalent to " + "running ``sky launch -d ...`` and then ``sky autostop -i ``" + ". If not set, the cluster will not be autostopped." + ), +) @click.option( - '--down', + "--down", default=False, is_flag=True, required=False, - help= - ('Autodown the cluster: tear down the cluster after all jobs finish ' - '(successfully or abnormally). If --idle-minutes-to-autostop is also set, ' - 'the cluster will be torn down after the specified idle time. ' - 'Note that if errors occur during provisioning/data syncing/setting up, ' - 'the cluster will not be torn down for debugging purposes.'), + help=( + "Autodown the cluster: tear down the cluster after all jobs finish " + "(successfully or abnormally). If --idle-minutes-to-autostop is also set, " + "the cluster will be torn down after the specified idle time. " + "Note that if errors occur during provisioning/data syncing/setting up, " + "the cluster will not be torn down for debugging purposes." + ), ) @click.option( - '--retry-until-up', - '-r', + "--retry-until-up", + "-r", default=False, is_flag=True, required=False, - help=('Whether to retry provisioning infinitely until the cluster is up, ' - 'if we fail to launch the cluster on any possible region/cloud due ' - 'to unavailability errors.'), + help=( + "Whether to retry provisioning infinitely until the cluster is up, " + "if we fail to launch the cluster on any possible region/cloud due " + "to unavailability errors." + ), ) @click.option( - '--yes', - '-y', + "--yes", + "-y", is_flag=True, default=False, required=False, # Disabling quote check here, as there seems to be a bug in pylint, # which incorrectly recognizes the help string as a docstring. # pylint: disable=bad-docstring-quotes - help='Skip confirmation prompt.') -@click.option('--no-setup', - is_flag=True, - default=False, - required=False, - help='Skip setup phase when (re-)launching cluster.') + help="Skip confirmation prompt.", +) +@click.option( + "--no-setup", + is_flag=True, + default=False, + required=False, + help="Skip setup phase when (re-)launching cluster.", +) @click.option( - '--clone-disk-from', - '--clone', + "--clone-disk-from", + "--clone", default=None, type=str, **_get_shell_complete_args(_complete_cluster_name), - help=('[Experimental] Clone disk from an existing cluster to launch ' - 'a new one. This is useful when the new cluster needs to have ' - 'the same data on the boot disk as an existing cluster.')) + help=( + "[Experimental] Clone disk from an existing cluster to launch " + "a new one. This is useful when the new cluster needs to have " + "the same data on the boot disk as an existing cluster." + ), +) @usage_lib.entrypoint def launch( entrypoint: Tuple[str, ...], @@ -1077,7 +1171,8 @@ def launch( # NOTE(dev): Keep the docstring consistent between the Python API and CLI. env = _merge_env_vars(env_file, env) controller_utils.check_cluster_name_not_controller( - cluster, operation_str='Launching tasks on it') + cluster, operation_str="Launching tasks on it" + ) if backend_name is None: backend_name = backends.CloudVmRayBackend.NAME @@ -1101,8 +1196,7 @@ def launch( ports=ports, ) if isinstance(task_or_dag, sky.Dag): - raise click.UsageError( - _DAG_NOT_SUPPORTED_MESSAGE.format(command='sky launch')) + raise click.UsageError(_DAG_NOT_SUPPORTED_MESSAGE.format(command="sky launch")) task = task_or_dag backend: backends.Backend @@ -1112,55 +1206,66 @@ def launch( backend = backends.CloudVmRayBackend() else: with ux_utils.print_exception_no_traceback(): - raise ValueError(f'{backend_name} backend is not supported.') + raise ValueError(f"{backend_name} backend is not supported.") if task.service is not None: logger.info( - f'{colorama.Fore.YELLOW}Service section will be ignored when using ' - f'`sky launch`. {colorama.Style.RESET_ALL}\n{colorama.Fore.YELLOW}' - 'To spin up a service, use SkyServe CLI: ' - f'{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}sky serve up' - f'{colorama.Style.RESET_ALL}') - - _launch_with_confirm(task, - backend, - cluster, - dryrun=dryrun, - detach_setup=detach_setup, - detach_run=detach_run, - no_confirm=yes, - idle_minutes_to_autostop=idle_minutes_to_autostop, - down=down, - retry_until_up=retry_until_up, - no_setup=no_setup, - clone_disk_from=clone_disk_from) + f"{colorama.Fore.YELLOW}Service section will be ignored when using " + f"`sky launch`. {colorama.Style.RESET_ALL}\n{colorama.Fore.YELLOW}" + "To spin up a service, use SkyServe CLI: " + f"{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}sky serve up" + f"{colorama.Style.RESET_ALL}" + ) + + _launch_with_confirm( + task, + backend, + cluster, + dryrun=dryrun, + detach_setup=detach_setup, + detach_run=detach_run, + no_confirm=yes, + idle_minutes_to_autostop=idle_minutes_to_autostop, + down=down, + retry_until_up=retry_until_up, + no_setup=no_setup, + clone_disk_from=clone_disk_from, + ) @cli.command(cls=_DocumentedCodeCommand) -@click.argument('cluster', - required=False, - type=str, - **_get_shell_complete_args(_complete_cluster_name)) +@click.argument( + "cluster", + required=False, + type=str, + **_get_shell_complete_args(_complete_cluster_name), +) @click.option( - '--cluster', - '-c', - 'cluster_option', + "--cluster", + "-c", + "cluster_option", hidden=True, type=str, - help='This is the same as the positional argument, just for consistency.', - **_get_shell_complete_args(_complete_cluster_name)) -@click.argument('entrypoint', - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name)) + help="This is the same as the positional argument, just for consistency.", + **_get_shell_complete_args(_complete_cluster_name), +) +@click.argument( + "entrypoint", + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name), +) @click.option( - '--detach-run', - '-d', + "--detach-run", + "-d", default=False, is_flag=True, - help=('If True, as soon as a job is submitted, return from this call ' - 'and do not stream execution logs.')) + help=( + "If True, as soon as a job is submitted, return from this call " + "and do not stream execution logs." + ), +) @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS) @usage_lib.entrypoint # pylint: disable=redefined-builtin @@ -1246,23 +1351,24 @@ def exec( """ if cluster_option is None and cluster is None: - raise click.UsageError('Missing argument \'[CLUSTER]\' and ' - '\'[ENTRYPOINT]...\'') + raise click.UsageError("Missing argument '[CLUSTER]' and " "'[ENTRYPOINT]...'") if cluster_option is not None: if cluster is not None: entrypoint = (cluster,) + entrypoint cluster = cluster_option if not entrypoint: - raise click.UsageError('Missing argument \'[ENTRYPOINT]...\'') + raise click.UsageError("Missing argument '[ENTRYPOINT]...'") assert cluster is not None, (cluster, cluster_option, entrypoint) env = _merge_env_vars(env_file, env) controller_utils.check_cluster_name_not_controller( - cluster, operation_str='Executing task on it') + cluster, operation_str="Executing task on it" + ) handle = global_user_state.get_handle_from_cluster_name(cluster) if handle is None: - raise click.BadParameter(f'Cluster {cluster!r} not found. ' - 'Use `sky launch` to provision first.') + raise click.BadParameter( + f"Cluster {cluster!r} not found. " "Use `sky launch` to provision first." + ) backend = backend_utils.get_backend_from_handle(handle) task_or_dag = _make_task_or_dag_from_entrypoint_with_overrides( @@ -1283,24 +1389,26 @@ def exec( disk_size=disk_size, disk_tier=disk_tier, ports=ports, - field_to_ignore=['cpus', 'memory', 'disk_size', 'disk_tier', 'ports'], + field_to_ignore=["cpus", "memory", "disk_size", "disk_tier", "ports"], ) if isinstance(task_or_dag, sky.Dag): - raise click.UsageError('YAML specifies a DAG, while `sky exec` ' - 'supports a single task only.') + raise click.UsageError( + "YAML specifies a DAG, while `sky exec` " "supports a single task only." + ) task = task_or_dag - click.secho(f'Executing task on cluster {cluster}...', fg='yellow') + click.secho(f"Executing task on cluster {cluster}...", fg="yellow") sky.exec(task, backend=backend, cluster_name=cluster, detach_run=detach_run) def _get_managed_jobs( - refresh: bool, - skip_finished: bool, - show_all: bool, - limit_num_jobs_to_show: bool = False, - is_called_by_user: bool = False) -> Tuple[Optional[int], str]: + refresh: bool, + skip_finished: bool, + show_all: bool, + limit_num_jobs_to_show: bool = False, + is_called_by_user: bool = False, +) -> Tuple[Optional[int], str]: """Get the in-progress managed jobs. Args: @@ -1326,30 +1434,35 @@ def _get_managed_jobs( usage_lib.messages.usage.set_internal() with sky_logging.silent(): # Make the call silent - managed_jobs_ = managed_jobs.queue(refresh=refresh, - skip_finished=skip_finished) - num_in_progress_jobs = len(set(job['job_id'] for job in managed_jobs_)) + managed_jobs_ = managed_jobs.queue( + refresh=refresh, skip_finished=skip_finished + ) + num_in_progress_jobs = len(set(job["job_id"] for job in managed_jobs_)) except exceptions.ClusterNotUpError as e: controller_status = e.cluster_status msg = str(e) if controller_status is None: - msg += (f' (See: {colorama.Style.BRIGHT}sky jobs -h' - f'{colorama.Style.RESET_ALL})') - elif (controller_status == status_lib.ClusterStatus.STOPPED and - is_called_by_user): - msg += (f' (See finished managed jobs: {colorama.Style.BRIGHT}' - f'sky jobs queue --refresh{colorama.Style.RESET_ALL})') + msg += ( + f" (See: {colorama.Style.BRIGHT}sky jobs -h" + f"{colorama.Style.RESET_ALL})" + ) + elif ( + controller_status == status_lib.ClusterStatus.STOPPED and is_called_by_user + ): + msg += ( + f" (See finished managed jobs: {colorama.Style.BRIGHT}" + f"sky jobs queue --refresh{colorama.Style.RESET_ALL})" + ) except RuntimeError as e: - msg = '' + msg = "" try: # Check the controller status again, as the RuntimeError is likely # due to the controller being autostopped when querying the jobs. controller_type = controller_utils.Controllers.JOBS_CONTROLLER record = backend_utils.refresh_cluster_record( - controller_type.value.cluster_name, - cluster_status_lock_timeout=0) - if (record is None or - record['status'] == status_lib.ClusterStatus.STOPPED): + controller_type.value.cluster_name, cluster_status_lock_timeout=0 + ) + if record is None or record["status"] == status_lib.ClusterStatus.STOPPED: msg = controller_type.value.default_hint_if_non_existent except Exception: # pylint: disable=broad-except # This is to an best effort to find the latest controller status to @@ -1358,26 +1471,31 @@ def _get_managed_jobs( pass if not msg: msg = ( - 'Failed to query managed jobs due to connection ' - 'issues. Try again later. ' - f'Details: {common_utils.format_exception(e, use_bracket=True)}' + "Failed to query managed jobs due to connection " + "issues. Try again later. " + f"Details: {common_utils.format_exception(e, use_bracket=True)}" ) except Exception as e: # pylint: disable=broad-except - msg = ('Failed to query managed jobs: ' - f'{common_utils.format_exception(e, use_bracket=True)}') + msg = ( + "Failed to query managed jobs: " + f"{common_utils.format_exception(e, use_bracket=True)}" + ) else: - max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS - if limit_num_jobs_to_show else None) - msg = managed_jobs.format_job_table(managed_jobs_, - show_all=show_all, - max_jobs=max_jobs_to_show) + max_jobs_to_show = ( + _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS if limit_num_jobs_to_show else None + ) + msg = managed_jobs.format_job_table( + managed_jobs_, show_all=show_all, max_jobs=max_jobs_to_show + ) return num_in_progress_jobs, msg -def _get_services(service_names: Optional[List[str]], - show_all: bool, - show_endpoint: bool, - is_called_by_user: bool = False) -> Tuple[Optional[int], str]: +def _get_services( + service_names: Optional[List[str]], + show_all: bool, + show_endpoint: bool, + is_called_by_user: bool = False, +) -> Tuple[Optional[int], str]: """Get service statuses. Args: @@ -1406,20 +1524,21 @@ def _get_services(service_names: Optional[List[str]], controller_status = e.cluster_status msg = str(e) if controller_status is None: - msg += (f' (See: {colorama.Style.BRIGHT}sky serve -h' - f'{colorama.Style.RESET_ALL})') + msg += ( + f" (See: {colorama.Style.BRIGHT}sky serve -h" + f"{colorama.Style.RESET_ALL})" + ) except RuntimeError as e: - msg = '' + msg = "" try: # Check the controller status again, as the RuntimeError is likely # due to the controller being autostopped when querying the # services. controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER record = backend_utils.refresh_cluster_record( - controller_type.value.cluster_name, - cluster_status_lock_timeout=0) - if (record is None or - record['status'] == status_lib.ClusterStatus.STOPPED): + controller_type.value.cluster_name, cluster_status_lock_timeout=0 + ) + if record is None or record["status"] == status_lib.ClusterStatus.STOPPED: msg = controller_type.value.default_hint_if_non_existent except Exception: # pylint: disable=broad-except # This is to an best effort to find the latest controller status to @@ -1427,92 +1546,128 @@ def _get_services(service_names: Optional[List[str]], # print the original error. pass if not msg: - msg = ('Failed to fetch service statuses due to connection issues. ' - 'Please try again later. Details: ' - f'{common_utils.format_exception(e, use_bracket=True)}') + msg = ( + "Failed to fetch service statuses due to connection issues. " + "Please try again later. Details: " + f"{common_utils.format_exception(e, use_bracket=True)}" + ) except Exception as e: # pylint: disable=broad-except - msg = ('Failed to fetch service statuses: ' - f'{common_utils.format_exception(e, use_bracket=True)}') + msg = ( + "Failed to fetch service statuses: " + f"{common_utils.format_exception(e, use_bracket=True)}" + ) else: if show_endpoint: if len(service_records) != 1: - plural = 's' if len(service_records) > 1 else '' - service_num = (str(len(service_records)) - if len(service_records) > 0 else 'No') + plural = "s" if len(service_records) > 1 else "" + service_num = ( + str(len(service_records)) if len(service_records) > 0 else "No" + ) raise click.UsageError( - f'{service_num} service{plural} found. Please specify ' - 'an existing service to show its endpoint. Usage: ' - 'sky serve status --endpoint ') + f"{service_num} service{plural} found. Please specify " + "an existing service to show its endpoint. Usage: " + "sky serve status --endpoint " + ) msg = serve_lib.get_endpoint(service_records[0]) else: msg = serve_lib.format_service_table(service_records, show_all) - service_not_found_msg = '' + service_not_found_msg = "" if service_names is not None: for service_name in service_names: - if not any(service_name == record['name'] - for record in service_records): + if not any( + service_name == record["name"] for record in service_records + ): service_not_found_msg += ( - f'\nService {service_name!r} not found.') + f"\nService {service_name!r} not found." + ) if service_not_found_msg: - msg += f'\n{service_not_found_msg}' + msg += f"\n{service_not_found_msg}" return num_services, msg @cli.command() -@click.option('--all', - '-a', - default=False, - is_flag=True, - required=False, - help='Show all information in full.') @click.option( - '--refresh', - '-r', + "--all", + "-a", + default=False, + is_flag=True, + required=False, + help="Show all information in full.", +) +@click.option( + "--refresh", + "-r", + default=False, + is_flag=True, + required=False, + help="Query the latest cluster statuses from the cloud provider(s).", +) +@click.option( + "--ip", + default=False, + is_flag=True, + required=False, + help=( + "Get the IP address of the head node of a cluster. This " + "option will override all other options. For Kubernetes " + "clusters, the returned IP address is the internal IP " + "of the head pod, and may not be accessible from outside " + "the cluster." + ), +) +@click.option( + "--endpoints", default=False, is_flag=True, required=False, - help='Query the latest cluster statuses from the cloud provider(s).') -@click.option('--ip', - default=False, - is_flag=True, - required=False, - help=('Get the IP address of the head node of a cluster. This ' - 'option will override all other options. For Kubernetes ' - 'clusters, the returned IP address is the internal IP ' - 'of the head pod, and may not be accessible from outside ' - 'the cluster.')) -@click.option('--endpoints', - default=False, - is_flag=True, - required=False, - help=('Get all exposed endpoints and corresponding URLs for a' - 'cluster. This option will override all other options.')) -@click.option('--endpoint', - required=False, - default=None, - type=int, - help=('Get the endpoint URL for the specified port number on the ' - 'cluster. This option will override all other options.')) -@click.option('--show-managed-jobs/--no-show-managed-jobs', - default=True, - is_flag=True, - required=False, - help='Also show recent in-progress managed jobs, if any.') -@click.option('--show-services/--no-show-services', - default=True, - is_flag=True, - required=False, - help='Also show sky serve services, if any.') -@click.argument('clusters', - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_cluster_name)) + help=( + "Get all exposed endpoints and corresponding URLs for a" + "cluster. This option will override all other options." + ), +) +@click.option( + "--endpoint", + required=False, + default=None, + type=int, + help=( + "Get the endpoint URL for the specified port number on the " + "cluster. This option will override all other options." + ), +) +@click.option( + "--show-managed-jobs/--no-show-managed-jobs", + default=True, + is_flag=True, + required=False, + help="Also show recent in-progress managed jobs, if any.", +) +@click.option( + "--show-services/--no-show-services", + default=True, + is_flag=True, + required=False, + help="Also show sky serve services, if any.", +) +@click.argument( + "clusters", + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_cluster_name), +) @usage_lib.entrypoint # pylint: disable=redefined-builtin -def status(all: bool, refresh: bool, ip: bool, endpoints: bool, - endpoint: Optional[int], show_managed_jobs: bool, - show_services: bool, clusters: List[str]): +def status( + all: bool, + refresh: bool, + ip: bool, + endpoints: bool, + endpoint: Optional[int], + show_managed_jobs: bool, + show_services: bool, + clusters: List[str], +): # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Show clusters. @@ -1577,127 +1732,159 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, with multiprocessing.Pool(2) as pool: # Do not show job queue if user specifies clusters, and if user # specifies --ip or --endpoint(s). - show_managed_jobs = show_managed_jobs and not any( - [clusters, ip, endpoints]) + show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints]) show_endpoints = endpoints or endpoint is not None show_single_endpoint = endpoint is not None if show_managed_jobs: # Run managed job query in parallel to speed up the status query. managed_jobs_future = pool.apply_async( _get_managed_jobs, - kwds=dict(refresh=False, - skip_finished=True, - show_all=False, - limit_num_jobs_to_show=not all, - is_called_by_user=False)) + kwds=dict( + refresh=False, + skip_finished=True, + show_all=False, + limit_num_jobs_to_show=not all, + is_called_by_user=False, + ), + ) show_services = show_services and not clusters and not ip if show_services: # Run the sky serve service query in parallel to speed up the # status query. - services_future = pool.apply_async(_get_services, - kwds=dict( - service_names=None, - show_all=False, - show_endpoint=False, - is_called_by_user=False)) + services_future = pool.apply_async( + _get_services, + kwds=dict( + service_names=None, + show_all=False, + show_endpoint=False, + is_called_by_user=False, + ), + ) if ip or show_endpoints: if refresh: raise click.UsageError( - 'Using --ip or --endpoint(s) with --refresh is not' - 'supported for now. To fix, refresh first, ' - 'then query the IP or endpoint.') + "Using --ip or --endpoint(s) with --refresh is not" + "supported for now. To fix, refresh first, " + "then query the IP or endpoint." + ) if ip and show_endpoints: with ux_utils.print_exception_no_traceback(): raise ValueError( - 'Cannot specify both --ip and --endpoint(s) ' - 'at the same time.') + "Cannot specify both --ip and --endpoint(s) " + "at the same time." + ) if endpoint is not None and endpoints: with ux_utils.print_exception_no_traceback(): raise ValueError( - 'Cannot specify both --endpoint and --endpoints ' - 'at the same time.') + "Cannot specify both --endpoint and --endpoints " + "at the same time." + ) if len(clusters) != 1: with ux_utils.print_exception_no_traceback(): - plural = 's' if len(clusters) > 1 else '' - cluster_num = (str(len(clusters)) - if len(clusters) > 0 else 'No') - cause = 'a single' if len(clusters) > 1 else 'an existing' + plural = "s" if len(clusters) > 1 else "" + cluster_num = str(len(clusters)) if len(clusters) > 0 else "No" + cause = "a single" if len(clusters) > 1 else "an existing" raise ValueError( _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format( cluster_num=cluster_num, plural=plural, - verb='specified', + verb="specified", cause=cause, - property='IP address' if ip else 'endpoint(s)', - flag='ip' if ip else - ('endpoint port' - if show_single_endpoint else 'endpoints'))) + property="IP address" if ip else "endpoint(s)", + flag=( + "ip" + if ip + else ( + "endpoint port" + if show_single_endpoint + else "endpoints" + ) + ), + ) + ) else: - click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters' - f'{colorama.Style.RESET_ALL}') + click.echo( + f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters" + f"{colorama.Style.RESET_ALL}" + ) query_clusters: Optional[List[str]] = None if clusters: query_clusters = _get_glob_clusters(clusters, silent=ip) - cluster_records = core.status(cluster_names=query_clusters, - refresh=refresh) + cluster_records = core.status(cluster_names=query_clusters, refresh=refresh) if ip or show_endpoints: if len(cluster_records) != 1: with ux_utils.print_exception_no_traceback(): - plural = 's' if len(cluster_records) > 1 else '' - cluster_num = (str(len(cluster_records)) - if len(cluster_records) > 0 else - f'{clusters[0]!r}') - verb = 'found' if len(cluster_records) > 0 else 'not found' - cause = 'a single' if len(clusters) > 1 else 'an existing' + plural = "s" if len(cluster_records) > 1 else "" + cluster_num = ( + str(len(cluster_records)) + if len(cluster_records) > 0 + else f"{clusters[0]!r}" + ) + verb = "found" if len(cluster_records) > 0 else "not found" + cause = "a single" if len(clusters) > 1 else "an existing" raise ValueError( _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format( cluster_num=cluster_num, plural=plural, verb=verb, cause=cause, - property='IP address' if ip else 'endpoint(s)', - flag='ip' if ip else - ('endpoint port' - if show_single_endpoint else 'endpoints'))) + property="IP address" if ip else "endpoint(s)", + flag=( + "ip" + if ip + else ( + "endpoint port" + if show_single_endpoint + else "endpoints" + ) + ), + ) + ) cluster_record = cluster_records[0] - if cluster_record['status'] != status_lib.ClusterStatus.UP: + if cluster_record["status"] != status_lib.ClusterStatus.UP: with ux_utils.print_exception_no_traceback(): - raise RuntimeError(f'Cluster {cluster_record["name"]!r} ' - 'is not in UP status.') - handle = cluster_record['handle'] + raise RuntimeError( + f'Cluster {cluster_record["name"]!r} ' "is not in UP status." + ) + handle = cluster_record["handle"] if not isinstance(handle, backends.CloudVmRayResourceHandle): with ux_utils.print_exception_no_traceback(): - raise ValueError('Querying IP address is not supported ' - 'for local clusters.') + raise ValueError( + "Querying IP address is not supported " "for local clusters." + ) head_ip = handle.external_ips()[0] if show_endpoints: if endpoint: - cluster_endpoint = core.endpoints(cluster_record['name'], - endpoint).get( - endpoint, None) + cluster_endpoint = core.endpoints( + cluster_record["name"], endpoint + ).get(endpoint, None) if not cluster_endpoint: raise click.Abort( - f'Endpoint {endpoint} not found for cluster ' - f'{cluster_record["name"]!r}.') + f"Endpoint {endpoint} not found for cluster " + f'{cluster_record["name"]!r}.' + ) click.echo(cluster_endpoint) else: - cluster_endpoints = core.endpoints(cluster_record['name']) + cluster_endpoints = core.endpoints(cluster_record["name"]) assert isinstance(cluster_endpoints, dict) if not cluster_endpoints: - raise click.Abort(f'No endpoint found for cluster ' - f'{cluster_record["name"]!r}.') + raise click.Abort( + f"No endpoint found for cluster " + f'{cluster_record["name"]!r}.' + ) for port, port_endpoint in cluster_endpoints.items(): click.echo( - f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}' - f'{colorama.Style.RESET_ALL}: ' - f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'{port_endpoint}{colorama.Style.RESET_ALL}') + f"{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}" + f"{colorama.Style.RESET_ALL}: " + f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" + f"{port_endpoint}{colorama.Style.RESET_ALL}" + ) return click.echo(head_ip) return @@ -1705,7 +1892,7 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, normal_clusters = [] controllers = [] for cluster_record in cluster_records: - cluster_name = cluster_record['name'] + cluster_name = cluster_record["name"] controller = controller_utils.Controllers.from_name(cluster_name) if controller is not None: controllers.append(cluster_record) @@ -1714,7 +1901,8 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool, num_pending_autostop = 0 num_pending_autostop += status_utils.show_status_table( - normal_clusters + controllers, all) + normal_clusters + controllers, all + ) def _try_get_future_result(future) -> Tuple[bool, Any]: result = None @@ -1728,61 +1916,69 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: managed_jobs_query_interrupted = False if show_managed_jobs: - click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Managed jobs{colorama.Style.RESET_ALL}') - with rich_utils.safe_status('[cyan]Checking managed jobs[/]'): + click.echo( + f"\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" + f"Managed jobs{colorama.Style.RESET_ALL}" + ) + with rich_utils.safe_status("[cyan]Checking managed jobs[/]"): managed_jobs_query_interrupted, result = _try_get_future_result( - managed_jobs_future) + managed_jobs_future + ) if managed_jobs_query_interrupted: # Set to -1, so that the controller is not considered # down, and the hint for showing sky jobs queue # will still be shown. num_in_progress_jobs = -1 - msg = 'KeyboardInterrupt' + msg = "KeyboardInterrupt" else: num_in_progress_jobs, msg = result click.echo(msg) if num_in_progress_jobs is not None: # jobs controller is UP. - job_info = '' + job_info = "" if num_in_progress_jobs > 0: - plural_and_verb = ' is' + plural_and_verb = " is" if num_in_progress_jobs > 1: - plural_and_verb = 's are' + plural_and_verb = "s are" job_info = ( - f'{num_in_progress_jobs} managed job{plural_and_verb} ' - 'in progress') - if (num_in_progress_jobs > - _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS): + f"{num_in_progress_jobs} managed job{plural_and_verb} " + "in progress" + ) + if num_in_progress_jobs > _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS: job_info += ( - f' ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest ' - 'ones shown)') - job_info += '. ' + f" ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest " + "ones shown)" + ) + job_info += ". " hints.append( - controller_utils.Controllers.JOBS_CONTROLLER.value. - in_progress_hint.format(job_info=job_info)) + controller_utils.Controllers.JOBS_CONTROLLER.value.in_progress_hint.format( + job_info=job_info + ) + ) if show_services: - click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Services{colorama.Style.RESET_ALL}') + click.echo( + f"\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" + f"Services{colorama.Style.RESET_ALL}" + ) num_services = None if managed_jobs_query_interrupted: # The pool is terminated, so we cannot run the service query. - msg = 'KeyboardInterrupt' + msg = "KeyboardInterrupt" else: - with rich_utils.safe_status('[cyan]Checking services[/]'): - interrupted, result = _try_get_future_result( - services_future) + with rich_utils.safe_status("[cyan]Checking services[/]"): + interrupted, result = _try_get_future_result(services_future) if interrupted: num_services = -1 - msg = 'KeyboardInterrupt' + msg = "KeyboardInterrupt" else: num_services, msg = result click.echo(msg) if num_services is not None: - hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER. - value.in_progress_hint) + hints.append( + controller_utils.Controllers.SKY_SERVE_CONTROLLER.value.in_progress_hint + ) if show_managed_jobs or show_services: try: @@ -1799,24 +1995,28 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: if num_pending_autostop > 0 and not refresh: # Don't print this hint if there's no pending autostop or user has # already passed --refresh. - plural_and_verb = ' has' + plural_and_verb = " has" if num_pending_autostop > 1: - plural_and_verb = 's have' - hints.append(f'* {num_pending_autostop} cluster{plural_and_verb} ' - 'auto{stop,down} scheduled. Refresh statuses with: ' - f'{colorama.Style.BRIGHT}sky status --refresh' - f'{colorama.Style.RESET_ALL}') + plural_and_verb = "s have" + hints.append( + f"* {num_pending_autostop} cluster{plural_and_verb} " + "auto{stop,down} scheduled. Refresh statuses with: " + f"{colorama.Style.BRIGHT}sky status --refresh" + f"{colorama.Style.RESET_ALL}" + ) if hints: - click.echo('\n' + '\n'.join(hints)) + click.echo("\n" + "\n".join(hints)) @cli.command() -@click.option('--all', - '-a', - default=False, - is_flag=True, - required=False, - help='Show all information in full.') +@click.option( + "--all", + "-a", + default=False, + is_flag=True, + required=False, + help="Show all information in full.", +) @usage_lib.entrypoint def cost_report(all: bool): # pylint: disable=redefined-builtin # NOTE(dev): Keep the docstring consistent between the Python API and CLI. @@ -1842,7 +2042,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin normal_cluster_records = [] controllers = dict() for cluster_record in cluster_records: - cluster_name = cluster_record['name'] + cluster_name = cluster_record["name"] controller = controller_utils.Controllers.from_name(cluster_name) if controller is not None: controller_name = controller.value.name @@ -1854,111 +2054,139 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin normal_cluster_records.append(cluster_record) total_cost = status_utils.get_total_cost_of_displayed_records( - normal_cluster_records, all) + normal_cluster_records, all + ) status_utils.show_cost_report_table(normal_cluster_records, all) for controller_name, cluster_record in controllers.items(): status_utils.show_cost_report_table( - [cluster_record], all, controller_name=controller_name.capitalize()) - total_cost += cluster_record['total_cost'] + [cluster_record], all, controller_name=controller_name.capitalize() + ) + total_cost += cluster_record["total_cost"] - click.echo(f'\n{colorama.Style.BRIGHT}' - f'Total Cost: ${total_cost:.2f}{colorama.Style.RESET_ALL}') + click.echo( + f"\n{colorama.Style.BRIGHT}" + f"Total Cost: ${total_cost:.2f}{colorama.Style.RESET_ALL}" + ) if not all: click.secho( - f'Showing up to {status_utils.NUM_COST_REPORT_LINES} ' - 'most recent clusters. ' - 'To see all clusters in history, ' - 'pass the --all flag.', - fg='yellow') + f"Showing up to {status_utils.NUM_COST_REPORT_LINES} " + "most recent clusters. " + "To see all clusters in history, " + "pass the --all flag.", + fg="yellow", + ) click.secho( - 'This feature is experimental. ' - 'Costs for clusters with auto{stop,down} ' - 'scheduled may not be accurate.', - fg='yellow') + "This feature is experimental. " + "Costs for clusters with auto{stop,down} " + "scheduled may not be accurate.", + fg="yellow", + ) @cli.command() -@click.option('--all-users', - '-a', - default=False, - is_flag=True, - required=False, - help='Show all users\' information in full.') -@click.option('--skip-finished', - '-s', - default=False, - is_flag=True, - required=False, - help='Show only pending/running jobs\' information.') -@click.argument('clusters', - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_cluster_name)) +@click.option( + "--all-users", + "-a", + default=False, + is_flag=True, + required=False, + help="Show all users' information in full.", +) +@click.option( + "--skip-finished", + "-s", + default=False, + is_flag=True, + required=False, + help="Show only pending/running jobs' information.", +) +@click.argument( + "clusters", + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_cluster_name), +) @usage_lib.entrypoint def queue(clusters: List[str], skip_finished: bool, all_users: bool): # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Show the job queue for cluster(s).""" - click.secho('Fetching and parsing job queue...', fg='yellow') + click.secho("Fetching and parsing job queue...", fg="yellow") if clusters: clusters = _get_glob_clusters(clusters) else: cluster_infos = global_user_state.get_clusters() - clusters = [c['name'] for c in cluster_infos] + clusters = [c["name"] for c in cluster_infos] unsupported_clusters = [] for cluster in clusters: try: job_table = core.queue(cluster, skip_finished, all_users) - except (exceptions.CommandError, ValueError, - exceptions.NotSupportedError, exceptions.ClusterNotUpError, - exceptions.CloudUserIdentityError, - exceptions.ClusterOwnerIdentityMismatchError) as e: + except ( + exceptions.CommandError, + ValueError, + exceptions.NotSupportedError, + exceptions.ClusterNotUpError, + exceptions.CloudUserIdentityError, + exceptions.ClusterOwnerIdentityMismatchError, + ) as e: if isinstance(e, exceptions.NotSupportedError): unsupported_clusters.append(cluster) - click.echo(f'{colorama.Fore.YELLOW}Failed to get the job queue for ' - f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n' - f' {common_utils.format_exception(e)}') + click.echo( + f"{colorama.Fore.YELLOW}Failed to get the job queue for " + f"cluster {cluster!r}.{colorama.Style.RESET_ALL}\n" + f" {common_utils.format_exception(e)}" + ) continue job_table = job_lib.format_job_queue(job_table) - click.echo(f'\nJob queue of cluster {cluster}\n{job_table}') + click.echo(f"\nJob queue of cluster {cluster}\n{job_table}") if unsupported_clusters: click.secho( - f'Note: Job queues are not supported on clusters: ' + f"Note: Job queues are not supported on clusters: " f'{", ".join(unsupported_clusters)}', - fg='yellow') + fg="yellow", + ) @cli.command() @click.option( - '--sync-down', - '-s', + "--sync-down", + "-s", is_flag=True, default=False, - help='Sync down the logs of a job to the local machine. For a distributed' - ' job, a separate log file from each worker will be downloaded.') + help="Sync down the logs of a job to the local machine. For a distributed" + " job, a separate log file from each worker will be downloaded.", +) @click.option( - '--status', + "--status", is_flag=True, default=False, - help=('If specified, do not show logs but exit with a status code for the ' - 'job\'s status: 0 for succeeded, or 1 for all other statuses.')) + help=( + "If specified, do not show logs but exit with a status code for the " + "job's status: 0 for succeeded, or 1 for all other statuses." + ), +) @click.option( - '--follow/--no-follow', + "--follow/--no-follow", is_flag=True, default=True, - help=('Follow the logs of a job. ' - 'If --no-follow is specified, print the log so far and exit. ' - '[default: --follow]')) -@click.argument('cluster', - required=True, - type=str, - **_get_shell_complete_args(_complete_cluster_name)) -@click.argument('job_ids', type=str, nargs=-1) + help=( + "Follow the logs of a job. " + "If --no-follow is specified, print the log so far and exit. " + "[default: --follow]" + ), +) +@click.argument( + "cluster", + required=True, + type=str, + **_get_shell_complete_args(_complete_cluster_name), +) +@click.argument("job_ids", type=str, nargs=-1) # TODO(zhwu): support logs by job name @usage_lib.entrypoint def logs( @@ -1986,13 +2214,15 @@ def logs( """ if sync_down and status: raise click.UsageError( - 'Both --sync_down and --status are specified ' - '(ambiguous). To fix: specify at most one of them.') + "Both --sync_down and --status are specified " + "(ambiguous). To fix: specify at most one of them." + ) if len(job_ids) > 1 and not sync_down: raise click.UsageError( f'Cannot stream logs of multiple jobs (IDs: {", ".join(job_ids)}).' - '\nPass -s/--sync-down to download the logs instead.') + "\nPass -s/--sync-down to download the logs instead." + ) job_ids = None if not job_ids else job_ids @@ -2008,8 +2238,9 @@ def logs( # in core.tail_logs. job_id = job_ids[0] if not job_id.isdigit(): - raise click.UsageError(f'Invalid job ID {job_id}. ' - 'Job ID must be integers.') + raise click.UsageError( + f"Invalid job ID {job_id}. " "Job ID must be integers." + ) job_ids_to_query = [int(job_id)] else: # job_ids is either None or empty list, so it is safe to cast it here. @@ -2020,42 +2251,50 @@ def logs( # If job_ids is None and no job has been submitted to the cluster, # it will return {None: None}. if job_id is None: - click.secho(f'No job found on cluster {cluster!r}.', fg='red') + click.secho(f"No job found on cluster {cluster!r}.", fg="red") sys.exit(1) job_status = list(job_statuses.values())[0] - job_status_str = job_status.value if job_status is not None else 'None' - click.echo(f'Job {job_id}: {job_status_str}') + job_status_str = job_status.value if job_status is not None else "None" + click.echo(f"Job {job_id}: {job_status_str}") if job_status == job_lib.JobStatus.SUCCEEDED: return else: if job_status is None: - id_str = '' if job_id is None else f'{job_id} ' - click.secho(f'Job {id_str}not found', fg='red') + id_str = "" if job_id is None else f"{job_id} " + click.secho(f"Job {id_str}not found", fg="red") sys.exit(1) core.tail_logs(cluster, job_id, follow) @cli.command() -@click.argument('cluster', - required=True, - type=str, - **_get_shell_complete_args(_complete_cluster_name)) -@click.option('--all', - '-a', - default=False, - is_flag=True, - required=False, - help='Cancel all jobs on the specified cluster.') -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') -@click.argument('jobs', required=False, type=int, nargs=-1) +@click.argument( + "cluster", + required=True, + type=str, + **_get_shell_complete_args(_complete_cluster_name), +) +@click.option( + "--all", + "-a", + default=False, + is_flag=True, + required=False, + help="Cancel all jobs on the specified cluster.", +) +@click.option( + "--yes", + "-y", + is_flag=True, + default=False, + required=False, + help="Skip confirmation prompt.", +) +@click.argument("jobs", required=False, type=int, nargs=-1) @usage_lib.entrypoint -def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disable=redefined-builtin, redefined-outer-name +def cancel( + cluster: str, all: bool, jobs: List[int], yes: bool +): # pylint: disable=redefined-builtin, redefined-outer-name # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Cancel job(s). @@ -2079,26 +2318,30 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa job_identity_str = None job_ids_to_cancel = None if not jobs and not all: - click.echo(f'{colorama.Fore.YELLOW}No job IDs or --all provided; ' - 'cancelling the latest running job.' - f'{colorama.Style.RESET_ALL}') - job_identity_str = 'the latest running job' + click.echo( + f"{colorama.Fore.YELLOW}No job IDs or --all provided; " + "cancelling the latest running job." + f"{colorama.Style.RESET_ALL}" + ) + job_identity_str = "the latest running job" else: # Cancelling specific jobs or --all. - job_ids = ' '.join(map(str, jobs)) - plural = 's' if len(job_ids) > 1 else '' - job_identity_str = f'job{plural} {job_ids}' + job_ids = " ".join(map(str, jobs)) + plural = "s" if len(job_ids) > 1 else "" + job_identity_str = f"job{plural} {job_ids}" job_ids_to_cancel = jobs if all: - job_identity_str = 'all jobs' + job_identity_str = "all jobs" job_ids_to_cancel = None - job_identity_str += f' on cluster {cluster!r}' + job_identity_str += f" on cluster {cluster!r}" if not yes: - click.confirm(f'Cancelling {job_identity_str}. Proceed?', - default=True, - abort=True, - show_default=True) + click.confirm( + f"Cancelling {job_identity_str}. Proceed?", + default=True, + abort=True, + show_default=True, + ) try: core.cancel(cluster, all=all, job_ids=job_ids_to_cancel) @@ -2115,21 +2358,23 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa @cli.command(cls=_DocumentedCodeCommand) -@click.argument('clusters', - nargs=-1, - required=False, - **_get_shell_complete_args(_complete_cluster_name)) -@click.option('--all', - '-a', - default=None, - is_flag=True, - help='Stop all existing clusters.') -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') +@click.argument( + "clusters", + nargs=-1, + required=False, + **_get_shell_complete_args(_complete_cluster_name), +) +@click.option( + "--all", "-a", default=None, is_flag=True, help="Stop all existing clusters." +) +@click.option( + "--yes", + "-y", + is_flag=True, + default=False, + required=False, + help="Skip confirmation prompt.", +) @usage_lib.entrypoint def stop( clusters: List[str], @@ -2165,49 +2410,58 @@ def stop( sky stop -a """ - _down_or_stop_clusters(clusters, - apply_to_all=all, - down=False, - no_confirm=yes) + _down_or_stop_clusters(clusters, apply_to_all=all, down=False, no_confirm=yes) @cli.command(cls=_DocumentedCodeCommand) -@click.argument('clusters', - nargs=-1, - required=False, - **_get_shell_complete_args(_complete_cluster_name)) -@click.option('--all', - '-a', - default=None, - is_flag=True, - help='Apply this command to all existing clusters.') -@click.option('--idle-minutes', - '-i', - type=int, - default=None, - required=False, - help=('Set the idle minutes before autostopping the cluster. ' - 'See the doc above for detailed semantics.')) +@click.argument( + "clusters", + nargs=-1, + required=False, + **_get_shell_complete_args(_complete_cluster_name), +) @click.option( - '--cancel', + "--all", + "-a", + default=None, + is_flag=True, + help="Apply this command to all existing clusters.", +) +@click.option( + "--idle-minutes", + "-i", + type=int, + default=None, + required=False, + help=( + "Set the idle minutes before autostopping the cluster. " + "See the doc above for detailed semantics." + ), +) +@click.option( + "--cancel", default=False, is_flag=True, required=False, - help='Cancel any currently active auto{stop,down} setting for the ' - 'cluster. No-op if there is no active setting.') + help="Cancel any currently active auto{stop,down} setting for the " + "cluster. No-op if there is no active setting.", +) @click.option( - '--down', + "--down", default=False, is_flag=True, required=False, - help='Use autodown (tear down the cluster; non-restartable), instead ' - 'of autostop (restartable).') -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') + help="Use autodown (tear down the cluster; non-restartable), instead " + "of autostop (restartable).", +) +@click.option( + "--yes", + "-y", + is_flag=True, + default=False, + required=False, + help="Skip confirmation prompt.", +) @usage_lib.entrypoint def autostop( clusters: List[str], @@ -2260,89 +2514,108 @@ def autostop( """ if cancel and idle_minutes is not None: raise click.UsageError( - 'Only one of --idle-minutes and --cancel should be specified. ' - f'cancel: {cancel}, idle_minutes: {idle_minutes}') + "Only one of --idle-minutes and --cancel should be specified. " + f"cancel: {cancel}, idle_minutes: {idle_minutes}" + ) if cancel: idle_minutes = -1 elif idle_minutes is None: idle_minutes = 5 - _down_or_stop_clusters(clusters, - apply_to_all=all, - down=down, - no_confirm=yes, - idle_minutes_to_autostop=idle_minutes) + _down_or_stop_clusters( + clusters, + apply_to_all=all, + down=down, + no_confirm=yes, + idle_minutes_to_autostop=idle_minutes, + ) @cli.command(cls=_DocumentedCodeCommand) -@click.argument('clusters', - nargs=-1, - required=False, - **_get_shell_complete_args(_complete_cluster_name)) -@click.option('--all', - '-a', - default=False, - is_flag=True, - required=False, - help='Start all existing clusters.') -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') +@click.argument( + "clusters", + nargs=-1, + required=False, + **_get_shell_complete_args(_complete_cluster_name), +) +@click.option( + "--all", + "-a", + default=False, + is_flag=True, + required=False, + help="Start all existing clusters.", +) +@click.option( + "--yes", + "-y", + is_flag=True, + default=False, + required=False, + help="Skip confirmation prompt.", +) @click.option( - '--idle-minutes-to-autostop', - '-i', + "--idle-minutes-to-autostop", + "-i", default=None, type=int, required=False, - help=('Automatically stop the cluster after this many minutes ' - 'of idleness, i.e., no running or pending jobs in the cluster\'s job ' - 'queue. Idleness gets reset whenever setting-up/running/pending jobs ' - 'are found in the job queue. ' - 'Setting this flag is equivalent to ' - 'running ``sky launch -d ...`` and then ``sky autostop -i ``' - '. If not set, the cluster will not be autostopped.')) + help=( + "Automatically stop the cluster after this many minutes " + "of idleness, i.e., no running or pending jobs in the cluster's job " + "queue. Idleness gets reset whenever setting-up/running/pending jobs " + "are found in the job queue. " + "Setting this flag is equivalent to " + "running ``sky launch -d ...`` and then ``sky autostop -i ``" + ". If not set, the cluster will not be autostopped." + ), +) @click.option( - '--down', + "--down", default=False, is_flag=True, required=False, - help= - ('Autodown the cluster: tear down the cluster after specified minutes of ' - 'idle time after all jobs finish (successfully or abnormally). Requires ' - '--idle-minutes-to-autostop to be set.'), + help=( + "Autodown the cluster: tear down the cluster after specified minutes of " + "idle time after all jobs finish (successfully or abnormally). Requires " + "--idle-minutes-to-autostop to be set." + ), ) @click.option( - '--retry-until-up', - '-r', + "--retry-until-up", + "-r", default=False, is_flag=True, required=False, # Disabling quote check here, as there seems to be a bug in pylint, # which incorrectly recognizes the help string as a docstring. # pylint: disable=bad-docstring-quotes - help=('Retry provisioning infinitely until the cluster is up, ' - 'if we fail to start the cluster due to unavailability errors.'), + help=( + "Retry provisioning infinitely until the cluster is up, " + "if we fail to start the cluster due to unavailability errors." + ), ) @click.option( - '--force', - '-f', + "--force", + "-f", default=False, is_flag=True, required=False, - help=('Force start the cluster even if it is already UP. Useful for ' - 'upgrading the SkyPilot runtime on the cluster.')) + help=( + "Force start the cluster even if it is already UP. Useful for " + "upgrading the SkyPilot runtime on the cluster." + ), +) @usage_lib.entrypoint # pylint: disable=redefined-builtin def start( - clusters: List[str], - all: bool, - yes: bool, - idle_minutes_to_autostop: Optional[int], - down: bool, # pylint: disable=redefined-outer-name - retry_until_up: bool, - force: bool): + clusters: List[str], + all: bool, + yes: bool, + idle_minutes_to_autostop: Optional[int], + down: bool, # pylint: disable=redefined-outer-name + retry_until_up: bool, + force: bool, +): # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Restart cluster(s). @@ -2373,43 +2646,48 @@ def start( """ if down and idle_minutes_to_autostop is None: raise click.UsageError( - '--idle-minutes-to-autostop must be set if --down is set.') + "--idle-minutes-to-autostop must be set if --down is set." + ) to_start = [] if not clusters and not all: # UX: frequently users may have only 1 cluster. In this case, be smart # and default to that unique choice. - all_cluster_names = global_user_state.get_cluster_names_start_with('') + all_cluster_names = global_user_state.get_cluster_names_start_with("") if len(all_cluster_names) <= 1: clusters = all_cluster_names else: raise click.UsageError( - '`sky start` requires either a cluster name or glob ' - '(see `sky status`), or the -a/--all flag.') + "`sky start` requires either a cluster name or glob " + "(see `sky status`), or the -a/--all flag." + ) if all: if len(clusters) > 0: - click.echo('Both --all and cluster(s) specified for sky start. ' - 'Letting --all take effect.') + click.echo( + "Both --all and cluster(s) specified for sky start. " + "Letting --all take effect." + ) # Get all clusters that are not controllers. clusters = [ - cluster['name'] + cluster["name"] for cluster in global_user_state.get_clusters() - if controller_utils.Controllers.from_name(cluster['name']) is None + if controller_utils.Controllers.from_name(cluster["name"]) is None ] if not clusters: - click.echo('Cluster(s) not found (tip: see `sky status`). Do you ' - 'mean to use `sky launch` to provision a new cluster?') + click.echo( + "Cluster(s) not found (tip: see `sky status`). Do you " + "mean to use `sky launch` to provision a new cluster?" + ) return else: # Get GLOB cluster names clusters = _get_glob_clusters(clusters) for name in clusters: - cluster_status, _ = backend_utils.refresh_cluster_status_handle( - name) + cluster_status, _ = backend_utils.refresh_cluster_status_handle(name) # A cluster may have one of the following states: # # STOPPED - ok to restart @@ -2433,7 +2711,7 @@ def start( # INIT state cluster due to head_ip not being cached). # # This can be replicated by adding `exit 1` to Task.setup. - if (not force and cluster_status == status_lib.ClusterStatus.UP): + if not force and cluster_status == status_lib.ClusterStatus.UP: # An UP cluster; skipping 'sky start' because: # 1. For a really up cluster, this has no effects (ray up -y # --no-restart) anyway. @@ -2444,12 +2722,13 @@ def start( # zombied (remains as stopped in the cloud's UI). # # This is dangerous and unwanted behavior! - click.echo(f'Cluster {name} already has status UP.') + click.echo(f"Cluster {name} already has status UP.") continue assert force or cluster_status in ( status_lib.ClusterStatus.INIT, - status_lib.ClusterStatus.STOPPED), cluster_status + status_lib.ClusterStatus.STOPPED, + ), cluster_status to_start.append(name) if not to_start: return @@ -2463,74 +2742,83 @@ def start( normal_clusters.append(name) if controllers and normal_clusters: # Keep this behavior the same as _down_or_stop_clusters(). - raise click.UsageError('Starting controllers with other cluster(s) ' - 'is currently not supported.\n' - 'Please start the former independently.') + raise click.UsageError( + "Starting controllers with other cluster(s) " + "is currently not supported.\n" + "Please start the former independently." + ) if controllers: bold = backend_utils.BOLD reset_bold = backend_utils.RESET_BOLD if len(controllers) != 1: raise click.UsageError( - 'Starting multiple controllers is currently not supported.\n' - 'Please start them independently.') + "Starting multiple controllers is currently not supported.\n" + "Please start them independently." + ) if idle_minutes_to_autostop is not None: raise click.UsageError( - 'Autostop options are currently not allowed when starting the ' - 'controllers. Use the default autostop settings by directly ' - f'calling: {bold}sky start {" ".join(controllers)}{reset_bold}') + "Autostop options are currently not allowed when starting the " + "controllers. Use the default autostop settings by directly " + f'calling: {bold}sky start {" ".join(controllers)}{reset_bold}' + ) if not yes: - cluster_str = 'clusters' if len(to_start) > 1 else 'cluster' - cluster_list = ', '.join(to_start) + cluster_str = "clusters" if len(to_start) > 1 else "cluster" + cluster_list = ", ".join(to_start) click.confirm( - f'Restarting {len(to_start)} {cluster_str}: ' - f'{cluster_list}. Proceed?', + f"Restarting {len(to_start)} {cluster_str}: " f"{cluster_list}. Proceed?", default=True, abort=True, - show_default=True) + show_default=True, + ) for name in to_start: try: - core.start(name, - idle_minutes_to_autostop, - retry_until_up, - down=down, - force=force) - except (exceptions.NotSupportedError, - exceptions.ClusterOwnerIdentityMismatchError) as e: + core.start( + name, idle_minutes_to_autostop, retry_until_up, down=down, force=force + ) + except ( + exceptions.NotSupportedError, + exceptions.ClusterOwnerIdentityMismatchError, + ) as e: click.echo(str(e)) else: - click.secho(f'Cluster {name} started.', fg='green') + click.secho(f"Cluster {name} started.", fg="green") @cli.command(cls=_DocumentedCodeCommand) -@click.argument('clusters', - nargs=-1, - required=False, - **_get_shell_complete_args(_complete_cluster_name)) -@click.option('--all', - '-a', - default=None, - is_flag=True, - help='Tear down all existing clusters.') -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') +@click.argument( + "clusters", + nargs=-1, + required=False, + **_get_shell_complete_args(_complete_cluster_name), +) +@click.option( + "--all", "-a", default=None, is_flag=True, help="Tear down all existing clusters." +) @click.option( - '--purge', - '-p', + "--yes", + "-y", is_flag=True, default=False, required=False, - help=('(Advanced) Forcefully remove the cluster(s) from ' - 'SkyPilot\'s cluster table, even if the actual cluster termination ' - 'failed on the cloud. WARNING: This flag should only be set sparingly' - ' in certain manual troubleshooting scenarios; with it set, it is the' - ' user\'s responsibility to ensure there are no leaked instances and ' - 'related resources.')) + help="Skip confirmation prompt.", +) +@click.option( + "--purge", + "-p", + is_flag=True, + default=False, + required=False, + help=( + "(Advanced) Forcefully remove the cluster(s) from " + "SkyPilot's cluster table, even if the actual cluster termination " + "failed on the cloud. WARNING: This flag should only be set sparingly" + " in certain manual troubleshooting scenarios; with it set, it is the" + " user's responsibility to ensure there are no leaked instances and " + "related resources." + ), +) @usage_lib.entrypoint def down( clusters: List[str], @@ -2566,11 +2854,9 @@ def down( sky down -a """ - _down_or_stop_clusters(clusters, - apply_to_all=all, - down=True, - no_confirm=yes, - purge=purge) + _down_or_stop_clusters( + clusters, apply_to_all=all, down=True, no_confirm=yes, purge=purge + ) def _hint_or_raise_for_down_jobs_controller(controller_name: str): @@ -2588,43 +2874,43 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str): controller = controller_utils.Controllers.from_name(controller_name) assert controller is not None, controller_name - with rich_utils.safe_status( - '[bold cyan]Checking for in-progress managed jobs[/]'): + with rich_utils.safe_status("[bold cyan]Checking for in-progress managed jobs[/]"): try: - managed_jobs_ = managed_jobs.queue(refresh=False, - skip_finished=True) + managed_jobs_ = managed_jobs.queue(refresh=False, skip_finished=True) except exceptions.ClusterNotUpError as e: if controller.value.connection_error_hint in str(e): with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError( - controller.value. - decline_down_when_failed_to_fetch_status_hint) + controller.value.decline_down_when_failed_to_fetch_status_hint + ) if e.cluster_status is None: - click.echo( - 'Managed jobs controller has already been torn down.') + click.echo("Managed jobs controller has already been torn down.") sys.exit(0) # At this point, the managed jobs are failed to be fetched due to # the controller being STOPPED or being firstly launched, i.e., # there is no in-prgress managed jobs. managed_jobs_ = [] - msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed ' - 'jobs controller. Please be aware of the following:' - f'{colorama.Style.RESET_ALL}' - '\n * All logs and status information of the managed ' - 'jobs (output of `sky jobs queue`) will be lost.') + msg = ( + f"{colorama.Fore.YELLOW}WARNING: Tearing down the managed " + "jobs controller. Please be aware of the following:" + f"{colorama.Style.RESET_ALL}" + "\n * All logs and status information of the managed " + "jobs (output of `sky jobs queue`) will be lost." + ) click.echo(msg) if managed_jobs_: job_table = managed_jobs.format_job_table(managed_jobs_, show_all=False) msg = controller.value.decline_down_for_dirty_controller_hint # Add prefix to each line to align with the bullet point. - msg += '\n'.join( - [' ' + line for line in job_table.split('\n') if line != '']) + msg += "\n".join([" " + line for line in job_table.split("\n") if line != ""]) with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError(msg) else: - click.echo(' * No in-progress managed jobs found. It should be safe to ' - 'terminate (see caveats above).') + click.echo( + " * No in-progress managed jobs found. It should be safe to " + "terminate (see caveats above)." + ) def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): @@ -2641,17 +2927,17 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): """ controller = controller_utils.Controllers.from_name(controller_name) assert controller is not None, controller_name - with rich_utils.safe_status('[bold cyan]Checking for live services[/]'): + with rich_utils.safe_status("[bold cyan]Checking for live services[/]"): try: services = serve_lib.status() except exceptions.ClusterNotUpError as e: if controller.value.connection_error_hint in str(e): with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError( - controller.value. - decline_down_when_failed_to_fetch_status_hint) + controller.value.decline_down_when_failed_to_fetch_status_hint + ) if e.cluster_status is None: - click.echo('Serve controller has already been torn down.') + click.echo("Serve controller has already been torn down.") sys.exit(0) # At this point, the services are failed to be fetched due to the # controller being STOPPED or being firstly launched, i.e., there is @@ -2659,31 +2945,34 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): services = [] if services: - service_names = [service['name'] for service in services] + service_names = [service["name"] for service in services] with ux_utils.print_exception_no_traceback(): - msg = ( - controller.value.decline_down_for_dirty_controller_hint.format( - service_names=', '.join(service_names))) + msg = controller.value.decline_down_for_dirty_controller_hint.format( + service_names=", ".join(service_names) + ) raise exceptions.NotSupportedError(msg) # Do nothing for STOPPED state, as it is safe to terminate the cluster. - click.echo(f'Terminate sky serve controller: {controller_name}.') + click.echo(f"Terminate sky serve controller: {controller_name}.") _CONTROLLER_TO_HINT_OR_RAISE = { - controller_utils.Controllers.JOBS_CONTROLLER: - (_hint_or_raise_for_down_jobs_controller), - controller_utils.Controllers.SKY_SERVE_CONTROLLER: - (_hint_or_raise_for_down_sky_serve_controller), + controller_utils.Controllers.JOBS_CONTROLLER: ( + _hint_or_raise_for_down_jobs_controller + ), + controller_utils.Controllers.SKY_SERVE_CONTROLLER: ( + _hint_or_raise_for_down_sky_serve_controller + ), } def _down_or_stop_clusters( - names: List[str], - apply_to_all: Optional[bool], - down: bool, # pylint: disable=redefined-outer-name - no_confirm: bool, - purge: bool = False, - idle_minutes_to_autostop: Optional[int] = None) -> None: + names: List[str], + apply_to_all: Optional[bool], + down: bool, # pylint: disable=redefined-outer-name + no_confirm: bool, + purge: bool = False, + idle_minutes_to_autostop: Optional[int] = None, +) -> None: """Tears down or (auto-)stops a cluster (or all clusters). Controllers (jobs controller and sky serve controller) can only be @@ -2691,40 +2980,43 @@ def _down_or_stop_clusters( via glob). """ if down: - command = 'down' + command = "down" elif idle_minutes_to_autostop is not None: - command = 'autostop' + command = "autostop" else: - command = 'stop' + command = "stop" if not names and apply_to_all is None: # UX: frequently users may have only 1 cluster. In this case, 'sky # stop/down' without args should be smart and default to that unique # choice. - all_cluster_names = global_user_state.get_cluster_names_start_with('') + all_cluster_names = global_user_state.get_cluster_names_start_with("") if len(all_cluster_names) <= 1: names = all_cluster_names else: raise click.UsageError( - f'`sky {command}` requires either a cluster name or glob ' - '(see `sky status`), or the -a/--all flag.') + f"`sky {command}` requires either a cluster name or glob " + "(see `sky status`), or the -a/--all flag." + ) - operation = 'Terminating' if down else 'Stopping' + operation = "Terminating" if down else "Stopping" if idle_minutes_to_autostop is not None: is_cancel = idle_minutes_to_autostop < 0 - verb = 'Cancelling' if is_cancel else 'Scheduling' - option_str = 'down' if down else 'stop' + verb = "Cancelling" if is_cancel else "Scheduling" + option_str = "down" if down else "stop" if is_cancel: - option_str = '{stop,down}' - operation = f'{verb} auto{option_str} on' + option_str = "{stop,down}" + operation = f"{verb} auto{option_str} on" if len(names) > 0: controllers = [ - name for name in names + name + for name in names if controller_utils.Controllers.from_name(name) is not None ] - controllers_str = ', '.join(map(repr, controllers)) + controllers_str = ", ".join(map(repr, controllers)) names = [ - name for name in _get_glob_clusters(names) + name + for name in _get_glob_clusters(names) if controller_utils.Controllers.from_name(name) is None ] @@ -2732,25 +3024,27 @@ def _down_or_stop_clusters( # normal clusters. if controllers: if len(names) != 0: - names_str = ', '.join(map(repr, names)) + names_str = ", ".join(map(repr, names)) raise click.UsageError( - f'{operation} controller(s) ' - f'{controllers_str} with other cluster(s) ' - f'{names_str} is currently not supported.\n' - f'Please omit the controller(s) {controllers}.') + f"{operation} controller(s) " + f"{controllers_str} with other cluster(s) " + f"{names_str} is currently not supported.\n" + f"Please omit the controller(s) {controllers}." + ) if len(controllers) > 1: raise click.UsageError( - f'{operation} multiple controllers ' - f'{controllers_str} is currently not supported.\n' - f'Please specify only one controller.') + f"{operation} multiple controllers " + f"{controllers_str} is currently not supported.\n" + f"Please specify only one controller." + ) controller_name = controllers[0] if not down: raise click.UsageError( - f'{operation} controller(s) ' - f'{controllers_str} is currently not supported.') + f"{operation} controller(s) " + f"{controllers_str} is currently not supported." + ) else: - controller = controller_utils.Controllers.from_name( - controller_name) + controller = controller_utils.Controllers.from_name(controller_name) assert controller is not None hint_or_raise = _CONTROLLER_TO_HINT_OR_RAISE[controller] try: @@ -2762,21 +3056,30 @@ def _down_or_stop_clusters( # managed job or service. We should make this check atomic # with the termination. hint_or_raise(controller_name) - except (exceptions.ClusterOwnerIdentityMismatchError, - RuntimeError) as e: + except ( + exceptions.ClusterOwnerIdentityMismatchError, + RuntimeError, + ) as e: if purge: click.echo(common_utils.format_exception(e)) else: raise - confirm_str = 'delete' - input_prefix = ('Since --purge is set, errors will be ignored ' - 'and controller will be removed from ' - 'local state.\n') if purge else '' + confirm_str = "delete" + input_prefix = ( + ( + "Since --purge is set, errors will be ignored " + "and controller will be removed from " + "local state.\n" + ) + if purge + else "" + ) user_input = click.prompt( - f'{input_prefix}' - f'To proceed, please type {colorama.Style.BRIGHT}' - f'{confirm_str!r}{colorama.Style.RESET_ALL}', - type=str) + f"{input_prefix}" + f"To proceed, please type {colorama.Style.BRIGHT}" + f"{confirm_str!r}{colorama.Style.RESET_ALL}", + type=str, + ) if user_input != confirm_str: raise click.Abort() no_confirm = True @@ -2786,14 +3089,15 @@ def _down_or_stop_clusters( all_clusters = global_user_state.get_clusters() if len(names) > 0: click.echo( - f'Both --all and cluster(s) specified for `sky {command}`. ' - 'Letting --all take effect.') + f"Both --all and cluster(s) specified for `sky {command}`. " + "Letting --all take effect." + ) # We should not remove controllers when --all is specified. # Otherwise, it would be very easy to accidentally delete a controller. names = [ - record['name'] + record["name"] for record in all_clusters - if controller_utils.Controllers.from_name(record['name']) is None + if controller_utils.Controllers.from_name(record["name"]) is None ] clusters = [] @@ -2808,51 +3112,54 @@ def _down_or_stop_clusters( usage_lib.record_cluster_name_for_current_operation(clusters) if not clusters: - click.echo('Cluster(s) not found (tip: see `sky status`).') + click.echo("Cluster(s) not found (tip: see `sky status`).") return if not no_confirm and len(clusters) > 0: - cluster_str = 'clusters' if len(clusters) > 1 else 'cluster' - cluster_list = ', '.join(clusters) + cluster_str = "clusters" if len(clusters) > 1 else "cluster" + cluster_list = ", ".join(clusters) click.confirm( - f'{operation} {len(clusters)} {cluster_str}: ' - f'{cluster_list}. Proceed?', + f"{operation} {len(clusters)} {cluster_str}: " f"{cluster_list}. Proceed?", default=True, abort=True, - show_default=True) + show_default=True, + ) - plural = 's' if len(clusters) > 1 else '' - progress = rich_progress.Progress(transient=True, - redirect_stdout=False, - redirect_stderr=False) + plural = "s" if len(clusters) > 1 else "" + progress = rich_progress.Progress( + transient=True, redirect_stdout=False, redirect_stderr=False + ) task = progress.add_task( - f'[bold cyan]{operation} {len(clusters)} cluster{plural}[/]', - total=len(clusters)) + f"[bold cyan]{operation} {len(clusters)} cluster{plural}[/]", + total=len(clusters), + ) def _down_or_stop(name: str): success_progress = False if idle_minutes_to_autostop is not None: try: core.autostop(name, idle_minutes_to_autostop, down) - except (exceptions.NotSupportedError, - exceptions.ClusterNotUpError) as e: + except (exceptions.NotSupportedError, exceptions.ClusterNotUpError) as e: message = str(e) else: # no exception raised success_progress = True - message = (f'{colorama.Fore.GREEN}{operation} ' - f'cluster {name!r}...done{colorama.Style.RESET_ALL}') + message = ( + f"{colorama.Fore.GREEN}{operation} " + f"cluster {name!r}...done{colorama.Style.RESET_ALL}" + ) if idle_minutes_to_autostop >= 0: - option_str = 'down' if down else 'stop' - passive_str = 'downed' if down else 'stopped' - plural = 's' if idle_minutes_to_autostop != 1 else '' + option_str = "down" if down else "stop" + passive_str = "downed" if down else "stopped" + plural = "s" if idle_minutes_to_autostop != 1 else "" message += ( - f'\n The cluster will be auto{passive_str} after ' - f'{idle_minutes_to_autostop} minute{plural} of ' - 'idleness.' - f'\n To cancel the auto{option_str}, run: ' - f'{colorama.Style.BRIGHT}' - f'sky autostop {name} --cancel' - f'{colorama.Style.RESET_ALL}') + f"\n The cluster will be auto{passive_str} after " + f"{idle_minutes_to_autostop} minute{plural} of " + "idleness." + f"\n To cancel the auto{option_str}, run: " + f"{colorama.Style.BRIGHT}" + f"sky autostop {name} --cancel" + f"{colorama.Style.RESET_ALL}" + ) else: try: if down: @@ -2861,20 +3168,26 @@ def _down_or_stop(name: str): core.stop(name, purge=purge) except RuntimeError as e: message = ( - f'{colorama.Fore.RED}{operation} cluster {name}...failed. ' - f'{colorama.Style.RESET_ALL}' - f'\nReason: {common_utils.format_exception(e)}.') - except (exceptions.NotSupportedError, - exceptions.ClusterOwnerIdentityMismatchError) as e: + f"{colorama.Fore.RED}{operation} cluster {name}...failed. " + f"{colorama.Style.RESET_ALL}" + f"\nReason: {common_utils.format_exception(e)}." + ) + except ( + exceptions.NotSupportedError, + exceptions.ClusterOwnerIdentityMismatchError, + ) as e: message = str(e) else: # no exception raised message = ( - f'{colorama.Fore.GREEN}{operation} cluster {name}...done.' - f'{colorama.Style.RESET_ALL}') + f"{colorama.Fore.GREEN}{operation} cluster {name}...done." + f"{colorama.Style.RESET_ALL}" + ) if not down: - message += ('\n To restart the cluster, run: ' - f'{colorama.Style.BRIGHT}sky start {name}' - f'{colorama.Style.RESET_ALL}') + message += ( + "\n To restart the cluster, run: " + f"{colorama.Style.BRIGHT}sky start {name}" + f"{colorama.Style.RESET_ALL}" + ) success_progress = True progress.stop() @@ -2891,12 +3204,14 @@ def _down_or_stop(name: str): @cli.command(cls=_DocumentedCodeCommand) -@click.argument('clouds', required=False, type=str, nargs=-1) -@click.option('--verbose', - '-v', - is_flag=True, - default=False, - help='Show the activated account for each cloud.') +@click.argument("clouds", required=False, type=str, nargs=-1) +@click.option( + "--verbose", + "-v", + is_flag=True, + default=False, + help="Show the activated account for each cloud.", +) @usage_lib.entrypoint def check(clouds: Tuple[str], verbose: bool): """Check which clouds are available to use. @@ -2925,38 +3240,37 @@ def check(clouds: Tuple[str], verbose: bool): @cli.command() -@click.argument('accelerator_str', required=False) -@click.option('--all', - '-a', - is_flag=True, - default=False, - help='Show details of all GPU/TPU/accelerator offerings.') -@click.option('--cloud', - default=None, - type=str, - help='Cloud provider to query.') +@click.argument("accelerator_str", required=False) @click.option( - '--region', + "--all", + "-a", + is_flag=True, + default=False, + help="Show details of all GPU/TPU/accelerator offerings.", +) +@click.option("--cloud", default=None, type=str, help="Cloud provider to query.") +@click.option( + "--region", required=False, type=str, - help= - ('The region to use. If not specified, shows accelerators from all regions.' - ), + help=("The region to use. If not specified, shows accelerators from all regions."), ) @click.option( - '--all-regions', + "--all-regions", is_flag=True, default=False, - help='Show pricing and instance details for a specified accelerator across ' - 'all regions and clouds.') + help="Show pricing and instance details for a specified accelerator across " + "all regions and clouds.", +) @service_catalog.fallback_to_default_catalog @usage_lib.entrypoint def show_gpus( - accelerator_str: Optional[str], - all: bool, # pylint: disable=redefined-builtin - cloud: Optional[str], - region: Optional[str], - all_regions: Optional[bool]): + accelerator_str: Optional[str], + all: bool, # pylint: disable=redefined-builtin + cloud: Optional[str], + region: Optional[str], + all_regions: Optional[bool], +): """Show supported GPU/TPU/accelerators and their prices. The names and counts shown can be set in the ``accelerators`` field in task @@ -3002,102 +3316,110 @@ def show_gpus( # validation for the --region flag if region is not None and cloud is None: raise click.UsageError( - 'The --region flag is only valid when the --cloud flag is set.') + "The --region flag is only valid when the --cloud flag is set." + ) # validation for the --all-regions flag if all_regions and accelerator_str is None: raise click.UsageError( - 'The --all-regions flag is only valid when an accelerator ' - 'is specified.') + "The --all-regions flag is only valid when an accelerator " "is specified." + ) if all_regions and region is not None: raise click.UsageError( - '--all-regions and --region flags cannot be used simultaneously.') + "--all-regions and --region flags cannot be used simultaneously." + ) # This will validate 'cloud' and raise if not found. cloud_obj = sky_clouds.CLOUD_REGISTRY.from_str(cloud) service_catalog.validate_region_zone(region, None, clouds=cloud) show_all = all if show_all and accelerator_str is not None: - raise click.UsageError('--all is only allowed without a GPU name.') + raise click.UsageError("--all is only allowed without a GPU name.") # Kubernetes specific bools cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes) kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None kubernetes_is_enabled = sky_clouds.cloud_in_iterable( - sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds()) + sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds() + ) if cloud_is_kubernetes and region is not None: raise click.UsageError( - 'The --region flag cannot be set with --cloud kubernetes.') + "The --region flag cannot be set with --cloud kubernetes." + ) def _list_to_str(lst): - return ', '.join([str(e) for e in lst]) + return ", ".join([str(e) for e in lst]) def _get_kubernetes_realtime_gpu_table( - name_filter: Optional[str] = None, - quantity_filter: Optional[int] = None): + name_filter: Optional[str] = None, quantity_filter: Optional[int] = None + ): if quantity_filter: - qty_header = 'QTY_FILTER' - free_header = 'FILTERED_FREE_GPUS' + qty_header = "QTY_FILTER" + free_header = "FILTERED_FREE_GPUS" else: - qty_header = 'QTY_PER_NODE' - free_header = 'TOTAL_FREE_GPUS' + qty_header = "QTY_PER_NODE" + free_header = "TOTAL_FREE_GPUS" realtime_gpu_table = log_utils.create_table( - ['GPU', qty_header, 'TOTAL_GPUS', free_header]) + ["GPU", qty_header, "TOTAL_GPUS", free_header] + ) counts, capacity, available = service_catalog.list_accelerator_realtime( gpus_only=True, - clouds='kubernetes', + clouds="kubernetes", name_filter=name_filter, region_filter=region, quantity_filter=quantity_filter, - case_sensitive=False) - assert (set(counts.keys()) == set(capacity.keys()) == set( - available.keys())), (f'Keys of counts ({list(counts.keys())}), ' - f'capacity ({list(capacity.keys())}), ' - f'and available ({list(available.keys())}) ' - 'must be same.') + case_sensitive=False, + ) + assert set(counts.keys()) == set(capacity.keys()) == set(available.keys()), ( + f"Keys of counts ({list(counts.keys())}), " + f"capacity ({list(capacity.keys())}), " + f"and available ({list(available.keys())}) " + "must be same." + ) if len(counts) == 0: - err_msg = 'No GPUs found in Kubernetes cluster. ' - debug_msg = 'To further debug, run: sky check ' + err_msg = "No GPUs found in Kubernetes cluster. " + debug_msg = "To further debug, run: sky check " if name_filter is not None: - gpu_info_msg = f' {name_filter!r}' + gpu_info_msg = f" {name_filter!r}" if quantity_filter is not None: - gpu_info_msg += (' with requested quantity' - f' {quantity_filter}') - err_msg = (f'Resources{gpu_info_msg} not found ' - 'in Kubernetes cluster. ') - debug_msg = ('To show available accelerators on kubernetes,' - ' run: sky show-gpus --cloud kubernetes ') - full_err_msg = (err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE + - debug_msg) + gpu_info_msg += " with requested quantity" f" {quantity_filter}" + err_msg = ( + f"Resources{gpu_info_msg} not found " "in Kubernetes cluster. " + ) + debug_msg = ( + "To show available accelerators on kubernetes," + " run: sky show-gpus --cloud kubernetes " + ) + full_err_msg = err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE + debug_msg raise ValueError(full_err_msg) for gpu, _ in sorted(counts.items()): - realtime_gpu_table.add_row([ - gpu, - _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu] - ]) + realtime_gpu_table.add_row( + [gpu, _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu]] + ) return realtime_gpu_table def _get_kubernetes_node_info_table(): node_table = log_utils.create_table( - ['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS']) + ["NODE_NAME", "GPU_NAME", "TOTAL_GPUS", "FREE_GPUS"] + ) node_info_dict = kubernetes_utils.get_kubernetes_node_info() for node_name, node_info in node_info_dict.items(): - node_table.add_row([ - node_name, node_info.gpu_type, - node_info.total['nvidia.com/gpu'], - node_info.free['nvidia.com/gpu'] - ]) + node_table.add_row( + [ + node_name, + node_info.gpu_type, + node_info.total["nvidia.com/gpu"], + node_info.free["nvidia.com/gpu"], + ] + ) return node_table def _output(): - gpu_table = log_utils.create_table( - ['COMMON_GPU', 'AVAILABLE_QUANTITIES']) - tpu_table = log_utils.create_table( - ['GOOGLE_TPU', 'AVAILABLE_QUANTITIES']) - other_table = log_utils.create_table( - ['OTHER_GPU', 'AVAILABLE_QUANTITIES']) + gpu_table = log_utils.create_table(["COMMON_GPU", "AVAILABLE_QUANTITIES"]) + tpu_table = log_utils.create_table(["GOOGLE_TPU", "AVAILABLE_QUANTITIES"]) + other_table = log_utils.create_table(["OTHER_GPU", "AVAILABLE_QUANTITIES"]) name, quantity = None, None @@ -3107,10 +3429,10 @@ def _output(): clouds_to_list = cloud if cloud is None: clouds_to_list = [ - c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes' + c for c in service_catalog.ALL_CLOUDS if c != "kubernetes" ] - k8s_messages = '' + k8s_messages = "" if accelerator_str is None: # Collect k8s related messages in k8s_messages and print them at end print_section_titles = False @@ -3124,27 +3446,32 @@ def _output(): except ValueError as e: if not cloud_is_kubernetes: # Make it a note if cloud is not kubernetes - k8s_messages += 'Note: ' + k8s_messages += "Note: " k8s_messages += str(e) else: print_section_titles = True - yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') + yield ( + f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" + f"Kubernetes GPUs{colorama.Style.RESET_ALL}\n" + ) yield from k8s_realtime_table.get_string() k8s_node_table = _get_kubernetes_node_info_table() - yield '\n\n' - yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Kubernetes per node GPU availability' - f'{colorama.Style.RESET_ALL}\n') + yield "\n\n" + yield ( + f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" + f"Kubernetes per node GPU availability" + f"{colorama.Style.RESET_ALL}\n" + ) yield from k8s_node_table.get_string() if kubernetes_autoscaling: - k8s_messages += ( - '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) + k8s_messages += "\n" + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE if cloud_is_kubernetes: # Do not show clouds if --cloud kubernetes is specified if not kubernetes_is_enabled: - yield ('Kubernetes is not enabled. To fix, run: ' - 'sky check kubernetes ') + yield ( + "Kubernetes is not enabled. To fix, run: " + "sky check kubernetes " + ) yield k8s_messages return @@ -3152,7 +3479,7 @@ def _output(): # long and the user may not scroll to the end. if show_all and k8s_messages: yield k8s_messages - yield '\n\n' + yield "\n\n" result = service_catalog.list_accelerator_counts( gpus_only=True, @@ -3162,9 +3489,11 @@ def _output(): if print_section_titles: # If section titles were printed above, print again here - yield '\n\n' - yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Cloud GPUs{colorama.Style.RESET_ALL}\n') + yield "\n\n" + yield ( + f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" + f"Cloud GPUs{colorama.Style.RESET_ALL}\n" + ) # "Common" GPUs for gpu in service_catalog.get_common_gpus(): @@ -3177,89 +3506,101 @@ def _output(): if tpu in result: tpu_table.add_row([tpu, _list_to_str(result.pop(tpu))]) if len(tpu_table.get_string()) > 0: - yield '\n\n' + yield "\n\n" yield from tpu_table.get_string() # Handle Other GPUs if show_all or cloud is not None: - yield '\n\n' + yield "\n\n" for gpu, qty in sorted(result.items()): other_table.add_row([gpu, _list_to_str(qty)]) yield from other_table.get_string() - yield '\n\n' + yield "\n\n" # Handle hints and messages - if not show_all and cloud is None: - yield ('\n\nHint: use -a/--all to see all accelerators ' - '(including non-common ones) and pricing.') - - # Handle k8 messages if present - if k8s_messages: - yield '\n' - yield k8s_messages - - # Return if we're not showing all or if a specific cloud was queried - if not show_all or cloud is not None: - return + if not show_all: + if cloud is None: + yield ( + "\n\nHint: use -a/--all to see all accelerators " + "(including non-common ones) and pricing." + ) + # Handle k8 messages if present + if k8s_messages: + yield "\n" + yield k8s_messages + return + else: + # Return if we're not showing all or if a specific cloud was queried + yield ("Hint: use -a/--all to see all accelerators " "and pricing.") + return else: # Parse accelerator string - accelerator_split = accelerator_str.split(':') + accelerator_split = accelerator_str.split(":") if len(accelerator_split) > 2: raise click.UsageError( - f'Invalid accelerator string {accelerator_str}. ' - 'Expected format: [:].') + f"Invalid accelerator string {accelerator_str}. " + "Expected format: [:]." + ) if len(accelerator_split) == 2: name = accelerator_split[0] # Check if quantity is valid try: quantity = int(accelerator_split[1]) if quantity <= 0: - raise ValueError( - 'Quantity cannot be non-positive integer.') + raise ValueError("Quantity cannot be non-positive integer.") except ValueError as invalid_quantity: raise click.UsageError( - f'Invalid accelerator quantity {accelerator_split[1]}. ' - 'Expected a positive integer.') from invalid_quantity + f"Invalid accelerator quantity {accelerator_split[1]}. " + "Expected a positive integer." + ) from invalid_quantity else: name, quantity = accelerator_str, None print_section_titles = False - if (kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and - not show_all): + if ( + kubernetes_is_enabled + and (cloud is None or cloud_is_kubernetes) + and not show_all + ): # Print section title if not showing all and instead a specific # accelerator is requested print_section_titles = True - yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') + yield ( + f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" + f"Kubernetes GPUs{colorama.Style.RESET_ALL}\n" + ) # TODO(romilb): Show filtered per node GPU availability here as well try: k8s_realtime_table = _get_kubernetes_realtime_gpu_table( - name_filter=name, quantity_filter=quantity) + name_filter=name, quantity_filter=quantity + ) yield from k8s_realtime_table.get_string() except ValueError as e: # In the case of a specific accelerator, show the error message # immediately (e.g., "Resources H100 not found ...") yield str(e) if kubernetes_autoscaling: - k8s_messages += ('\n' + - kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) + k8s_messages += "\n" + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE yield k8s_messages if cloud_is_kubernetes: # Do not show clouds if --cloud kubernetes is specified if not kubernetes_is_enabled: - yield ('Kubernetes is not enabled. To fix, run: ' - 'sky check kubernetes ') + yield ( + "Kubernetes is not enabled. To fix, run: " "sky check kubernetes " + ) return # For clouds other than Kubernetes, get the accelerator details # Case-sensitive - result = service_catalog.list_accelerators(gpus_only=True, - name_filter=name, - quantity_filter=quantity, - region_filter=region, - clouds=clouds_to_list, - case_sensitive=False, - all_regions=all_regions) + result = service_catalog.list_accelerators( + gpus_only=True, + name_filter=name, + quantity_filter=quantity, + region_filter=region, + clouds=clouds_to_list, + case_sensitive=False, + all_regions=all_regions, + ) # Import here to save module load speed. # pylint: disable=import-outside-toplevel,line-too-long from sky.clouds.service_catalog import common @@ -3272,73 +3613,79 @@ def _output(): for i, (gpu, items) in enumerate(result.items()): df = pd.DataFrame([t._asdict() for t in items]) # Determine the minimum prices for each cloud. - min_price_df = df.groupby('cloud').agg(min_price=('price', 'min'), - min_spot_price=('spot_price', - 'min')) - df = df.merge(min_price_df, on='cloud') + min_price_df = df.groupby("cloud").agg( + min_price=("price", "min"), min_spot_price=("spot_price", "min") + ) + df = df.merge(min_price_df, on="cloud") # Sort within each cloud by price. - df = df.groupby('cloud', group_keys=False).apply( - lambda x: x.sort_values(by=['price', 'spot_price'])) + df = df.groupby("cloud", group_keys=False).apply( + lambda x: x.sort_values(by=["price", "spot_price"]) + ) # Sort across groups (clouds). - df = df.sort_values(by=['min_price', 'min_spot_price']) - df = df.drop(columns=['min_price', 'min_spot_price']) + df = df.sort_values(by=["min_price", "min_spot_price"]) + df = df.drop(columns=["min_price", "min_spot_price"]) sorted_dataclasses = [ - common.InstanceTypeInfo(*row) - for row in df.to_records(index=False) + common.InstanceTypeInfo(*row) for row in df.to_records(index=False) ] new_result[gpu] = sorted_dataclasses result = new_result if print_section_titles and not show_all: - yield '\n\n' - yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Cloud GPUs{colorama.Style.RESET_ALL}\n') + yield "\n\n" + yield ( + f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" + f"Cloud GPUs{colorama.Style.RESET_ALL}\n" + ) if len(result) == 0: - quantity_str = (f' with requested quantity {quantity}' - if quantity else '') - cloud_str = f' on {cloud_obj}.' if cloud else ' in cloud catalogs.' - yield f'Resources \'{name}\'{quantity_str} not found{cloud_str} ' - yield 'To show available accelerators, run: sky show-gpus --all' + quantity_str = f" with requested quantity {quantity}" if quantity else "" + cloud_str = f" on {cloud_obj}." if cloud else " in cloud catalogs." + yield f"Resources '{name}'{quantity_str} not found{cloud_str} " + yield "To show available accelerators, run: sky show-gpus --all" return for i, (gpu, items) in enumerate(result.items()): accelerator_table_headers = [ - 'GPU', - 'QTY', - 'CLOUD', - 'INSTANCE_TYPE', - 'DEVICE_MEM', - 'vCPUs', - 'HOST_MEM', - 'HOURLY_PRICE', - 'HOURLY_SPOT_PRICE', + "GPU", + "QTY", + "CLOUD", + "INSTANCE_TYPE", + "DEVICE_MEM", + "vCPUs", + "HOST_MEM", + "HOURLY_PRICE", + "HOURLY_SPOT_PRICE", ] if not show_all: - accelerator_table_headers.append('REGION') - accelerator_table = log_utils.create_table( - accelerator_table_headers) + accelerator_table_headers.append("REGION") + accelerator_table = log_utils.create_table(accelerator_table_headers) for item in items: - instance_type_str = item.instance_type if not pd.isna( - item.instance_type) else '(attachable)' + instance_type_str = ( + item.instance_type + if not pd.isna(item.instance_type) + else "(attachable)" + ) cpu_count = item.cpu_count - if not pd.isna(cpu_count) and isinstance( - cpu_count, (float, int)): + if not pd.isna(cpu_count) and isinstance(cpu_count, (float, int)): if int(cpu_count) == cpu_count: cpu_str = str(int(cpu_count)) else: - cpu_str = f'{cpu_count:.1f}' + cpu_str = f"{cpu_count:.1f}" else: - cpu_str = '-' - device_memory_str = (f'{item.device_memory:.0f}GB' if - not pd.isna(item.device_memory) else '-') - host_memory_str = f'{item.memory:.0f}GB' if not pd.isna( - item.memory) else '-' - price_str = f'$ {item.price:.3f}' if not pd.isna( - item.price) else '-' - spot_price_str = f'$ {item.spot_price:.3f}' if not pd.isna( - item.spot_price) else '-' - region_str = item.region if not pd.isna(item.region) else '-' + cpu_str = "-" + device_memory_str = ( + f"{item.device_memory:.0f}GB" + if not pd.isna(item.device_memory) + else "-" + ) + host_memory_str = ( + f"{item.memory:.0f}GB" if not pd.isna(item.memory) else "-" + ) + price_str = f"$ {item.price:.3f}" if not pd.isna(item.price) else "-" + spot_price_str = ( + f"$ {item.spot_price:.3f}" if not pd.isna(item.spot_price) else "-" + ) + region_str = item.region if not pd.isna(item.region) else "-" accelerator_table_vals = [ item.accelerator_name, item.accelerator_count, @@ -3355,7 +3702,7 @@ def _output(): accelerator_table.add_row(accelerator_table_vals) if i != 0: - yield '\n\n' + yield "\n\n" yield from accelerator_table.get_string() if show_all: @@ -3372,13 +3719,15 @@ def storage(): pass -@storage.command('ls', cls=_DocumentedCodeCommand) -@click.option('--all', - '-a', - default=False, - is_flag=True, - required=False, - help='Show all information in full.') +@storage.command("ls", cls=_DocumentedCodeCommand) +@click.option( + "--all", + "-a", + default=False, + is_flag=True, + required=False, + help="Show all information in full.", +) @usage_lib.entrypoint # pylint: disable=redefined-builtin def storage_ls(all: bool): @@ -3388,26 +3737,34 @@ def storage_ls(all: bool): click.echo(storage_table) -@storage.command('delete', cls=_DocumentedCodeCommand) -@click.argument('names', - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_storage_name)) -@click.option('--all', - '-a', - default=False, - is_flag=True, - required=False, - help='Delete all storage objects.') -@click.option('--yes', - '-y', - default=False, - is_flag=True, - required=False, - help='Skip confirmation prompt.') +@storage.command("delete", cls=_DocumentedCodeCommand) +@click.argument( + "names", + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_storage_name), +) +@click.option( + "--all", + "-a", + default=False, + is_flag=True, + required=False, + help="Delete all storage objects.", +) +@click.option( + "--yes", + "-y", + default=False, + is_flag=True, + required=False, + help="Skip confirmation prompt.", +) @usage_lib.entrypoint -def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=redefined-builtin +def storage_delete( + names: List[str], all: bool, yes: bool +): # pylint: disable=redefined-builtin """Delete storage objects. Examples: @@ -3424,25 +3781,25 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r sky storage delete -a """ if sum([len(names) > 0, all]) != 1: - raise click.UsageError('Either --all or a name must be specified.') + raise click.UsageError("Either --all or a name must be specified.") if all: storages = sky.storage_ls() if not storages: - click.echo('No storage(s) to delete.') + click.echo("No storage(s) to delete.") return - names = [s['name'] for s in storages] + names = [s["name"] for s in storages] else: names = _get_glob_storages(names) if names: if not yes: - storage_names = ', '.join(names) - storage_str = 'storages' if len(names) > 1 else 'storage' + storage_names = ", ".join(names) + storage_str = "storages" if len(names) > 1 else "storage" click.confirm( - f'Deleting {len(names)} {storage_str}: ' - f'{storage_names}. Proceed?', + f"Deleting {len(names)} {storage_str}: " f"{storage_names}. Proceed?", default=True, abort=True, - show_default=True) + show_default=True, + ) subprocess_utils.run_in_parallel(sky.storage_delete, names) @@ -3459,49 +3816,62 @@ def jobs(): pass -@jobs.command('launch', cls=_DocumentedCodeCommand) -@click.argument('entrypoint', - required=True, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name)) +@jobs.command("launch", cls=_DocumentedCodeCommand) +@click.argument( + "entrypoint", + required=True, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name), +) # TODO(zhwu): Add --dryrun option to test the launch command. @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS) -@click.option('--cluster', - '-c', - default=None, - type=str, - hidden=True, - help=('Alias for --name, the name of the spot job.')) -@click.option('--job-recovery', - default=None, - type=str, - help='Recovery strategy to use for managed jobs.') @click.option( - '--detach-run', - '-d', + "--cluster", + "-c", + default=None, + type=str, + hidden=True, + help=("Alias for --name, the name of the spot job."), +) +@click.option( + "--job-recovery", + default=None, + type=str, + help="Recovery strategy to use for managed jobs.", +) +@click.option( + "--detach-run", + "-d", default=False, is_flag=True, - help=('If True, as soon as a job is submitted, return from this call ' - 'and do not stream execution logs.')) + help=( + "If True, as soon as a job is submitted, return from this call " + "and do not stream execution logs." + ), +) @click.option( - '--retry-until-up/--no-retry-until-up', - '-r/-no-r', + "--retry-until-up/--no-retry-until-up", + "-r/-no-r", default=None, is_flag=True, required=False, help=( - '(Default: True; this flag is deprecated and will be removed in a ' - 'future release.) Whether to retry provisioning infinitely until the ' - 'cluster is up, if unavailability errors are encountered. This ' # pylint: disable=bad-docstring-quotes - 'applies to launching all managed jobs (both the initial and ' - 'any recovery attempts), not the jobs controller.')) -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') + "(Default: True; this flag is deprecated and will be removed in a " + "future release.) Whether to retry provisioning infinitely until the " + "cluster is up, if unavailability errors are encountered. This " # pylint: disable=bad-docstring-quotes + "applies to launching all managed jobs (both the initial and " + "any recovery attempts), not the jobs controller." + ), +) +@click.option( + "--yes", + "-y", + is_flag=True, + default=False, + required=False, + help="Skip confirmation prompt.", +) @timeline.event @usage_lib.entrypoint def jobs_launch( @@ -3545,8 +3915,10 @@ def jobs_launch( """ if cluster is not None: if name is not None and name != cluster: - raise click.UsageError('Cannot specify both --name and --cluster. ' - 'Use one of the flags as they are alias.') + raise click.UsageError( + "Cannot specify both --name and --cluster. " + "Use one of the flags as they are alias." + ) name = cluster env = _merge_env_vars(env_file, env) task_or_dag = _make_task_or_dag_from_entrypoint_with_overrides( @@ -3572,14 +3944,15 @@ def jobs_launch( # Deprecation. We set the default behavior to be retry until up, and the # flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0. if retry_until_up is not None: - flag_str = '--retry-until-up' + flag_str = "--retry-until-up" if not retry_until_up: - flag_str = '--no-retry-until-up' + flag_str = "--no-retry-until-up" click.secho( - f'Flag {flag_str} is deprecated and will be removed in a ' - 'future release (managed jobs will always be retried). ' - 'Please file an issue if this does not work for you.', - fg='yellow') + f"Flag {flag_str} is deprecated and will be removed in a " + "future release (managed jobs will always be retried). " + "Please file an issue if this does not work for you.", + fg="yellow", + ) else: retry_until_up = True @@ -3597,44 +3970,46 @@ def jobs_launch( dag_utils.maybe_infer_and_fill_dag_and_task_names(dag) dag_utils.fill_default_config_in_dag_for_job_launch(dag) - click.secho(f'Managed job {dag.name!r} will be launched on (estimated):', - fg='yellow') + click.secho( + f"Managed job {dag.name!r} will be launched on (estimated):", fg="yellow" + ) dag = sky.optimize(dag) if not yes: - prompt = f'Launching a managed job {dag.name!r}. Proceed?' + prompt = f"Launching a managed job {dag.name!r}. Proceed?" if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) common_utils.check_cluster_name_is_valid(name) - managed_jobs.launch(dag, - name, - detach_run=detach_run, - retry_until_up=retry_until_up) + managed_jobs.launch(dag, name, detach_run=detach_run, retry_until_up=retry_until_up) -@jobs.command('queue', cls=_DocumentedCodeCommand) -@click.option('--all', - '-a', - default=False, - is_flag=True, - required=False, - help='Show all information in full.') +@jobs.command("queue", cls=_DocumentedCodeCommand) +@click.option( + "--all", + "-a", + default=False, + is_flag=True, + required=False, + help="Show all information in full.", +) +@click.option( + "--refresh", + "-r", + default=False, + is_flag=True, + required=False, + help="Query the latest statuses, restarting the jobs controller if stopped.", +) @click.option( - '--refresh', - '-r', + "--skip-finished", + "-s", default=False, is_flag=True, required=False, - help='Query the latest statuses, restarting the jobs controller if stopped.' + help="Show only pending/running jobs' information.", ) -@click.option('--skip-finished', - '-s', - default=False, - is_flag=True, - required=False, - help='Show only pending/running jobs\' information.') @usage_lib.entrypoint # pylint: disable=redefined-builtin def jobs_queue(all: bool, refresh: bool, skip_finished: bool): @@ -3691,40 +4066,46 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool): watch -n60 sky jobs queue """ - click.secho('Fetching managed job statuses...', fg='yellow') - with rich_utils.safe_status('[cyan]Checking managed jobs[/]'): - _, msg = _get_managed_jobs(refresh=refresh, - skip_finished=skip_finished, - show_all=all, - is_called_by_user=True) + click.secho("Fetching managed job statuses...", fg="yellow") + with rich_utils.safe_status("[cyan]Checking managed jobs[/]"): + _, msg = _get_managed_jobs( + refresh=refresh, + skip_finished=skip_finished, + show_all=all, + is_called_by_user=True, + ) if not skip_finished: - in_progress_only_hint = '' + in_progress_only_hint = "" else: - in_progress_only_hint = ' (showing in-progress jobs only)' - click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Managed jobs{colorama.Style.RESET_ALL}' - f'{in_progress_only_hint}\n{msg}') - - -@jobs.command('cancel', cls=_DocumentedCodeCommand) -@click.option('--name', - '-n', - required=False, - type=str, - help='Managed job name to cancel.') -@click.argument('job_ids', default=None, type=int, required=False, nargs=-1) -@click.option('--all', - '-a', - is_flag=True, - default=False, - required=False, - help='Cancel all managed jobs.') -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') + in_progress_only_hint = " (showing in-progress jobs only)" + click.echo( + f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" + f"Managed jobs{colorama.Style.RESET_ALL}" + f"{in_progress_only_hint}\n{msg}" + ) + + +@jobs.command("cancel", cls=_DocumentedCodeCommand) +@click.option( + "--name", "-n", required=False, type=str, help="Managed job name to cancel." +) +@click.argument("job_ids", default=None, type=int, required=False, nargs=-1) +@click.option( + "--all", + "-a", + is_flag=True, + default=False, + required=False, + help="Cancel all managed jobs.", +) +@click.option( + "--yes", + "-y", + is_flag=True, + default=False, + required=False, + help="Skip confirmation prompt.", +) @usage_lib.entrypoint # pylint: disable=redefined-builtin def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool): @@ -3745,73 +4126,83 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool): """ backend_utils.is_controller_accessible( controller=controller_utils.Controllers.JOBS_CONTROLLER, - stopped_message='All managed jobs should have finished.', - exit_if_not_accessible=True) + stopped_message="All managed jobs should have finished.", + exit_if_not_accessible=True, + ) - job_id_str = ','.join(map(str, job_ids)) + job_id_str = ",".join(map(str, job_ids)) if sum([len(job_ids) > 0, name is not None, all]) != 1: - argument_str = f'--job-ids {job_id_str}' if len(job_ids) > 0 else '' - argument_str += f' --name {name}' if name is not None else '' - argument_str += ' --all' if all else '' + argument_str = f"--job-ids {job_id_str}" if len(job_ids) > 0 else "" + argument_str += f" --name {name}" if name is not None else "" + argument_str += " --all" if all else "" raise click.UsageError( - 'Can only specify one of JOB_IDS or --name or --all. ' - f'Provided {argument_str!r}.') + "Can only specify one of JOB_IDS or --name or --all. " + f"Provided {argument_str!r}." + ) if not yes: - job_identity_str = (f'managed jobs with IDs {job_id_str}' - if job_ids else repr(name)) + job_identity_str = ( + f"managed jobs with IDs {job_id_str}" if job_ids else repr(name) + ) if all: - job_identity_str = 'all managed jobs' - click.confirm(f'Cancelling {job_identity_str}. Proceed?', - default=True, - abort=True, - show_default=True) + job_identity_str = "all managed jobs" + click.confirm( + f"Cancelling {job_identity_str}. Proceed?", + default=True, + abort=True, + show_default=True, + ) managed_jobs.cancel(job_ids=job_ids, name=name, all=all) -@jobs.command('logs', cls=_DocumentedCodeCommand) -@click.option('--name', - '-n', - required=False, - type=str, - help='Managed job name.') +@jobs.command("logs", cls=_DocumentedCodeCommand) +@click.option("--name", "-n", required=False, type=str, help="Managed job name.") @click.option( - '--follow/--no-follow', + "--follow/--no-follow", is_flag=True, default=True, - help=('Follow the logs of the job. [default: --follow] ' - 'If --no-follow is specified, print the log so far and exit.')) + help=( + "Follow the logs of the job. [default: --follow] " + "If --no-follow is specified, print the log so far and exit." + ), +) @click.option( - '--controller', + "--controller", is_flag=True, default=False, - help=('Show the controller logs of this job; useful for debugging ' - 'launching/recoveries, etc.')) -@click.argument('job_id', required=False, type=int) + help=( + "Show the controller logs of this job; useful for debugging " + "launching/recoveries, etc." + ), +) +@click.argument("job_id", required=False, type=int) @usage_lib.entrypoint -def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool, - controller: bool): +def jobs_logs( + name: Optional[str], job_id: Optional[int], follow: bool, controller: bool +): """Tail the log of a managed job.""" try: - managed_jobs.tail_logs(name=name, - job_id=job_id, - follow=follow, - controller=controller) + managed_jobs.tail_logs( + name=name, job_id=job_id, follow=follow, controller=controller + ) except exceptions.ClusterNotUpError: with ux_utils.print_exception_no_traceback(): raise -@jobs.command('dashboard', cls=_DocumentedCodeCommand) +@jobs.command("dashboard", cls=_DocumentedCodeCommand) @click.option( - '--port', - '-p', + "--port", + "-p", default=None, type=int, required=False, - help=('Local port to use for the dashboard. If None, a free port is ' - 'automatically chosen.')) + help=( + "Local port to use for the dashboard. If None, a free port is " + "automatically chosen." + ), +) @usage_lib.entrypoint def jobs_dashboard(port: Optional[int]): """Opens a dashboard for managed jobs (needs controller to be UP).""" @@ -3820,14 +4211,17 @@ def jobs_dashboard(port: Optional[int]): # see if the controller is UP first, which is slow; (2) not have to run SSH # port forwarding first (we'd just launch a local dashboard which would make # REST API calls to the controller dashboard server). - click.secho('Checking if jobs controller is up...', fg='yellow') - hint = ('Dashboard is not available if jobs controller is not up. Run a ' - 'managed job first.') + click.secho("Checking if jobs controller is up...", fg="yellow") + hint = ( + "Dashboard is not available if jobs controller is not up. Run a " + "managed job first." + ) backend_utils.is_controller_accessible( controller=controller_utils.Controllers.JOBS_CONTROLLER, stopped_message=hint, non_existent_message=hint, - exit_if_not_accessible=True) + exit_if_not_accessible=True, + ) # SSH forward a free local port to remote's dashboard port. remote_port = constants.SPOT_DASHBOARD_REMOTE_PORT @@ -3836,18 +4230,20 @@ def jobs_dashboard(port: Optional[int]): else: free_port = port ssh_command = ( - f'ssh -qNL {free_port}:localhost:{remote_port} ' - f'{controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name}') - click.echo('Forwarding port: ', nl=False) - click.secho(f'{ssh_command}', dim=True) + f"ssh -qNL {free_port}:localhost:{remote_port} " + f"{controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name}" + ) + click.echo("Forwarding port: ", nl=False) + click.secho(f"{ssh_command}", dim=True) - with subprocess.Popen(ssh_command, shell=True, - start_new_session=True) as ssh_process: + with subprocess.Popen( + ssh_command, shell=True, start_new_session=True + ) as ssh_process: time.sleep(3) # Added delay for ssh_command to initialize. - webbrowser.open(f'http://localhost:{free_port}') + webbrowser.open(f"http://localhost:{free_port}") click.secho( - f'Dashboard is now available at: http://127.0.0.1:{free_port}', - fg='green') + f"Dashboard is now available at: http://127.0.0.1:{free_port}", fg="green" + ) try: ssh_process.wait() except KeyboardInterrupt: @@ -3859,7 +4255,7 @@ def jobs_dashboard(port: Optional[int]): # This happens if jobs controller is auto-stopped. pass finally: - click.echo('Exiting.') + click.echo("Exiting.") # TODO(zhwu): Backward compatibility for the old `sky spot launch` command. @@ -3871,10 +4267,9 @@ def spot(): pass -_add_command_alias(jobs, - jobs_launch, - new_group=spot, - override_command_argument={'use_spot': True}) +_add_command_alias( + jobs, jobs_launch, new_group=spot, override_command_argument={"use_spot": True} +) _add_command_alias(jobs, jobs_queue, new_group=spot) _add_command_alias(jobs, jobs_logs, new_group=spot) _add_command_alias(jobs, jobs_cancel, new_group=spot) @@ -3909,9 +4304,9 @@ def _generate_task_with_service( not_supported_cmd: str, ) -> sky.Task: """Generate a task with service section from a service YAML file.""" - is_yaml, _ = _check_yaml(''.join(service_yaml_args)) + is_yaml, _ = _check_yaml("".join(service_yaml_args)) if not is_yaml: - raise click.UsageError('SERVICE_YAML must be a valid YAML file.') + raise click.UsageError("SERVICE_YAML must be a valid YAML file.") env = _merge_env_vars(env_file, env) # We keep nargs=-1 in service_yaml argument to reuse this function. task = _make_task_or_dag_from_entrypoint_with_overrides( @@ -3933,31 +4328,36 @@ def _generate_task_with_service( disk_size=disk_size, disk_tier=disk_tier, ports=ports, - entrypoint_name='Service', + entrypoint_name="Service", ) if isinstance(task, sky.Dag): raise click.UsageError( - _DAG_NOT_SUPPORTED_MESSAGE.format(command=not_supported_cmd)) + _DAG_NOT_SUPPORTED_MESSAGE.format(command=not_supported_cmd) + ) if task.service is None: with ux_utils.print_exception_no_traceback(): - raise ValueError('Service section not found in the YAML file. ' - 'To fix, add a valid `service` field.') + raise ValueError( + "Service section not found in the YAML file. " + "To fix, add a valid `service` field." + ) service_port: Optional[int] = None for requested_resources in list(task.resources): - if requested_resources.ports is None or len( - requested_resources.ports) != 1: + if requested_resources.ports is None or len(requested_resources.ports) != 1: with ux_utils.print_exception_no_traceback(): raise ValueError( - 'Must only specify one port in resources. Each replica ' - 'will use the port specified as application ingress port.') + "Must only specify one port in resources. Each replica " + "will use the port specified as application ingress port." + ) service_port_str = requested_resources.ports[0] if not service_port_str.isdigit(): # For the case when the user specified a port range like 10000-10010 with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Port {service_port_str!r} is not a valid ' - 'port number. Please specify a single port ' - f'instead. Got: {service_port_str!r}') + raise ValueError( + f"Port {service_port_str!r} is not a valid " + "port number. Please specify a single port " + f"instead. Got: {service_port_str!r}" + ) # We request all the replicas using the same port for now, but it # should be fine to allow different replicas to use different ports # in the future. @@ -3966,31 +4366,39 @@ def _generate_task_with_service( service_port = resource_port if service_port != resource_port: with ux_utils.print_exception_no_traceback(): - raise ValueError(f'Got multiple ports: {service_port} and ' - f'{resource_port} in different resources. ' - 'Please specify single port instead.') + raise ValueError( + f"Got multiple ports: {service_port} and " + f"{resource_port} in different resources. " + "Please specify single port instead." + ) return task -@serve.command('up', cls=_DocumentedCodeCommand) -@click.argument('service_yaml', - required=True, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name)) -@click.option('--service-name', - '-n', - default=None, - type=str, - help='A service name. Unique for each service. If not provided, ' - 'a unique name is autogenerated.') +@serve.command("up", cls=_DocumentedCodeCommand) +@click.argument( + "service_yaml", + required=True, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name), +) +@click.option( + "--service-name", + "-n", + default=None, + type=str, + help="A service name. Unique for each service. If not provided, " + "a unique name is autogenerated.", +) @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS) -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') +@click.option( + "--yes", + "-y", + is_flag=True, + default=False, + required=False, + help="Skip confirmation prompt.", +) @timeline.event @usage_lib.entrypoint def serve_up( @@ -4064,19 +4472,18 @@ def serve_up( disk_size=disk_size, disk_tier=disk_tier, ports=ports, - not_supported_cmd='sky serve up', + not_supported_cmd="sky serve up", ) - click.secho('Service Spec:', fg='cyan') + click.secho("Service Spec:", fg="cyan") click.echo(task.service) - click.secho('Each replica will use the following resources (estimated):', - fg='cyan') + click.secho("Each replica will use the following resources (estimated):", fg="cyan") with sky.Dag() as dag: dag.add(task) sky.optimize(dag) if not yes: - prompt = f'Launching a new service {service_name!r}. Proceed?' + prompt = f"Launching a new service {service_name!r}. Proceed?" if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) @@ -4086,28 +4493,35 @@ def serve_up( # TODO(MaoZiming): Update Doc. # TODO(MaoZiming): Expose mix replica traffic option to user. # Currently, we do not mix traffic from old and new replicas. -@serve.command('update', cls=_DocumentedCodeCommand) -@click.argument('service_name', required=True, type=str) -@click.argument('service_yaml', - required=True, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name)) +@serve.command("update", cls=_DocumentedCodeCommand) +@click.argument("service_name", required=True, type=str) +@click.argument( + "service_yaml", + required=True, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name), +) @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS) -@click.option('--mode', - default=serve_lib.DEFAULT_UPDATE_MODE.value, - type=click.Choice([m.value for m in serve_lib.UpdateMode], - case_sensitive=False), - required=False, - help=('Update mode. If "rolling", SkyServe will update the ' - 'service with rolling update. If "blue_green", SkyServe ' - 'will update the service with blue-green update. ')) -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') +@click.option( + "--mode", + default=serve_lib.DEFAULT_UPDATE_MODE.value, + type=click.Choice([m.value for m in serve_lib.UpdateMode], case_sensitive=False), + required=False, + help=( + 'Update mode. If "rolling", SkyServe will update the ' + 'service with rolling update. If "blue_green", SkyServe ' + "will update the service with blue-green update. " + ), +) +@click.option( + "--yes", + "-y", + is_flag=True, + default=False, + required=False, + help="Skip confirmation prompt.", +) @timeline.event @usage_lib.entrypoint def serve_update( @@ -4180,39 +4594,44 @@ def serve_update( disk_size=disk_size, disk_tier=disk_tier, ports=ports, - not_supported_cmd='sky serve update', + not_supported_cmd="sky serve update", ) - click.secho('Service Spec:', fg='cyan') + click.secho("Service Spec:", fg="cyan") click.echo(task.service) - click.secho('New replica will use the following resources (estimated):', - fg='cyan') + click.secho("New replica will use the following resources (estimated):", fg="cyan") with sky.Dag() as dag: dag.add(task) sky.optimize(dag) if not yes: - click.confirm(f'Updating service {service_name!r}. Proceed?', - default=True, - abort=True, - show_default=True) + click.confirm( + f"Updating service {service_name!r}. Proceed?", + default=True, + abort=True, + show_default=True, + ) serve_lib.update(task, service_name, mode=serve_lib.UpdateMode(mode)) -@serve.command('status', cls=_DocumentedCodeCommand) -@click.option('--all', - '-a', - default=False, - is_flag=True, - required=False, - help='Show all information in full.') -@click.option('--endpoint', - default=False, - is_flag=True, - required=False, - help='Show service endpoint.') -@click.argument('service_names', required=False, type=str, nargs=-1) +@serve.command("status", cls=_DocumentedCodeCommand) +@click.option( + "--all", + "-a", + default=False, + is_flag=True, + required=False, + help="Show all information in full.", +) +@click.option( + "--endpoint", + default=False, + is_flag=True, + required=False, + help="Show service endpoint.", +) +@click.argument("service_names", required=False, type=str, nargs=-1) @usage_lib.entrypoint # pylint: disable=redefined-builtin def serve_status(all: bool, endpoint: bool, service_names: List[str]): @@ -4308,36 +4727,39 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): sky serve status my-service """ # This won't pollute the output of --endpoint. - with rich_utils.safe_status('[cyan]Checking services[/]'): - _, msg = _get_services(service_names, - show_all=all, - show_endpoint=endpoint, - is_called_by_user=True) + with rich_utils.safe_status("[cyan]Checking services[/]"): + _, msg = _get_services( + service_names, show_all=all, show_endpoint=endpoint, is_called_by_user=True + ) if not endpoint: - click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Services{colorama.Style.RESET_ALL}') + click.echo( + f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" + f"Services{colorama.Style.RESET_ALL}" + ) click.echo(msg) -@serve.command('down', cls=_DocumentedCodeCommand) -@click.argument('service_names', required=False, type=str, nargs=-1) -@click.option('--all', - '-a', - default=False, - is_flag=True, - help='Tear down all services.') -@click.option('--purge', - '-p', - default=False, - is_flag=True, - help='Tear down services in failed status.') -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') +@serve.command("down", cls=_DocumentedCodeCommand) +@click.argument("service_names", required=False, type=str, nargs=-1) +@click.option( + "--all", "-a", default=False, is_flag=True, help="Tear down all services." +) +@click.option( + "--purge", + "-p", + default=False, + is_flag=True, + help="Tear down services in failed status.", +) +@click.option( + "--yes", + "-y", + is_flag=True, + default=False, + required=False, + help="Skip confirmation prompt.", +) # pylint: disable=redefined-builtin def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool): """Teardown service(s). @@ -4368,50 +4790,62 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool): sky serve down failed-service --purge """ if sum([len(service_names) > 0, all]) != 1: - argument_str = f'SERVICE_NAMES={",".join(service_names)}' if len( - service_names) > 0 else '' - argument_str += ' --all' if all else '' + argument_str = ( + f'SERVICE_NAMES={",".join(service_names)}' if len(service_names) > 0 else "" + ) + argument_str += " --all" if all else "" raise click.UsageError( - 'Can only specify one of SERVICE_NAMES or --all. ' - f'Provided {argument_str!r}.') + "Can only specify one of SERVICE_NAMES or --all. " + f"Provided {argument_str!r}." + ) backend_utils.is_controller_accessible( controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER, - stopped_message='All services should have been terminated.', - exit_if_not_accessible=True) + stopped_message="All services should have been terminated.", + exit_if_not_accessible=True, + ) if not yes: - quoted_service_names = [f'{name!r}' for name in service_names] + quoted_service_names = [f"{name!r}" for name in service_names] service_identity_str = f'service(s) {", ".join(quoted_service_names)}' if all: - service_identity_str = 'all services' - click.confirm(f'Terminating {service_identity_str}. Proceed?', - default=True, - abort=True, - show_default=True) + service_identity_str = "all services" + click.confirm( + f"Terminating {service_identity_str}. Proceed?", + default=True, + abort=True, + show_default=True, + ) serve_lib.down(service_names=service_names, all=all, purge=purge) -@serve.command('logs', cls=_DocumentedCodeCommand) +@serve.command("logs", cls=_DocumentedCodeCommand) @click.option( - '--follow/--no-follow', + "--follow/--no-follow", is_flag=True, default=True, - help=('Follow the logs of the job. [default: --follow] ' - 'If --no-follow is specified, print the log so far and exit.')) -@click.option('--controller', - is_flag=True, - default=False, - required=False, - help='Show the controller logs of this service.') -@click.option('--load-balancer', - is_flag=True, - default=False, - required=False, - help='Show the load balancer logs of this service.') -@click.argument('service_name', required=True, type=str) -@click.argument('replica_id', required=False, type=int) + help=( + "Follow the logs of the job. [default: --follow] " + "If --no-follow is specified, print the log so far and exit." + ), +) +@click.option( + "--controller", + is_flag=True, + default=False, + required=False, + help="Show the controller logs of this service.", +) +@click.option( + "--load-balancer", + is_flag=True, + default=False, + required=False, + help="Show the load balancer logs of this service.", +) +@click.argument("service_name", required=True, type=str) +@click.argument("replica_id", required=False, type=int) @usage_lib.entrypoint # TODO(tian): Add default argument for this CLI if none of the flags are # specified. @@ -4438,13 +4872,16 @@ def serve_logs( sky serve logs [SERVICE_NAME] 1 """ have_replica_id = replica_id is not None - num_flags = (controller + load_balancer + have_replica_id) + num_flags = controller + load_balancer + have_replica_id if num_flags > 1: - raise click.UsageError('At most one of --controller, --load-balancer, ' - '[REPLICA_ID] can be specified.') + raise click.UsageError( + "At most one of --controller, --load-balancer, " + "[REPLICA_ID] can be specified." + ) if num_flags == 0: - raise click.UsageError('One of --controller, --load-balancer, ' - '[REPLICA_ID] must be specified.') + raise click.UsageError( + "One of --controller, --load-balancer, " "[REPLICA_ID] must be specified." + ) if controller: target_component = serve_lib.ServiceComponent.CONTROLLER elif load_balancer: @@ -4454,10 +4891,9 @@ def serve_logs( assert replica_id is not None target_component = serve_lib.ServiceComponent.REPLICA try: - serve_lib.tail_logs(service_name, - target=target_component, - replica_id=replica_id, - follow=follow) + serve_lib.tail_logs( + service_name, target=target_component, replica_id=replica_id, follow=follow + ) except exceptions.ClusterNotUpError: with ux_utils.print_exception_no_traceback(): raise @@ -4485,71 +4921,84 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]: """ config = common_utils.read_yaml(os.path.expanduser(yaml_path)) if not isinstance(config, dict): - raise ValueError(f'Invalid YAML file: {yaml_path}. ' - 'The YAML file should be parsed into a dictionary.') - if config.get('resources') is None: + raise ValueError( + f"Invalid YAML file: {yaml_path}. " + "The YAML file should be parsed into a dictionary." + ) + if config.get("resources") is None: return None - resources = config['resources'] + resources = config["resources"] if not isinstance(resources, dict): - raise ValueError(f'Invalid resources configuration in {yaml_path}. ' - 'Resources must be a dictionary.') - if resources.get('candidates') is None: + raise ValueError( + f"Invalid resources configuration in {yaml_path}. " + "Resources must be a dictionary." + ) + if resources.get("candidates") is None: return None - candidates = resources['candidates'] + candidates = resources["candidates"] if not isinstance(candidates, list): - raise ValueError('Resource candidates must be a list of dictionaries.') + raise ValueError("Resource candidates must be a list of dictionaries.") for candidate in candidates: if not isinstance(candidate, dict): - raise ValueError('Each resource candidate must be a dictionary.') + raise ValueError("Each resource candidate must be a dictionary.") return candidates -@bench.command('launch', cls=_DocumentedCodeCommand) -@click.argument('entrypoint', - required=True, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name)) -@click.option('--benchmark', - '-b', - required=True, - type=str, - help='Benchmark name.') +@bench.command("launch", cls=_DocumentedCodeCommand) +@click.argument( + "entrypoint", + required=True, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name), +) +@click.option("--benchmark", "-b", required=True, type=str, help="Benchmark name.") @_add_click_options(_TASK_OPTIONS_WITH_NAME) -@click.option('--gpus', - required=False, - type=str, - help=('Comma-separated list of GPUs to run benchmark on. ' - 'Example values: "T4:4,V100:8" (without blank spaces).')) @click.option( - '--ports', + "--gpus", + required=False, + type=str, + help=( + "Comma-separated list of GPUs to run benchmark on. " + 'Example values: "T4:4,V100:8" (without blank spaces).' + ), +) +@click.option( + "--ports", required=False, type=str, multiple=True, - help=('Ports to open on the cluster. ' - 'If specified, overrides the "ports" config in the YAML. '), + help=( + "Ports to open on the cluster. " + 'If specified, overrides the "ports" config in the YAML. ' + ), ) @click.option( - '--idle-minutes-to-autostop', - '-i', + "--idle-minutes-to-autostop", + "-i", default=None, type=int, required=False, - help=('Automatically stop the cluster after this many minutes ' - 'of idleness after setup/file_mounts. This is equivalent to ' - 'running `sky launch -d ...` and then `sky autostop -i `. ' - 'If not set, the cluster will not be autostopped.')) + help=( + "Automatically stop the cluster after this many minutes " + "of idleness after setup/file_mounts. This is equivalent to " + "running `sky launch -d ...` and then `sky autostop -i `. " + "If not set, the cluster will not be autostopped." + ), +) # Disabling quote check here, as there seems to be a bug in pylint, # which incorrectly recognizes the help string as a docstring. # pylint: disable=bad-docstring-quotes -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') +@click.option( + "--yes", + "-y", + is_flag=True, + default=False, + required=False, + help="Skip confirmation prompt.", +) @usage_lib.entrypoint def benchmark_launch( entrypoint: str, @@ -4583,66 +5032,71 @@ def benchmark_launch( env = _merge_env_vars(env_file, env) record = benchmark_state.get_benchmark_from_name(benchmark) if record is not None: - raise click.BadParameter(f'Benchmark {benchmark} already exists. ' - 'To delete the previous benchmark result, ' - f'run `sky bench delete {benchmark}`.') + raise click.BadParameter( + f"Benchmark {benchmark} already exists. " + "To delete the previous benchmark result, " + f"run `sky bench delete {benchmark}`." + ) - entrypoint = ' '.join(entrypoint) + entrypoint = " ".join(entrypoint) if not entrypoint: - raise click.BadParameter('Please specify a task yaml to benchmark.') + raise click.BadParameter("Please specify a task yaml to benchmark.") is_yaml, config = _check_yaml(entrypoint) if not is_yaml: raise click.BadParameter( - 'Sky Benchmark does not support command line tasks. ' - 'Please provide a YAML file.') + "Sky Benchmark does not support command line tasks. " + "Please provide a YAML file." + ) assert config is not None, (is_yaml, config) - click.secho('Benchmarking a task from YAML spec: ', fg='yellow', nl=False) + click.secho("Benchmarking a task from YAML spec: ", fg="yellow", nl=False) click.secho(entrypoint, bold=True) candidates = _get_candidate_configs(entrypoint) # Check if the candidate configs are specified in both CLI and YAML. if candidates is not None: - message = ('is specified in both CLI and resources.candidates ' - 'in the YAML. Please specify only one of them.') + message = ( + "is specified in both CLI and resources.candidates " + "in the YAML. Please specify only one of them." + ) if cloud is not None: - if any('cloud' in candidate for candidate in candidates): - raise click.BadParameter(f'cloud {message}') + if any("cloud" in candidate for candidate in candidates): + raise click.BadParameter(f"cloud {message}") if region is not None: - if any('region' in candidate for candidate in candidates): - raise click.BadParameter(f'region {message}') + if any("region" in candidate for candidate in candidates): + raise click.BadParameter(f"region {message}") if zone is not None: - if any('zone' in candidate for candidate in candidates): - raise click.BadParameter(f'zone {message}') + if any("zone" in candidate for candidate in candidates): + raise click.BadParameter(f"zone {message}") if gpus is not None: - if any('accelerators' in candidate for candidate in candidates): - raise click.BadParameter(f'gpus (accelerators) {message}') + if any("accelerators" in candidate for candidate in candidates): + raise click.BadParameter(f"gpus (accelerators) {message}") if use_spot is not None: - if any('use_spot' in candidate for candidate in candidates): - raise click.BadParameter(f'use_spot {message}') + if any("use_spot" in candidate for candidate in candidates): + raise click.BadParameter(f"use_spot {message}") if image_id is not None: - if any('image_id' in candidate for candidate in candidates): - raise click.BadParameter(f'image_id {message}') + if any("image_id" in candidate for candidate in candidates): + raise click.BadParameter(f"image_id {message}") if disk_size is not None: - if any('disk_size' in candidate for candidate in candidates): - raise click.BadParameter(f'disk_size {message}') + if any("disk_size" in candidate for candidate in candidates): + raise click.BadParameter(f"disk_size {message}") if disk_tier is not None: - if any('disk_tier' in candidate for candidate in candidates): - raise click.BadParameter(f'disk_tier {message}') + if any("disk_tier" in candidate for candidate in candidates): + raise click.BadParameter(f"disk_tier {message}") if ports: - if any('ports' in candidate for candidate in candidates): - raise click.BadParameter(f'ports {message}') + if any("ports" in candidate for candidate in candidates): + raise click.BadParameter(f"ports {message}") # The user can specify the benchmark candidates in either of the two ways: # 1. By specifying resources.candidates in the YAML. # 2. By specifying gpu types as a command line argument (--gpus). override_gpu = None if gpus is not None: - gpu_list = gpus.split(',') + gpu_list = gpus.split(",") gpu_list = [gpu.strip() for gpu in gpu_list] - if ' ' in gpus: - raise click.BadParameter('Remove blanks in --gpus.') + if " " in gpus: + raise click.BadParameter("Remove blanks in --gpus.") if len(gpu_list) == 1: override_gpu = gpu_list[0] @@ -4650,66 +5104,73 @@ def benchmark_launch( # If len(gpu_list) > 1, gpus is interpreted # as a list of benchmark candidates. if candidates is None: - candidates = [{'accelerators': gpu} for gpu in gpu_list] + candidates = [{"accelerators": gpu} for gpu in gpu_list] override_gpu = None else: - raise ValueError('Provide benchmark candidates in either ' - '--gpus or resources.candidates in the YAML.') + raise ValueError( + "Provide benchmark candidates in either " + "--gpus or resources.candidates in the YAML." + ) if candidates is None: candidates = [{}] - if 'resources' not in config: - config['resources'] = {} - resources_config = config['resources'] + if "resources" not in config: + config["resources"] = {} + resources_config = config["resources"] # Override the yaml config with the command line arguments. if name is not None: - config['name'] = name + config["name"] = name if workdir is not None: - config['workdir'] = workdir + config["workdir"] = workdir if num_nodes is not None: - config['num_nodes'] = num_nodes - override_params = _parse_override_params(cloud=cloud, - region=region, - zone=zone, - gpus=override_gpu, - cpus=cpus, - memory=memory, - use_spot=use_spot, - image_id=image_id, - disk_size=disk_size, - disk_tier=disk_tier, - ports=ports) + config["num_nodes"] = num_nodes + override_params = _parse_override_params( + cloud=cloud, + region=region, + zone=zone, + gpus=override_gpu, + cpus=cpus, + memory=memory, + use_spot=use_spot, + image_id=image_id, + disk_size=disk_size, + disk_tier=disk_tier, + ports=ports, + ) _pop_and_ignore_fields_in_override_params( - override_params, field_to_ignore=['cpus', 'memory']) + override_params, field_to_ignore=["cpus", "memory"] + ) resources_config.update(override_params) - if 'cloud' in resources_config: - cloud = resources_config.pop('cloud') + if "cloud" in resources_config: + cloud = resources_config.pop("cloud") if cloud is not None: - resources_config['cloud'] = str(cloud) - if 'region' in resources_config: - if resources_config['region'] is None: - resources_config.pop('region') - if 'zone' in resources_config: - if resources_config['zone'] is None: - resources_config.pop('zone') - if 'accelerators' in resources_config: - if resources_config['accelerators'] is None: - resources_config.pop('accelerators') - if 'image_id' in resources_config: - if resources_config['image_id'] is None: - resources_config.pop('image_id') + resources_config["cloud"] = str(cloud) + if "region" in resources_config: + if resources_config["region"] is None: + resources_config.pop("region") + if "zone" in resources_config: + if resources_config["zone"] is None: + resources_config.pop("zone") + if "accelerators" in resources_config: + if resources_config["accelerators"] is None: + resources_config.pop("accelerators") + if "image_id" in resources_config: + if resources_config["image_id"] is None: + resources_config.pop("image_id") # Fully generate the benchmark candidate configs. clusters, candidate_configs = benchmark_utils.generate_benchmark_configs( - benchmark, config, candidates) + benchmark, config, candidates + ) # Show the benchmarking VM instances selected by the optimizer. # This also detects the case where the user requested infeasible resources. - benchmark_utils.print_benchmark_clusters(benchmark, clusters, config, - candidate_configs) + benchmark_utils.print_benchmark_clusters( + benchmark, clusters, config, candidate_configs + ) if not yes: - plural = 's' if len(candidates) > 1 else '' - prompt = f'Launching {len(candidates)} cluster{plural}. Proceed?' + plural = "s" if len(candidates) > 1 else "" + prompt = f"Launching {len(candidates)} cluster{plural}. Proceed?" click.confirm(prompt, default=True, abort=True, show_default=True) # Configs that are only accepted by the CLI. @@ -4718,96 +5179,96 @@ def benchmark_launch( # the serverless execution. if idle_minutes_to_autostop is None: idle_minutes_to_autostop = 5 - commandline_args['idle-minutes-to-autostop'] = idle_minutes_to_autostop + commandline_args["idle-minutes-to-autostop"] = idle_minutes_to_autostop if len(env) > 0: - commandline_args['env'] = [f'{k}={v}' for k, v in env] + commandline_args["env"] = [f"{k}={v}" for k, v in env] # Launch the benchmarking clusters in detach mode in parallel. benchmark_created = benchmark_utils.launch_benchmark_clusters( - benchmark, clusters, candidate_configs, commandline_args) + benchmark, clusters, candidate_configs, commandline_args + ) # If at least one cluster is created, print the following messages. if benchmark_created: logger.info( - f'\n{colorama.Fore.CYAN}Benchmark name: ' - f'{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}' - '\nTo see the benchmark results: ' - f'{backend_utils.BOLD}sky bench show ' - f'{benchmark}{backend_utils.RESET_BOLD}' - '\nTo teardown the clusters: ' - f'{backend_utils.BOLD}sky bench down ' - f'{benchmark}{backend_utils.RESET_BOLD}') - subprocess_utils.run('sky bench ls') + f"\n{colorama.Fore.CYAN}Benchmark name: " + f"{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}" + "\nTo see the benchmark results: " + f"{backend_utils.BOLD}sky bench show " + f"{benchmark}{backend_utils.RESET_BOLD}" + "\nTo teardown the clusters: " + f"{backend_utils.BOLD}sky bench down " + f"{benchmark}{backend_utils.RESET_BOLD}" + ) + subprocess_utils.run("sky bench ls") else: - logger.error('No benchmarking clusters are created.') - subprocess_utils.run('sky status') + logger.error("No benchmarking clusters are created.") + subprocess_utils.run("sky status") -@bench.command('ls', cls=_DocumentedCodeCommand) +@bench.command("ls", cls=_DocumentedCodeCommand) @usage_lib.entrypoint def benchmark_ls() -> None: """List the benchmark history.""" benchmarks = benchmark_state.get_benchmarks() columns = [ - 'BENCHMARK', - 'TASK', - 'LAUNCHED', + "BENCHMARK", + "TASK", + "LAUNCHED", ] max_num_candidates = 1 for benchmark in benchmarks: - benchmark_results = benchmark_state.get_benchmark_results( - benchmark['name']) + benchmark_results = benchmark_state.get_benchmark_results(benchmark["name"]) num_candidates = len(benchmark_results) if num_candidates > max_num_candidates: max_num_candidates = num_candidates if max_num_candidates == 1: - columns += ['CANDIDATE'] + columns += ["CANDIDATE"] else: - columns += [f'CANDIDATE {i}' for i in range(1, max_num_candidates + 1)] + columns += [f"CANDIDATE {i}" for i in range(1, max_num_candidates + 1)] benchmark_table = log_utils.create_table(columns) for benchmark in benchmarks: - if benchmark['task'] is not None: - task = benchmark['task'] + if benchmark["task"] is not None: + task = benchmark["task"] else: - task = '-' + task = "-" row = [ # BENCHMARK - benchmark['name'], + benchmark["name"], # TASK task, # LAUNCHED - datetime.datetime.fromtimestamp(benchmark['launched_at']), + datetime.datetime.fromtimestamp(benchmark["launched_at"]), ] - benchmark_results = benchmark_state.get_benchmark_results( - benchmark['name']) + benchmark_results = benchmark_state.get_benchmark_results(benchmark["name"]) # RESOURCES for b in benchmark_results: - num_nodes = b['num_nodes'] - resources = b['resources'] - postfix_spot = '[Spot]' if resources.use_spot else '' + num_nodes = b["num_nodes"] + resources = b["resources"] + postfix_spot = "[Spot]" if resources.use_spot else "" instance_type = resources.instance_type + postfix_spot if resources.accelerators is None: - accelerators = '' + accelerators = "" else: accelerator, count = list(resources.accelerators.items())[0] - accelerators = f' ({accelerator}:{count})' + accelerators = f" ({accelerator}:{count})" # For brevity, skip the cloud names. - resources_str = f'{num_nodes}x {instance_type}{accelerators}' + resources_str = f"{num_nodes}x {instance_type}{accelerators}" row.append(resources_str) - row += [''] * (max_num_candidates - len(benchmark_results)) + row += [""] * (max_num_candidates - len(benchmark_results)) benchmark_table.add_row(row) if benchmarks: click.echo(benchmark_table) else: - click.echo('No benchmark history found.') + click.echo("No benchmark history found.") -@bench.command('show', cls=_DocumentedCodeCommand) -@click.argument('benchmark', required=True, type=str) +@bench.command("show", cls=_DocumentedCodeCommand) +@click.argument("benchmark", required=True, type=str) # TODO(woosuk): Add --all option to show all the collected information # (e.g., setup time, warmup steps, total steps, etc.). @usage_lib.entrypoint @@ -4815,79 +5276,81 @@ def benchmark_show(benchmark: str) -> None: """Show a benchmark report.""" record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: - raise click.BadParameter(f'Benchmark {benchmark} does not exist.') + raise click.BadParameter(f"Benchmark {benchmark} does not exist.") benchmark_utils.update_benchmark_state(benchmark) click.echo( - textwrap.dedent("""\ + textwrap.dedent( + """\ Legend: - #STEPS: Number of steps taken. - SEC/STEP, $/STEP: Average time (cost) per step. - EST(hr), EST($): Estimated total time (cost) to complete the benchmark. - """)) + """ + ) + ) columns = [ - 'CLUSTER', - 'RESOURCES', - 'STATUS', - 'DURATION', - 'SPENT($)', - '#STEPS', - 'SEC/STEP', - '$/STEP', - 'EST(hr)', - 'EST($)', + "CLUSTER", + "RESOURCES", + "STATUS", + "DURATION", + "SPENT($)", + "#STEPS", + "SEC/STEP", + "$/STEP", + "EST(hr)", + "EST($)", ] cluster_table = log_utils.create_table(columns) rows = [] benchmark_results = benchmark_state.get_benchmark_results(benchmark) for result in benchmark_results: - num_nodes = result['num_nodes'] - resources = result['resources'] + num_nodes = result["num_nodes"] + resources = result["resources"] row = [ # CLUSTER - result['cluster'], + result["cluster"], # RESOURCES - f'{num_nodes}x {resources}', + f"{num_nodes}x {resources}", # STATUS - result['status'].value, + result["status"].value, ] - record = result['record'] - if (record is None or record.start_time is None or - record.last_time is None): - row += ['-'] * (len(columns) - len(row)) + record = result["record"] + if record is None or record.start_time is None or record.last_time is None: + row += ["-"] * (len(columns) - len(row)) rows.append(row) continue - duration_str = log_utils.readable_time_duration(record.start_time, - record.last_time, - absolute=True) + duration_str = log_utils.readable_time_duration( + record.start_time, record.last_time, absolute=True + ) duration = record.last_time - record.start_time spent = num_nodes * resources.get_cost(duration) - spent_str = f'{spent:.4f}' + spent_str = f"{spent:.4f}" num_steps = record.num_steps_so_far if num_steps is None: - num_steps = '-' + num_steps = "-" seconds_per_step = record.seconds_per_step if seconds_per_step is None: - seconds_per_step_str = '-' - cost_per_step_str = '-' + seconds_per_step_str = "-" + cost_per_step_str = "-" else: - seconds_per_step_str = f'{seconds_per_step:.4f}' + seconds_per_step_str = f"{seconds_per_step:.4f}" cost_per_step = num_nodes * resources.get_cost(seconds_per_step) - cost_per_step_str = f'{cost_per_step:.6f}' + cost_per_step_str = f"{cost_per_step:.6f}" total_time = record.estimated_total_seconds if total_time is None: - total_time_str = '-' - total_cost_str = '-' + total_time_str = "-" + total_cost_str = "-" else: - total_time_str = f'{total_time / 3600:.2f}' + total_time_str = f"{total_time / 3600:.2f}" total_cost = num_nodes * resources.get_cost(total_time) - total_cost_str = f'{total_cost:.2f}' + total_cost_str = f"{total_cost:.2f}" row += [ # DURATION @@ -4911,45 +5374,51 @@ def benchmark_show(benchmark: str) -> None: click.echo(cluster_table) finished = [ - row for row in rows - if row[2] == benchmark_state.BenchmarkStatus.FINISHED.value + row for row in rows if row[2] == benchmark_state.BenchmarkStatus.FINISHED.value ] - if any(row[5] == '-' for row in finished): + if any(row[5] == "-" for row in finished): # No #STEPS. SkyCallback was unused. click.secho( - 'SkyCallback logs are not found in this benchmark. ' - 'Consider using SkyCallback to get more detailed information ' - 'in real time.', - fg='yellow') - elif any(row[6] != '-' and row[-1] == '-' for row in rows): + "SkyCallback logs are not found in this benchmark. " + "Consider using SkyCallback to get more detailed information " + "in real time.", + fg="yellow", + ) + elif any(row[6] != "-" and row[-1] == "-" for row in rows): # No EST($). total_steps is not specified and cannot be inferred. click.secho( - 'Cannot estimate total time and cost because ' - 'the total number of steps cannot be inferred by SkyCallback. ' - 'To get the estimation, specify the total number of steps in ' - 'either `sky_callback.init` or `Sky*Callback`.', - fg='yellow') + "Cannot estimate total time and cost because " + "the total number of steps cannot be inferred by SkyCallback. " + "To get the estimation, specify the total number of steps in " + "either `sky_callback.init` or `Sky*Callback`.", + fg="yellow", + ) -@bench.command('down', cls=_DocumentedCodeCommand) -@click.argument('benchmark', required=True, type=str) +@bench.command("down", cls=_DocumentedCodeCommand) +@click.argument("benchmark", required=True, type=str) @click.option( - '--exclude', - '-e', - 'clusters_to_exclude', + "--exclude", + "-e", + "clusters_to_exclude", required=False, type=str, multiple=True, - help=('Cluster name(s) to exclude from termination. ' - 'Typically, you might want to see the benchmark results in ' - '`sky bench show` and exclude a "winner" cluster from termination ' - 'to finish the running task.')) -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') + help=( + "Cluster name(s) to exclude from termination. " + "Typically, you might want to see the benchmark results in " + '`sky bench show` and exclude a "winner" cluster from termination ' + "to finish the running task." + ), +) +@click.option( + "--yes", + "-y", + is_flag=True, + default=False, + required=False, + help="Skip confirmation prompt.", +) @usage_lib.entrypoint def benchmark_down( benchmark: str, @@ -4959,7 +5428,7 @@ def benchmark_down( """Tear down all clusters belonging to a benchmark.""" record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: - raise click.BadParameter(f'Benchmark {benchmark} does not exist.') + raise click.BadParameter(f"Benchmark {benchmark} does not exist.") clusters = benchmark_state.get_benchmark_clusters(benchmark) to_stop: List[str] = [] @@ -4970,66 +5439,71 @@ def benchmark_down( continue to_stop.append(cluster) - _down_or_stop_clusters(to_stop, - apply_to_all=False, - down=True, - no_confirm=yes) - - -@bench.command('delete', cls=_DocumentedCodeCommand) -@click.argument('benchmarks', required=False, type=str, nargs=-1) -@click.option('--all', - '-a', - default=None, - is_flag=True, - help='Delete all benchmark reports from the history.') -@click.option('--yes', - '-y', - is_flag=True, - default=False, - required=False, - help='Skip confirmation prompt.') + _down_or_stop_clusters(to_stop, apply_to_all=False, down=True, no_confirm=yes) + + +@bench.command("delete", cls=_DocumentedCodeCommand) +@click.argument("benchmarks", required=False, type=str, nargs=-1) +@click.option( + "--all", + "-a", + default=None, + is_flag=True, + help="Delete all benchmark reports from the history.", +) +@click.option( + "--yes", + "-y", + is_flag=True, + default=False, + required=False, + help="Skip confirmation prompt.", +) @usage_lib.entrypoint # pylint: disable=redefined-builtin -def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool], - yes: bool) -> None: +def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool], yes: bool) -> None: """Delete benchmark reports from the history.""" if not benchmarks and all is None: raise click.BadParameter( - 'Either specify benchmarks or use --all to delete all benchmarks.') + "Either specify benchmarks or use --all to delete all benchmarks." + ) to_delete = [] if len(benchmarks) > 0: for benchmark in benchmarks: record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: - print(f'Benchmark {benchmark} not found.') + print(f"Benchmark {benchmark} not found.") else: to_delete.append(record) if all: to_delete = benchmark_state.get_benchmarks() if len(benchmarks) > 0: - print('Both --all and benchmark(s) specified ' - 'for sky bench delete. Letting --all take effect.') + print( + "Both --all and benchmark(s) specified " + "for sky bench delete. Letting --all take effect." + ) - to_delete = [r['name'] for r in to_delete] + to_delete = [r["name"] for r in to_delete] if not to_delete: return - benchmark_list = ', '.join(to_delete) - plural = 's' if len(to_delete) > 1 else '' + benchmark_list = ", ".join(to_delete) + plural = "s" if len(to_delete) > 1 else "" if not yes: click.confirm( - f'Deleting the benchmark{plural}: {benchmark_list}. Proceed?', + f"Deleting the benchmark{plural}: {benchmark_list}. Proceed?", default=True, abort=True, - show_default=True) + show_default=True, + ) - progress = rich_progress.Progress(transient=True, - redirect_stdout=False, - redirect_stderr=False) + progress = rich_progress.Progress( + transient=True, redirect_stdout=False, redirect_stderr=False + ) task = progress.add_task( - f'[bold cyan]Deleting {len(to_delete)} benchmark{plural}: ', - total=len(to_delete)) + f"[bold cyan]Deleting {len(to_delete)} benchmark{plural}: ", + total=len(to_delete), + ) def _delete_benchmark(benchmark: str) -> None: clusters = benchmark_state.get_benchmark_clusters(benchmark) @@ -5040,25 +5514,27 @@ def _delete_benchmark(benchmark: str) -> None: num_clusters = len([r for r in records if r is not None]) if num_clusters > 0: - plural = 's' if num_clusters > 1 else '' - message = (f'{colorama.Fore.YELLOW}Benchmark {benchmark} ' - f'has {num_clusters} un-terminated cluster{plural}. ' - f'Terminate the cluster{plural} with ' - f'{backend_utils.BOLD} sky bench down {benchmark} ' - f'{backend_utils.RESET_BOLD} ' - 'before deleting the benchmark report.') + plural = "s" if num_clusters > 1 else "" + message = ( + f"{colorama.Fore.YELLOW}Benchmark {benchmark} " + f"has {num_clusters} un-terminated cluster{plural}. " + f"Terminate the cluster{plural} with " + f"{backend_utils.BOLD} sky bench down {benchmark} " + f"{backend_utils.RESET_BOLD} " + "before deleting the benchmark report." + ) success = False else: - bucket_name = benchmark_state.get_benchmark_from_name( - benchmark)['bucket'] + bucket_name = benchmark_state.get_benchmark_from_name(benchmark)["bucket"] handle = global_user_state.get_handle_from_storage_name(bucket_name) assert handle is not None, bucket_name bucket_type = list(handle.sky_stores.keys())[0] - benchmark_utils.remove_benchmark_logs(benchmark, bucket_name, - bucket_type) + benchmark_utils.remove_benchmark_logs(benchmark, bucket_name, bucket_type) benchmark_state.delete_benchmark(benchmark) - message = (f'{colorama.Fore.GREEN}Benchmark report for ' - f'{benchmark} deleted.{colorama.Style.RESET_ALL}') + message = ( + f"{colorama.Fore.GREEN}Benchmark report for " + f"{benchmark} deleted.{colorama.Style.RESET_ALL}" + ) success = True progress.stop() @@ -5079,12 +5555,13 @@ def local(): pass -@click.option('--gpus/--no-gpus', - default=True, - is_flag=True, - help='Launch cluster without GPU support even ' - 'if GPUs are detected on the host.') -@local.command('up', cls=_DocumentedCodeCommand) +@click.option( + "--gpus/--no-gpus", + default=True, + is_flag=True, + help="Launch cluster without GPU support even " "if GPUs are detected on the host.", +) +@local.command("up", cls=_DocumentedCodeCommand) @usage_lib.entrypoint def local_up(gpus: bool): """Creates a local cluster.""" @@ -5095,36 +5572,39 @@ def local_up(gpus: bool): gpus = gpus and local_gpus_available # Check if ~/.kube/config exists: - if os.path.exists(os.path.expanduser('~/.kube/config')): + if os.path.exists(os.path.expanduser("~/.kube/config")): curr_context = kubernetes_utils.get_current_kube_config_context_name() - skypilot_context = 'kind-skypilot' + skypilot_context = "kind-skypilot" if curr_context is not None and curr_context != skypilot_context: click.echo( - f'Current context in kube config: {curr_context}' - '\nWill automatically switch to kind-skypilot after the local ' - 'cluster is created.') - message_str = 'Creating local cluster{}...' - message_str = message_str.format((' with GPU support (this may take up ' - 'to 15 minutes)') if gpus else '') + f"Current context in kube config: {curr_context}" + "\nWill automatically switch to kind-skypilot after the local " + "cluster is created." + ) + message_str = "Creating local cluster{}..." + message_str = message_str.format( + (" with GPU support (this may take up " "to 15 minutes)") if gpus else "" + ) path_to_package = os.path.dirname(os.path.dirname(__file__)) - up_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes', - 'create_cluster.sh') + up_script_path = os.path.join( + path_to_package, "sky/utils/kubernetes", "create_cluster.sh" + ) # Get directory of script and run it from there cwd = os.path.dirname(os.path.abspath(up_script_path)) - run_command = up_script_path + ' --gpus' if gpus else up_script_path + run_command = up_script_path + " --gpus" if gpus else up_script_path run_command = shlex.split(run_command) # Setup logging paths run_timestamp = backend_utils.get_run_timestamp() - log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp, - 'local_up.log') - tail_cmd = 'tail -n100 -f ' + log_path + log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp, "local_up.log") + tail_cmd = "tail -n100 -f " + log_path click.echo(message_str) style = colorama.Style - click.echo('To view detailed progress: ' - f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}') + click.echo( + "To view detailed progress: " f"{style.BRIGHT}{tail_cmd}{style.RESET_ALL}" + ) returncode, _, stderr = log_lib.run_with_log( cmd=run_command, @@ -5132,138 +5612,155 @@ def local_up(gpus: bool): require_outputs=True, stream_logs=False, line_processor=log_utils.SkyLocalUpLineProcessor(), - cwd=cwd) + cwd=cwd, + ) # Kind always writes to stderr even if it succeeds. # If the failure happens after the cluster is created, we need # to strip all stderr of "No kind clusters found.", which is # printed when querying with kind get clusters. - stderr = stderr.replace('No kind clusters found.\n', '') + stderr = stderr.replace("No kind clusters found.\n", "") if returncode == 0: cluster_created = True elif returncode == 100: - click.echo(f'{colorama.Fore.GREEN}Local cluster already ' - f'exists.{style.RESET_ALL}\n' - 'If you want to delete it instead, run: sky local down') + click.echo( + f"{colorama.Fore.GREEN}Local cluster already " + f"exists.{style.RESET_ALL}\n" + "If you want to delete it instead, run: sky local down" + ) else: with ux_utils.print_exception_no_traceback(): raise RuntimeError( - 'Failed to create local cluster. ' - f'Full log: {log_path}' - f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}') + "Failed to create local cluster. " + f"Full log: {log_path}" + f"\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}" + ) # Run sky check - with rich_utils.safe_status('[bold cyan]Running sky check...'): - sky_check.check(clouds=['kubernetes'], quiet=True) + with rich_utils.safe_status("[bold cyan]Running sky check..."): + sky_check.check(clouds=["kubernetes"], quiet=True) if cluster_created: # Prepare completion message which shows CPU and GPU count # Get number of CPUs p = subprocess_utils.run( - 'kubectl get nodes -o jsonpath=\'{.items[0].status.capacity.cpu}\'', - capture_output=True) - num_cpus = int(p.stdout.decode('utf-8')) + "kubectl get nodes -o jsonpath='{.items[0].status.capacity.cpu}'", + capture_output=True, + ) + num_cpus = int(p.stdout.decode("utf-8")) # GPU count/type parsing - gpu_message = '' - gpu_hint = '' + gpu_message = "" + gpu_hint = "" if gpus: # Get GPU model by querying the node labels - label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.') - gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long + label_name_escaped = "skypilot.co/accelerator".replace(".", "\\.") + gpu_type_cmd = f"kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels['{label_name_escaped}']}}\"" # pylint: disable=line-too-long try: # Run the command and capture the output - gpu_count_output = subprocess.check_output(gpu_type_cmd, - shell=True, - text=True) - gpu_type_str = gpu_count_output.strip() + ' ' + gpu_count_output = subprocess.check_output( + gpu_type_cmd, shell=True, text=True + ) + gpu_type_str = gpu_count_output.strip() + " " except subprocess.CalledProcessError as e: - output = str(e.output.decode('utf-8')) - logger.warning(f'Failed to get GPU type: {output}') - gpu_type_str = '' + output = str(e.output.decode("utf-8")) + logger.warning(f"Failed to get GPU type: {output}") + gpu_type_str = "" # Get number of GPUs (sum of nvidia.com/gpu resources) - gpu_count_command = 'kubectl get nodes -o=jsonpath=\'{range .items[*]}{.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}\' | awk \'{sum += $1} END {print sum}\'' # pylint: disable=line-too-long + gpu_count_command = "kubectl get nodes -o=jsonpath='{range .items[*]}{.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}' | awk '{sum += $1} END {print sum}'" # pylint: disable=line-too-long try: # Run the command and capture the output - gpu_count_output = subprocess.check_output(gpu_count_command, - shell=True, - text=True) - gpu_count = gpu_count_output.strip( - ) # Remove any extra whitespace - gpu_message = f' and {gpu_count} {gpu_type_str}GPUs' + gpu_count_output = subprocess.check_output( + gpu_count_command, shell=True, text=True + ) + gpu_count = gpu_count_output.strip() # Remove any extra whitespace + gpu_message = f" and {gpu_count} {gpu_type_str}GPUs" except subprocess.CalledProcessError as e: - output = str(e.output.decode('utf-8')) - logger.warning(f'Failed to get GPU count: {output}') - gpu_message = f' with {gpu_type_str}GPU support' + output = str(e.output.decode("utf-8")) + logger.warning(f"Failed to get GPU count: {output}") + gpu_message = f" with {gpu_type_str}GPU support" gpu_hint = ( - '\nHint: To see the list of GPUs in the cluster, ' - 'run \'sky show-gpus --cloud kubernetes\'') if gpus else '' + ( + "\nHint: To see the list of GPUs in the cluster, " + "run 'sky show-gpus --cloud kubernetes'" + ) + if gpus + else "" + ) if num_cpus < 2: - click.echo('Warning: Local cluster has less than 2 CPUs. ' - 'This may cause issues with running tasks.') + click.echo( + "Warning: Local cluster has less than 2 CPUs. " + "This may cause issues with running tasks." + ) click.echo( - f'\n{colorama.Fore.GREEN}Local Kubernetes cluster created ' - 'successfully with ' - f'{num_cpus} CPUs{gpu_message}.{style.RESET_ALL}\n`sky launch` can ' - 'now run tasks locally.' - '\nHint: To change the number of CPUs, change your docker ' - 'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long - f'{gpu_hint}') + f"\n{colorama.Fore.GREEN}Local Kubernetes cluster created " + "successfully with " + f"{num_cpus} CPUs{gpu_message}.{style.RESET_ALL}\n`sky launch` can " + "now run tasks locally." + "\nHint: To change the number of CPUs, change your docker " + "runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info." # pylint: disable=line-too-long + f"{gpu_hint}" + ) -@local.command('down', cls=_DocumentedCodeCommand) +@local.command("down", cls=_DocumentedCodeCommand) @usage_lib.entrypoint def local_down(): """Deletes a local cluster.""" cluster_removed = False path_to_package = os.path.dirname(os.path.dirname(__file__)) - down_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes', - 'delete_cluster.sh') + down_script_path = os.path.join( + path_to_package, "sky/utils/kubernetes", "delete_cluster.sh" + ) cwd = os.path.dirname(os.path.abspath(down_script_path)) run_command = shlex.split(down_script_path) # Setup logging paths run_timestamp = backend_utils.get_run_timestamp() - log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp, - 'local_down.log') - tail_cmd = 'tail -n100 -f ' + log_path + log_path = os.path.join( + constants.SKY_LOGS_DIRECTORY, run_timestamp, "local_down.log" + ) + tail_cmd = "tail -n100 -f " + log_path - with rich_utils.safe_status('[bold cyan]Removing local cluster...'): + with rich_utils.safe_status("[bold cyan]Removing local cluster..."): style = colorama.Style - click.echo('To view detailed progress: ' - f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}') - returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command, - log_path=log_path, - require_outputs=True, - stream_logs=False, - cwd=cwd) - stderr = stderr.replace('No kind clusters found.\n', '') + click.echo( + "To view detailed progress: " f"{style.BRIGHT}{tail_cmd}{style.RESET_ALL}" + ) + returncode, stdout, stderr = log_lib.run_with_log( + cmd=run_command, + log_path=log_path, + require_outputs=True, + stream_logs=False, + cwd=cwd, + ) + stderr = stderr.replace("No kind clusters found.\n", "") if returncode == 0: cluster_removed = True elif returncode == 100: - click.echo('\nLocal cluster does not exist.') + click.echo("\nLocal cluster does not exist.") else: with ux_utils.print_exception_no_traceback(): raise RuntimeError( - 'Failed to create local cluster. ' - f'Stdout: {stdout}' - f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}') + "Failed to create local cluster. " + f"Stdout: {stdout}" + f"\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}" + ) if cluster_removed: # Run sky check - with rich_utils.safe_status('[bold cyan]Running sky check...'): - sky_check.check(clouds=['kubernetes'], quiet=True) - click.echo( - f'{colorama.Fore.GREEN}Local cluster removed.{style.RESET_ALL}') + with rich_utils.safe_status("[bold cyan]Running sky check..."): + sky_check.check(clouds=["kubernetes"], quiet=True) + click.echo(f"{colorama.Fore.GREEN}Local cluster removed.{style.RESET_ALL}") def main(): return cli() -if __name__ == '__main__': +if __name__ == "__main__": main() From a7f97fe3814038c059d700efbeda170f2840ecfa Mon Sep 17 00:00:00 2001 From: wizenheimer Date: Tue, 17 Sep 2024 00:09:16 +0530 Subject: [PATCH 5/5] fix: resolve black formatting --- sky/cli.py | 4215 +++++++++++++++++++++++----------------------------- 1 file changed, 1860 insertions(+), 2355 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 69064475ea7..3e5f551d0ee 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -23,7 +23,6 @@ listed in "sky --help". Take care to put logically connected commands close to each other. """ - import copy import datetime import functools @@ -82,10 +81,10 @@ if typing.TYPE_CHECKING: from sky.backends import backend as backend_lib -pd = adaptors_common.LazyImport("pandas") +pd = adaptors_common.LazyImport('pandas') logger = sky_logging.init_logger(__name__) -_CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) +_CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) _CLUSTER_FLAG_HELP = """\ A cluster name. If provided, either reuse an existing cluster with that name or @@ -97,19 +96,15 @@ _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5 _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = ( - "{cluster_num} cluster{plural} {verb}. Please specify {cause} " - "cluster to show its {property}.\nUsage: `sky status --{flag} `" -) + '{cluster_num} cluster{plural} {verb}. Please specify {cause} ' + 'cluster to show its {property}.\nUsage: `sky status --{flag} `') -_ENDPOINTS_RETRY_MESSAGE = ( - "If the cluster was recently started, " "please retry after a while." -) +_ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, ' + 'please retry after a while.') -_DAG_NOT_SUPPORTED_MESSAGE = ( - "YAML specifies a DAG which is only supported by " - "`sky jobs launch`. `{command}` supports a " - "single task only." -) +_DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by ' + '`sky jobs launch`. `{command}` supports a ' + 'single task only.') def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]: @@ -118,7 +113,7 @@ def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]: for cluster in clusters: glob_cluster = global_user_state.get_glob_cluster_names(cluster) if len(glob_cluster) == 0 and not silent: - click.echo(f"Cluster {cluster} not found.") + click.echo(f'Cluster {cluster} not found.') glob_clusters.extend(glob_cluster) return list(set(glob_clusters)) @@ -129,153 +124,121 @@ def _get_glob_storages(storages: List[str]) -> List[str]: for storage_object in storages: glob_storage = global_user_state.get_glob_storage_name(storage_object) if len(glob_storage) == 0: - click.echo(f"Storage {storage_object} not found.") + click.echo(f'Storage {storage_object} not found.') glob_storages.extend(glob_storage) return list(set(glob_storages)) def _parse_env_var(env_var: str) -> Tuple[str, str]: """Parse env vars into a (KEY, VAL) pair.""" - if "=" not in env_var: + if '=' not in env_var: value = os.environ.get(env_var) if value is None: - raise click.UsageError(f"{env_var} is not set in local environment.") + raise click.UsageError( + f'{env_var} is not set in local environment.') return (env_var, value) - ret = tuple(env_var.split("=", 1)) + ret = tuple(env_var.split('=', 1)) if len(ret) != 2: raise click.UsageError( - f"Invalid env var: {env_var}. Must be in the form of KEY=VAL " "or KEY." - ) + f'Invalid env var: {env_var}. Must be in the form of KEY=VAL ' + 'or KEY.') return ret[0], ret[1] -def _merge_env_vars( - env_dict: Optional[Dict[str, str]], env_list: List[Tuple[str, str]] -) -> List[Tuple[str, str]]: +def _merge_env_vars(env_dict: Optional[Dict[str, str]], + env_list: List[Tuple[str, str]]) -> List[Tuple[str, str]]: """Merges all values from env_list into env_dict.""" if not env_dict: return env_list - for key, value in env_list: + for (key, value) in env_list: env_dict[key] = value return list(env_dict.items()) _TASK_OPTIONS = [ click.option( - "--workdir", + '--workdir', required=False, type=click.Path(exists=True, file_okay=False), - help=( - "If specified, sync this dir to the remote working directory, " - "where the task will be invoked. " - 'Overrides the "workdir" config in the YAML if both are supplied.' - ), - ), + help=('If specified, sync this dir to the remote working directory, ' + 'where the task will be invoked. ' + 'Overrides the "workdir" config in the YAML if both are supplied.' + )), click.option( - "--cloud", + '--cloud', required=False, type=str, - help=( - 'The cloud to use. If specified, overrides the "resources.cloud" ' - 'config. Passing "none" resets the config.' - ), - ), + help=('The cloud to use. If specified, overrides the "resources.cloud" ' + 'config. Passing "none" resets the config.')), click.option( - "--region", + '--region', required=False, type=str, - help=( - "The region to use. If specified, overrides the " - '"resources.region" config. Passing "none" resets the config.' - ), - ), + help=('The region to use. If specified, overrides the ' + '"resources.region" config. Passing "none" resets the config.')), click.option( - "--zone", + '--zone', required=False, type=str, - help=( - "The zone to use. If specified, overrides the " - '"resources.zone" config. Passing "none" resets the config.' - ), - ), + help=('The zone to use. If specified, overrides the ' + '"resources.zone" config. Passing "none" resets the config.')), click.option( - "--num-nodes", + '--num-nodes', required=False, type=int, - help=( - "Number of nodes to execute the task on. " - 'Overrides the "num_nodes" config in the YAML if both are ' - "supplied." - ), - ), + help=('Number of nodes to execute the task on. ' + 'Overrides the "num_nodes" config in the YAML if both are ' + 'supplied.')), click.option( - "--cpus", + '--cpus', default=None, type=str, required=False, - help=( - "Number of vCPUs each instance must have (e.g., " - "``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). " - "This is used to automatically select the instance type." - ), - ), + help=('Number of vCPUs each instance must have (e.g., ' + '``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). ' + 'This is used to automatically select the instance type.')), click.option( - "--memory", + '--memory', default=None, type=str, required=False, help=( - "Amount of memory each instance must have in GB (e.g., " - "``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least 16GB))" - ), - ), - click.option( - "--disk-size", - default=None, - type=int, - required=False, - help=("OS disk size in GBs."), - ), - click.option( - "--disk-tier", - default=None, - type=click.Choice( - resources_utils.DiskTier.supported_tiers(), case_sensitive=False - ), - required=False, - help=resources_utils.DiskTier.cli_help_message(), - ), - click.option( - "--use-spot/--no-use-spot", - required=False, - default=None, - help=( - "Whether to request spot instances. If specified, overrides the " - '"resources.use_spot" config.' - ), - ), + 'Amount of memory each instance must have in GB (e.g., ' + '``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least 16GB))' + )), + click.option('--disk-size', + default=None, + type=int, + required=False, + help=('OS disk size in GBs.')), + click.option('--disk-tier', + default=None, + type=click.Choice(resources_utils.DiskTier.supported_tiers(), + case_sensitive=False), + required=False, + help=resources_utils.DiskTier.cli_help_message()), click.option( - "--image-id", + '--use-spot/--no-use-spot', required=False, default=None, - help=( - "Custom image id for launching the instances. " - 'Passing "none" resets the config.' - ), - ), - click.option( - "--env-file", - required=False, - type=dotenv.dotenv_values, - help="""\ + help=('Whether to request spot instances. If specified, overrides the ' + '"resources.use_spot" config.')), + click.option('--image-id', + required=False, + default=None, + help=('Custom image id for launching the instances. ' + 'Passing "none" resets the config.')), + click.option('--env-file', + required=False, + type=dotenv.dotenv_values, + help="""\ Path to a dotenv file with environment variables to set on the remote node. If any values from ``--env-file`` conflict with values set by - ``--env``, the ``--env`` value will be preferred.""", - ), + ``--env``, the ``--env`` value will be preferred."""), click.option( - "--env", + '--env', required=False, type=_parse_env_var, multiple=True, @@ -293,92 +256,79 @@ def _merge_env_vars( 3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the same value of ``$MY_ENV3`` in the local environment.""", - ), + ) ] _TASK_OPTIONS_WITH_NAME = [ - click.option( - "--name", - "-n", - required=False, - type=str, - help=( - 'Task name. Overrides the "name" ' - "config in the YAML if both are supplied." - ), - ), + click.option('--name', + '-n', + required=False, + type=str, + help=('Task name. Overrides the "name" ' + 'config in the YAML if both are supplied.')), ] + _TASK_OPTIONS _EXTRA_RESOURCES_OPTIONS = [ click.option( - "--gpus", + '--gpus', required=False, type=str, - help=( - "Type and number of GPUs to use. Example values: " - '"V100:8", "V100" (short for a count of 1), or "V100:0.5" ' - "(fractional counts are supported by the scheduling framework). " - "If a new cluster is being launched by this command, this is the " - "resources to provision. If an existing cluster is being reused, this" - " is seen as the task demand, which must fit the cluster's total " - "resources and is used for scheduling the task. " - 'Overrides the "accelerators" ' - "config in the YAML if both are supplied. " - 'Passing "none" resets the config.' - ), - ), + help= + ('Type and number of GPUs to use. Example values: ' + '"V100:8", "V100" (short for a count of 1), or "V100:0.5" ' + '(fractional counts are supported by the scheduling framework). ' + 'If a new cluster is being launched by this command, this is the ' + 'resources to provision. If an existing cluster is being reused, this' + ' is seen as the task demand, which must fit the cluster\'s total ' + 'resources and is used for scheduling the task. ' + 'Overrides the "accelerators" ' + 'config in the YAML if both are supplied. ' + 'Passing "none" resets the config.')), click.option( - "--instance-type", - "-t", + '--instance-type', + '-t', required=False, type=str, - help=( - "The instance type to use. If specified, overrides the " - '"resources.instance_type" config. Passing "none" resets the ' - "config." - ), + help=('The instance type to use. If specified, overrides the ' + '"resources.instance_type" config. Passing "none" resets the ' + 'config.'), ), click.option( - "--ports", + '--ports', required=False, type=str, multiple=True, - help=( - "Ports to open on the cluster. " - 'If specified, overrides the "ports" config in the YAML. ' - ), + help=('Ports to open on the cluster. ' + 'If specified, overrides the "ports" config in the YAML. '), ), ] -def _complete_cluster_name( - ctx: click.Context, param: click.Parameter, incomplete: str -) -> List[str]: +def _complete_cluster_name(ctx: click.Context, param: click.Parameter, + incomplete: str) -> List[str]: """Handle shell completion for cluster names.""" del ctx, param # Unused. return global_user_state.get_cluster_names_start_with(incomplete) -def _complete_storage_name( - ctx: click.Context, param: click.Parameter, incomplete: str -) -> List[str]: +def _complete_storage_name(ctx: click.Context, param: click.Parameter, + incomplete: str) -> List[str]: """Handle shell completion for storage names.""" del ctx, param # Unused. return global_user_state.get_storage_names_start_with(incomplete) -def _complete_file_name( - ctx: click.Context, param: click.Parameter, incomplete: str -) -> List[str]: +def _complete_file_name(ctx: click.Context, param: click.Parameter, + incomplete: str) -> List[str]: """Handle shell completion for file names. Returns a special completion marker that tells click to use the shell's default file completion. """ del ctx, param # Unused. - return [click.shell_completion.CompletionItem(incomplete, type="file")] + return [click.shell_completion.CompletionItem(incomplete, type='file')] def _get_click_major_version(): - return int(click.__version__.split(".", maxsplit=1)[0]) + return int(click.__version__.split('.', maxsplit=1)[0]) def _get_shell_complete_args(complete_fn): @@ -388,49 +338,49 @@ def _get_shell_complete_args(complete_fn): return {} -_RELOAD_ZSH_CMD = "source ~/.zshrc" -_RELOAD_FISH_CMD = "source ~/.config/fish/config.fish" -_RELOAD_BASH_CMD = "source ~/.bashrc" +_RELOAD_ZSH_CMD = 'source ~/.zshrc' +_RELOAD_FISH_CMD = 'source ~/.config/fish/config.fish' +_RELOAD_BASH_CMD = 'source ~/.bashrc' -def _install_shell_completion(ctx: click.Context, param: click.Parameter, value: str): +def _install_shell_completion(ctx: click.Context, param: click.Parameter, + value: str): """A callback for installing shell completion for click.""" del param # Unused. if not value or ctx.resilient_parsing: return - if value == "auto": - if "SHELL" not in os.environ: + if value == 'auto': + if 'SHELL' not in os.environ: click.secho( - "Cannot auto-detect shell. Please specify shell explicitly.", fg="red" - ) + 'Cannot auto-detect shell. Please specify shell explicitly.', + fg='red') ctx.exit() else: - value = os.path.basename(os.environ["SHELL"]) + value = os.path.basename(os.environ['SHELL']) - zshrc_diff = "\n# For SkyPilot shell completion\n. ~/.sky/.sky-complete.zsh" - bashrc_diff = "\n# For SkyPilot shell completion" "\n. ~/.sky/.sky-complete.bash" + zshrc_diff = '\n# For SkyPilot shell completion\n. ~/.sky/.sky-complete.zsh' + bashrc_diff = ('\n# For SkyPilot shell completion' + '\n. ~/.sky/.sky-complete.bash') - if value == "bash": + if value == 'bash': install_cmd = f'_SKY_COMPLETE=bash_source sky > \ ~/.sky/.sky-complete.bash && \ echo "{bashrc_diff}" >> ~/.bashrc' - cmd = ( - f'(grep -q "SkyPilot" ~/.bashrc) || ' - f"([[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd}) || " - f'(echo "Bash must be version 4 or above." && exit 1))' - ) + cmd = (f'(grep -q "SkyPilot" ~/.bashrc) || ' + f'([[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd}) || ' + f'(echo "Bash must be version 4 or above." && exit 1))') reload_cmd = _RELOAD_BASH_CMD - elif value == "fish": - cmd = "_SKY_COMPLETE=fish_source sky > \ - ~/.config/fish/completions/sky.fish" + elif value == 'fish': + cmd = '_SKY_COMPLETE=fish_source sky > \ + ~/.config/fish/completions/sky.fish' reload_cmd = _RELOAD_FISH_CMD - elif value == "zsh": + elif value == 'zsh': install_cmd = f'_SKY_COMPLETE=zsh_source sky > \ ~/.sky/.sky-complete.zsh && \ echo "{zshrc_diff}" >> ~/.zshrc' @@ -439,48 +389,51 @@ def _install_shell_completion(ctx: click.Context, param: click.Parameter, value: reload_cmd = _RELOAD_ZSH_CMD else: - click.secho(f"Unsupported shell: {value}", fg="red") + click.secho(f'Unsupported shell: {value}', fg='red') ctx.exit() try: - subprocess.run(cmd, shell=True, check=True, executable=shutil.which("bash")) - click.secho(f"Shell completion installed for {value}", fg="green") + subprocess.run(cmd, + shell=True, + check=True, + executable=shutil.which('bash')) + click.secho(f'Shell completion installed for {value}', fg='green') click.echo( - "Completion will take effect once you restart the terminal: " - + click.style(f"{reload_cmd}", bold=True) - ) + 'Completion will take effect once you restart the terminal: ' + + click.style(f'{reload_cmd}', bold=True)) except subprocess.CalledProcessError as e: - click.secho(f"> Installation failed with code {e.returncode}", fg="red") + click.secho(f'> Installation failed with code {e.returncode}', fg='red') ctx.exit() -def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter, value: str): +def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter, + value: str): """A callback for uninstalling shell completion for click.""" del param # Unused. if not value or ctx.resilient_parsing: return - if value == "auto": - if "SHELL" not in os.environ: + if value == 'auto': + if 'SHELL' not in os.environ: click.secho( - "Cannot auto-detect shell. Please specify shell explicitly.", fg="red" - ) + 'Cannot auto-detect shell. Please specify shell explicitly.', + fg='red') ctx.exit() else: - value = os.path.basename(os.environ["SHELL"]) + value = os.path.basename(os.environ['SHELL']) - if value == "bash": + if value == 'bash': cmd = 'sed -i"" -e "/# For SkyPilot shell completion/d" ~/.bashrc && \ sed -i"" -e "/sky-complete.bash/d" ~/.bashrc && \ rm -f ~/.sky/.sky-complete.bash' reload_cmd = _RELOAD_BASH_CMD - elif value == "fish": - cmd = "rm -f ~/.config/fish/completions/sky.fish" + elif value == 'fish': + cmd = 'rm -f ~/.config/fish/completions/sky.fish' reload_cmd = _RELOAD_FISH_CMD - elif value == "zsh": + elif value == 'zsh': cmd = 'sed -i"" -e "/# For SkyPilot shell completion/d" ~/.zshrc && \ sed -i"" -e "/sky-complete.zsh/d" ~/.zshrc && \ rm -f ~/.sky/.sky-complete.zsh' @@ -488,18 +441,17 @@ def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter, valu reload_cmd = _RELOAD_ZSH_CMD else: - click.secho(f"Unsupported shell: {value}", fg="red") + click.secho(f'Unsupported shell: {value}', fg='red') ctx.exit() try: subprocess.run(cmd, shell=True, check=True) - click.secho(f"Shell completion uninstalled for {value}", fg="green") - click.echo( - "Changes will take effect once you restart the terminal: " - + click.style(f"{reload_cmd}", bold=True) - ) + click.secho(f'Shell completion uninstalled for {value}', fg='green') + click.echo('Changes will take effect once you restart the terminal: ' + + click.style(f'{reload_cmd}', bold=True)) except subprocess.CalledProcessError as e: - click.secho(f"> Uninstallation failed with code {e.returncode}", fg="red") + click.secho(f'> Uninstallation failed with code {e.returncode}', + fg='red') ctx.exit() @@ -515,72 +467,71 @@ def _add_options(func): def _parse_override_params( - cloud: Optional[str] = None, - region: Optional[str] = None, - zone: Optional[str] = None, - gpus: Optional[str] = None, - cpus: Optional[str] = None, - memory: Optional[str] = None, - instance_type: Optional[str] = None, - use_spot: Optional[bool] = None, - image_id: Optional[str] = None, - disk_size: Optional[int] = None, - disk_tier: Optional[str] = None, - ports: Optional[Tuple[str]] = None, -) -> Dict[str, Any]: + cloud: Optional[str] = None, + region: Optional[str] = None, + zone: Optional[str] = None, + gpus: Optional[str] = None, + cpus: Optional[str] = None, + memory: Optional[str] = None, + instance_type: Optional[str] = None, + use_spot: Optional[bool] = None, + image_id: Optional[str] = None, + disk_size: Optional[int] = None, + disk_tier: Optional[str] = None, + ports: Optional[Tuple[str]] = None) -> Dict[str, Any]: """Parses the override parameters into a dictionary.""" override_params: Dict[str, Any] = {} if cloud is not None: - if cloud.lower() == "none": - override_params["cloud"] = None + if cloud.lower() == 'none': + override_params['cloud'] = None else: - override_params["cloud"] = sky_clouds.CLOUD_REGISTRY.from_str(cloud) + override_params['cloud'] = sky_clouds.CLOUD_REGISTRY.from_str(cloud) if region is not None: - if region.lower() == "none": - override_params["region"] = None + if region.lower() == 'none': + override_params['region'] = None else: - override_params["region"] = region + override_params['region'] = region if zone is not None: - if zone.lower() == "none": - override_params["zone"] = None + if zone.lower() == 'none': + override_params['zone'] = None else: - override_params["zone"] = zone + override_params['zone'] = zone if gpus is not None: - if gpus.lower() == "none": - override_params["accelerators"] = None + if gpus.lower() == 'none': + override_params['accelerators'] = None else: - override_params["accelerators"] = gpus + override_params['accelerators'] = gpus if cpus is not None: - if cpus.lower() == "none": - override_params["cpus"] = None + if cpus.lower() == 'none': + override_params['cpus'] = None else: - override_params["cpus"] = cpus + override_params['cpus'] = cpus if memory is not None: - if memory.lower() == "none": - override_params["memory"] = None + if memory.lower() == 'none': + override_params['memory'] = None else: - override_params["memory"] = memory + override_params['memory'] = memory if instance_type is not None: - if instance_type.lower() == "none": - override_params["instance_type"] = None + if instance_type.lower() == 'none': + override_params['instance_type'] = None else: - override_params["instance_type"] = instance_type + override_params['instance_type'] = instance_type if use_spot is not None: - override_params["use_spot"] = use_spot + override_params['use_spot'] = use_spot if image_id is not None: - if image_id.lower() == "none": - override_params["image_id"] = None + if image_id.lower() == 'none': + override_params['image_id'] = None else: - override_params["image_id"] = image_id + override_params['image_id'] = image_id if disk_size is not None: - override_params["disk_size"] = disk_size + override_params['disk_size'] = disk_size if disk_tier is not None: - if disk_tier.lower() == "none": - override_params["disk_tier"] = None + if disk_tier.lower() == 'none': + override_params['disk_tier'] = None else: - override_params["disk_tier"] = disk_tier + override_params['disk_tier'] = disk_tier if ports: - override_params["ports"] = ports + override_params['ports'] = ports return override_params @@ -603,12 +554,11 @@ def _launch_with_confirm( if cluster is None: cluster = backend_utils.generate_cluster_name() - clone_source_str = "" + clone_source_str = '' if clone_disk_from is not None: - clone_source_str = f" from the disk of {clone_disk_from!r}" + clone_source_str = f' from the disk of {clone_disk_from!r}' task, _ = backend_utils.check_can_clone_disk_and_override_task( - clone_disk_from, cluster, task - ) + clone_disk_from, cluster, task) with sky.Dag() as dag: dag.add(task) @@ -618,15 +568,13 @@ def _launch_with_confirm( # Show the optimize log before the prompt if the cluster does not exist. try: sky_check.get_cached_enabled_clouds_or_refresh( - raise_if_no_cloud_access=True - ) + raise_if_no_cloud_access=True) except exceptions.NoCloudAccessError as e: # Catch the exception where the public cloud is not enabled, and # make it yellow for better visibility. with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f"{colorama.Fore.YELLOW}{e}" f"{colorama.Style.RESET_ALL}" - ) from e + raise RuntimeError(f'{colorama.Fore.YELLOW}{e}' + f'{colorama.Style.RESET_ALL}') from e dag = sky.optimize(dag) task = dag.tasks[0] @@ -639,18 +587,18 @@ def _launch_with_confirm( # it exists but is STOPPED. prompt = None if maybe_status is None: - cluster_str = "" if cluster is None else f" {cluster!r}" + cluster_str = '' if cluster is None else f' {cluster!r}' prompt = ( - f"Launching a new cluster{cluster_str}{clone_source_str}. " "Proceed?" - ) + f'Launching a new cluster{cluster_str}{clone_source_str}. ' + 'Proceed?') elif maybe_status == status_lib.ClusterStatus.STOPPED: - prompt = f"Restarting the stopped cluster {cluster!r}. Proceed?" + prompt = f'Restarting the stopped cluster {cluster!r}. Proceed?' if prompt is not None: confirm_shown = True click.confirm(prompt, default=True, abort=True, show_default=True) if not confirm_shown: - click.secho(f"Running task on cluster {cluster}...", fg="yellow") + click.secho(f'Running task on cluster {cluster}...', fg='yellow') sky.launch( dag, @@ -678,12 +626,12 @@ def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]: config: Optional[List[Dict[str, Any]]] = None result = None shell_splits = shlex.split(entrypoint) - yaml_file_provided = len(shell_splits) == 1 and ( - shell_splits[0].endswith("yaml") or shell_splits[0].endswith(".yml") - ) - invalid_reason = "" + yaml_file_provided = (len(shell_splits) == 1 and + (shell_splits[0].endswith('yaml') or + shell_splits[0].endswith('.yml'))) + invalid_reason = '' try: - with open(entrypoint, "r", encoding="utf-8") as f: + with open(entrypoint, 'r', encoding='utf-8') as f: try: config = list(yaml.safe_load_all(f)) if config: @@ -698,43 +646,36 @@ def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]: except yaml.YAMLError as e: if yaml_file_provided: logger.debug(e) - detailed_error = f"\nYAML Error: {e}\n" - invalid_reason = ( - "contains an invalid configuration. " - "Please check syntax.\n" - f"{detailed_error}" - ) + detailed_error = f'\nYAML Error: {e}\n' + invalid_reason = ('contains an invalid configuration. ' + 'Please check syntax.\n' + f'{detailed_error}') is_yaml = False except OSError: if yaml_file_provided: entry_point_path = os.path.expanduser(entrypoint) if not os.path.exists(entry_point_path): - invalid_reason = ( - "does not exist. Please check if the path" " is correct." - ) + invalid_reason = ('does not exist. Please check if the path' + ' is correct.') elif not os.path.isfile(entry_point_path): - invalid_reason = ( - "is not a file. Please check if the path" " is correct." - ) + invalid_reason = ('is not a file. Please check if the path' + ' is correct.') else: - invalid_reason = ( - "yaml.safe_load() failed. Please check if the" " path is correct." - ) + invalid_reason = ('yaml.safe_load() failed. Please check if the' + ' path is correct.') is_yaml = False if not is_yaml: if yaml_file_provided: click.confirm( - f"{entrypoint!r} looks like a yaml path but {invalid_reason}\n" - "It will be treated as a command to be run remotely. Continue?", - abort=True, - ) + f'{entrypoint!r} looks like a yaml path but {invalid_reason}\n' + 'It will be treated as a command to be run remotely. Continue?', + abort=True) return is_yaml, result def _pop_and_ignore_fields_in_override_params( - params: Dict[str, Any], field_to_ignore: List[str] -) -> None: + params: Dict[str, Any], field_to_ignore: List[str]) -> None: """Pops and ignores fields in override params. Args: @@ -748,15 +689,14 @@ def _pop_and_ignore_fields_in_override_params( for field in field_to_ignore: field_value = params.pop(field, None) if field_value is not None: - click.secho( - f"Override param {field}={field_value} is ignored.", fg="yellow" - ) + click.secho(f'Override param {field}={field_value} is ignored.', + fg='yellow') def _make_task_or_dag_from_entrypoint_with_overrides( entrypoint: Tuple[str, ...], *, - entrypoint_name: str = "Task", + entrypoint_name: str = 'Task', name: Optional[str] = None, workdir: Optional[str] = None, cloud: Optional[str] = None, @@ -783,37 +723,40 @@ def _make_task_or_dag_from_entrypoint_with_overrides( A dag iff the entrypoint is YAML and contains more than 1 task. Otherwise, a task. """ - entrypoint = " ".join(entrypoint) + entrypoint = ' '.join(entrypoint) is_yaml, _ = _check_yaml(entrypoint) entrypoint: Optional[str] if is_yaml: # Treat entrypoint as a yaml. - click.secho(f"{entrypoint_name} from YAML spec: ", fg="yellow", nl=False) + click.secho(f'{entrypoint_name} from YAML spec: ', + fg='yellow', + nl=False) click.secho(entrypoint, bold=True) else: if not entrypoint: entrypoint = None else: # Treat entrypoint as a bash command. - click.secho(f"{entrypoint_name} from command: ", fg="yellow", nl=False) + click.secho(f'{entrypoint_name} from command: ', + fg='yellow', + nl=False) click.secho(entrypoint, bold=True) - override_params = _parse_override_params( - cloud=cloud, - region=region, - zone=zone, - gpus=gpus, - cpus=cpus, - memory=memory, - instance_type=instance_type, - use_spot=use_spot, - image_id=image_id, - disk_size=disk_size, - disk_tier=disk_tier, - ports=ports, - ) + override_params = _parse_override_params(cloud=cloud, + region=region, + zone=zone, + gpus=gpus, + cpus=cpus, + memory=memory, + instance_type=instance_type, + use_spot=use_spot, + image_id=image_id, + disk_size=disk_size, + disk_tier=disk_tier, + ports=ports) if field_to_ignore is not None: - _pop_and_ignore_fields_in_override_params(override_params, field_to_ignore) + _pop_and_ignore_fields_in_override_params(override_params, + field_to_ignore) if is_yaml: assert entrypoint is not None @@ -825,17 +768,15 @@ def _make_task_or_dag_from_entrypoint_with_overrides( # override params. if override_params: click.secho( - f"WARNING: override params {override_params} are ignored, " - "since the yaml file contains multiple tasks.", - fg="yellow", - ) + f'WARNING: override params {override_params} are ignored, ' + 'since the yaml file contains multiple tasks.', + fg='yellow') return dag - assert ( - len(dag.tasks) == 1 - ), f"If you see this, please file an issue; tasks: {dag.tasks}" + assert len(dag.tasks) == 1, ( + f'If you see this, please file an issue; tasks: {dag.tasks}') task = dag.tasks[0] else: - task = sky.Task(name="sky-cmd", run=entrypoint) + task = sky.Task(name='sky-cmd', run=entrypoint) task.set_resources({sky.Resources()}) # env update has been done for DAG in load_chain_dag_from_yaml for YAML. task.update_envs(env) @@ -846,7 +787,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides( # job launch specific. if job_recovery is not None: - override_params["job_recovery"] = job_recovery + override_params['job_recovery'] = job_recovery task.set_resources_override(override_params) @@ -866,7 +807,7 @@ class _NaturalOrderGroup(click.Group): def list_commands(self, ctx): return self.commands.keys() - @usage_lib.entrypoint("sky.cli", fallback=True) + @usage_lib.entrypoint('sky.cli', fallback=True) def invoke(self, ctx): return super().invoke(ctx) @@ -878,38 +819,36 @@ class _DocumentedCodeCommand(click.Command): def get_help(self, ctx): help_str = ctx.command.help - ctx.command.help = help_str.replace(".. code-block:: bash\n", "\b") + ctx.command.help = help_str.replace('.. code-block:: bash\n', '\b') return super().get_help(ctx) def _with_deprecation_warning( - f, - original_name: str, - alias_name: str, - override_command_argument: Optional[Dict[str, Any]] = None, -): + f, + original_name: str, + alias_name: str, + override_command_argument: Optional[Dict[str, Any]] = None): @functools.wraps(f) def wrapper(self, *args, **kwargs): - override_str = "" + override_str = '' if override_command_argument is not None: overrides = [] for k, v in override_command_argument.items(): if isinstance(v, bool): if v: - overrides.append(f"--{k}") + overrides.append(f'--{k}') else: - overrides.append(f"--no-{k}") + overrides.append(f'--no-{k}') else: overrides.append(f'--{k.replace("_", "-")}={v}') - override_str = " with additional arguments " + " ".join(overrides) + override_str = ' with additional arguments ' + ' '.join(overrides) click.secho( - f"WARNING: `{alias_name}` has been renamed to `{original_name}` " - f"and will be removed in a future release. Please use the " - f"latter{override_str} instead.\n", + f'WARNING: `{alias_name}` has been renamed to `{original_name}` ' + f'and will be removed in a future release. Please use the ' + f'latter{override_str} instead.\n', err=True, - fg="yellow", - ) + fg='yellow') return f(self, *args, **kwargs) return wrapper @@ -918,7 +857,7 @@ def wrapper(self, *args, **kwargs): def _override_arguments(callback, override_command_argument: Dict[str, Any]): def wrapper(*args, **kwargs): - logger.info(f"Overriding arguments: {override_command_argument}") + logger.info(f'Overriding arguments: {override_command_argument}') kwargs.update(override_command_argument) return callback(*args, **kwargs) @@ -940,194 +879,161 @@ def _add_command_alias( if new_command_name is None: new_command_name = command.name if new_group == group and new_command_name == command.name: - raise ValueError("Cannot add an alias to the same command.") + raise ValueError('Cannot add an alias to the same command.') new_command = copy.deepcopy(command) new_command.hidden = hidden new_command.name = new_command_name if override_command_argument: - new_command.callback = _override_arguments( - new_command.callback, override_command_argument - ) + new_command.callback = _override_arguments(new_command.callback, + override_command_argument) - orig = f"sky {group.name} {command.name}" - alias = f"sky {new_group.name} {new_command_name}" + orig = f'sky {group.name} {command.name}' + alias = f'sky {new_group.name} {new_command_name}' if with_warning: new_command.invoke = _with_deprecation_warning( new_command.invoke, orig, alias, - override_command_argument=override_command_argument, - ) + override_command_argument=override_command_argument) new_group.add_command(new_command, name=new_command_name) -def _deprecate_and_hide_command(group, command_to_deprecate, alternative_command): +def _deprecate_and_hide_command(group, command_to_deprecate, + alternative_command): """Hide a command and show a deprecation note, hinting the alternative.""" command_to_deprecate.hidden = True if group is not None: - orig = f"sky {group.name} {command_to_deprecate.name}" + orig = f'sky {group.name} {command_to_deprecate.name}' else: - orig = f"sky {command_to_deprecate.name}" + orig = f'sky {command_to_deprecate.name}' command_to_deprecate.invoke = _with_deprecation_warning( - command_to_deprecate.invoke, alternative_command, orig - ) + command_to_deprecate.invoke, alternative_command, orig) @click.group(cls=_NaturalOrderGroup, context_settings=_CONTEXT_SETTINGS) -@click.option( - "--install-shell-completion", - type=click.Choice(["bash", "zsh", "fish", "auto"]), - callback=_install_shell_completion, - expose_value=False, - is_eager=True, - help="Install shell completion for the specified shell.", -) -@click.option( - "--uninstall-shell-completion", - type=click.Choice(["bash", "zsh", "fish", "auto"]), - callback=_uninstall_shell_completion, - expose_value=False, - is_eager=True, - help="Uninstall shell completion for the specified shell.", -) -@click.version_option(sky.__version__, "--version", "-v", prog_name="skypilot") -@click.version_option( - sky.__commit__, - "--commit", - "-c", - prog_name="skypilot", - message="%(prog)s, commit %(version)s", - help="Show the commit hash and exit", -) +@click.option('--install-shell-completion', + type=click.Choice(['bash', 'zsh', 'fish', 'auto']), + callback=_install_shell_completion, + expose_value=False, + is_eager=True, + help='Install shell completion for the specified shell.') +@click.option('--uninstall-shell-completion', + type=click.Choice(['bash', 'zsh', 'fish', 'auto']), + callback=_uninstall_shell_completion, + expose_value=False, + is_eager=True, + help='Uninstall shell completion for the specified shell.') +@click.version_option(sky.__version__, '--version', '-v', prog_name='skypilot') +@click.version_option(sky.__commit__, + '--commit', + '-c', + prog_name='skypilot', + message='%(prog)s, commit %(version)s', + help='Show the commit hash and exit') def cli(): pass @cli.command(cls=_DocumentedCodeCommand) -@click.argument( - "entrypoint", - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name), -) +@click.argument('entrypoint', + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name)) +@click.option('--cluster', + '-c', + default=None, + type=str, + **_get_shell_complete_args(_complete_cluster_name), + help=_CLUSTER_FLAG_HELP) +@click.option('--dryrun', + default=False, + is_flag=True, + help='If True, do not actually run the job.') @click.option( - "--cluster", - "-c", - default=None, - type=str, - **_get_shell_complete_args(_complete_cluster_name), - help=_CLUSTER_FLAG_HELP, -) -@click.option( - "--dryrun", - default=False, - is_flag=True, - help="If True, do not actually run the job.", -) -@click.option( - "--detach-setup", - "-s", + '--detach-setup', + '-s', default=False, is_flag=True, - help=( - "If True, run setup in non-interactive mode as part of the job itself. " - "You can safely ctrl-c to detach from logging, and it will not interrupt " - "the setup process. To see the logs again after detaching, use `sky logs`." - " To cancel setup, cancel the job via `sky cancel`. Useful for long-" - "running setup commands." - ), -) + help= + ('If True, run setup in non-interactive mode as part of the job itself. ' + 'You can safely ctrl-c to detach from logging, and it will not interrupt ' + 'the setup process. To see the logs again after detaching, use `sky logs`.' + ' To cancel setup, cancel the job via `sky cancel`. Useful for long-' + 'running setup commands.')) @click.option( - "--detach-run", - "-d", + '--detach-run', + '-d', default=False, is_flag=True, - help=( - "If True, as soon as a job is submitted, return from this call " - "and do not stream execution logs." - ), -) -@click.option( - "--docker", - "backend_name", - flag_value=backends.LocalDockerBackend.NAME, - default=False, - help="If used, runs locally inside a docker container.", -) + help=('If True, as soon as a job is submitted, return from this call ' + 'and do not stream execution logs.')) +@click.option('--docker', + 'backend_name', + flag_value=backends.LocalDockerBackend.NAME, + default=False, + help='If used, runs locally inside a docker container.') @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS) @click.option( - "--idle-minutes-to-autostop", - "-i", + '--idle-minutes-to-autostop', + '-i', default=None, type=int, required=False, - help=( - "Automatically stop the cluster after this many minutes " - "of idleness, i.e., no running or pending jobs in the cluster's job " - "queue. Idleness gets reset whenever setting-up/running/pending jobs " - "are found in the job queue. " - "Setting this flag is equivalent to " - "running ``sky launch -d ...`` and then ``sky autostop -i ``" - ". If not set, the cluster will not be autostopped." - ), -) + help=('Automatically stop the cluster after this many minutes ' + 'of idleness, i.e., no running or pending jobs in the cluster\'s job ' + 'queue. Idleness gets reset whenever setting-up/running/pending jobs ' + 'are found in the job queue. ' + 'Setting this flag is equivalent to ' + 'running ``sky launch -d ...`` and then ``sky autostop -i ``' + '. If not set, the cluster will not be autostopped.')) @click.option( - "--down", + '--down', default=False, is_flag=True, required=False, - help=( - "Autodown the cluster: tear down the cluster after all jobs finish " - "(successfully or abnormally). If --idle-minutes-to-autostop is also set, " - "the cluster will be torn down after the specified idle time. " - "Note that if errors occur during provisioning/data syncing/setting up, " - "the cluster will not be torn down for debugging purposes." - ), + help= + ('Autodown the cluster: tear down the cluster after all jobs finish ' + '(successfully or abnormally). If --idle-minutes-to-autostop is also set, ' + 'the cluster will be torn down after the specified idle time. ' + 'Note that if errors occur during provisioning/data syncing/setting up, ' + 'the cluster will not be torn down for debugging purposes.'), ) @click.option( - "--retry-until-up", - "-r", + '--retry-until-up', + '-r', default=False, is_flag=True, required=False, - help=( - "Whether to retry provisioning infinitely until the cluster is up, " - "if we fail to launch the cluster on any possible region/cloud due " - "to unavailability errors." - ), + help=('Whether to retry provisioning infinitely until the cluster is up, ' + 'if we fail to launch the cluster on any possible region/cloud due ' + 'to unavailability errors.'), ) @click.option( - "--yes", - "-y", + '--yes', + '-y', is_flag=True, default=False, required=False, # Disabling quote check here, as there seems to be a bug in pylint, # which incorrectly recognizes the help string as a docstring. # pylint: disable=bad-docstring-quotes - help="Skip confirmation prompt.", -) -@click.option( - "--no-setup", - is_flag=True, - default=False, - required=False, - help="Skip setup phase when (re-)launching cluster.", -) + help='Skip confirmation prompt.') +@click.option('--no-setup', + is_flag=True, + default=False, + required=False, + help='Skip setup phase when (re-)launching cluster.') @click.option( - "--clone-disk-from", - "--clone", + '--clone-disk-from', + '--clone', default=None, type=str, **_get_shell_complete_args(_complete_cluster_name), - help=( - "[Experimental] Clone disk from an existing cluster to launch " - "a new one. This is useful when the new cluster needs to have " - "the same data on the boot disk as an existing cluster." - ), -) + help=('[Experimental] Clone disk from an existing cluster to launch ' + 'a new one. This is useful when the new cluster needs to have ' + 'the same data on the boot disk as an existing cluster.')) @usage_lib.entrypoint def launch( entrypoint: Tuple[str, ...], @@ -1171,8 +1077,7 @@ def launch( # NOTE(dev): Keep the docstring consistent between the Python API and CLI. env = _merge_env_vars(env_file, env) controller_utils.check_cluster_name_not_controller( - cluster, operation_str="Launching tasks on it" - ) + cluster, operation_str='Launching tasks on it') if backend_name is None: backend_name = backends.CloudVmRayBackend.NAME @@ -1196,7 +1101,8 @@ def launch( ports=ports, ) if isinstance(task_or_dag, sky.Dag): - raise click.UsageError(_DAG_NOT_SUPPORTED_MESSAGE.format(command="sky launch")) + raise click.UsageError( + _DAG_NOT_SUPPORTED_MESSAGE.format(command='sky launch')) task = task_or_dag backend: backends.Backend @@ -1206,66 +1112,55 @@ def launch( backend = backends.CloudVmRayBackend() else: with ux_utils.print_exception_no_traceback(): - raise ValueError(f"{backend_name} backend is not supported.") + raise ValueError(f'{backend_name} backend is not supported.') if task.service is not None: logger.info( - f"{colorama.Fore.YELLOW}Service section will be ignored when using " - f"`sky launch`. {colorama.Style.RESET_ALL}\n{colorama.Fore.YELLOW}" - "To spin up a service, use SkyServe CLI: " - f"{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}sky serve up" - f"{colorama.Style.RESET_ALL}" - ) - - _launch_with_confirm( - task, - backend, - cluster, - dryrun=dryrun, - detach_setup=detach_setup, - detach_run=detach_run, - no_confirm=yes, - idle_minutes_to_autostop=idle_minutes_to_autostop, - down=down, - retry_until_up=retry_until_up, - no_setup=no_setup, - clone_disk_from=clone_disk_from, - ) + f'{colorama.Fore.YELLOW}Service section will be ignored when using ' + f'`sky launch`. {colorama.Style.RESET_ALL}\n{colorama.Fore.YELLOW}' + 'To spin up a service, use SkyServe CLI: ' + f'{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}sky serve up' + f'{colorama.Style.RESET_ALL}') + + _launch_with_confirm(task, + backend, + cluster, + dryrun=dryrun, + detach_setup=detach_setup, + detach_run=detach_run, + no_confirm=yes, + idle_minutes_to_autostop=idle_minutes_to_autostop, + down=down, + retry_until_up=retry_until_up, + no_setup=no_setup, + clone_disk_from=clone_disk_from) @cli.command(cls=_DocumentedCodeCommand) -@click.argument( - "cluster", - required=False, - type=str, - **_get_shell_complete_args(_complete_cluster_name), -) +@click.argument('cluster', + required=False, + type=str, + **_get_shell_complete_args(_complete_cluster_name)) @click.option( - "--cluster", - "-c", - "cluster_option", + '--cluster', + '-c', + 'cluster_option', hidden=True, type=str, - help="This is the same as the positional argument, just for consistency.", - **_get_shell_complete_args(_complete_cluster_name), -) -@click.argument( - "entrypoint", - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name), -) + help='This is the same as the positional argument, just for consistency.', + **_get_shell_complete_args(_complete_cluster_name)) +@click.argument('entrypoint', + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name)) @click.option( - "--detach-run", - "-d", + '--detach-run', + '-d', default=False, is_flag=True, - help=( - "If True, as soon as a job is submitted, return from this call " - "and do not stream execution logs." - ), -) + help=('If True, as soon as a job is submitted, return from this call ' + 'and do not stream execution logs.')) @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS) @usage_lib.entrypoint # pylint: disable=redefined-builtin @@ -1351,24 +1246,23 @@ def exec( """ if cluster_option is None and cluster is None: - raise click.UsageError("Missing argument '[CLUSTER]' and " "'[ENTRYPOINT]...'") + raise click.UsageError('Missing argument \'[CLUSTER]\' and ' + '\'[ENTRYPOINT]...\'') if cluster_option is not None: if cluster is not None: entrypoint = (cluster,) + entrypoint cluster = cluster_option if not entrypoint: - raise click.UsageError("Missing argument '[ENTRYPOINT]...'") + raise click.UsageError('Missing argument \'[ENTRYPOINT]...\'') assert cluster is not None, (cluster, cluster_option, entrypoint) env = _merge_env_vars(env_file, env) controller_utils.check_cluster_name_not_controller( - cluster, operation_str="Executing task on it" - ) + cluster, operation_str='Executing task on it') handle = global_user_state.get_handle_from_cluster_name(cluster) if handle is None: - raise click.BadParameter( - f"Cluster {cluster!r} not found. " "Use `sky launch` to provision first." - ) + raise click.BadParameter(f'Cluster {cluster!r} not found. ' + 'Use `sky launch` to provision first.') backend = backend_utils.get_backend_from_handle(handle) task_or_dag = _make_task_or_dag_from_entrypoint_with_overrides( @@ -1389,26 +1283,24 @@ def exec( disk_size=disk_size, disk_tier=disk_tier, ports=ports, - field_to_ignore=["cpus", "memory", "disk_size", "disk_tier", "ports"], + field_to_ignore=['cpus', 'memory', 'disk_size', 'disk_tier', 'ports'], ) if isinstance(task_or_dag, sky.Dag): - raise click.UsageError( - "YAML specifies a DAG, while `sky exec` " "supports a single task only." - ) + raise click.UsageError('YAML specifies a DAG, while `sky exec` ' + 'supports a single task only.') task = task_or_dag - click.secho(f"Executing task on cluster {cluster}...", fg="yellow") + click.secho(f'Executing task on cluster {cluster}...', fg='yellow') sky.exec(task, backend=backend, cluster_name=cluster, detach_run=detach_run) def _get_managed_jobs( - refresh: bool, - skip_finished: bool, - show_all: bool, - limit_num_jobs_to_show: bool = False, - is_called_by_user: bool = False, -) -> Tuple[Optional[int], str]: + refresh: bool, + skip_finished: bool, + show_all: bool, + limit_num_jobs_to_show: bool = False, + is_called_by_user: bool = False) -> Tuple[Optional[int], str]: """Get the in-progress managed jobs. Args: @@ -1434,35 +1326,30 @@ def _get_managed_jobs( usage_lib.messages.usage.set_internal() with sky_logging.silent(): # Make the call silent - managed_jobs_ = managed_jobs.queue( - refresh=refresh, skip_finished=skip_finished - ) - num_in_progress_jobs = len(set(job["job_id"] for job in managed_jobs_)) + managed_jobs_ = managed_jobs.queue(refresh=refresh, + skip_finished=skip_finished) + num_in_progress_jobs = len(set(job['job_id'] for job in managed_jobs_)) except exceptions.ClusterNotUpError as e: controller_status = e.cluster_status msg = str(e) if controller_status is None: - msg += ( - f" (See: {colorama.Style.BRIGHT}sky jobs -h" - f"{colorama.Style.RESET_ALL})" - ) - elif ( - controller_status == status_lib.ClusterStatus.STOPPED and is_called_by_user - ): - msg += ( - f" (See finished managed jobs: {colorama.Style.BRIGHT}" - f"sky jobs queue --refresh{colorama.Style.RESET_ALL})" - ) + msg += (f' (See: {colorama.Style.BRIGHT}sky jobs -h' + f'{colorama.Style.RESET_ALL})') + elif (controller_status == status_lib.ClusterStatus.STOPPED and + is_called_by_user): + msg += (f' (See finished managed jobs: {colorama.Style.BRIGHT}' + f'sky jobs queue --refresh{colorama.Style.RESET_ALL})') except RuntimeError as e: - msg = "" + msg = '' try: # Check the controller status again, as the RuntimeError is likely # due to the controller being autostopped when querying the jobs. controller_type = controller_utils.Controllers.JOBS_CONTROLLER record = backend_utils.refresh_cluster_record( - controller_type.value.cluster_name, cluster_status_lock_timeout=0 - ) - if record is None or record["status"] == status_lib.ClusterStatus.STOPPED: + controller_type.value.cluster_name, + cluster_status_lock_timeout=0) + if (record is None or + record['status'] == status_lib.ClusterStatus.STOPPED): msg = controller_type.value.default_hint_if_non_existent except Exception: # pylint: disable=broad-except # This is to an best effort to find the latest controller status to @@ -1471,31 +1358,26 @@ def _get_managed_jobs( pass if not msg: msg = ( - "Failed to query managed jobs due to connection " - "issues. Try again later. " - f"Details: {common_utils.format_exception(e, use_bracket=True)}" + 'Failed to query managed jobs due to connection ' + 'issues. Try again later. ' + f'Details: {common_utils.format_exception(e, use_bracket=True)}' ) except Exception as e: # pylint: disable=broad-except - msg = ( - "Failed to query managed jobs: " - f"{common_utils.format_exception(e, use_bracket=True)}" - ) + msg = ('Failed to query managed jobs: ' + f'{common_utils.format_exception(e, use_bracket=True)}') else: - max_jobs_to_show = ( - _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS if limit_num_jobs_to_show else None - ) - msg = managed_jobs.format_job_table( - managed_jobs_, show_all=show_all, max_jobs=max_jobs_to_show - ) + max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS + if limit_num_jobs_to_show else None) + msg = managed_jobs.format_job_table(managed_jobs_, + show_all=show_all, + max_jobs=max_jobs_to_show) return num_in_progress_jobs, msg -def _get_services( - service_names: Optional[List[str]], - show_all: bool, - show_endpoint: bool, - is_called_by_user: bool = False, -) -> Tuple[Optional[int], str]: +def _get_services(service_names: Optional[List[str]], + show_all: bool, + show_endpoint: bool, + is_called_by_user: bool = False) -> Tuple[Optional[int], str]: """Get service statuses. Args: @@ -1524,21 +1406,20 @@ def _get_services( controller_status = e.cluster_status msg = str(e) if controller_status is None: - msg += ( - f" (See: {colorama.Style.BRIGHT}sky serve -h" - f"{colorama.Style.RESET_ALL})" - ) + msg += (f' (See: {colorama.Style.BRIGHT}sky serve -h' + f'{colorama.Style.RESET_ALL})') except RuntimeError as e: - msg = "" + msg = '' try: # Check the controller status again, as the RuntimeError is likely # due to the controller being autostopped when querying the # services. controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER record = backend_utils.refresh_cluster_record( - controller_type.value.cluster_name, cluster_status_lock_timeout=0 - ) - if record is None or record["status"] == status_lib.ClusterStatus.STOPPED: + controller_type.value.cluster_name, + cluster_status_lock_timeout=0) + if (record is None or + record['status'] == status_lib.ClusterStatus.STOPPED): msg = controller_type.value.default_hint_if_non_existent except Exception: # pylint: disable=broad-except # This is to an best effort to find the latest controller status to @@ -1546,128 +1427,92 @@ def _get_services( # print the original error. pass if not msg: - msg = ( - "Failed to fetch service statuses due to connection issues. " - "Please try again later. Details: " - f"{common_utils.format_exception(e, use_bracket=True)}" - ) + msg = ('Failed to fetch service statuses due to connection issues. ' + 'Please try again later. Details: ' + f'{common_utils.format_exception(e, use_bracket=True)}') except Exception as e: # pylint: disable=broad-except - msg = ( - "Failed to fetch service statuses: " - f"{common_utils.format_exception(e, use_bracket=True)}" - ) + msg = ('Failed to fetch service statuses: ' + f'{common_utils.format_exception(e, use_bracket=True)}') else: if show_endpoint: if len(service_records) != 1: - plural = "s" if len(service_records) > 1 else "" - service_num = ( - str(len(service_records)) if len(service_records) > 0 else "No" - ) + plural = 's' if len(service_records) > 1 else '' + service_num = (str(len(service_records)) + if len(service_records) > 0 else 'No') raise click.UsageError( - f"{service_num} service{plural} found. Please specify " - "an existing service to show its endpoint. Usage: " - "sky serve status --endpoint " - ) + f'{service_num} service{plural} found. Please specify ' + 'an existing service to show its endpoint. Usage: ' + 'sky serve status --endpoint ') msg = serve_lib.get_endpoint(service_records[0]) else: msg = serve_lib.format_service_table(service_records, show_all) - service_not_found_msg = "" + service_not_found_msg = '' if service_names is not None: for service_name in service_names: - if not any( - service_name == record["name"] for record in service_records - ): + if not any(service_name == record['name'] + for record in service_records): service_not_found_msg += ( - f"\nService {service_name!r} not found." - ) + f'\nService {service_name!r} not found.') if service_not_found_msg: - msg += f"\n{service_not_found_msg}" + msg += f'\n{service_not_found_msg}' return num_services, msg @cli.command() +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all information in full.') @click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Show all information in full.", -) -@click.option( - "--refresh", - "-r", - default=False, - is_flag=True, - required=False, - help="Query the latest cluster statuses from the cloud provider(s).", -) -@click.option( - "--ip", - default=False, - is_flag=True, - required=False, - help=( - "Get the IP address of the head node of a cluster. This " - "option will override all other options. For Kubernetes " - "clusters, the returned IP address is the internal IP " - "of the head pod, and may not be accessible from outside " - "the cluster." - ), -) -@click.option( - "--endpoints", + '--refresh', + '-r', default=False, is_flag=True, required=False, - help=( - "Get all exposed endpoints and corresponding URLs for a" - "cluster. This option will override all other options." - ), -) -@click.option( - "--endpoint", - required=False, - default=None, - type=int, - help=( - "Get the endpoint URL for the specified port number on the " - "cluster. This option will override all other options." - ), -) -@click.option( - "--show-managed-jobs/--no-show-managed-jobs", - default=True, - is_flag=True, - required=False, - help="Also show recent in-progress managed jobs, if any.", -) -@click.option( - "--show-services/--no-show-services", - default=True, - is_flag=True, - required=False, - help="Also show sky serve services, if any.", -) -@click.argument( - "clusters", - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_cluster_name), -) + help='Query the latest cluster statuses from the cloud provider(s).') +@click.option('--ip', + default=False, + is_flag=True, + required=False, + help=('Get the IP address of the head node of a cluster. This ' + 'option will override all other options. For Kubernetes ' + 'clusters, the returned IP address is the internal IP ' + 'of the head pod, and may not be accessible from outside ' + 'the cluster.')) +@click.option('--endpoints', + default=False, + is_flag=True, + required=False, + help=('Get all exposed endpoints and corresponding URLs for a' + 'cluster. This option will override all other options.')) +@click.option('--endpoint', + required=False, + default=None, + type=int, + help=('Get the endpoint URL for the specified port number on the ' + 'cluster. This option will override all other options.')) +@click.option('--show-managed-jobs/--no-show-managed-jobs', + default=True, + is_flag=True, + required=False, + help='Also show recent in-progress managed jobs, if any.') +@click.option('--show-services/--no-show-services', + default=True, + is_flag=True, + required=False, + help='Also show sky serve services, if any.') +@click.argument('clusters', + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_cluster_name)) @usage_lib.entrypoint # pylint: disable=redefined-builtin -def status( - all: bool, - refresh: bool, - ip: bool, - endpoints: bool, - endpoint: Optional[int], - show_managed_jobs: bool, - show_services: bool, - clusters: List[str], -): +def status(all: bool, refresh: bool, ip: bool, endpoints: bool, + endpoint: Optional[int], show_managed_jobs: bool, + show_services: bool, clusters: List[str]): # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Show clusters. @@ -1732,159 +1577,127 @@ def status( with multiprocessing.Pool(2) as pool: # Do not show job queue if user specifies clusters, and if user # specifies --ip or --endpoint(s). - show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints]) + show_managed_jobs = show_managed_jobs and not any( + [clusters, ip, endpoints]) show_endpoints = endpoints or endpoint is not None show_single_endpoint = endpoint is not None if show_managed_jobs: # Run managed job query in parallel to speed up the status query. managed_jobs_future = pool.apply_async( _get_managed_jobs, - kwds=dict( - refresh=False, - skip_finished=True, - show_all=False, - limit_num_jobs_to_show=not all, - is_called_by_user=False, - ), - ) + kwds=dict(refresh=False, + skip_finished=True, + show_all=False, + limit_num_jobs_to_show=not all, + is_called_by_user=False)) show_services = show_services and not clusters and not ip if show_services: # Run the sky serve service query in parallel to speed up the # status query. - services_future = pool.apply_async( - _get_services, - kwds=dict( - service_names=None, - show_all=False, - show_endpoint=False, - is_called_by_user=False, - ), - ) + services_future = pool.apply_async(_get_services, + kwds=dict( + service_names=None, + show_all=False, + show_endpoint=False, + is_called_by_user=False)) if ip or show_endpoints: if refresh: raise click.UsageError( - "Using --ip or --endpoint(s) with --refresh is not" - "supported for now. To fix, refresh first, " - "then query the IP or endpoint." - ) + 'Using --ip or --endpoint(s) with --refresh is not' + 'supported for now. To fix, refresh first, ' + 'then query the IP or endpoint.') if ip and show_endpoints: with ux_utils.print_exception_no_traceback(): raise ValueError( - "Cannot specify both --ip and --endpoint(s) " - "at the same time." - ) + 'Cannot specify both --ip and --endpoint(s) ' + 'at the same time.') if endpoint is not None and endpoints: with ux_utils.print_exception_no_traceback(): raise ValueError( - "Cannot specify both --endpoint and --endpoints " - "at the same time." - ) + 'Cannot specify both --endpoint and --endpoints ' + 'at the same time.') if len(clusters) != 1: with ux_utils.print_exception_no_traceback(): - plural = "s" if len(clusters) > 1 else "" - cluster_num = str(len(clusters)) if len(clusters) > 0 else "No" - cause = "a single" if len(clusters) > 1 else "an existing" + plural = 's' if len(clusters) > 1 else '' + cluster_num = (str(len(clusters)) + if len(clusters) > 0 else 'No') + cause = 'a single' if len(clusters) > 1 else 'an existing' raise ValueError( _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format( cluster_num=cluster_num, plural=plural, - verb="specified", + verb='specified', cause=cause, - property="IP address" if ip else "endpoint(s)", - flag=( - "ip" - if ip - else ( - "endpoint port" - if show_single_endpoint - else "endpoints" - ) - ), - ) - ) + property='IP address' if ip else 'endpoint(s)', + flag='ip' if ip else + ('endpoint port' + if show_single_endpoint else 'endpoints'))) else: - click.echo( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters" - f"{colorama.Style.RESET_ALL}" - ) + click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters' + f'{colorama.Style.RESET_ALL}') query_clusters: Optional[List[str]] = None if clusters: query_clusters = _get_glob_clusters(clusters, silent=ip) - cluster_records = core.status(cluster_names=query_clusters, refresh=refresh) + cluster_records = core.status(cluster_names=query_clusters, + refresh=refresh) if ip or show_endpoints: if len(cluster_records) != 1: with ux_utils.print_exception_no_traceback(): - plural = "s" if len(cluster_records) > 1 else "" - cluster_num = ( - str(len(cluster_records)) - if len(cluster_records) > 0 - else f"{clusters[0]!r}" - ) - verb = "found" if len(cluster_records) > 0 else "not found" - cause = "a single" if len(clusters) > 1 else "an existing" + plural = 's' if len(cluster_records) > 1 else '' + cluster_num = (str(len(cluster_records)) + if len(cluster_records) > 0 else + f'{clusters[0]!r}') + verb = 'found' if len(cluster_records) > 0 else 'not found' + cause = 'a single' if len(clusters) > 1 else 'an existing' raise ValueError( _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format( cluster_num=cluster_num, plural=plural, verb=verb, cause=cause, - property="IP address" if ip else "endpoint(s)", - flag=( - "ip" - if ip - else ( - "endpoint port" - if show_single_endpoint - else "endpoints" - ) - ), - ) - ) + property='IP address' if ip else 'endpoint(s)', + flag='ip' if ip else + ('endpoint port' + if show_single_endpoint else 'endpoints'))) cluster_record = cluster_records[0] - if cluster_record["status"] != status_lib.ClusterStatus.UP: + if cluster_record['status'] != status_lib.ClusterStatus.UP: with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'Cluster {cluster_record["name"]!r} ' "is not in UP status." - ) - handle = cluster_record["handle"] + raise RuntimeError(f'Cluster {cluster_record["name"]!r} ' + 'is not in UP status.') + handle = cluster_record['handle'] if not isinstance(handle, backends.CloudVmRayResourceHandle): with ux_utils.print_exception_no_traceback(): - raise ValueError( - "Querying IP address is not supported " "for local clusters." - ) + raise ValueError('Querying IP address is not supported ' + 'for local clusters.') head_ip = handle.external_ips()[0] if show_endpoints: if endpoint: - cluster_endpoint = core.endpoints( - cluster_record["name"], endpoint - ).get(endpoint, None) + cluster_endpoint = core.endpoints(cluster_record['name'], + endpoint).get( + endpoint, None) if not cluster_endpoint: raise click.Abort( - f"Endpoint {endpoint} not found for cluster " - f'{cluster_record["name"]!r}.' - ) + f'Endpoint {endpoint} not found for cluster ' + f'{cluster_record["name"]!r}.') click.echo(cluster_endpoint) else: - cluster_endpoints = core.endpoints(cluster_record["name"]) + cluster_endpoints = core.endpoints(cluster_record['name']) assert isinstance(cluster_endpoints, dict) if not cluster_endpoints: - raise click.Abort( - f"No endpoint found for cluster " - f'{cluster_record["name"]!r}.' - ) + raise click.Abort(f'No endpoint found for cluster ' + f'{cluster_record["name"]!r}.') for port, port_endpoint in cluster_endpoints.items(): click.echo( - f"{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}" - f"{colorama.Style.RESET_ALL}: " - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"{port_endpoint}{colorama.Style.RESET_ALL}" - ) + f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}' + f'{colorama.Style.RESET_ALL}: ' + f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'{port_endpoint}{colorama.Style.RESET_ALL}') return click.echo(head_ip) return @@ -1892,7 +1705,7 @@ def status( normal_clusters = [] controllers = [] for cluster_record in cluster_records: - cluster_name = cluster_record["name"] + cluster_name = cluster_record['name'] controller = controller_utils.Controllers.from_name(cluster_name) if controller is not None: controllers.append(cluster_record) @@ -1901,8 +1714,7 @@ def status( num_pending_autostop = 0 num_pending_autostop += status_utils.show_status_table( - normal_clusters + controllers, all - ) + normal_clusters + controllers, all) def _try_get_future_result(future) -> Tuple[bool, Any]: result = None @@ -1916,69 +1728,61 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: managed_jobs_query_interrupted = False if show_managed_jobs: - click.echo( - f"\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Managed jobs{colorama.Style.RESET_ALL}" - ) - with rich_utils.safe_status("[cyan]Checking managed jobs[/]"): + click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Managed jobs{colorama.Style.RESET_ALL}') + with rich_utils.safe_status('[cyan]Checking managed jobs[/]'): managed_jobs_query_interrupted, result = _try_get_future_result( - managed_jobs_future - ) + managed_jobs_future) if managed_jobs_query_interrupted: # Set to -1, so that the controller is not considered # down, and the hint for showing sky jobs queue # will still be shown. num_in_progress_jobs = -1 - msg = "KeyboardInterrupt" + msg = 'KeyboardInterrupt' else: num_in_progress_jobs, msg = result click.echo(msg) if num_in_progress_jobs is not None: # jobs controller is UP. - job_info = "" + job_info = '' if num_in_progress_jobs > 0: - plural_and_verb = " is" + plural_and_verb = ' is' if num_in_progress_jobs > 1: - plural_and_verb = "s are" + plural_and_verb = 's are' job_info = ( - f"{num_in_progress_jobs} managed job{plural_and_verb} " - "in progress" - ) - if num_in_progress_jobs > _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS: + f'{num_in_progress_jobs} managed job{plural_and_verb} ' + 'in progress') + if (num_in_progress_jobs > + _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS): job_info += ( - f" ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest " - "ones shown)" - ) - job_info += ". " + f' ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest ' + 'ones shown)') + job_info += '. ' hints.append( - controller_utils.Controllers.JOBS_CONTROLLER.value.in_progress_hint.format( - job_info=job_info - ) - ) + controller_utils.Controllers.JOBS_CONTROLLER.value. + in_progress_hint.format(job_info=job_info)) if show_services: - click.echo( - f"\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Services{colorama.Style.RESET_ALL}" - ) + click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Services{colorama.Style.RESET_ALL}') num_services = None if managed_jobs_query_interrupted: # The pool is terminated, so we cannot run the service query. - msg = "KeyboardInterrupt" + msg = 'KeyboardInterrupt' else: - with rich_utils.safe_status("[cyan]Checking services[/]"): - interrupted, result = _try_get_future_result(services_future) + with rich_utils.safe_status('[cyan]Checking services[/]'): + interrupted, result = _try_get_future_result( + services_future) if interrupted: num_services = -1 - msg = "KeyboardInterrupt" + msg = 'KeyboardInterrupt' else: num_services, msg = result click.echo(msg) if num_services is not None: - hints.append( - controller_utils.Controllers.SKY_SERVE_CONTROLLER.value.in_progress_hint - ) + hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER. + value.in_progress_hint) if show_managed_jobs or show_services: try: @@ -1995,28 +1799,24 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: if num_pending_autostop > 0 and not refresh: # Don't print this hint if there's no pending autostop or user has # already passed --refresh. - plural_and_verb = " has" + plural_and_verb = ' has' if num_pending_autostop > 1: - plural_and_verb = "s have" - hints.append( - f"* {num_pending_autostop} cluster{plural_and_verb} " - "auto{stop,down} scheduled. Refresh statuses with: " - f"{colorama.Style.BRIGHT}sky status --refresh" - f"{colorama.Style.RESET_ALL}" - ) + plural_and_verb = 's have' + hints.append(f'* {num_pending_autostop} cluster{plural_and_verb} ' + 'auto{stop,down} scheduled. Refresh statuses with: ' + f'{colorama.Style.BRIGHT}sky status --refresh' + f'{colorama.Style.RESET_ALL}') if hints: - click.echo("\n" + "\n".join(hints)) + click.echo('\n' + '\n'.join(hints)) @cli.command() -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Show all information in full.", -) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all information in full.') @usage_lib.entrypoint def cost_report(all: bool): # pylint: disable=redefined-builtin # NOTE(dev): Keep the docstring consistent between the Python API and CLI. @@ -2042,7 +1842,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin normal_cluster_records = [] controllers = dict() for cluster_record in cluster_records: - cluster_name = cluster_record["name"] + cluster_name = cluster_record['name'] controller = controller_utils.Controllers.from_name(cluster_name) if controller is not None: controller_name = controller.value.name @@ -2054,139 +1854,111 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin normal_cluster_records.append(cluster_record) total_cost = status_utils.get_total_cost_of_displayed_records( - normal_cluster_records, all - ) + normal_cluster_records, all) status_utils.show_cost_report_table(normal_cluster_records, all) for controller_name, cluster_record in controllers.items(): status_utils.show_cost_report_table( - [cluster_record], all, controller_name=controller_name.capitalize() - ) - total_cost += cluster_record["total_cost"] + [cluster_record], all, controller_name=controller_name.capitalize()) + total_cost += cluster_record['total_cost'] - click.echo( - f"\n{colorama.Style.BRIGHT}" - f"Total Cost: ${total_cost:.2f}{colorama.Style.RESET_ALL}" - ) + click.echo(f'\n{colorama.Style.BRIGHT}' + f'Total Cost: ${total_cost:.2f}{colorama.Style.RESET_ALL}') if not all: click.secho( - f"Showing up to {status_utils.NUM_COST_REPORT_LINES} " - "most recent clusters. " - "To see all clusters in history, " - "pass the --all flag.", - fg="yellow", - ) + f'Showing up to {status_utils.NUM_COST_REPORT_LINES} ' + 'most recent clusters. ' + 'To see all clusters in history, ' + 'pass the --all flag.', + fg='yellow') click.secho( - "This feature is experimental. " - "Costs for clusters with auto{stop,down} " - "scheduled may not be accurate.", - fg="yellow", - ) + 'This feature is experimental. ' + 'Costs for clusters with auto{stop,down} ' + 'scheduled may not be accurate.', + fg='yellow') @cli.command() -@click.option( - "--all-users", - "-a", - default=False, - is_flag=True, - required=False, - help="Show all users' information in full.", -) -@click.option( - "--skip-finished", - "-s", - default=False, - is_flag=True, - required=False, - help="Show only pending/running jobs' information.", -) -@click.argument( - "clusters", - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_cluster_name), -) +@click.option('--all-users', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all users\' information in full.') +@click.option('--skip-finished', + '-s', + default=False, + is_flag=True, + required=False, + help='Show only pending/running jobs\' information.') +@click.argument('clusters', + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_cluster_name)) @usage_lib.entrypoint def queue(clusters: List[str], skip_finished: bool, all_users: bool): # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Show the job queue for cluster(s).""" - click.secho("Fetching and parsing job queue...", fg="yellow") + click.secho('Fetching and parsing job queue...', fg='yellow') if clusters: clusters = _get_glob_clusters(clusters) else: cluster_infos = global_user_state.get_clusters() - clusters = [c["name"] for c in cluster_infos] + clusters = [c['name'] for c in cluster_infos] unsupported_clusters = [] for cluster in clusters: try: job_table = core.queue(cluster, skip_finished, all_users) - except ( - exceptions.CommandError, - ValueError, - exceptions.NotSupportedError, - exceptions.ClusterNotUpError, - exceptions.CloudUserIdentityError, - exceptions.ClusterOwnerIdentityMismatchError, - ) as e: + except (exceptions.CommandError, ValueError, + exceptions.NotSupportedError, exceptions.ClusterNotUpError, + exceptions.CloudUserIdentityError, + exceptions.ClusterOwnerIdentityMismatchError) as e: if isinstance(e, exceptions.NotSupportedError): unsupported_clusters.append(cluster) - click.echo( - f"{colorama.Fore.YELLOW}Failed to get the job queue for " - f"cluster {cluster!r}.{colorama.Style.RESET_ALL}\n" - f" {common_utils.format_exception(e)}" - ) + click.echo(f'{colorama.Fore.YELLOW}Failed to get the job queue for ' + f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n' + f' {common_utils.format_exception(e)}') continue job_table = job_lib.format_job_queue(job_table) - click.echo(f"\nJob queue of cluster {cluster}\n{job_table}") + click.echo(f'\nJob queue of cluster {cluster}\n{job_table}') if unsupported_clusters: click.secho( - f"Note: Job queues are not supported on clusters: " + f'Note: Job queues are not supported on clusters: ' f'{", ".join(unsupported_clusters)}', - fg="yellow", - ) + fg='yellow') @cli.command() @click.option( - "--sync-down", - "-s", + '--sync-down', + '-s', is_flag=True, default=False, - help="Sync down the logs of a job to the local machine. For a distributed" - " job, a separate log file from each worker will be downloaded.", -) + help='Sync down the logs of a job to the local machine. For a distributed' + ' job, a separate log file from each worker will be downloaded.') @click.option( - "--status", + '--status', is_flag=True, default=False, - help=( - "If specified, do not show logs but exit with a status code for the " - "job's status: 0 for succeeded, or 1 for all other statuses." - ), -) + help=('If specified, do not show logs but exit with a status code for the ' + 'job\'s status: 0 for succeeded, or 1 for all other statuses.')) @click.option( - "--follow/--no-follow", + '--follow/--no-follow', is_flag=True, default=True, - help=( - "Follow the logs of a job. " - "If --no-follow is specified, print the log so far and exit. " - "[default: --follow]" - ), -) -@click.argument( - "cluster", - required=True, - type=str, - **_get_shell_complete_args(_complete_cluster_name), -) -@click.argument("job_ids", type=str, nargs=-1) + help=('Follow the logs of a job. ' + 'If --no-follow is specified, print the log so far and exit. ' + '[default: --follow]')) +@click.argument('cluster', + required=True, + type=str, + **_get_shell_complete_args(_complete_cluster_name)) +@click.argument('job_ids', type=str, nargs=-1) # TODO(zhwu): support logs by job name @usage_lib.entrypoint def logs( @@ -2214,15 +1986,13 @@ def logs( """ if sync_down and status: raise click.UsageError( - "Both --sync_down and --status are specified " - "(ambiguous). To fix: specify at most one of them." - ) + 'Both --sync_down and --status are specified ' + '(ambiguous). To fix: specify at most one of them.') if len(job_ids) > 1 and not sync_down: raise click.UsageError( f'Cannot stream logs of multiple jobs (IDs: {", ".join(job_ids)}).' - "\nPass -s/--sync-down to download the logs instead." - ) + '\nPass -s/--sync-down to download the logs instead.') job_ids = None if not job_ids else job_ids @@ -2238,9 +2008,8 @@ def logs( # in core.tail_logs. job_id = job_ids[0] if not job_id.isdigit(): - raise click.UsageError( - f"Invalid job ID {job_id}. " "Job ID must be integers." - ) + raise click.UsageError(f'Invalid job ID {job_id}. ' + 'Job ID must be integers.') job_ids_to_query = [int(job_id)] else: # job_ids is either None or empty list, so it is safe to cast it here. @@ -2251,50 +2020,42 @@ def logs( # If job_ids is None and no job has been submitted to the cluster, # it will return {None: None}. if job_id is None: - click.secho(f"No job found on cluster {cluster!r}.", fg="red") + click.secho(f'No job found on cluster {cluster!r}.', fg='red') sys.exit(1) job_status = list(job_statuses.values())[0] - job_status_str = job_status.value if job_status is not None else "None" - click.echo(f"Job {job_id}: {job_status_str}") + job_status_str = job_status.value if job_status is not None else 'None' + click.echo(f'Job {job_id}: {job_status_str}') if job_status == job_lib.JobStatus.SUCCEEDED: return else: if job_status is None: - id_str = "" if job_id is None else f"{job_id} " - click.secho(f"Job {id_str}not found", fg="red") + id_str = '' if job_id is None else f'{job_id} ' + click.secho(f'Job {id_str}not found', fg='red') sys.exit(1) core.tail_logs(cluster, job_id, follow) @cli.command() -@click.argument( - "cluster", - required=True, - type=str, - **_get_shell_complete_args(_complete_cluster_name), -) -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Cancel all jobs on the specified cluster.", -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) -@click.argument("jobs", required=False, type=int, nargs=-1) +@click.argument('cluster', + required=True, + type=str, + **_get_shell_complete_args(_complete_cluster_name)) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Cancel all jobs on the specified cluster.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') +@click.argument('jobs', required=False, type=int, nargs=-1) @usage_lib.entrypoint -def cancel( - cluster: str, all: bool, jobs: List[int], yes: bool -): # pylint: disable=redefined-builtin, redefined-outer-name +def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disable=redefined-builtin, redefined-outer-name # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Cancel job(s). @@ -2318,30 +2079,26 @@ def cancel( job_identity_str = None job_ids_to_cancel = None if not jobs and not all: - click.echo( - f"{colorama.Fore.YELLOW}No job IDs or --all provided; " - "cancelling the latest running job." - f"{colorama.Style.RESET_ALL}" - ) - job_identity_str = "the latest running job" + click.echo(f'{colorama.Fore.YELLOW}No job IDs or --all provided; ' + 'cancelling the latest running job.' + f'{colorama.Style.RESET_ALL}') + job_identity_str = 'the latest running job' else: # Cancelling specific jobs or --all. - job_ids = " ".join(map(str, jobs)) - plural = "s" if len(job_ids) > 1 else "" - job_identity_str = f"job{plural} {job_ids}" + job_ids = ' '.join(map(str, jobs)) + plural = 's' if len(job_ids) > 1 else '' + job_identity_str = f'job{plural} {job_ids}' job_ids_to_cancel = jobs if all: - job_identity_str = "all jobs" + job_identity_str = 'all jobs' job_ids_to_cancel = None - job_identity_str += f" on cluster {cluster!r}" + job_identity_str += f' on cluster {cluster!r}' if not yes: - click.confirm( - f"Cancelling {job_identity_str}. Proceed?", - default=True, - abort=True, - show_default=True, - ) + click.confirm(f'Cancelling {job_identity_str}. Proceed?', + default=True, + abort=True, + show_default=True) try: core.cancel(cluster, all=all, job_ids=job_ids_to_cancel) @@ -2358,23 +2115,21 @@ def cancel( @cli.command(cls=_DocumentedCodeCommand) -@click.argument( - "clusters", - nargs=-1, - required=False, - **_get_shell_complete_args(_complete_cluster_name), -) -@click.option( - "--all", "-a", default=None, is_flag=True, help="Stop all existing clusters." -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) +@click.argument('clusters', + nargs=-1, + required=False, + **_get_shell_complete_args(_complete_cluster_name)) +@click.option('--all', + '-a', + default=None, + is_flag=True, + help='Stop all existing clusters.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint def stop( clusters: List[str], @@ -2410,58 +2165,49 @@ def stop( sky stop -a """ - _down_or_stop_clusters(clusters, apply_to_all=all, down=False, no_confirm=yes) + _down_or_stop_clusters(clusters, + apply_to_all=all, + down=False, + no_confirm=yes) @cli.command(cls=_DocumentedCodeCommand) -@click.argument( - "clusters", - nargs=-1, - required=False, - **_get_shell_complete_args(_complete_cluster_name), -) +@click.argument('clusters', + nargs=-1, + required=False, + **_get_shell_complete_args(_complete_cluster_name)) +@click.option('--all', + '-a', + default=None, + is_flag=True, + help='Apply this command to all existing clusters.') +@click.option('--idle-minutes', + '-i', + type=int, + default=None, + required=False, + help=('Set the idle minutes before autostopping the cluster. ' + 'See the doc above for detailed semantics.')) @click.option( - "--all", - "-a", - default=None, - is_flag=True, - help="Apply this command to all existing clusters.", -) -@click.option( - "--idle-minutes", - "-i", - type=int, - default=None, - required=False, - help=( - "Set the idle minutes before autostopping the cluster. " - "See the doc above for detailed semantics." - ), -) -@click.option( - "--cancel", + '--cancel', default=False, is_flag=True, required=False, - help="Cancel any currently active auto{stop,down} setting for the " - "cluster. No-op if there is no active setting.", -) + help='Cancel any currently active auto{stop,down} setting for the ' + 'cluster. No-op if there is no active setting.') @click.option( - "--down", + '--down', default=False, is_flag=True, required=False, - help="Use autodown (tear down the cluster; non-restartable), instead " - "of autostop (restartable).", -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) + help='Use autodown (tear down the cluster; non-restartable), instead ' + 'of autostop (restartable).') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint def autostop( clusters: List[str], @@ -2514,108 +2260,89 @@ def autostop( """ if cancel and idle_minutes is not None: raise click.UsageError( - "Only one of --idle-minutes and --cancel should be specified. " - f"cancel: {cancel}, idle_minutes: {idle_minutes}" - ) + 'Only one of --idle-minutes and --cancel should be specified. ' + f'cancel: {cancel}, idle_minutes: {idle_minutes}') if cancel: idle_minutes = -1 elif idle_minutes is None: idle_minutes = 5 - _down_or_stop_clusters( - clusters, - apply_to_all=all, - down=down, - no_confirm=yes, - idle_minutes_to_autostop=idle_minutes, - ) + _down_or_stop_clusters(clusters, + apply_to_all=all, + down=down, + no_confirm=yes, + idle_minutes_to_autostop=idle_minutes) @cli.command(cls=_DocumentedCodeCommand) -@click.argument( - "clusters", - nargs=-1, - required=False, - **_get_shell_complete_args(_complete_cluster_name), -) -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Start all existing clusters.", -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) +@click.argument('clusters', + nargs=-1, + required=False, + **_get_shell_complete_args(_complete_cluster_name)) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Start all existing clusters.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @click.option( - "--idle-minutes-to-autostop", - "-i", + '--idle-minutes-to-autostop', + '-i', default=None, type=int, required=False, - help=( - "Automatically stop the cluster after this many minutes " - "of idleness, i.e., no running or pending jobs in the cluster's job " - "queue. Idleness gets reset whenever setting-up/running/pending jobs " - "are found in the job queue. " - "Setting this flag is equivalent to " - "running ``sky launch -d ...`` and then ``sky autostop -i ``" - ". If not set, the cluster will not be autostopped." - ), -) + help=('Automatically stop the cluster after this many minutes ' + 'of idleness, i.e., no running or pending jobs in the cluster\'s job ' + 'queue. Idleness gets reset whenever setting-up/running/pending jobs ' + 'are found in the job queue. ' + 'Setting this flag is equivalent to ' + 'running ``sky launch -d ...`` and then ``sky autostop -i ``' + '. If not set, the cluster will not be autostopped.')) @click.option( - "--down", + '--down', default=False, is_flag=True, required=False, - help=( - "Autodown the cluster: tear down the cluster after specified minutes of " - "idle time after all jobs finish (successfully or abnormally). Requires " - "--idle-minutes-to-autostop to be set." - ), + help= + ('Autodown the cluster: tear down the cluster after specified minutes of ' + 'idle time after all jobs finish (successfully or abnormally). Requires ' + '--idle-minutes-to-autostop to be set.'), ) @click.option( - "--retry-until-up", - "-r", + '--retry-until-up', + '-r', default=False, is_flag=True, required=False, # Disabling quote check here, as there seems to be a bug in pylint, # which incorrectly recognizes the help string as a docstring. # pylint: disable=bad-docstring-quotes - help=( - "Retry provisioning infinitely until the cluster is up, " - "if we fail to start the cluster due to unavailability errors." - ), + help=('Retry provisioning infinitely until the cluster is up, ' + 'if we fail to start the cluster due to unavailability errors.'), ) @click.option( - "--force", - "-f", + '--force', + '-f', default=False, is_flag=True, required=False, - help=( - "Force start the cluster even if it is already UP. Useful for " - "upgrading the SkyPilot runtime on the cluster." - ), -) + help=('Force start the cluster even if it is already UP. Useful for ' + 'upgrading the SkyPilot runtime on the cluster.')) @usage_lib.entrypoint # pylint: disable=redefined-builtin def start( - clusters: List[str], - all: bool, - yes: bool, - idle_minutes_to_autostop: Optional[int], - down: bool, # pylint: disable=redefined-outer-name - retry_until_up: bool, - force: bool, -): + clusters: List[str], + all: bool, + yes: bool, + idle_minutes_to_autostop: Optional[int], + down: bool, # pylint: disable=redefined-outer-name + retry_until_up: bool, + force: bool): # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Restart cluster(s). @@ -2646,48 +2373,43 @@ def start( """ if down and idle_minutes_to_autostop is None: raise click.UsageError( - "--idle-minutes-to-autostop must be set if --down is set." - ) + '--idle-minutes-to-autostop must be set if --down is set.') to_start = [] if not clusters and not all: # UX: frequently users may have only 1 cluster. In this case, be smart # and default to that unique choice. - all_cluster_names = global_user_state.get_cluster_names_start_with("") + all_cluster_names = global_user_state.get_cluster_names_start_with('') if len(all_cluster_names) <= 1: clusters = all_cluster_names else: raise click.UsageError( - "`sky start` requires either a cluster name or glob " - "(see `sky status`), or the -a/--all flag." - ) + '`sky start` requires either a cluster name or glob ' + '(see `sky status`), or the -a/--all flag.') if all: if len(clusters) > 0: - click.echo( - "Both --all and cluster(s) specified for sky start. " - "Letting --all take effect." - ) + click.echo('Both --all and cluster(s) specified for sky start. ' + 'Letting --all take effect.') # Get all clusters that are not controllers. clusters = [ - cluster["name"] + cluster['name'] for cluster in global_user_state.get_clusters() - if controller_utils.Controllers.from_name(cluster["name"]) is None + if controller_utils.Controllers.from_name(cluster['name']) is None ] if not clusters: - click.echo( - "Cluster(s) not found (tip: see `sky status`). Do you " - "mean to use `sky launch` to provision a new cluster?" - ) + click.echo('Cluster(s) not found (tip: see `sky status`). Do you ' + 'mean to use `sky launch` to provision a new cluster?') return else: # Get GLOB cluster names clusters = _get_glob_clusters(clusters) for name in clusters: - cluster_status, _ = backend_utils.refresh_cluster_status_handle(name) + cluster_status, _ = backend_utils.refresh_cluster_status_handle( + name) # A cluster may have one of the following states: # # STOPPED - ok to restart @@ -2711,7 +2433,7 @@ def start( # INIT state cluster due to head_ip not being cached). # # This can be replicated by adding `exit 1` to Task.setup. - if not force and cluster_status == status_lib.ClusterStatus.UP: + if (not force and cluster_status == status_lib.ClusterStatus.UP): # An UP cluster; skipping 'sky start' because: # 1. For a really up cluster, this has no effects (ray up -y # --no-restart) anyway. @@ -2722,13 +2444,12 @@ def start( # zombied (remains as stopped in the cloud's UI). # # This is dangerous and unwanted behavior! - click.echo(f"Cluster {name} already has status UP.") + click.echo(f'Cluster {name} already has status UP.') continue assert force or cluster_status in ( status_lib.ClusterStatus.INIT, - status_lib.ClusterStatus.STOPPED, - ), cluster_status + status_lib.ClusterStatus.STOPPED), cluster_status to_start.append(name) if not to_start: return @@ -2742,83 +2463,74 @@ def start( normal_clusters.append(name) if controllers and normal_clusters: # Keep this behavior the same as _down_or_stop_clusters(). - raise click.UsageError( - "Starting controllers with other cluster(s) " - "is currently not supported.\n" - "Please start the former independently." - ) + raise click.UsageError('Starting controllers with other cluster(s) ' + 'is currently not supported.\n' + 'Please start the former independently.') if controllers: bold = backend_utils.BOLD reset_bold = backend_utils.RESET_BOLD if len(controllers) != 1: raise click.UsageError( - "Starting multiple controllers is currently not supported.\n" - "Please start them independently." - ) + 'Starting multiple controllers is currently not supported.\n' + 'Please start them independently.') if idle_minutes_to_autostop is not None: raise click.UsageError( - "Autostop options are currently not allowed when starting the " - "controllers. Use the default autostop settings by directly " - f'calling: {bold}sky start {" ".join(controllers)}{reset_bold}' - ) + 'Autostop options are currently not allowed when starting the ' + 'controllers. Use the default autostop settings by directly ' + f'calling: {bold}sky start {" ".join(controllers)}{reset_bold}') if not yes: - cluster_str = "clusters" if len(to_start) > 1 else "cluster" - cluster_list = ", ".join(to_start) + cluster_str = 'clusters' if len(to_start) > 1 else 'cluster' + cluster_list = ', '.join(to_start) click.confirm( - f"Restarting {len(to_start)} {cluster_str}: " f"{cluster_list}. Proceed?", + f'Restarting {len(to_start)} {cluster_str}: ' + f'{cluster_list}. Proceed?', default=True, abort=True, - show_default=True, - ) + show_default=True) for name in to_start: try: - core.start( - name, idle_minutes_to_autostop, retry_until_up, down=down, force=force - ) - except ( - exceptions.NotSupportedError, - exceptions.ClusterOwnerIdentityMismatchError, - ) as e: + core.start(name, + idle_minutes_to_autostop, + retry_until_up, + down=down, + force=force) + except (exceptions.NotSupportedError, + exceptions.ClusterOwnerIdentityMismatchError) as e: click.echo(str(e)) else: - click.secho(f"Cluster {name} started.", fg="green") + click.secho(f'Cluster {name} started.', fg='green') @cli.command(cls=_DocumentedCodeCommand) -@click.argument( - "clusters", - nargs=-1, - required=False, - **_get_shell_complete_args(_complete_cluster_name), -) -@click.option( - "--all", "-a", default=None, is_flag=True, help="Tear down all existing clusters." -) +@click.argument('clusters', + nargs=-1, + required=False, + **_get_shell_complete_args(_complete_cluster_name)) +@click.option('--all', + '-a', + default=None, + is_flag=True, + help='Tear down all existing clusters.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @click.option( - "--yes", - "-y", + '--purge', + '-p', is_flag=True, default=False, required=False, - help="Skip confirmation prompt.", -) -@click.option( - "--purge", - "-p", - is_flag=True, - default=False, - required=False, - help=( - "(Advanced) Forcefully remove the cluster(s) from " - "SkyPilot's cluster table, even if the actual cluster termination " - "failed on the cloud. WARNING: This flag should only be set sparingly" - " in certain manual troubleshooting scenarios; with it set, it is the" - " user's responsibility to ensure there are no leaked instances and " - "related resources." - ), -) + help=('(Advanced) Forcefully remove the cluster(s) from ' + 'SkyPilot\'s cluster table, even if the actual cluster termination ' + 'failed on the cloud. WARNING: This flag should only be set sparingly' + ' in certain manual troubleshooting scenarios; with it set, it is the' + ' user\'s responsibility to ensure there are no leaked instances and ' + 'related resources.')) @usage_lib.entrypoint def down( clusters: List[str], @@ -2854,9 +2566,11 @@ def down( sky down -a """ - _down_or_stop_clusters( - clusters, apply_to_all=all, down=True, no_confirm=yes, purge=purge - ) + _down_or_stop_clusters(clusters, + apply_to_all=all, + down=True, + no_confirm=yes, + purge=purge) def _hint_or_raise_for_down_jobs_controller(controller_name: str): @@ -2874,43 +2588,43 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str): controller = controller_utils.Controllers.from_name(controller_name) assert controller is not None, controller_name - with rich_utils.safe_status("[bold cyan]Checking for in-progress managed jobs[/]"): + with rich_utils.safe_status( + '[bold cyan]Checking for in-progress managed jobs[/]'): try: - managed_jobs_ = managed_jobs.queue(refresh=False, skip_finished=True) + managed_jobs_ = managed_jobs.queue(refresh=False, + skip_finished=True) except exceptions.ClusterNotUpError as e: if controller.value.connection_error_hint in str(e): with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError( - controller.value.decline_down_when_failed_to_fetch_status_hint - ) + controller.value. + decline_down_when_failed_to_fetch_status_hint) if e.cluster_status is None: - click.echo("Managed jobs controller has already been torn down.") + click.echo( + 'Managed jobs controller has already been torn down.') sys.exit(0) # At this point, the managed jobs are failed to be fetched due to # the controller being STOPPED or being firstly launched, i.e., # there is no in-prgress managed jobs. managed_jobs_ = [] - msg = ( - f"{colorama.Fore.YELLOW}WARNING: Tearing down the managed " - "jobs controller. Please be aware of the following:" - f"{colorama.Style.RESET_ALL}" - "\n * All logs and status information of the managed " - "jobs (output of `sky jobs queue`) will be lost." - ) + msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed ' + 'jobs controller. Please be aware of the following:' + f'{colorama.Style.RESET_ALL}' + '\n * All logs and status information of the managed ' + 'jobs (output of `sky jobs queue`) will be lost.') click.echo(msg) if managed_jobs_: job_table = managed_jobs.format_job_table(managed_jobs_, show_all=False) msg = controller.value.decline_down_for_dirty_controller_hint # Add prefix to each line to align with the bullet point. - msg += "\n".join([" " + line for line in job_table.split("\n") if line != ""]) + msg += '\n'.join( + [' ' + line for line in job_table.split('\n') if line != '']) with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError(msg) else: - click.echo( - " * No in-progress managed jobs found. It should be safe to " - "terminate (see caveats above)." - ) + click.echo(' * No in-progress managed jobs found. It should be safe to ' + 'terminate (see caveats above).') def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): @@ -2927,17 +2641,17 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): """ controller = controller_utils.Controllers.from_name(controller_name) assert controller is not None, controller_name - with rich_utils.safe_status("[bold cyan]Checking for live services[/]"): + with rich_utils.safe_status('[bold cyan]Checking for live services[/]'): try: services = serve_lib.status() except exceptions.ClusterNotUpError as e: if controller.value.connection_error_hint in str(e): with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError( - controller.value.decline_down_when_failed_to_fetch_status_hint - ) + controller.value. + decline_down_when_failed_to_fetch_status_hint) if e.cluster_status is None: - click.echo("Serve controller has already been torn down.") + click.echo('Serve controller has already been torn down.') sys.exit(0) # At this point, the services are failed to be fetched due to the # controller being STOPPED or being firstly launched, i.e., there is @@ -2945,34 +2659,31 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): services = [] if services: - service_names = [service["name"] for service in services] + service_names = [service['name'] for service in services] with ux_utils.print_exception_no_traceback(): - msg = controller.value.decline_down_for_dirty_controller_hint.format( - service_names=", ".join(service_names) - ) + msg = ( + controller.value.decline_down_for_dirty_controller_hint.format( + service_names=', '.join(service_names))) raise exceptions.NotSupportedError(msg) # Do nothing for STOPPED state, as it is safe to terminate the cluster. - click.echo(f"Terminate sky serve controller: {controller_name}.") + click.echo(f'Terminate sky serve controller: {controller_name}.') _CONTROLLER_TO_HINT_OR_RAISE = { - controller_utils.Controllers.JOBS_CONTROLLER: ( - _hint_or_raise_for_down_jobs_controller - ), - controller_utils.Controllers.SKY_SERVE_CONTROLLER: ( - _hint_or_raise_for_down_sky_serve_controller - ), + controller_utils.Controllers.JOBS_CONTROLLER: + (_hint_or_raise_for_down_jobs_controller), + controller_utils.Controllers.SKY_SERVE_CONTROLLER: + (_hint_or_raise_for_down_sky_serve_controller), } def _down_or_stop_clusters( - names: List[str], - apply_to_all: Optional[bool], - down: bool, # pylint: disable=redefined-outer-name - no_confirm: bool, - purge: bool = False, - idle_minutes_to_autostop: Optional[int] = None, -) -> None: + names: List[str], + apply_to_all: Optional[bool], + down: bool, # pylint: disable=redefined-outer-name + no_confirm: bool, + purge: bool = False, + idle_minutes_to_autostop: Optional[int] = None) -> None: """Tears down or (auto-)stops a cluster (or all clusters). Controllers (jobs controller and sky serve controller) can only be @@ -2980,43 +2691,40 @@ def _down_or_stop_clusters( via glob). """ if down: - command = "down" + command = 'down' elif idle_minutes_to_autostop is not None: - command = "autostop" + command = 'autostop' else: - command = "stop" + command = 'stop' if not names and apply_to_all is None: # UX: frequently users may have only 1 cluster. In this case, 'sky # stop/down' without args should be smart and default to that unique # choice. - all_cluster_names = global_user_state.get_cluster_names_start_with("") + all_cluster_names = global_user_state.get_cluster_names_start_with('') if len(all_cluster_names) <= 1: names = all_cluster_names else: raise click.UsageError( - f"`sky {command}` requires either a cluster name or glob " - "(see `sky status`), or the -a/--all flag." - ) + f'`sky {command}` requires either a cluster name or glob ' + '(see `sky status`), or the -a/--all flag.') - operation = "Terminating" if down else "Stopping" + operation = 'Terminating' if down else 'Stopping' if idle_minutes_to_autostop is not None: is_cancel = idle_minutes_to_autostop < 0 - verb = "Cancelling" if is_cancel else "Scheduling" - option_str = "down" if down else "stop" + verb = 'Cancelling' if is_cancel else 'Scheduling' + option_str = 'down' if down else 'stop' if is_cancel: - option_str = "{stop,down}" - operation = f"{verb} auto{option_str} on" + option_str = '{stop,down}' + operation = f'{verb} auto{option_str} on' if len(names) > 0: controllers = [ - name - for name in names + name for name in names if controller_utils.Controllers.from_name(name) is not None ] - controllers_str = ", ".join(map(repr, controllers)) + controllers_str = ', '.join(map(repr, controllers)) names = [ - name - for name in _get_glob_clusters(names) + name for name in _get_glob_clusters(names) if controller_utils.Controllers.from_name(name) is None ] @@ -3024,27 +2732,25 @@ def _down_or_stop_clusters( # normal clusters. if controllers: if len(names) != 0: - names_str = ", ".join(map(repr, names)) + names_str = ', '.join(map(repr, names)) raise click.UsageError( - f"{operation} controller(s) " - f"{controllers_str} with other cluster(s) " - f"{names_str} is currently not supported.\n" - f"Please omit the controller(s) {controllers}." - ) + f'{operation} controller(s) ' + f'{controllers_str} with other cluster(s) ' + f'{names_str} is currently not supported.\n' + f'Please omit the controller(s) {controllers}.') if len(controllers) > 1: raise click.UsageError( - f"{operation} multiple controllers " - f"{controllers_str} is currently not supported.\n" - f"Please specify only one controller." - ) + f'{operation} multiple controllers ' + f'{controllers_str} is currently not supported.\n' + f'Please specify only one controller.') controller_name = controllers[0] if not down: raise click.UsageError( - f"{operation} controller(s) " - f"{controllers_str} is currently not supported." - ) + f'{operation} controller(s) ' + f'{controllers_str} is currently not supported.') else: - controller = controller_utils.Controllers.from_name(controller_name) + controller = controller_utils.Controllers.from_name( + controller_name) assert controller is not None hint_or_raise = _CONTROLLER_TO_HINT_OR_RAISE[controller] try: @@ -3056,30 +2762,21 @@ def _down_or_stop_clusters( # managed job or service. We should make this check atomic # with the termination. hint_or_raise(controller_name) - except ( - exceptions.ClusterOwnerIdentityMismatchError, - RuntimeError, - ) as e: + except (exceptions.ClusterOwnerIdentityMismatchError, + RuntimeError) as e: if purge: click.echo(common_utils.format_exception(e)) else: raise - confirm_str = "delete" - input_prefix = ( - ( - "Since --purge is set, errors will be ignored " - "and controller will be removed from " - "local state.\n" - ) - if purge - else "" - ) + confirm_str = 'delete' + input_prefix = ('Since --purge is set, errors will be ignored ' + 'and controller will be removed from ' + 'local state.\n') if purge else '' user_input = click.prompt( - f"{input_prefix}" - f"To proceed, please type {colorama.Style.BRIGHT}" - f"{confirm_str!r}{colorama.Style.RESET_ALL}", - type=str, - ) + f'{input_prefix}' + f'To proceed, please type {colorama.Style.BRIGHT}' + f'{confirm_str!r}{colorama.Style.RESET_ALL}', + type=str) if user_input != confirm_str: raise click.Abort() no_confirm = True @@ -3089,15 +2786,14 @@ def _down_or_stop_clusters( all_clusters = global_user_state.get_clusters() if len(names) > 0: click.echo( - f"Both --all and cluster(s) specified for `sky {command}`. " - "Letting --all take effect." - ) + f'Both --all and cluster(s) specified for `sky {command}`. ' + 'Letting --all take effect.') # We should not remove controllers when --all is specified. # Otherwise, it would be very easy to accidentally delete a controller. names = [ - record["name"] + record['name'] for record in all_clusters - if controller_utils.Controllers.from_name(record["name"]) is None + if controller_utils.Controllers.from_name(record['name']) is None ] clusters = [] @@ -3112,54 +2808,51 @@ def _down_or_stop_clusters( usage_lib.record_cluster_name_for_current_operation(clusters) if not clusters: - click.echo("Cluster(s) not found (tip: see `sky status`).") + click.echo('Cluster(s) not found (tip: see `sky status`).') return if not no_confirm and len(clusters) > 0: - cluster_str = "clusters" if len(clusters) > 1 else "cluster" - cluster_list = ", ".join(clusters) + cluster_str = 'clusters' if len(clusters) > 1 else 'cluster' + cluster_list = ', '.join(clusters) click.confirm( - f"{operation} {len(clusters)} {cluster_str}: " f"{cluster_list}. Proceed?", + f'{operation} {len(clusters)} {cluster_str}: ' + f'{cluster_list}. Proceed?', default=True, abort=True, - show_default=True, - ) + show_default=True) - plural = "s" if len(clusters) > 1 else "" - progress = rich_progress.Progress( - transient=True, redirect_stdout=False, redirect_stderr=False - ) + plural = 's' if len(clusters) > 1 else '' + progress = rich_progress.Progress(transient=True, + redirect_stdout=False, + redirect_stderr=False) task = progress.add_task( - f"[bold cyan]{operation} {len(clusters)} cluster{plural}[/]", - total=len(clusters), - ) + f'[bold cyan]{operation} {len(clusters)} cluster{plural}[/]', + total=len(clusters)) def _down_or_stop(name: str): success_progress = False if idle_minutes_to_autostop is not None: try: core.autostop(name, idle_minutes_to_autostop, down) - except (exceptions.NotSupportedError, exceptions.ClusterNotUpError) as e: + except (exceptions.NotSupportedError, + exceptions.ClusterNotUpError) as e: message = str(e) else: # no exception raised success_progress = True - message = ( - f"{colorama.Fore.GREEN}{operation} " - f"cluster {name!r}...done{colorama.Style.RESET_ALL}" - ) + message = (f'{colorama.Fore.GREEN}{operation} ' + f'cluster {name!r}...done{colorama.Style.RESET_ALL}') if idle_minutes_to_autostop >= 0: - option_str = "down" if down else "stop" - passive_str = "downed" if down else "stopped" - plural = "s" if idle_minutes_to_autostop != 1 else "" + option_str = 'down' if down else 'stop' + passive_str = 'downed' if down else 'stopped' + plural = 's' if idle_minutes_to_autostop != 1 else '' message += ( - f"\n The cluster will be auto{passive_str} after " - f"{idle_minutes_to_autostop} minute{plural} of " - "idleness." - f"\n To cancel the auto{option_str}, run: " - f"{colorama.Style.BRIGHT}" - f"sky autostop {name} --cancel" - f"{colorama.Style.RESET_ALL}" - ) + f'\n The cluster will be auto{passive_str} after ' + f'{idle_minutes_to_autostop} minute{plural} of ' + 'idleness.' + f'\n To cancel the auto{option_str}, run: ' + f'{colorama.Style.BRIGHT}' + f'sky autostop {name} --cancel' + f'{colorama.Style.RESET_ALL}') else: try: if down: @@ -3168,26 +2861,20 @@ def _down_or_stop(name: str): core.stop(name, purge=purge) except RuntimeError as e: message = ( - f"{colorama.Fore.RED}{operation} cluster {name}...failed. " - f"{colorama.Style.RESET_ALL}" - f"\nReason: {common_utils.format_exception(e)}." - ) - except ( - exceptions.NotSupportedError, - exceptions.ClusterOwnerIdentityMismatchError, - ) as e: + f'{colorama.Fore.RED}{operation} cluster {name}...failed. ' + f'{colorama.Style.RESET_ALL}' + f'\nReason: {common_utils.format_exception(e)}.') + except (exceptions.NotSupportedError, + exceptions.ClusterOwnerIdentityMismatchError) as e: message = str(e) else: # no exception raised message = ( - f"{colorama.Fore.GREEN}{operation} cluster {name}...done." - f"{colorama.Style.RESET_ALL}" - ) + f'{colorama.Fore.GREEN}{operation} cluster {name}...done.' + f'{colorama.Style.RESET_ALL}') if not down: - message += ( - "\n To restart the cluster, run: " - f"{colorama.Style.BRIGHT}sky start {name}" - f"{colorama.Style.RESET_ALL}" - ) + message += ('\n To restart the cluster, run: ' + f'{colorama.Style.BRIGHT}sky start {name}' + f'{colorama.Style.RESET_ALL}') success_progress = True progress.stop() @@ -3204,14 +2891,12 @@ def _down_or_stop(name: str): @cli.command(cls=_DocumentedCodeCommand) -@click.argument("clouds", required=False, type=str, nargs=-1) -@click.option( - "--verbose", - "-v", - is_flag=True, - default=False, - help="Show the activated account for each cloud.", -) +@click.argument('clouds', required=False, type=str, nargs=-1) +@click.option('--verbose', + '-v', + is_flag=True, + default=False, + help='Show the activated account for each cloud.') @usage_lib.entrypoint def check(clouds: Tuple[str], verbose: bool): """Check which clouds are available to use. @@ -3237,40 +2922,41 @@ def check(clouds: Tuple[str], verbose: bool): """ clouds_arg = clouds if len(clouds) > 0 else None sky_check.check(verbose=verbose, clouds=clouds_arg) - - -@cli.command() -@click.argument("accelerator_str", required=False) -@click.option( - "--all", - "-a", - is_flag=True, - default=False, - help="Show details of all GPU/TPU/accelerator offerings.", -) -@click.option("--cloud", default=None, type=str, help="Cloud provider to query.") + + +@cli.command() +@click.argument('accelerator_str', required=False) +@click.option('--all', + '-a', + is_flag=True, + default=False, + help='Show details of all GPU/TPU/accelerator offerings.') +@click.option('--cloud', + default=None, + type=str, + help='Cloud provider to query.') @click.option( - "--region", + '--region', required=False, type=str, - help=("The region to use. If not specified, shows accelerators from all regions."), + help= + ('The region to use. If not specified, shows accelerators from all regions.' + ), ) @click.option( - "--all-regions", + '--all-regions', is_flag=True, default=False, - help="Show pricing and instance details for a specified accelerator across " - "all regions and clouds.", -) + help='Show pricing and instance details for a specified accelerator across ' + 'all regions and clouds.') @service_catalog.fallback_to_default_catalog @usage_lib.entrypoint def show_gpus( - accelerator_str: Optional[str], - all: bool, # pylint: disable=redefined-builtin - cloud: Optional[str], - region: Optional[str], - all_regions: Optional[bool], -): + accelerator_str: Optional[str], + all: bool, # pylint: disable=redefined-builtin + cloud: Optional[str], + region: Optional[str], + all_regions: Optional[bool]): """Show supported GPU/TPU/accelerators and their prices. The names and counts shown can be set in the ``accelerators`` field in task @@ -3316,110 +3002,102 @@ def show_gpus( # validation for the --region flag if region is not None and cloud is None: raise click.UsageError( - "The --region flag is only valid when the --cloud flag is set." - ) + 'The --region flag is only valid when the --cloud flag is set.') # validation for the --all-regions flag if all_regions and accelerator_str is None: raise click.UsageError( - "The --all-regions flag is only valid when an accelerator " "is specified." - ) + 'The --all-regions flag is only valid when an accelerator ' + 'is specified.') if all_regions and region is not None: raise click.UsageError( - "--all-regions and --region flags cannot be used simultaneously." - ) + '--all-regions and --region flags cannot be used simultaneously.') # This will validate 'cloud' and raise if not found. cloud_obj = sky_clouds.CLOUD_REGISTRY.from_str(cloud) service_catalog.validate_region_zone(region, None, clouds=cloud) show_all = all if show_all and accelerator_str is not None: - raise click.UsageError("--all is only allowed without a GPU name.") + raise click.UsageError('--all is only allowed without a GPU name.') # Kubernetes specific bools cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes) kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None kubernetes_is_enabled = sky_clouds.cloud_in_iterable( - sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds() - ) + sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds()) if cloud_is_kubernetes and region is not None: raise click.UsageError( - "The --region flag cannot be set with --cloud kubernetes." - ) + 'The --region flag cannot be set with --cloud kubernetes.') def _list_to_str(lst): - return ", ".join([str(e) for e in lst]) + return ', '.join([str(e) for e in lst]) def _get_kubernetes_realtime_gpu_table( - name_filter: Optional[str] = None, quantity_filter: Optional[int] = None - ): + name_filter: Optional[str] = None, + quantity_filter: Optional[int] = None): if quantity_filter: - qty_header = "QTY_FILTER" - free_header = "FILTERED_FREE_GPUS" + qty_header = 'QTY_FILTER' + free_header = 'FILTERED_FREE_GPUS' else: - qty_header = "QTY_PER_NODE" - free_header = "TOTAL_FREE_GPUS" + qty_header = 'QTY_PER_NODE' + free_header = 'TOTAL_FREE_GPUS' realtime_gpu_table = log_utils.create_table( - ["GPU", qty_header, "TOTAL_GPUS", free_header] - ) + ['GPU', qty_header, 'TOTAL_GPUS', free_header]) counts, capacity, available = service_catalog.list_accelerator_realtime( gpus_only=True, - clouds="kubernetes", + clouds='kubernetes', name_filter=name_filter, region_filter=region, quantity_filter=quantity_filter, - case_sensitive=False, - ) - assert set(counts.keys()) == set(capacity.keys()) == set(available.keys()), ( - f"Keys of counts ({list(counts.keys())}), " - f"capacity ({list(capacity.keys())}), " - f"and available ({list(available.keys())}) " - "must be same." - ) + case_sensitive=False) + assert (set(counts.keys()) == set(capacity.keys()) == set( + available.keys())), (f'Keys of counts ({list(counts.keys())}), ' + f'capacity ({list(capacity.keys())}), ' + f'and available ({list(available.keys())}) ' + 'must be same.') if len(counts) == 0: - err_msg = "No GPUs found in Kubernetes cluster. " - debug_msg = "To further debug, run: sky check " + err_msg = 'No GPUs found in Kubernetes cluster. ' + debug_msg = 'To further debug, run: sky check ' if name_filter is not None: - gpu_info_msg = f" {name_filter!r}" + gpu_info_msg = f' {name_filter!r}' if quantity_filter is not None: - gpu_info_msg += " with requested quantity" f" {quantity_filter}" - err_msg = ( - f"Resources{gpu_info_msg} not found " "in Kubernetes cluster. " - ) - debug_msg = ( - "To show available accelerators on kubernetes," - " run: sky show-gpus --cloud kubernetes " - ) - full_err_msg = err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE + debug_msg + gpu_info_msg += (' with requested quantity' + f' {quantity_filter}') + err_msg = (f'Resources{gpu_info_msg} not found ' + 'in Kubernetes cluster. ') + debug_msg = ('To show available accelerators on kubernetes,' + ' run: sky show-gpus --cloud kubernetes ') + full_err_msg = (err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE + + debug_msg) raise ValueError(full_err_msg) for gpu, _ in sorted(counts.items()): - realtime_gpu_table.add_row( - [gpu, _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu]] - ) + realtime_gpu_table.add_row([ + gpu, + _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu] + ]) return realtime_gpu_table def _get_kubernetes_node_info_table(): node_table = log_utils.create_table( - ["NODE_NAME", "GPU_NAME", "TOTAL_GPUS", "FREE_GPUS"] - ) + ['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS']) node_info_dict = kubernetes_utils.get_kubernetes_node_info() for node_name, node_info in node_info_dict.items(): - node_table.add_row( - [ - node_name, - node_info.gpu_type, - node_info.total["nvidia.com/gpu"], - node_info.free["nvidia.com/gpu"], - ] - ) + node_table.add_row([ + node_name, node_info.gpu_type, + node_info.total['nvidia.com/gpu'], + node_info.free['nvidia.com/gpu'] + ]) return node_table def _output(): - gpu_table = log_utils.create_table(["COMMON_GPU", "AVAILABLE_QUANTITIES"]) - tpu_table = log_utils.create_table(["GOOGLE_TPU", "AVAILABLE_QUANTITIES"]) - other_table = log_utils.create_table(["OTHER_GPU", "AVAILABLE_QUANTITIES"]) + gpu_table = log_utils.create_table( + ['COMMON_GPU', 'AVAILABLE_QUANTITIES']) + tpu_table = log_utils.create_table( + ['GOOGLE_TPU', 'AVAILABLE_QUANTITIES']) + other_table = log_utils.create_table( + ['OTHER_GPU', 'AVAILABLE_QUANTITIES']) name, quantity = None, None @@ -3429,10 +3107,10 @@ def _output(): clouds_to_list = cloud if cloud is None: clouds_to_list = [ - c for c in service_catalog.ALL_CLOUDS if c != "kubernetes" + c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes' ] - k8s_messages = "" + k8s_messages = '' if accelerator_str is None: # Collect k8s related messages in k8s_messages and print them at end print_section_titles = False @@ -3446,32 +3124,27 @@ def _output(): except ValueError as e: if not cloud_is_kubernetes: # Make it a note if cloud is not kubernetes - k8s_messages += "Note: " + k8s_messages += 'Note: ' k8s_messages += str(e) else: print_section_titles = True - yield ( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Kubernetes GPUs{colorama.Style.RESET_ALL}\n" - ) + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') yield from k8s_realtime_table.get_string() k8s_node_table = _get_kubernetes_node_info_table() - yield "\n\n" - yield ( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Kubernetes per node GPU availability" - f"{colorama.Style.RESET_ALL}\n" - ) + yield '\n\n' + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes per node GPU availability' + f'{colorama.Style.RESET_ALL}\n') yield from k8s_node_table.get_string() if kubernetes_autoscaling: - k8s_messages += "\n" + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + k8s_messages += ( + '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) if cloud_is_kubernetes: # Do not show clouds if --cloud kubernetes is specified if not kubernetes_is_enabled: - yield ( - "Kubernetes is not enabled. To fix, run: " - "sky check kubernetes " - ) + yield ('Kubernetes is not enabled. To fix, run: ' + 'sky check kubernetes ') yield k8s_messages return @@ -3479,7 +3152,7 @@ def _output(): # long and the user may not scroll to the end. if show_all and k8s_messages: yield k8s_messages - yield "\n\n" + yield '\n\n' result = service_catalog.list_accelerator_counts( gpus_only=True, @@ -3489,11 +3162,9 @@ def _output(): if print_section_titles: # If section titles were printed above, print again here - yield "\n\n" - yield ( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Cloud GPUs{colorama.Style.RESET_ALL}\n" - ) + yield '\n\n' + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Cloud GPUs{colorama.Style.RESET_ALL}\n') # "Common" GPUs for gpu in service_catalog.get_common_gpus(): @@ -3506,101 +3177,91 @@ def _output(): if tpu in result: tpu_table.add_row([tpu, _list_to_str(result.pop(tpu))]) if len(tpu_table.get_string()) > 0: - yield "\n\n" + yield '\n\n' yield from tpu_table.get_string() # Handle Other GPUs if show_all or cloud is not None: - yield "\n\n" + yield '\n\n' for gpu, qty in sorted(result.items()): other_table.add_row([gpu, _list_to_str(qty)]) yield from other_table.get_string() - yield "\n\n" + yield '\n\n' # Handle hints and messages if not show_all: if cloud is None: - yield ( - "\n\nHint: use -a/--all to see all accelerators " - "(including non-common ones) and pricing." - ) + yield ('\n\nHint: use -a/--all to see all accelerators ' + '(including non-common ones) and pricing.') # Handle k8 messages if present if k8s_messages: - yield "\n" + yield '\n' yield k8s_messages return else: - # Return if we're not showing all or if a specific cloud was queried - yield ("Hint: use -a/--all to see all accelerators " "and pricing.") + # Return if not showing all or a specific cloud was queried + yield ('Hint: use -a/--all to see all accelerators ' + 'and pricing.') return else: # Parse accelerator string - accelerator_split = accelerator_str.split(":") + accelerator_split = accelerator_str.split(':') if len(accelerator_split) > 2: raise click.UsageError( - f"Invalid accelerator string {accelerator_str}. " - "Expected format: [:]." - ) + f'Invalid accelerator string {accelerator_str}. ' + 'Expected format: [:].') if len(accelerator_split) == 2: name = accelerator_split[0] # Check if quantity is valid try: quantity = int(accelerator_split[1]) if quantity <= 0: - raise ValueError("Quantity cannot be non-positive integer.") + raise ValueError( + 'Quantity cannot be non-positive integer.') except ValueError as invalid_quantity: raise click.UsageError( - f"Invalid accelerator quantity {accelerator_split[1]}. " - "Expected a positive integer." - ) from invalid_quantity + f'Invalid accelerator quantity {accelerator_split[1]}. ' + 'Expected a positive integer.') from invalid_quantity else: name, quantity = accelerator_str, None print_section_titles = False - if ( - kubernetes_is_enabled - and (cloud is None or cloud_is_kubernetes) - and not show_all - ): + if (kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and + not show_all): # Print section title if not showing all and instead a specific # accelerator is requested print_section_titles = True - yield ( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Kubernetes GPUs{colorama.Style.RESET_ALL}\n" - ) + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') # TODO(romilb): Show filtered per node GPU availability here as well try: k8s_realtime_table = _get_kubernetes_realtime_gpu_table( - name_filter=name, quantity_filter=quantity - ) + name_filter=name, quantity_filter=quantity) yield from k8s_realtime_table.get_string() except ValueError as e: # In the case of a specific accelerator, show the error message # immediately (e.g., "Resources H100 not found ...") yield str(e) if kubernetes_autoscaling: - k8s_messages += "\n" + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + k8s_messages += ('\n' + + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) yield k8s_messages if cloud_is_kubernetes: # Do not show clouds if --cloud kubernetes is specified if not kubernetes_is_enabled: - yield ( - "Kubernetes is not enabled. To fix, run: " "sky check kubernetes " - ) + yield ('Kubernetes is not enabled. To fix, run: ' + 'sky check kubernetes ') return # For clouds other than Kubernetes, get the accelerator details # Case-sensitive - result = service_catalog.list_accelerators( - gpus_only=True, - name_filter=name, - quantity_filter=quantity, - region_filter=region, - clouds=clouds_to_list, - case_sensitive=False, - all_regions=all_regions, - ) + result = service_catalog.list_accelerators(gpus_only=True, + name_filter=name, + quantity_filter=quantity, + region_filter=region, + clouds=clouds_to_list, + case_sensitive=False, + all_regions=all_regions) # Import here to save module load speed. # pylint: disable=import-outside-toplevel,line-too-long from sky.clouds.service_catalog import common @@ -3613,79 +3274,73 @@ def _output(): for i, (gpu, items) in enumerate(result.items()): df = pd.DataFrame([t._asdict() for t in items]) # Determine the minimum prices for each cloud. - min_price_df = df.groupby("cloud").agg( - min_price=("price", "min"), min_spot_price=("spot_price", "min") - ) - df = df.merge(min_price_df, on="cloud") + min_price_df = df.groupby('cloud').agg(min_price=('price', 'min'), + min_spot_price=('spot_price', + 'min')) + df = df.merge(min_price_df, on='cloud') # Sort within each cloud by price. - df = df.groupby("cloud", group_keys=False).apply( - lambda x: x.sort_values(by=["price", "spot_price"]) - ) + df = df.groupby('cloud', group_keys=False).apply( + lambda x: x.sort_values(by=['price', 'spot_price'])) # Sort across groups (clouds). - df = df.sort_values(by=["min_price", "min_spot_price"]) - df = df.drop(columns=["min_price", "min_spot_price"]) + df = df.sort_values(by=['min_price', 'min_spot_price']) + df = df.drop(columns=['min_price', 'min_spot_price']) sorted_dataclasses = [ - common.InstanceTypeInfo(*row) for row in df.to_records(index=False) + common.InstanceTypeInfo(*row) + for row in df.to_records(index=False) ] new_result[gpu] = sorted_dataclasses result = new_result if print_section_titles and not show_all: - yield "\n\n" - yield ( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Cloud GPUs{colorama.Style.RESET_ALL}\n" - ) + yield '\n\n' + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Cloud GPUs{colorama.Style.RESET_ALL}\n') if len(result) == 0: - quantity_str = f" with requested quantity {quantity}" if quantity else "" - cloud_str = f" on {cloud_obj}." if cloud else " in cloud catalogs." - yield f"Resources '{name}'{quantity_str} not found{cloud_str} " - yield "To show available accelerators, run: sky show-gpus --all" + quantity_str = (f' with requested quantity {quantity}' + if quantity else '') + cloud_str = f' on {cloud_obj}.' if cloud else ' in cloud catalogs.' + yield f'Resources \'{name}\'{quantity_str} not found{cloud_str} ' + yield 'To show available accelerators, run: sky show-gpus --all' return for i, (gpu, items) in enumerate(result.items()): accelerator_table_headers = [ - "GPU", - "QTY", - "CLOUD", - "INSTANCE_TYPE", - "DEVICE_MEM", - "vCPUs", - "HOST_MEM", - "HOURLY_PRICE", - "HOURLY_SPOT_PRICE", + 'GPU', + 'QTY', + 'CLOUD', + 'INSTANCE_TYPE', + 'DEVICE_MEM', + 'vCPUs', + 'HOST_MEM', + 'HOURLY_PRICE', + 'HOURLY_SPOT_PRICE', ] if not show_all: - accelerator_table_headers.append("REGION") - accelerator_table = log_utils.create_table(accelerator_table_headers) + accelerator_table_headers.append('REGION') + accelerator_table = log_utils.create_table( + accelerator_table_headers) for item in items: - instance_type_str = ( - item.instance_type - if not pd.isna(item.instance_type) - else "(attachable)" - ) + instance_type_str = item.instance_type if not pd.isna( + item.instance_type) else '(attachable)' cpu_count = item.cpu_count - if not pd.isna(cpu_count) and isinstance(cpu_count, (float, int)): + if not pd.isna(cpu_count) and isinstance( + cpu_count, (float, int)): if int(cpu_count) == cpu_count: cpu_str = str(int(cpu_count)) else: - cpu_str = f"{cpu_count:.1f}" + cpu_str = f'{cpu_count:.1f}' else: - cpu_str = "-" - device_memory_str = ( - f"{item.device_memory:.0f}GB" - if not pd.isna(item.device_memory) - else "-" - ) - host_memory_str = ( - f"{item.memory:.0f}GB" if not pd.isna(item.memory) else "-" - ) - price_str = f"$ {item.price:.3f}" if not pd.isna(item.price) else "-" - spot_price_str = ( - f"$ {item.spot_price:.3f}" if not pd.isna(item.spot_price) else "-" - ) - region_str = item.region if not pd.isna(item.region) else "-" + cpu_str = '-' + device_memory_str = (f'{item.device_memory:.0f}GB' if + not pd.isna(item.device_memory) else '-') + host_memory_str = f'{item.memory:.0f}GB' if not pd.isna( + item.memory) else '-' + price_str = f'$ {item.price:.3f}' if not pd.isna( + item.price) else '-' + spot_price_str = f'$ {item.spot_price:.3f}' if not pd.isna( + item.spot_price) else '-' + region_str = item.region if not pd.isna(item.region) else '-' accelerator_table_vals = [ item.accelerator_name, item.accelerator_count, @@ -3702,7 +3357,7 @@ def _output(): accelerator_table.add_row(accelerator_table_vals) if i != 0: - yield "\n\n" + yield '\n\n' yield from accelerator_table.get_string() if show_all: @@ -3719,15 +3374,13 @@ def storage(): pass -@storage.command("ls", cls=_DocumentedCodeCommand) -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Show all information in full.", -) +@storage.command('ls', cls=_DocumentedCodeCommand) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all information in full.') @usage_lib.entrypoint # pylint: disable=redefined-builtin def storage_ls(all: bool): @@ -3737,34 +3390,26 @@ def storage_ls(all: bool): click.echo(storage_table) -@storage.command("delete", cls=_DocumentedCodeCommand) -@click.argument( - "names", - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_storage_name), -) -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Delete all storage objects.", -) -@click.option( - "--yes", - "-y", - default=False, - is_flag=True, - required=False, - help="Skip confirmation prompt.", -) +@storage.command('delete', cls=_DocumentedCodeCommand) +@click.argument('names', + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_storage_name)) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Delete all storage objects.') +@click.option('--yes', + '-y', + default=False, + is_flag=True, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint -def storage_delete( - names: List[str], all: bool, yes: bool -): # pylint: disable=redefined-builtin +def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=redefined-builtin """Delete storage objects. Examples: @@ -3781,25 +3426,25 @@ def storage_delete( sky storage delete -a """ if sum([len(names) > 0, all]) != 1: - raise click.UsageError("Either --all or a name must be specified.") + raise click.UsageError('Either --all or a name must be specified.') if all: storages = sky.storage_ls() if not storages: - click.echo("No storage(s) to delete.") + click.echo('No storage(s) to delete.') return - names = [s["name"] for s in storages] + names = [s['name'] for s in storages] else: names = _get_glob_storages(names) if names: if not yes: - storage_names = ", ".join(names) - storage_str = "storages" if len(names) > 1 else "storage" + storage_names = ', '.join(names) + storage_str = 'storages' if len(names) > 1 else 'storage' click.confirm( - f"Deleting {len(names)} {storage_str}: " f"{storage_names}. Proceed?", + f'Deleting {len(names)} {storage_str}: ' + f'{storage_names}. Proceed?', default=True, abort=True, - show_default=True, - ) + show_default=True) subprocess_utils.run_in_parallel(sky.storage_delete, names) @@ -3816,62 +3461,49 @@ def jobs(): pass -@jobs.command("launch", cls=_DocumentedCodeCommand) -@click.argument( - "entrypoint", - required=True, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name), -) +@jobs.command('launch', cls=_DocumentedCodeCommand) +@click.argument('entrypoint', + required=True, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name)) # TODO(zhwu): Add --dryrun option to test the launch command. @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS) +@click.option('--cluster', + '-c', + default=None, + type=str, + hidden=True, + help=('Alias for --name, the name of the spot job.')) +@click.option('--job-recovery', + default=None, + type=str, + help='Recovery strategy to use for managed jobs.') @click.option( - "--cluster", - "-c", - default=None, - type=str, - hidden=True, - help=("Alias for --name, the name of the spot job."), -) -@click.option( - "--job-recovery", - default=None, - type=str, - help="Recovery strategy to use for managed jobs.", -) -@click.option( - "--detach-run", - "-d", + '--detach-run', + '-d', default=False, is_flag=True, - help=( - "If True, as soon as a job is submitted, return from this call " - "and do not stream execution logs." - ), -) + help=('If True, as soon as a job is submitted, return from this call ' + 'and do not stream execution logs.')) @click.option( - "--retry-until-up/--no-retry-until-up", - "-r/-no-r", + '--retry-until-up/--no-retry-until-up', + '-r/-no-r', default=None, is_flag=True, required=False, help=( - "(Default: True; this flag is deprecated and will be removed in a " - "future release.) Whether to retry provisioning infinitely until the " - "cluster is up, if unavailability errors are encountered. This " # pylint: disable=bad-docstring-quotes - "applies to launching all managed jobs (both the initial and " - "any recovery attempts), not the jobs controller." - ), -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) + '(Default: True; this flag is deprecated and will be removed in a ' + 'future release.) Whether to retry provisioning infinitely until the ' + 'cluster is up, if unavailability errors are encountered. This ' # pylint: disable=bad-docstring-quotes + 'applies to launching all managed jobs (both the initial and ' + 'any recovery attempts), not the jobs controller.')) +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @timeline.event @usage_lib.entrypoint def jobs_launch( @@ -3915,10 +3547,8 @@ def jobs_launch( """ if cluster is not None: if name is not None and name != cluster: - raise click.UsageError( - "Cannot specify both --name and --cluster. " - "Use one of the flags as they are alias." - ) + raise click.UsageError('Cannot specify both --name and --cluster. ' + 'Use one of the flags as they are alias.') name = cluster env = _merge_env_vars(env_file, env) task_or_dag = _make_task_or_dag_from_entrypoint_with_overrides( @@ -3944,15 +3574,14 @@ def jobs_launch( # Deprecation. We set the default behavior to be retry until up, and the # flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0. if retry_until_up is not None: - flag_str = "--retry-until-up" + flag_str = '--retry-until-up' if not retry_until_up: - flag_str = "--no-retry-until-up" + flag_str = '--no-retry-until-up' click.secho( - f"Flag {flag_str} is deprecated and will be removed in a " - "future release (managed jobs will always be retried). " - "Please file an issue if this does not work for you.", - fg="yellow", - ) + f'Flag {flag_str} is deprecated and will be removed in a ' + 'future release (managed jobs will always be retried). ' + 'Please file an issue if this does not work for you.', + fg='yellow') else: retry_until_up = True @@ -3970,46 +3599,44 @@ def jobs_launch( dag_utils.maybe_infer_and_fill_dag_and_task_names(dag) dag_utils.fill_default_config_in_dag_for_job_launch(dag) - click.secho( - f"Managed job {dag.name!r} will be launched on (estimated):", fg="yellow" - ) + click.secho(f'Managed job {dag.name!r} will be launched on (estimated):', + fg='yellow') dag = sky.optimize(dag) if not yes: - prompt = f"Launching a managed job {dag.name!r}. Proceed?" + prompt = f'Launching a managed job {dag.name!r}. Proceed?' if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) common_utils.check_cluster_name_is_valid(name) - managed_jobs.launch(dag, name, detach_run=detach_run, retry_until_up=retry_until_up) + managed_jobs.launch(dag, + name, + detach_run=detach_run, + retry_until_up=retry_until_up) -@jobs.command("queue", cls=_DocumentedCodeCommand) -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Show all information in full.", -) -@click.option( - "--refresh", - "-r", - default=False, - is_flag=True, - required=False, - help="Query the latest statuses, restarting the jobs controller if stopped.", -) +@jobs.command('queue', cls=_DocumentedCodeCommand) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all information in full.') @click.option( - "--skip-finished", - "-s", + '--refresh', + '-r', default=False, is_flag=True, required=False, - help="Show only pending/running jobs' information.", + help='Query the latest statuses, restarting the jobs controller if stopped.' ) +@click.option('--skip-finished', + '-s', + default=False, + is_flag=True, + required=False, + help='Show only pending/running jobs\' information.') @usage_lib.entrypoint # pylint: disable=redefined-builtin def jobs_queue(all: bool, refresh: bool, skip_finished: bool): @@ -4066,46 +3693,40 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool): watch -n60 sky jobs queue """ - click.secho("Fetching managed job statuses...", fg="yellow") - with rich_utils.safe_status("[cyan]Checking managed jobs[/]"): - _, msg = _get_managed_jobs( - refresh=refresh, - skip_finished=skip_finished, - show_all=all, - is_called_by_user=True, - ) + click.secho('Fetching managed job statuses...', fg='yellow') + with rich_utils.safe_status('[cyan]Checking managed jobs[/]'): + _, msg = _get_managed_jobs(refresh=refresh, + skip_finished=skip_finished, + show_all=all, + is_called_by_user=True) if not skip_finished: - in_progress_only_hint = "" + in_progress_only_hint = '' else: - in_progress_only_hint = " (showing in-progress jobs only)" - click.echo( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Managed jobs{colorama.Style.RESET_ALL}" - f"{in_progress_only_hint}\n{msg}" - ) - - -@jobs.command("cancel", cls=_DocumentedCodeCommand) -@click.option( - "--name", "-n", required=False, type=str, help="Managed job name to cancel." -) -@click.argument("job_ids", default=None, type=int, required=False, nargs=-1) -@click.option( - "--all", - "-a", - is_flag=True, - default=False, - required=False, - help="Cancel all managed jobs.", -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) + in_progress_only_hint = ' (showing in-progress jobs only)' + click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Managed jobs{colorama.Style.RESET_ALL}' + f'{in_progress_only_hint}\n{msg}') + + +@jobs.command('cancel', cls=_DocumentedCodeCommand) +@click.option('--name', + '-n', + required=False, + type=str, + help='Managed job name to cancel.') +@click.argument('job_ids', default=None, type=int, required=False, nargs=-1) +@click.option('--all', + '-a', + is_flag=True, + default=False, + required=False, + help='Cancel all managed jobs.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint # pylint: disable=redefined-builtin def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool): @@ -4126,83 +3747,73 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool): """ backend_utils.is_controller_accessible( controller=controller_utils.Controllers.JOBS_CONTROLLER, - stopped_message="All managed jobs should have finished.", - exit_if_not_accessible=True, - ) + stopped_message='All managed jobs should have finished.', + exit_if_not_accessible=True) - job_id_str = ",".join(map(str, job_ids)) + job_id_str = ','.join(map(str, job_ids)) if sum([len(job_ids) > 0, name is not None, all]) != 1: - argument_str = f"--job-ids {job_id_str}" if len(job_ids) > 0 else "" - argument_str += f" --name {name}" if name is not None else "" - argument_str += " --all" if all else "" + argument_str = f'--job-ids {job_id_str}' if len(job_ids) > 0 else '' + argument_str += f' --name {name}' if name is not None else '' + argument_str += ' --all' if all else '' raise click.UsageError( - "Can only specify one of JOB_IDS or --name or --all. " - f"Provided {argument_str!r}." - ) + 'Can only specify one of JOB_IDS or --name or --all. ' + f'Provided {argument_str!r}.') if not yes: - job_identity_str = ( - f"managed jobs with IDs {job_id_str}" if job_ids else repr(name) - ) + job_identity_str = (f'managed jobs with IDs {job_id_str}' + if job_ids else repr(name)) if all: - job_identity_str = "all managed jobs" - click.confirm( - f"Cancelling {job_identity_str}. Proceed?", - default=True, - abort=True, - show_default=True, - ) + job_identity_str = 'all managed jobs' + click.confirm(f'Cancelling {job_identity_str}. Proceed?', + default=True, + abort=True, + show_default=True) managed_jobs.cancel(job_ids=job_ids, name=name, all=all) -@jobs.command("logs", cls=_DocumentedCodeCommand) -@click.option("--name", "-n", required=False, type=str, help="Managed job name.") +@jobs.command('logs', cls=_DocumentedCodeCommand) +@click.option('--name', + '-n', + required=False, + type=str, + help='Managed job name.') @click.option( - "--follow/--no-follow", + '--follow/--no-follow', is_flag=True, default=True, - help=( - "Follow the logs of the job. [default: --follow] " - "If --no-follow is specified, print the log so far and exit." - ), -) + help=('Follow the logs of the job. [default: --follow] ' + 'If --no-follow is specified, print the log so far and exit.')) @click.option( - "--controller", + '--controller', is_flag=True, default=False, - help=( - "Show the controller logs of this job; useful for debugging " - "launching/recoveries, etc." - ), -) -@click.argument("job_id", required=False, type=int) + help=('Show the controller logs of this job; useful for debugging ' + 'launching/recoveries, etc.')) +@click.argument('job_id', required=False, type=int) @usage_lib.entrypoint -def jobs_logs( - name: Optional[str], job_id: Optional[int], follow: bool, controller: bool -): +def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool, + controller: bool): """Tail the log of a managed job.""" try: - managed_jobs.tail_logs( - name=name, job_id=job_id, follow=follow, controller=controller - ) + managed_jobs.tail_logs(name=name, + job_id=job_id, + follow=follow, + controller=controller) except exceptions.ClusterNotUpError: with ux_utils.print_exception_no_traceback(): raise -@jobs.command("dashboard", cls=_DocumentedCodeCommand) +@jobs.command('dashboard', cls=_DocumentedCodeCommand) @click.option( - "--port", - "-p", + '--port', + '-p', default=None, type=int, required=False, - help=( - "Local port to use for the dashboard. If None, a free port is " - "automatically chosen." - ), -) + help=('Local port to use for the dashboard. If None, a free port is ' + 'automatically chosen.')) @usage_lib.entrypoint def jobs_dashboard(port: Optional[int]): """Opens a dashboard for managed jobs (needs controller to be UP).""" @@ -4211,17 +3822,14 @@ def jobs_dashboard(port: Optional[int]): # see if the controller is UP first, which is slow; (2) not have to run SSH # port forwarding first (we'd just launch a local dashboard which would make # REST API calls to the controller dashboard server). - click.secho("Checking if jobs controller is up...", fg="yellow") - hint = ( - "Dashboard is not available if jobs controller is not up. Run a " - "managed job first." - ) + click.secho('Checking if jobs controller is up...', fg='yellow') + hint = ('Dashboard is not available if jobs controller is not up. Run a ' + 'managed job first.') backend_utils.is_controller_accessible( controller=controller_utils.Controllers.JOBS_CONTROLLER, stopped_message=hint, non_existent_message=hint, - exit_if_not_accessible=True, - ) + exit_if_not_accessible=True) # SSH forward a free local port to remote's dashboard port. remote_port = constants.SPOT_DASHBOARD_REMOTE_PORT @@ -4230,20 +3838,18 @@ def jobs_dashboard(port: Optional[int]): else: free_port = port ssh_command = ( - f"ssh -qNL {free_port}:localhost:{remote_port} " - f"{controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name}" - ) - click.echo("Forwarding port: ", nl=False) - click.secho(f"{ssh_command}", dim=True) + f'ssh -qNL {free_port}:localhost:{remote_port} ' + f'{controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name}') + click.echo('Forwarding port: ', nl=False) + click.secho(f'{ssh_command}', dim=True) - with subprocess.Popen( - ssh_command, shell=True, start_new_session=True - ) as ssh_process: + with subprocess.Popen(ssh_command, shell=True, + start_new_session=True) as ssh_process: time.sleep(3) # Added delay for ssh_command to initialize. - webbrowser.open(f"http://localhost:{free_port}") + webbrowser.open(f'http://localhost:{free_port}') click.secho( - f"Dashboard is now available at: http://127.0.0.1:{free_port}", fg="green" - ) + f'Dashboard is now available at: http://127.0.0.1:{free_port}', + fg='green') try: ssh_process.wait() except KeyboardInterrupt: @@ -4255,7 +3861,7 @@ def jobs_dashboard(port: Optional[int]): # This happens if jobs controller is auto-stopped. pass finally: - click.echo("Exiting.") + click.echo('Exiting.') # TODO(zhwu): Backward compatibility for the old `sky spot launch` command. @@ -4267,9 +3873,10 @@ def spot(): pass -_add_command_alias( - jobs, jobs_launch, new_group=spot, override_command_argument={"use_spot": True} -) +_add_command_alias(jobs, + jobs_launch, + new_group=spot, + override_command_argument={'use_spot': True}) _add_command_alias(jobs, jobs_queue, new_group=spot) _add_command_alias(jobs, jobs_logs, new_group=spot) _add_command_alias(jobs, jobs_cancel, new_group=spot) @@ -4304,9 +3911,9 @@ def _generate_task_with_service( not_supported_cmd: str, ) -> sky.Task: """Generate a task with service section from a service YAML file.""" - is_yaml, _ = _check_yaml("".join(service_yaml_args)) + is_yaml, _ = _check_yaml(''.join(service_yaml_args)) if not is_yaml: - raise click.UsageError("SERVICE_YAML must be a valid YAML file.") + raise click.UsageError('SERVICE_YAML must be a valid YAML file.') env = _merge_env_vars(env_file, env) # We keep nargs=-1 in service_yaml argument to reuse this function. task = _make_task_or_dag_from_entrypoint_with_overrides( @@ -4328,36 +3935,31 @@ def _generate_task_with_service( disk_size=disk_size, disk_tier=disk_tier, ports=ports, - entrypoint_name="Service", + entrypoint_name='Service', ) if isinstance(task, sky.Dag): raise click.UsageError( - _DAG_NOT_SUPPORTED_MESSAGE.format(command=not_supported_cmd) - ) + _DAG_NOT_SUPPORTED_MESSAGE.format(command=not_supported_cmd)) if task.service is None: with ux_utils.print_exception_no_traceback(): - raise ValueError( - "Service section not found in the YAML file. " - "To fix, add a valid `service` field." - ) + raise ValueError('Service section not found in the YAML file. ' + 'To fix, add a valid `service` field.') service_port: Optional[int] = None for requested_resources in list(task.resources): - if requested_resources.ports is None or len(requested_resources.ports) != 1: + if requested_resources.ports is None or len( + requested_resources.ports) != 1: with ux_utils.print_exception_no_traceback(): raise ValueError( - "Must only specify one port in resources. Each replica " - "will use the port specified as application ingress port." - ) + 'Must only specify one port in resources. Each replica ' + 'will use the port specified as application ingress port.') service_port_str = requested_resources.ports[0] if not service_port_str.isdigit(): # For the case when the user specified a port range like 10000-10010 with ux_utils.print_exception_no_traceback(): - raise ValueError( - f"Port {service_port_str!r} is not a valid " - "port number. Please specify a single port " - f"instead. Got: {service_port_str!r}" - ) + raise ValueError(f'Port {service_port_str!r} is not a valid ' + 'port number. Please specify a single port ' + f'instead. Got: {service_port_str!r}') # We request all the replicas using the same port for now, but it # should be fine to allow different replicas to use different ports # in the future. @@ -4366,39 +3968,31 @@ def _generate_task_with_service( service_port = resource_port if service_port != resource_port: with ux_utils.print_exception_no_traceback(): - raise ValueError( - f"Got multiple ports: {service_port} and " - f"{resource_port} in different resources. " - "Please specify single port instead." - ) + raise ValueError(f'Got multiple ports: {service_port} and ' + f'{resource_port} in different resources. ' + 'Please specify single port instead.') return task -@serve.command("up", cls=_DocumentedCodeCommand) -@click.argument( - "service_yaml", - required=True, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name), -) -@click.option( - "--service-name", - "-n", - default=None, - type=str, - help="A service name. Unique for each service. If not provided, " - "a unique name is autogenerated.", -) +@serve.command('up', cls=_DocumentedCodeCommand) +@click.argument('service_yaml', + required=True, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name)) +@click.option('--service-name', + '-n', + default=None, + type=str, + help='A service name. Unique for each service. If not provided, ' + 'a unique name is autogenerated.') @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @timeline.event @usage_lib.entrypoint def serve_up( @@ -4472,18 +4066,19 @@ def serve_up( disk_size=disk_size, disk_tier=disk_tier, ports=ports, - not_supported_cmd="sky serve up", + not_supported_cmd='sky serve up', ) - click.secho("Service Spec:", fg="cyan") + click.secho('Service Spec:', fg='cyan') click.echo(task.service) - click.secho("Each replica will use the following resources (estimated):", fg="cyan") + click.secho('Each replica will use the following resources (estimated):', + fg='cyan') with sky.Dag() as dag: dag.add(task) sky.optimize(dag) if not yes: - prompt = f"Launching a new service {service_name!r}. Proceed?" + prompt = f'Launching a new service {service_name!r}. Proceed?' if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) @@ -4493,35 +4088,28 @@ def serve_up( # TODO(MaoZiming): Update Doc. # TODO(MaoZiming): Expose mix replica traffic option to user. # Currently, we do not mix traffic from old and new replicas. -@serve.command("update", cls=_DocumentedCodeCommand) -@click.argument("service_name", required=True, type=str) -@click.argument( - "service_yaml", - required=True, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name), -) +@serve.command('update', cls=_DocumentedCodeCommand) +@click.argument('service_name', required=True, type=str) +@click.argument('service_yaml', + required=True, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name)) @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS) -@click.option( - "--mode", - default=serve_lib.DEFAULT_UPDATE_MODE.value, - type=click.Choice([m.value for m in serve_lib.UpdateMode], case_sensitive=False), - required=False, - help=( - 'Update mode. If "rolling", SkyServe will update the ' - 'service with rolling update. If "blue_green", SkyServe ' - "will update the service with blue-green update. " - ), -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) +@click.option('--mode', + default=serve_lib.DEFAULT_UPDATE_MODE.value, + type=click.Choice([m.value for m in serve_lib.UpdateMode], + case_sensitive=False), + required=False, + help=('Update mode. If "rolling", SkyServe will update the ' + 'service with rolling update. If "blue_green", SkyServe ' + 'will update the service with blue-green update. ')) +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @timeline.event @usage_lib.entrypoint def serve_update( @@ -4594,44 +4182,39 @@ def serve_update( disk_size=disk_size, disk_tier=disk_tier, ports=ports, - not_supported_cmd="sky serve update", + not_supported_cmd='sky serve update', ) - click.secho("Service Spec:", fg="cyan") + click.secho('Service Spec:', fg='cyan') click.echo(task.service) - click.secho("New replica will use the following resources (estimated):", fg="cyan") + click.secho('New replica will use the following resources (estimated):', + fg='cyan') with sky.Dag() as dag: dag.add(task) sky.optimize(dag) if not yes: - click.confirm( - f"Updating service {service_name!r}. Proceed?", - default=True, - abort=True, - show_default=True, - ) + click.confirm(f'Updating service {service_name!r}. Proceed?', + default=True, + abort=True, + show_default=True) serve_lib.update(task, service_name, mode=serve_lib.UpdateMode(mode)) -@serve.command("status", cls=_DocumentedCodeCommand) -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Show all information in full.", -) -@click.option( - "--endpoint", - default=False, - is_flag=True, - required=False, - help="Show service endpoint.", -) -@click.argument("service_names", required=False, type=str, nargs=-1) +@serve.command('status', cls=_DocumentedCodeCommand) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all information in full.') +@click.option('--endpoint', + default=False, + is_flag=True, + required=False, + help='Show service endpoint.') +@click.argument('service_names', required=False, type=str, nargs=-1) @usage_lib.entrypoint # pylint: disable=redefined-builtin def serve_status(all: bool, endpoint: bool, service_names: List[str]): @@ -4727,39 +4310,36 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): sky serve status my-service """ # This won't pollute the output of --endpoint. - with rich_utils.safe_status("[cyan]Checking services[/]"): - _, msg = _get_services( - service_names, show_all=all, show_endpoint=endpoint, is_called_by_user=True - ) + with rich_utils.safe_status('[cyan]Checking services[/]'): + _, msg = _get_services(service_names, + show_all=all, + show_endpoint=endpoint, + is_called_by_user=True) if not endpoint: - click.echo( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Services{colorama.Style.RESET_ALL}" - ) + click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Services{colorama.Style.RESET_ALL}') click.echo(msg) -@serve.command("down", cls=_DocumentedCodeCommand) -@click.argument("service_names", required=False, type=str, nargs=-1) -@click.option( - "--all", "-a", default=False, is_flag=True, help="Tear down all services." -) -@click.option( - "--purge", - "-p", - default=False, - is_flag=True, - help="Tear down services in failed status.", -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) +@serve.command('down', cls=_DocumentedCodeCommand) +@click.argument('service_names', required=False, type=str, nargs=-1) +@click.option('--all', + '-a', + default=False, + is_flag=True, + help='Tear down all services.') +@click.option('--purge', + '-p', + default=False, + is_flag=True, + help='Tear down services in failed status.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') # pylint: disable=redefined-builtin def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool): """Teardown service(s). @@ -4790,62 +4370,50 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool): sky serve down failed-service --purge """ if sum([len(service_names) > 0, all]) != 1: - argument_str = ( - f'SERVICE_NAMES={",".join(service_names)}' if len(service_names) > 0 else "" - ) - argument_str += " --all" if all else "" + argument_str = f'SERVICE_NAMES={",".join(service_names)}' if len( + service_names) > 0 else '' + argument_str += ' --all' if all else '' raise click.UsageError( - "Can only specify one of SERVICE_NAMES or --all. " - f"Provided {argument_str!r}." - ) + 'Can only specify one of SERVICE_NAMES or --all. ' + f'Provided {argument_str!r}.') backend_utils.is_controller_accessible( controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER, - stopped_message="All services should have been terminated.", - exit_if_not_accessible=True, - ) + stopped_message='All services should have been terminated.', + exit_if_not_accessible=True) if not yes: - quoted_service_names = [f"{name!r}" for name in service_names] + quoted_service_names = [f'{name!r}' for name in service_names] service_identity_str = f'service(s) {", ".join(quoted_service_names)}' if all: - service_identity_str = "all services" - click.confirm( - f"Terminating {service_identity_str}. Proceed?", - default=True, - abort=True, - show_default=True, - ) + service_identity_str = 'all services' + click.confirm(f'Terminating {service_identity_str}. Proceed?', + default=True, + abort=True, + show_default=True) serve_lib.down(service_names=service_names, all=all, purge=purge) -@serve.command("logs", cls=_DocumentedCodeCommand) +@serve.command('logs', cls=_DocumentedCodeCommand) @click.option( - "--follow/--no-follow", + '--follow/--no-follow', is_flag=True, default=True, - help=( - "Follow the logs of the job. [default: --follow] " - "If --no-follow is specified, print the log so far and exit." - ), -) -@click.option( - "--controller", - is_flag=True, - default=False, - required=False, - help="Show the controller logs of this service.", -) -@click.option( - "--load-balancer", - is_flag=True, - default=False, - required=False, - help="Show the load balancer logs of this service.", -) -@click.argument("service_name", required=True, type=str) -@click.argument("replica_id", required=False, type=int) + help=('Follow the logs of the job. [default: --follow] ' + 'If --no-follow is specified, print the log so far and exit.')) +@click.option('--controller', + is_flag=True, + default=False, + required=False, + help='Show the controller logs of this service.') +@click.option('--load-balancer', + is_flag=True, + default=False, + required=False, + help='Show the load balancer logs of this service.') +@click.argument('service_name', required=True, type=str) +@click.argument('replica_id', required=False, type=int) @usage_lib.entrypoint # TODO(tian): Add default argument for this CLI if none of the flags are # specified. @@ -4872,16 +4440,13 @@ def serve_logs( sky serve logs [SERVICE_NAME] 1 """ have_replica_id = replica_id is not None - num_flags = controller + load_balancer + have_replica_id + num_flags = (controller + load_balancer + have_replica_id) if num_flags > 1: - raise click.UsageError( - "At most one of --controller, --load-balancer, " - "[REPLICA_ID] can be specified." - ) + raise click.UsageError('At most one of --controller, --load-balancer, ' + '[REPLICA_ID] can be specified.') if num_flags == 0: - raise click.UsageError( - "One of --controller, --load-balancer, " "[REPLICA_ID] must be specified." - ) + raise click.UsageError('One of --controller, --load-balancer, ' + '[REPLICA_ID] must be specified.') if controller: target_component = serve_lib.ServiceComponent.CONTROLLER elif load_balancer: @@ -4891,9 +4456,10 @@ def serve_logs( assert replica_id is not None target_component = serve_lib.ServiceComponent.REPLICA try: - serve_lib.tail_logs( - service_name, target=target_component, replica_id=replica_id, follow=follow - ) + serve_lib.tail_logs(service_name, + target=target_component, + replica_id=replica_id, + follow=follow) except exceptions.ClusterNotUpError: with ux_utils.print_exception_no_traceback(): raise @@ -4921,84 +4487,71 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]: """ config = common_utils.read_yaml(os.path.expanduser(yaml_path)) if not isinstance(config, dict): - raise ValueError( - f"Invalid YAML file: {yaml_path}. " - "The YAML file should be parsed into a dictionary." - ) - if config.get("resources") is None: + raise ValueError(f'Invalid YAML file: {yaml_path}. ' + 'The YAML file should be parsed into a dictionary.') + if config.get('resources') is None: return None - resources = config["resources"] + resources = config['resources'] if not isinstance(resources, dict): - raise ValueError( - f"Invalid resources configuration in {yaml_path}. " - "Resources must be a dictionary." - ) - if resources.get("candidates") is None: + raise ValueError(f'Invalid resources configuration in {yaml_path}. ' + 'Resources must be a dictionary.') + if resources.get('candidates') is None: return None - candidates = resources["candidates"] + candidates = resources['candidates'] if not isinstance(candidates, list): - raise ValueError("Resource candidates must be a list of dictionaries.") + raise ValueError('Resource candidates must be a list of dictionaries.') for candidate in candidates: if not isinstance(candidate, dict): - raise ValueError("Each resource candidate must be a dictionary.") + raise ValueError('Each resource candidate must be a dictionary.') return candidates -@bench.command("launch", cls=_DocumentedCodeCommand) -@click.argument( - "entrypoint", - required=True, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name), -) -@click.option("--benchmark", "-b", required=True, type=str, help="Benchmark name.") +@bench.command('launch', cls=_DocumentedCodeCommand) +@click.argument('entrypoint', + required=True, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name)) +@click.option('--benchmark', + '-b', + required=True, + type=str, + help='Benchmark name.') @_add_click_options(_TASK_OPTIONS_WITH_NAME) +@click.option('--gpus', + required=False, + type=str, + help=('Comma-separated list of GPUs to run benchmark on. ' + 'Example values: "T4:4,V100:8" (without blank spaces).')) @click.option( - "--gpus", - required=False, - type=str, - help=( - "Comma-separated list of GPUs to run benchmark on. " - 'Example values: "T4:4,V100:8" (without blank spaces).' - ), -) -@click.option( - "--ports", + '--ports', required=False, type=str, multiple=True, - help=( - "Ports to open on the cluster. " - 'If specified, overrides the "ports" config in the YAML. ' - ), + help=('Ports to open on the cluster. ' + 'If specified, overrides the "ports" config in the YAML. '), ) @click.option( - "--idle-minutes-to-autostop", - "-i", + '--idle-minutes-to-autostop', + '-i', default=None, type=int, required=False, - help=( - "Automatically stop the cluster after this many minutes " - "of idleness after setup/file_mounts. This is equivalent to " - "running `sky launch -d ...` and then `sky autostop -i `. " - "If not set, the cluster will not be autostopped." - ), -) + help=('Automatically stop the cluster after this many minutes ' + 'of idleness after setup/file_mounts. This is equivalent to ' + 'running `sky launch -d ...` and then `sky autostop -i `. ' + 'If not set, the cluster will not be autostopped.')) # Disabling quote check here, as there seems to be a bug in pylint, # which incorrectly recognizes the help string as a docstring. # pylint: disable=bad-docstring-quotes -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint def benchmark_launch( entrypoint: str, @@ -5032,71 +4585,66 @@ def benchmark_launch( env = _merge_env_vars(env_file, env) record = benchmark_state.get_benchmark_from_name(benchmark) if record is not None: - raise click.BadParameter( - f"Benchmark {benchmark} already exists. " - "To delete the previous benchmark result, " - f"run `sky bench delete {benchmark}`." - ) + raise click.BadParameter(f'Benchmark {benchmark} already exists. ' + 'To delete the previous benchmark result, ' + f'run `sky bench delete {benchmark}`.') - entrypoint = " ".join(entrypoint) + entrypoint = ' '.join(entrypoint) if not entrypoint: - raise click.BadParameter("Please specify a task yaml to benchmark.") + raise click.BadParameter('Please specify a task yaml to benchmark.') is_yaml, config = _check_yaml(entrypoint) if not is_yaml: raise click.BadParameter( - "Sky Benchmark does not support command line tasks. " - "Please provide a YAML file." - ) + 'Sky Benchmark does not support command line tasks. ' + 'Please provide a YAML file.') assert config is not None, (is_yaml, config) - click.secho("Benchmarking a task from YAML spec: ", fg="yellow", nl=False) + click.secho('Benchmarking a task from YAML spec: ', fg='yellow', nl=False) click.secho(entrypoint, bold=True) candidates = _get_candidate_configs(entrypoint) # Check if the candidate configs are specified in both CLI and YAML. if candidates is not None: - message = ( - "is specified in both CLI and resources.candidates " - "in the YAML. Please specify only one of them." - ) + message = ('is specified in both CLI and resources.candidates ' + 'in the YAML. Please specify only one of them.') if cloud is not None: - if any("cloud" in candidate for candidate in candidates): - raise click.BadParameter(f"cloud {message}") + if any('cloud' in candidate for candidate in candidates): + raise click.BadParameter(f'cloud {message}') if region is not None: - if any("region" in candidate for candidate in candidates): - raise click.BadParameter(f"region {message}") + if any('region' in candidate for candidate in candidates): + raise click.BadParameter(f'region {message}') if zone is not None: - if any("zone" in candidate for candidate in candidates): - raise click.BadParameter(f"zone {message}") + if any('zone' in candidate for candidate in candidates): + raise click.BadParameter(f'zone {message}') if gpus is not None: - if any("accelerators" in candidate for candidate in candidates): - raise click.BadParameter(f"gpus (accelerators) {message}") + if any('accelerators' in candidate for candidate in candidates): + raise click.BadParameter(f'gpus (accelerators) {message}') if use_spot is not None: - if any("use_spot" in candidate for candidate in candidates): - raise click.BadParameter(f"use_spot {message}") + if any('use_spot' in candidate for candidate in candidates): + raise click.BadParameter(f'use_spot {message}') if image_id is not None: - if any("image_id" in candidate for candidate in candidates): - raise click.BadParameter(f"image_id {message}") + if any('image_id' in candidate for candidate in candidates): + raise click.BadParameter(f'image_id {message}') if disk_size is not None: - if any("disk_size" in candidate for candidate in candidates): - raise click.BadParameter(f"disk_size {message}") + if any('disk_size' in candidate for candidate in candidates): + raise click.BadParameter(f'disk_size {message}') if disk_tier is not None: - if any("disk_tier" in candidate for candidate in candidates): - raise click.BadParameter(f"disk_tier {message}") + if any('disk_tier' in candidate for candidate in candidates): + raise click.BadParameter(f'disk_tier {message}') if ports: - if any("ports" in candidate for candidate in candidates): - raise click.BadParameter(f"ports {message}") + if any('ports' in candidate for candidate in candidates): + raise click.BadParameter(f'ports {message}') # The user can specify the benchmark candidates in either of the two ways: # 1. By specifying resources.candidates in the YAML. # 2. By specifying gpu types as a command line argument (--gpus). override_gpu = None if gpus is not None: - gpu_list = gpus.split(",") + gpu_list = gpus.split(',') gpu_list = [gpu.strip() for gpu in gpu_list] - if " " in gpus: - raise click.BadParameter("Remove blanks in --gpus.") + if ' ' in gpus: + raise click.BadParameter('Remove blanks in --gpus.') if len(gpu_list) == 1: override_gpu = gpu_list[0] @@ -5104,73 +4652,66 @@ def benchmark_launch( # If len(gpu_list) > 1, gpus is interpreted # as a list of benchmark candidates. if candidates is None: - candidates = [{"accelerators": gpu} for gpu in gpu_list] + candidates = [{'accelerators': gpu} for gpu in gpu_list] override_gpu = None else: - raise ValueError( - "Provide benchmark candidates in either " - "--gpus or resources.candidates in the YAML." - ) + raise ValueError('Provide benchmark candidates in either ' + '--gpus or resources.candidates in the YAML.') if candidates is None: candidates = [{}] - if "resources" not in config: - config["resources"] = {} - resources_config = config["resources"] + if 'resources' not in config: + config['resources'] = {} + resources_config = config['resources'] # Override the yaml config with the command line arguments. if name is not None: - config["name"] = name + config['name'] = name if workdir is not None: - config["workdir"] = workdir + config['workdir'] = workdir if num_nodes is not None: - config["num_nodes"] = num_nodes - override_params = _parse_override_params( - cloud=cloud, - region=region, - zone=zone, - gpus=override_gpu, - cpus=cpus, - memory=memory, - use_spot=use_spot, - image_id=image_id, - disk_size=disk_size, - disk_tier=disk_tier, - ports=ports, - ) + config['num_nodes'] = num_nodes + override_params = _parse_override_params(cloud=cloud, + region=region, + zone=zone, + gpus=override_gpu, + cpus=cpus, + memory=memory, + use_spot=use_spot, + image_id=image_id, + disk_size=disk_size, + disk_tier=disk_tier, + ports=ports) _pop_and_ignore_fields_in_override_params( - override_params, field_to_ignore=["cpus", "memory"] - ) + override_params, field_to_ignore=['cpus', 'memory']) resources_config.update(override_params) - if "cloud" in resources_config: - cloud = resources_config.pop("cloud") + if 'cloud' in resources_config: + cloud = resources_config.pop('cloud') if cloud is not None: - resources_config["cloud"] = str(cloud) - if "region" in resources_config: - if resources_config["region"] is None: - resources_config.pop("region") - if "zone" in resources_config: - if resources_config["zone"] is None: - resources_config.pop("zone") - if "accelerators" in resources_config: - if resources_config["accelerators"] is None: - resources_config.pop("accelerators") - if "image_id" in resources_config: - if resources_config["image_id"] is None: - resources_config.pop("image_id") + resources_config['cloud'] = str(cloud) + if 'region' in resources_config: + if resources_config['region'] is None: + resources_config.pop('region') + if 'zone' in resources_config: + if resources_config['zone'] is None: + resources_config.pop('zone') + if 'accelerators' in resources_config: + if resources_config['accelerators'] is None: + resources_config.pop('accelerators') + if 'image_id' in resources_config: + if resources_config['image_id'] is None: + resources_config.pop('image_id') # Fully generate the benchmark candidate configs. clusters, candidate_configs = benchmark_utils.generate_benchmark_configs( - benchmark, config, candidates - ) + benchmark, config, candidates) # Show the benchmarking VM instances selected by the optimizer. # This also detects the case where the user requested infeasible resources. - benchmark_utils.print_benchmark_clusters( - benchmark, clusters, config, candidate_configs - ) + benchmark_utils.print_benchmark_clusters(benchmark, clusters, config, + candidate_configs) if not yes: - plural = "s" if len(candidates) > 1 else "" - prompt = f"Launching {len(candidates)} cluster{plural}. Proceed?" + plural = 's' if len(candidates) > 1 else '' + prompt = f'Launching {len(candidates)} cluster{plural}. Proceed?' click.confirm(prompt, default=True, abort=True, show_default=True) # Configs that are only accepted by the CLI. @@ -5179,96 +4720,96 @@ def benchmark_launch( # the serverless execution. if idle_minutes_to_autostop is None: idle_minutes_to_autostop = 5 - commandline_args["idle-minutes-to-autostop"] = idle_minutes_to_autostop + commandline_args['idle-minutes-to-autostop'] = idle_minutes_to_autostop if len(env) > 0: - commandline_args["env"] = [f"{k}={v}" for k, v in env] + commandline_args['env'] = [f'{k}={v}' for k, v in env] # Launch the benchmarking clusters in detach mode in parallel. benchmark_created = benchmark_utils.launch_benchmark_clusters( - benchmark, clusters, candidate_configs, commandline_args - ) + benchmark, clusters, candidate_configs, commandline_args) # If at least one cluster is created, print the following messages. if benchmark_created: logger.info( - f"\n{colorama.Fore.CYAN}Benchmark name: " - f"{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}" - "\nTo see the benchmark results: " - f"{backend_utils.BOLD}sky bench show " - f"{benchmark}{backend_utils.RESET_BOLD}" - "\nTo teardown the clusters: " - f"{backend_utils.BOLD}sky bench down " - f"{benchmark}{backend_utils.RESET_BOLD}" - ) - subprocess_utils.run("sky bench ls") + f'\n{colorama.Fore.CYAN}Benchmark name: ' + f'{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}' + '\nTo see the benchmark results: ' + f'{backend_utils.BOLD}sky bench show ' + f'{benchmark}{backend_utils.RESET_BOLD}' + '\nTo teardown the clusters: ' + f'{backend_utils.BOLD}sky bench down ' + f'{benchmark}{backend_utils.RESET_BOLD}') + subprocess_utils.run('sky bench ls') else: - logger.error("No benchmarking clusters are created.") - subprocess_utils.run("sky status") + logger.error('No benchmarking clusters are created.') + subprocess_utils.run('sky status') -@bench.command("ls", cls=_DocumentedCodeCommand) +@bench.command('ls', cls=_DocumentedCodeCommand) @usage_lib.entrypoint def benchmark_ls() -> None: """List the benchmark history.""" benchmarks = benchmark_state.get_benchmarks() columns = [ - "BENCHMARK", - "TASK", - "LAUNCHED", + 'BENCHMARK', + 'TASK', + 'LAUNCHED', ] max_num_candidates = 1 for benchmark in benchmarks: - benchmark_results = benchmark_state.get_benchmark_results(benchmark["name"]) + benchmark_results = benchmark_state.get_benchmark_results( + benchmark['name']) num_candidates = len(benchmark_results) if num_candidates > max_num_candidates: max_num_candidates = num_candidates if max_num_candidates == 1: - columns += ["CANDIDATE"] + columns += ['CANDIDATE'] else: - columns += [f"CANDIDATE {i}" for i in range(1, max_num_candidates + 1)] + columns += [f'CANDIDATE {i}' for i in range(1, max_num_candidates + 1)] benchmark_table = log_utils.create_table(columns) for benchmark in benchmarks: - if benchmark["task"] is not None: - task = benchmark["task"] + if benchmark['task'] is not None: + task = benchmark['task'] else: - task = "-" + task = '-' row = [ # BENCHMARK - benchmark["name"], + benchmark['name'], # TASK task, # LAUNCHED - datetime.datetime.fromtimestamp(benchmark["launched_at"]), + datetime.datetime.fromtimestamp(benchmark['launched_at']), ] - benchmark_results = benchmark_state.get_benchmark_results(benchmark["name"]) + benchmark_results = benchmark_state.get_benchmark_results( + benchmark['name']) # RESOURCES for b in benchmark_results: - num_nodes = b["num_nodes"] - resources = b["resources"] - postfix_spot = "[Spot]" if resources.use_spot else "" + num_nodes = b['num_nodes'] + resources = b['resources'] + postfix_spot = '[Spot]' if resources.use_spot else '' instance_type = resources.instance_type + postfix_spot if resources.accelerators is None: - accelerators = "" + accelerators = '' else: accelerator, count = list(resources.accelerators.items())[0] - accelerators = f" ({accelerator}:{count})" + accelerators = f' ({accelerator}:{count})' # For brevity, skip the cloud names. - resources_str = f"{num_nodes}x {instance_type}{accelerators}" + resources_str = f'{num_nodes}x {instance_type}{accelerators}' row.append(resources_str) - row += [""] * (max_num_candidates - len(benchmark_results)) + row += [''] * (max_num_candidates - len(benchmark_results)) benchmark_table.add_row(row) if benchmarks: click.echo(benchmark_table) else: - click.echo("No benchmark history found.") + click.echo('No benchmark history found.') -@bench.command("show", cls=_DocumentedCodeCommand) -@click.argument("benchmark", required=True, type=str) +@bench.command('show', cls=_DocumentedCodeCommand) +@click.argument('benchmark', required=True, type=str) # TODO(woosuk): Add --all option to show all the collected information # (e.g., setup time, warmup steps, total steps, etc.). @usage_lib.entrypoint @@ -5276,81 +4817,79 @@ def benchmark_show(benchmark: str) -> None: """Show a benchmark report.""" record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: - raise click.BadParameter(f"Benchmark {benchmark} does not exist.") + raise click.BadParameter(f'Benchmark {benchmark} does not exist.') benchmark_utils.update_benchmark_state(benchmark) click.echo( - textwrap.dedent( - """\ + textwrap.dedent("""\ Legend: - #STEPS: Number of steps taken. - SEC/STEP, $/STEP: Average time (cost) per step. - EST(hr), EST($): Estimated total time (cost) to complete the benchmark. - """ - ) - ) + """)) columns = [ - "CLUSTER", - "RESOURCES", - "STATUS", - "DURATION", - "SPENT($)", - "#STEPS", - "SEC/STEP", - "$/STEP", - "EST(hr)", - "EST($)", + 'CLUSTER', + 'RESOURCES', + 'STATUS', + 'DURATION', + 'SPENT($)', + '#STEPS', + 'SEC/STEP', + '$/STEP', + 'EST(hr)', + 'EST($)', ] cluster_table = log_utils.create_table(columns) rows = [] benchmark_results = benchmark_state.get_benchmark_results(benchmark) for result in benchmark_results: - num_nodes = result["num_nodes"] - resources = result["resources"] + num_nodes = result['num_nodes'] + resources = result['resources'] row = [ # CLUSTER - result["cluster"], + result['cluster'], # RESOURCES - f"{num_nodes}x {resources}", + f'{num_nodes}x {resources}', # STATUS - result["status"].value, + result['status'].value, ] - record = result["record"] - if record is None or record.start_time is None or record.last_time is None: - row += ["-"] * (len(columns) - len(row)) + record = result['record'] + if (record is None or record.start_time is None or + record.last_time is None): + row += ['-'] * (len(columns) - len(row)) rows.append(row) continue - duration_str = log_utils.readable_time_duration( - record.start_time, record.last_time, absolute=True - ) + duration_str = log_utils.readable_time_duration(record.start_time, + record.last_time, + absolute=True) duration = record.last_time - record.start_time spent = num_nodes * resources.get_cost(duration) - spent_str = f"{spent:.4f}" + spent_str = f'{spent:.4f}' num_steps = record.num_steps_so_far if num_steps is None: - num_steps = "-" + num_steps = '-' seconds_per_step = record.seconds_per_step if seconds_per_step is None: - seconds_per_step_str = "-" - cost_per_step_str = "-" + seconds_per_step_str = '-' + cost_per_step_str = '-' else: - seconds_per_step_str = f"{seconds_per_step:.4f}" + seconds_per_step_str = f'{seconds_per_step:.4f}' cost_per_step = num_nodes * resources.get_cost(seconds_per_step) - cost_per_step_str = f"{cost_per_step:.6f}" + cost_per_step_str = f'{cost_per_step:.6f}' total_time = record.estimated_total_seconds if total_time is None: - total_time_str = "-" - total_cost_str = "-" + total_time_str = '-' + total_cost_str = '-' else: - total_time_str = f"{total_time / 3600:.2f}" + total_time_str = f'{total_time / 3600:.2f}' total_cost = num_nodes * resources.get_cost(total_time) - total_cost_str = f"{total_cost:.2f}" + total_cost_str = f'{total_cost:.2f}' row += [ # DURATION @@ -5374,51 +4913,45 @@ def benchmark_show(benchmark: str) -> None: click.echo(cluster_table) finished = [ - row for row in rows if row[2] == benchmark_state.BenchmarkStatus.FINISHED.value + row for row in rows + if row[2] == benchmark_state.BenchmarkStatus.FINISHED.value ] - if any(row[5] == "-" for row in finished): + if any(row[5] == '-' for row in finished): # No #STEPS. SkyCallback was unused. click.secho( - "SkyCallback logs are not found in this benchmark. " - "Consider using SkyCallback to get more detailed information " - "in real time.", - fg="yellow", - ) - elif any(row[6] != "-" and row[-1] == "-" for row in rows): + 'SkyCallback logs are not found in this benchmark. ' + 'Consider using SkyCallback to get more detailed information ' + 'in real time.', + fg='yellow') + elif any(row[6] != '-' and row[-1] == '-' for row in rows): # No EST($). total_steps is not specified and cannot be inferred. click.secho( - "Cannot estimate total time and cost because " - "the total number of steps cannot be inferred by SkyCallback. " - "To get the estimation, specify the total number of steps in " - "either `sky_callback.init` or `Sky*Callback`.", - fg="yellow", - ) + 'Cannot estimate total time and cost because ' + 'the total number of steps cannot be inferred by SkyCallback. ' + 'To get the estimation, specify the total number of steps in ' + 'either `sky_callback.init` or `Sky*Callback`.', + fg='yellow') -@bench.command("down", cls=_DocumentedCodeCommand) -@click.argument("benchmark", required=True, type=str) +@bench.command('down', cls=_DocumentedCodeCommand) +@click.argument('benchmark', required=True, type=str) @click.option( - "--exclude", - "-e", - "clusters_to_exclude", + '--exclude', + '-e', + 'clusters_to_exclude', required=False, type=str, multiple=True, - help=( - "Cluster name(s) to exclude from termination. " - "Typically, you might want to see the benchmark results in " - '`sky bench show` and exclude a "winner" cluster from termination ' - "to finish the running task." - ), -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) + help=('Cluster name(s) to exclude from termination. ' + 'Typically, you might want to see the benchmark results in ' + '`sky bench show` and exclude a "winner" cluster from termination ' + 'to finish the running task.')) +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint def benchmark_down( benchmark: str, @@ -5428,7 +4961,7 @@ def benchmark_down( """Tear down all clusters belonging to a benchmark.""" record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: - raise click.BadParameter(f"Benchmark {benchmark} does not exist.") + raise click.BadParameter(f'Benchmark {benchmark} does not exist.') clusters = benchmark_state.get_benchmark_clusters(benchmark) to_stop: List[str] = [] @@ -5439,71 +4972,66 @@ def benchmark_down( continue to_stop.append(cluster) - _down_or_stop_clusters(to_stop, apply_to_all=False, down=True, no_confirm=yes) - - -@bench.command("delete", cls=_DocumentedCodeCommand) -@click.argument("benchmarks", required=False, type=str, nargs=-1) -@click.option( - "--all", - "-a", - default=None, - is_flag=True, - help="Delete all benchmark reports from the history.", -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) + _down_or_stop_clusters(to_stop, + apply_to_all=False, + down=True, + no_confirm=yes) + + +@bench.command('delete', cls=_DocumentedCodeCommand) +@click.argument('benchmarks', required=False, type=str, nargs=-1) +@click.option('--all', + '-a', + default=None, + is_flag=True, + help='Delete all benchmark reports from the history.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint # pylint: disable=redefined-builtin -def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool], yes: bool) -> None: +def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool], + yes: bool) -> None: """Delete benchmark reports from the history.""" if not benchmarks and all is None: raise click.BadParameter( - "Either specify benchmarks or use --all to delete all benchmarks." - ) + 'Either specify benchmarks or use --all to delete all benchmarks.') to_delete = [] if len(benchmarks) > 0: for benchmark in benchmarks: record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: - print(f"Benchmark {benchmark} not found.") + print(f'Benchmark {benchmark} not found.') else: to_delete.append(record) if all: to_delete = benchmark_state.get_benchmarks() if len(benchmarks) > 0: - print( - "Both --all and benchmark(s) specified " - "for sky bench delete. Letting --all take effect." - ) + print('Both --all and benchmark(s) specified ' + 'for sky bench delete. Letting --all take effect.') - to_delete = [r["name"] for r in to_delete] + to_delete = [r['name'] for r in to_delete] if not to_delete: return - benchmark_list = ", ".join(to_delete) - plural = "s" if len(to_delete) > 1 else "" + benchmark_list = ', '.join(to_delete) + plural = 's' if len(to_delete) > 1 else '' if not yes: click.confirm( - f"Deleting the benchmark{plural}: {benchmark_list}. Proceed?", + f'Deleting the benchmark{plural}: {benchmark_list}. Proceed?', default=True, abort=True, - show_default=True, - ) + show_default=True) - progress = rich_progress.Progress( - transient=True, redirect_stdout=False, redirect_stderr=False - ) + progress = rich_progress.Progress(transient=True, + redirect_stdout=False, + redirect_stderr=False) task = progress.add_task( - f"[bold cyan]Deleting {len(to_delete)} benchmark{plural}: ", - total=len(to_delete), - ) + f'[bold cyan]Deleting {len(to_delete)} benchmark{plural}: ', + total=len(to_delete)) def _delete_benchmark(benchmark: str) -> None: clusters = benchmark_state.get_benchmark_clusters(benchmark) @@ -5514,27 +5042,25 @@ def _delete_benchmark(benchmark: str) -> None: num_clusters = len([r for r in records if r is not None]) if num_clusters > 0: - plural = "s" if num_clusters > 1 else "" - message = ( - f"{colorama.Fore.YELLOW}Benchmark {benchmark} " - f"has {num_clusters} un-terminated cluster{plural}. " - f"Terminate the cluster{plural} with " - f"{backend_utils.BOLD} sky bench down {benchmark} " - f"{backend_utils.RESET_BOLD} " - "before deleting the benchmark report." - ) + plural = 's' if num_clusters > 1 else '' + message = (f'{colorama.Fore.YELLOW}Benchmark {benchmark} ' + f'has {num_clusters} un-terminated cluster{plural}. ' + f'Terminate the cluster{plural} with ' + f'{backend_utils.BOLD} sky bench down {benchmark} ' + f'{backend_utils.RESET_BOLD} ' + 'before deleting the benchmark report.') success = False else: - bucket_name = benchmark_state.get_benchmark_from_name(benchmark)["bucket"] + bucket_name = benchmark_state.get_benchmark_from_name( + benchmark)['bucket'] handle = global_user_state.get_handle_from_storage_name(bucket_name) assert handle is not None, bucket_name bucket_type = list(handle.sky_stores.keys())[0] - benchmark_utils.remove_benchmark_logs(benchmark, bucket_name, bucket_type) + benchmark_utils.remove_benchmark_logs(benchmark, bucket_name, + bucket_type) benchmark_state.delete_benchmark(benchmark) - message = ( - f"{colorama.Fore.GREEN}Benchmark report for " - f"{benchmark} deleted.{colorama.Style.RESET_ALL}" - ) + message = (f'{colorama.Fore.GREEN}Benchmark report for ' + f'{benchmark} deleted.{colorama.Style.RESET_ALL}') success = True progress.stop() @@ -5555,13 +5081,12 @@ def local(): pass -@click.option( - "--gpus/--no-gpus", - default=True, - is_flag=True, - help="Launch cluster without GPU support even " "if GPUs are detected on the host.", -) -@local.command("up", cls=_DocumentedCodeCommand) +@click.option('--gpus/--no-gpus', + default=True, + is_flag=True, + help='Launch cluster without GPU support even ' + 'if GPUs are detected on the host.') +@local.command('up', cls=_DocumentedCodeCommand) @usage_lib.entrypoint def local_up(gpus: bool): """Creates a local cluster.""" @@ -5572,39 +5097,36 @@ def local_up(gpus: bool): gpus = gpus and local_gpus_available # Check if ~/.kube/config exists: - if os.path.exists(os.path.expanduser("~/.kube/config")): + if os.path.exists(os.path.expanduser('~/.kube/config')): curr_context = kubernetes_utils.get_current_kube_config_context_name() - skypilot_context = "kind-skypilot" + skypilot_context = 'kind-skypilot' if curr_context is not None and curr_context != skypilot_context: click.echo( - f"Current context in kube config: {curr_context}" - "\nWill automatically switch to kind-skypilot after the local " - "cluster is created." - ) - message_str = "Creating local cluster{}..." - message_str = message_str.format( - (" with GPU support (this may take up " "to 15 minutes)") if gpus else "" - ) + f'Current context in kube config: {curr_context}' + '\nWill automatically switch to kind-skypilot after the local ' + 'cluster is created.') + message_str = 'Creating local cluster{}...' + message_str = message_str.format((' with GPU support (this may take up ' + 'to 15 minutes)') if gpus else '') path_to_package = os.path.dirname(os.path.dirname(__file__)) - up_script_path = os.path.join( - path_to_package, "sky/utils/kubernetes", "create_cluster.sh" - ) + up_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes', + 'create_cluster.sh') # Get directory of script and run it from there cwd = os.path.dirname(os.path.abspath(up_script_path)) - run_command = up_script_path + " --gpus" if gpus else up_script_path + run_command = up_script_path + ' --gpus' if gpus else up_script_path run_command = shlex.split(run_command) # Setup logging paths run_timestamp = backend_utils.get_run_timestamp() - log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp, "local_up.log") - tail_cmd = "tail -n100 -f " + log_path + log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp, + 'local_up.log') + tail_cmd = 'tail -n100 -f ' + log_path click.echo(message_str) style = colorama.Style - click.echo( - "To view detailed progress: " f"{style.BRIGHT}{tail_cmd}{style.RESET_ALL}" - ) + click.echo('To view detailed progress: ' + f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}') returncode, _, stderr = log_lib.run_with_log( cmd=run_command, @@ -5612,155 +5134,138 @@ def local_up(gpus: bool): require_outputs=True, stream_logs=False, line_processor=log_utils.SkyLocalUpLineProcessor(), - cwd=cwd, - ) + cwd=cwd) # Kind always writes to stderr even if it succeeds. # If the failure happens after the cluster is created, we need # to strip all stderr of "No kind clusters found.", which is # printed when querying with kind get clusters. - stderr = stderr.replace("No kind clusters found.\n", "") + stderr = stderr.replace('No kind clusters found.\n', '') if returncode == 0: cluster_created = True elif returncode == 100: - click.echo( - f"{colorama.Fore.GREEN}Local cluster already " - f"exists.{style.RESET_ALL}\n" - "If you want to delete it instead, run: sky local down" - ) + click.echo(f'{colorama.Fore.GREEN}Local cluster already ' + f'exists.{style.RESET_ALL}\n' + 'If you want to delete it instead, run: sky local down') else: with ux_utils.print_exception_no_traceback(): raise RuntimeError( - "Failed to create local cluster. " - f"Full log: {log_path}" - f"\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}" - ) + 'Failed to create local cluster. ' + f'Full log: {log_path}' + f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}') # Run sky check - with rich_utils.safe_status("[bold cyan]Running sky check..."): - sky_check.check(clouds=["kubernetes"], quiet=True) + with rich_utils.safe_status('[bold cyan]Running sky check...'): + sky_check.check(clouds=['kubernetes'], quiet=True) if cluster_created: # Prepare completion message which shows CPU and GPU count # Get number of CPUs p = subprocess_utils.run( - "kubectl get nodes -o jsonpath='{.items[0].status.capacity.cpu}'", - capture_output=True, - ) - num_cpus = int(p.stdout.decode("utf-8")) + 'kubectl get nodes -o jsonpath=\'{.items[0].status.capacity.cpu}\'', + capture_output=True) + num_cpus = int(p.stdout.decode('utf-8')) # GPU count/type parsing - gpu_message = "" - gpu_hint = "" + gpu_message = '' + gpu_hint = '' if gpus: # Get GPU model by querying the node labels - label_name_escaped = "skypilot.co/accelerator".replace(".", "\\.") - gpu_type_cmd = f"kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels['{label_name_escaped}']}}\"" # pylint: disable=line-too-long + label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.') + gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long try: # Run the command and capture the output - gpu_count_output = subprocess.check_output( - gpu_type_cmd, shell=True, text=True - ) - gpu_type_str = gpu_count_output.strip() + " " + gpu_count_output = subprocess.check_output(gpu_type_cmd, + shell=True, + text=True) + gpu_type_str = gpu_count_output.strip() + ' ' except subprocess.CalledProcessError as e: - output = str(e.output.decode("utf-8")) - logger.warning(f"Failed to get GPU type: {output}") - gpu_type_str = "" + output = str(e.output.decode('utf-8')) + logger.warning(f'Failed to get GPU type: {output}') + gpu_type_str = '' # Get number of GPUs (sum of nvidia.com/gpu resources) - gpu_count_command = "kubectl get nodes -o=jsonpath='{range .items[*]}{.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}' | awk '{sum += $1} END {print sum}'" # pylint: disable=line-too-long + gpu_count_command = 'kubectl get nodes -o=jsonpath=\'{range .items[*]}{.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}\' | awk \'{sum += $1} END {print sum}\'' # pylint: disable=line-too-long try: # Run the command and capture the output - gpu_count_output = subprocess.check_output( - gpu_count_command, shell=True, text=True - ) - gpu_count = gpu_count_output.strip() # Remove any extra whitespace - gpu_message = f" and {gpu_count} {gpu_type_str}GPUs" + gpu_count_output = subprocess.check_output(gpu_count_command, + shell=True, + text=True) + gpu_count = gpu_count_output.strip( + ) # Remove any extra whitespace + gpu_message = f' and {gpu_count} {gpu_type_str}GPUs' except subprocess.CalledProcessError as e: - output = str(e.output.decode("utf-8")) - logger.warning(f"Failed to get GPU count: {output}") - gpu_message = f" with {gpu_type_str}GPU support" + output = str(e.output.decode('utf-8')) + logger.warning(f'Failed to get GPU count: {output}') + gpu_message = f' with {gpu_type_str}GPU support' gpu_hint = ( - ( - "\nHint: To see the list of GPUs in the cluster, " - "run 'sky show-gpus --cloud kubernetes'" - ) - if gpus - else "" - ) + '\nHint: To see the list of GPUs in the cluster, ' + 'run \'sky show-gpus --cloud kubernetes\'') if gpus else '' if num_cpus < 2: - click.echo( - "Warning: Local cluster has less than 2 CPUs. " - "This may cause issues with running tasks." - ) + click.echo('Warning: Local cluster has less than 2 CPUs. ' + 'This may cause issues with running tasks.') click.echo( - f"\n{colorama.Fore.GREEN}Local Kubernetes cluster created " - "successfully with " - f"{num_cpus} CPUs{gpu_message}.{style.RESET_ALL}\n`sky launch` can " - "now run tasks locally." - "\nHint: To change the number of CPUs, change your docker " - "runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info." # pylint: disable=line-too-long - f"{gpu_hint}" - ) + f'\n{colorama.Fore.GREEN}Local Kubernetes cluster created ' + 'successfully with ' + f'{num_cpus} CPUs{gpu_message}.{style.RESET_ALL}\n`sky launch` can ' + 'now run tasks locally.' + '\nHint: To change the number of CPUs, change your docker ' + 'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long + f'{gpu_hint}') -@local.command("down", cls=_DocumentedCodeCommand) +@local.command('down', cls=_DocumentedCodeCommand) @usage_lib.entrypoint def local_down(): """Deletes a local cluster.""" cluster_removed = False path_to_package = os.path.dirname(os.path.dirname(__file__)) - down_script_path = os.path.join( - path_to_package, "sky/utils/kubernetes", "delete_cluster.sh" - ) + down_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes', + 'delete_cluster.sh') cwd = os.path.dirname(os.path.abspath(down_script_path)) run_command = shlex.split(down_script_path) # Setup logging paths run_timestamp = backend_utils.get_run_timestamp() - log_path = os.path.join( - constants.SKY_LOGS_DIRECTORY, run_timestamp, "local_down.log" - ) - tail_cmd = "tail -n100 -f " + log_path + log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp, + 'local_down.log') + tail_cmd = 'tail -n100 -f ' + log_path - with rich_utils.safe_status("[bold cyan]Removing local cluster..."): + with rich_utils.safe_status('[bold cyan]Removing local cluster...'): style = colorama.Style - click.echo( - "To view detailed progress: " f"{style.BRIGHT}{tail_cmd}{style.RESET_ALL}" - ) - returncode, stdout, stderr = log_lib.run_with_log( - cmd=run_command, - log_path=log_path, - require_outputs=True, - stream_logs=False, - cwd=cwd, - ) - stderr = stderr.replace("No kind clusters found.\n", "") + click.echo('To view detailed progress: ' + f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}') + returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command, + log_path=log_path, + require_outputs=True, + stream_logs=False, + cwd=cwd) + stderr = stderr.replace('No kind clusters found.\n', '') if returncode == 0: cluster_removed = True elif returncode == 100: - click.echo("\nLocal cluster does not exist.") + click.echo('\nLocal cluster does not exist.') else: with ux_utils.print_exception_no_traceback(): raise RuntimeError( - "Failed to create local cluster. " - f"Stdout: {stdout}" - f"\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}" - ) + 'Failed to create local cluster. ' + f'Stdout: {stdout}' + f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}') if cluster_removed: # Run sky check - with rich_utils.safe_status("[bold cyan]Running sky check..."): - sky_check.check(clouds=["kubernetes"], quiet=True) - click.echo(f"{colorama.Fore.GREEN}Local cluster removed.{style.RESET_ALL}") + with rich_utils.safe_status('[bold cyan]Running sky check...'): + sky_check.check(clouds=['kubernetes'], quiet=True) + click.echo( + f'{colorama.Fore.GREEN}Local cluster removed.{style.RESET_ALL}') def main(): return cli() -if __name__ == "__main__": +if __name__ == '__main__': main()