From 6282697ea65c405d02c3aa0e8373d0f764dc5769 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Wed, 13 Nov 2024 16:40:21 -0800 Subject: [PATCH] address PR comments and update docstrings --- sky/backends/backend.py | 22 ++++++++++++++++++++ sky/backends/cloud_vm_ray_backend.py | 31 +++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/sky/backends/backend.py b/sky/backends/backend.py index 6148063fcb5..af26c89221c 100644 --- a/sky/backends/backend.py +++ b/sky/backends/backend.py @@ -54,6 +54,28 @@ def provision( retry_until_up: bool = False, skip_if_no_cluster_updates: bool = False, ) -> Optional[_ResourceHandleType]: + """Provisions resources for the given task. + + Args: + task: The task to provision resources for. + to_provision: Resource config to provision. Should only be None if + cluster_name refers to an existing cluster, whose resources will + be used. + dryrun: If True, don't actually provision anything. + stream_logs: If True, stream provisioning logs to console. + cluster_name: Name of the cluster to provision. If None, a name will + be auto-generated. If the name refers to an existing cluster, + the existing cluster will be reused and re-provisioned. + retry_until_up: If True, retry provisioning until resources are + successfully launched. + skip_if_no_cluster_updates: If True, calculate the cluster config + and compare to the existing cluster_name's config. Skip + provisioning if no updates are needed for the existing cluster. + + Returns: + A ResourceHandle object for the provisioned resources, or None if + dryrun is True. + """ if cluster_name is None: cluster_name = sky.backends.backend_utils.generate_cluster_name() usage_lib.record_cluster_name_for_current_operation(cluster_name) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 729eb03940b..22050efd3c3 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1318,7 +1318,19 @@ def _retry_zones( prev_cluster_ever_up: bool, skip_if_config_hash_matches: Optional[str], ) -> Dict[str, Any]: - """The provision retry loop.""" + """The provision retry loop. + + Returns a config_dict with the following fields: + - All fields from backend_utils.write_cluster_config(). See its + docstring. + - 'provisioning_skipped': True if provisioning was short-circuited + by skip_if_config_hash_matches, False otherwise. + - 'handle': The provisioned cluster handle. + - 'provision_record': (Only if using the new skypilot provisioner) The + record returned by provisioner.bulk_provision(). + - 'resources_vars': (Only if using the new skypilot provisioner) The + resources variables given by make_deploy_resources_variables(). + """ # Get log_path name log_path = os.path.join(self.log_dir, 'provision.log') log_abs_path = os.path.abspath(log_path) @@ -1429,8 +1441,8 @@ def _retry_zones( f'invalid cloud config: {common_utils.format_exception(e)}') if skip_if_config_hash_matches == config_dict['config_hash']: - logger.info('Skipping provisioning of cluster with matching ' - 'config hash.') + logger.debug('Skipping provisioning of cluster with matching ' + 'config hash.') config_dict['provisioning_skipped'] = True return config_dict config_dict['provisioning_skipped'] = False @@ -1951,7 +1963,11 @@ def provision_with_retries( stream_logs: bool, skip_if_config_hash_matches: Optional[str], ) -> Dict[str, Any]: - """Provision with retries for all launchable resources.""" + """Provision with retries for all launchable resources. + + Returns the config_dict from _retry_zones() - see its docstring for + details. + """ cluster_name = to_provision_config.cluster_name to_provision = to_provision_config.resources num_nodes = to_provision_config.num_nodes @@ -2710,7 +2726,12 @@ def _provision( retry_until_up: bool = False, skip_if_no_cluster_updates: bool = False, ) -> Optional[CloudVmRayResourceHandle]: - """Provisions using 'ray up'. + """Provisions the cluster, or re-provisions an existing cluster. + + Use the SKYPILOT provisioner if it's supported by the cloud, otherwise + use 'ray up'. + + See also docstring for Backend.provision(). Raises: exceptions.ClusterOwnerIdentityMismatchError: if the cluster