diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index a3651bdba9a..95c2296b0b9 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -916,6 +916,10 @@ def write_cluster_config( tmp_yaml_path, cluster_config_overrides=to_provision.cluster_config_overrides) kubernetes_utils.combine_metadata_fields(tmp_yaml_path) + valid, message = kubernetes_utils.check_pod_config(tmp_yaml_path) + if not valid: + raise exceptions.InvalidCloudConfigs( + f'There are invalid config in pod_config, deatil: {message}') if dryrun: # If dryrun, return the unfinished tmp yaml path. diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 2c062ad987f..11101c2ac95 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -866,15 +866,6 @@ def check_credentials(context: Optional[str], _, exec_msg = is_kubeconfig_exec_auth(context) - # Check whether pod_config is valid - pod_config = skypilot_config.get_nested(('kubernetes', 'pod_config'), - default_value={}, - override_configs={}) - if pod_config: - _, pod_msg = _check_pod_config(context, pod_config) - if pod_msg: - return False, pod_msg - # We now check if GPUs are available and labels are set correctly on the # cluster, and if not we return hints that may help debug any issues. # This early check avoids later surprises for user when they try to run @@ -900,9 +891,8 @@ def check_credentials(context: Optional[str], else: return True, None -def _check_pod_config( - context: Optional[str] = None, pod_config: Optional[Any] = None) \ - -> Tuple[bool, Optional[str]]: + +def check_pod_config(cluster_yaml_path: str) -> Tuple[bool, Optional[str]]: """Check if the pod_config is a valid pod config Using create_namespaced_pod api with dry_run to check the pod_config @@ -912,13 +902,19 @@ def _check_pod_config( bool: True if pod_config is valid. str: Error message about why the pod_config is invalid, None otherwise. """ + with open(cluster_yaml_path, 'r', encoding='utf-8') as f: + yaml_content = f.read() + yaml_obj = yaml.safe_load(yaml_content) + pod_config = \ + yaml_obj['available_node_types']['ray_head_default']['node_config'] try: - namespace = get_kube_config_context_namespace(context) - kubernetes.core_api(context).create_namespaced_pod( + # This ok to use None context here as we only test the pod is valid + # won't do any change in the cluster + namespace = get_kube_config_context_namespace(None) + kubernetes.core_api().create_namespaced_pod( namespace, body=pod_config, dry_run='All', - field_validation='Strict', _request_timeout=kubernetes.API_TIMEOUT) except kubernetes.api_exception() as e: error_msg = '' @@ -928,7 +924,7 @@ def _check_pod_config( error_msg = exception_body.get('message') else: error_msg = str(e) - return False, f'Invalid pod_config: {error_msg}' + return False, error_msg except Exception as e: # pylint: disable=broad-except return False, ('An error occurred: ' f'{common_utils.format_exception(e, use_bracket=True)}')