Skip to content

Commit

Permalink
update: check pod_config when launch
Browse files Browse the repository at this point in the history
check merged pod_config during launch using k8s api
  • Loading branch information
chesterli29 committed Dec 13, 2024
1 parent 12f1208 commit 64bb66a
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 16 deletions.
4 changes: 4 additions & 0 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,6 +916,10 @@ def write_cluster_config(
tmp_yaml_path,
cluster_config_overrides=to_provision.cluster_config_overrides)
kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
valid, message = kubernetes_utils.check_pod_config(tmp_yaml_path)
if not valid:
raise exceptions.InvalidCloudConfigs(
f'There are invalid config in pod_config, deatil: {message}')

if dryrun:
# If dryrun, return the unfinished tmp yaml path.
Expand Down
28 changes: 12 additions & 16 deletions sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -866,15 +866,6 @@ def check_credentials(context: Optional[str],

_, exec_msg = is_kubeconfig_exec_auth(context)

# Check whether pod_config is valid
pod_config = skypilot_config.get_nested(('kubernetes', 'pod_config'),
default_value={},
override_configs={})
if pod_config:
_, pod_msg = _check_pod_config(context, pod_config)
if pod_msg:
return False, pod_msg

# We now check if GPUs are available and labels are set correctly on the
# cluster, and if not we return hints that may help debug any issues.
# This early check avoids later surprises for user when they try to run
Expand All @@ -900,9 +891,8 @@ def check_credentials(context: Optional[str],
else:
return True, None

def _check_pod_config(
context: Optional[str] = None, pod_config: Optional[Any] = None) \
-> Tuple[bool, Optional[str]]:

def check_pod_config(cluster_yaml_path: str) -> Tuple[bool, Optional[str]]:
"""Check if the pod_config is a valid pod config
Using create_namespaced_pod api with dry_run to check the pod_config
Expand All @@ -912,13 +902,19 @@ def _check_pod_config(
bool: True if pod_config is valid.
str: Error message about why the pod_config is invalid, None otherwise.
"""
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
yaml_content = f.read()
yaml_obj = yaml.safe_load(yaml_content)
pod_config = \
yaml_obj['available_node_types']['ray_head_default']['node_config']
try:
namespace = get_kube_config_context_namespace(context)
kubernetes.core_api(context).create_namespaced_pod(
# This ok to use None context here as we only test the pod is valid
# won't do any change in the cluster
namespace = get_kube_config_context_namespace(None)
kubernetes.core_api().create_namespaced_pod(
namespace,
body=pod_config,
dry_run='All',
field_validation='Strict',
_request_timeout=kubernetes.API_TIMEOUT)
except kubernetes.api_exception() as e:
error_msg = ''
Expand All @@ -928,7 +924,7 @@ def _check_pod_config(
error_msg = exception_body.get('message')
else:
error_msg = str(e)
return False, f'Invalid pod_config: {error_msg}'
return False, error_msg
except Exception as e: # pylint: disable=broad-except
return False, ('An error occurred: '
f'{common_utils.format_exception(e, use_bracket=True)}')
Expand Down

0 comments on commit 64bb66a

Please sign in to comment.