From 3e6f9f3e7dc7f8fd613a49175d95eca9f131f431 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 6 Nov 2024 22:16:19 -0800
Subject: [PATCH 01/14] [Core] Avoid PENDING job to be set to FAILED and speed
 up job scheduling (#4264)

* fix race condition for setting job status to FAILED during INIT

* Fix

* fix

* format

* Add smoke tests

* revert pending submit

* remove update entirely for the job schedule step

* wait for job 32 to finish

* fix smoke

* move and rename

* Add comment

* minor
---
 examples/multi_echo.py             |  6 +--
 sky/skylet/job_lib.py              | 86 +++++++++++++++++++-----------
 tests/test_smoke.py                | 13 ++++-
 tests/test_yamls/pipeline_aws.yaml |  2 +-
 4 files changed, 71 insertions(+), 36 deletions(-)

diff --git a/examples/multi_echo.py b/examples/multi_echo.py
index a9fe49fad2d..7f310a4bb6b 100644
--- a/examples/multi_echo.py
+++ b/examples/multi_echo.py
@@ -31,13 +31,13 @@ def run(cluster: Optional[str] = None, cloud: Optional[str] = None):
 
     # Submit multiple tasks in parallel to trigger queueing behaviors.
     def _exec(i):
-        task = sky.Task(run=f'echo {i}; sleep 5')
-        resources = sky.Resources(accelerators={'T4': 0.5})
+        task = sky.Task(run=f'echo {i}; sleep 60')
+        resources = sky.Resources(accelerators={'T4': 0.05})
         task.set_resources(resources)
         sky.exec(task, cluster_name=cluster, detach_run=True)
 
     with pool.ThreadPool(8) as p:
-        list(p.imap(_exec, range(32)))
+        list(p.imap(_exec, range(150)))
 
 
 if __name__ == '__main__':
diff --git a/sky/skylet/job_lib.py b/sky/skylet/job_lib.py
index 12d42d8c79c..fba015618f2 100644
--- a/sky/skylet/job_lib.py
+++ b/sky/skylet/job_lib.py
@@ -181,14 +181,19 @@ def _run_job(self, job_id: int, run_cmd: str):
         subprocess.Popen(run_cmd, shell=True, stdout=subprocess.DEVNULL)
 
     def schedule_step(self, force_update_jobs: bool = False) -> None:
-        jobs = self._get_jobs()
-        if len(jobs) > 0 or force_update_jobs:
+        if force_update_jobs:
             update_status()
+        pending_jobs = self._get_pending_jobs()
         # TODO(zhwu, mraheja): One optimization can be allowing more than one
         # job staying in the pending state after ray job submit, so that to be
         # faster to schedule a large amount of jobs.
-        for job_id, run_cmd, submit, created_time in jobs:
+        for job_id, run_cmd, submit, created_time in pending_jobs:
             with filelock.FileLock(_get_lock_path(job_id)):
+                # We don't have to refresh the job status before checking, as
+                # the job status will only be stale in rare cases where ray job
+                # crashes; or the job stays in INIT state for a long time.
+                # In those cases, the periodic JobSchedulerEvent event will
+                # update the job status every 300 seconds.
                 status = get_status_no_lock(job_id)
                 if (status not in _PRE_RESOURCE_STATUSES or
                         created_time < psutil.boot_time()):
@@ -202,7 +207,7 @@ def schedule_step(self, force_update_jobs: bool = False) -> None:
                 self._run_job(job_id, run_cmd)
                 return
 
-    def _get_jobs(self) -> List[Tuple[int, str, int, int]]:
+    def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
         """Returns the metadata for jobs in the pending jobs table
 
         The information contains job_id, run command, submit time,
@@ -214,7 +219,7 @@ def _get_jobs(self) -> List[Tuple[int, str, int, int]]:
 class FIFOScheduler(JobScheduler):
     """First in first out job scheduler"""
 
-    def _get_jobs(self) -> List[Tuple[int, str, int, int]]:
+    def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
         return list(
             _CURSOR.execute('SELECT * FROM pending_jobs ORDER BY job_id'))
 
@@ -534,25 +539,13 @@ def update_job_status(job_ids: List[int],
 
     This function should only be run on the remote instance with ray>=2.4.0.
     """
+    echo = logger.info if not silent else logger.debug
     if len(job_ids) == 0:
         return []
 
-    # TODO: if too slow, directly query against redis.
     ray_job_ids = [make_ray_job_id(job_id) for job_id in job_ids]
-
     job_client = _create_ray_job_submission_client()
 
-    # In ray 2.4.0, job_client.list_jobs returns a list of JobDetails,
-    # which contains the job status (str) and submission_id (str).
-    ray_job_query_time = time.time()
-    job_detail_lists: List['ray_pydantic.JobDetails'] = job_client.list_jobs()
-
-    job_details = {}
-    ray_job_ids_set = set(ray_job_ids)
-    for job_detail in job_detail_lists:
-        if job_detail.submission_id in ray_job_ids_set:
-            job_details[job_detail.submission_id] = job_detail
-
     statuses = []
     for job_id, ray_job_id in zip(job_ids, ray_job_ids):
         # Per-job status lock is required because between the job status
@@ -560,15 +553,48 @@ def update_job_status(job_ids: List[int],
         # can be modified by the generated ray program.
         with filelock.FileLock(_get_lock_path(job_id)):
             status = None
-            if ray_job_id in job_details:
-                ray_status = job_details[ray_job_id].status
-                status = _RAY_TO_JOB_STATUS_MAP[ray_status]
+            job_record = _get_jobs_by_ids([job_id])[0]
+            original_status = job_record['status']
+            job_submitted_at = job_record['submitted_at']
+
+            ray_job_query_time = time.time()
+            if original_status == JobStatus.INIT:
+                if (job_submitted_at >= psutil.boot_time() and job_submitted_at
+                        >= ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
+                    # The job id is reserved, but the job is not submitted yet.
+                    # We should keep it in INIT.
+                    status = JobStatus.INIT
+                else:
+                    # We always immediately submit job after the job id is
+                    # allocated, i.e. INIT -> PENDING, if a job stays in INIT
+                    # for too long, it is likely the job submission process
+                    # was killed before the job is submitted. We should set it
+                    # to FAILED then. Note, if ray job indicates the job is
+                    # running, we will change status to PENDING below.
+                    echo(f'INIT job {job_id} is stale, setting to FAILED')
+                    status = JobStatus.FAILED
+
+            try:
+                # Querying status within the lock is safer than querying
+                # outside, as it avoids the race condition when job table is
+                # updated after the ray job status query.
+                # Also, getting per-job status is faster than querying all jobs,
+                # when there are significant number of finished jobs.
+                # Reference: getting 124 finished jobs takes 0.038s, while
+                # querying a single job takes 0.006s, 10 jobs takes 0.066s.
+                # TODO: if too slow, directly query against redis.
+                ray_job_status = job_client.get_job_status(ray_job_id)
+                status = _RAY_TO_JOB_STATUS_MAP[ray_job_status.value]
+            except RuntimeError:
+                # Job not found.
+                pass
+
             pending_job = _get_pending_job(job_id)
             if pending_job is not None:
                 if pending_job['created_time'] < psutil.boot_time():
-                    logger.info(f'Job {job_id} is stale, setting to FAILED: '
-                                f'created_time={pending_job["created_time"]}, '
-                                f'boot_time={psutil.boot_time()}')
+                    echo(f'Job {job_id} is stale, setting to FAILED: '
+                         f'created_time={pending_job["created_time"]}, '
+                         f'boot_time={psutil.boot_time()}')
                     # The job is stale as it is created before the instance
                     # is booted, e.g. the instance is rebooted.
                     status = JobStatus.FAILED
@@ -583,22 +609,20 @@ def update_job_status(job_ids: List[int],
                     # as stale.
                     status = JobStatus.PENDING
 
-            original_status = get_status_no_lock(job_id)
             assert original_status is not None, (job_id, status)
             if status is None:
                 status = original_status
                 if (original_status is not None and
                         not original_status.is_terminal()):
-                    logger.info(f'Ray job status for job {job_id} is None, '
-                                'setting it to FAILED.')
+                    echo(f'Ray job status for job {job_id} is None, '
+                         'setting it to FAILED.')
                     # The job may be stale, when the instance is restarted
                     # (the ray redis is volatile). We need to reset the
                     # status of the task to FAILED if its original status
                     # is RUNNING or PENDING.
                     status = JobStatus.FAILED
                     _set_status_no_lock(job_id, status)
-                    if not silent:
-                        logger.info(f'Updated job {job_id} status to {status}')
+                    echo(f'Updated job {job_id} status to {status}')
             else:
                 # Taking max of the status is necessary because:
                 # 1. It avoids race condition, where the original status has
@@ -611,10 +635,10 @@ def update_job_status(job_ids: List[int],
                 # DB) would already have that value. So we take the max here to
                 # keep it at later status.
                 status = max(status, original_status)
+                assert status is not None, (job_id, status, original_status)
                 if status != original_status:  # Prevents redundant update.
                     _set_status_no_lock(job_id, status)
-                    if not silent:
-                        logger.info(f'Updated job {job_id} status to {status}')
+                    echo(f'Updated job {job_id} status to {status}')
         statuses.append(status)
     return statuses
 
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 3049d1e840c..2e52ef0235f 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -1750,7 +1750,18 @@ def test_multi_echo(generic_cloud: str):
         'multi_echo',
         [
             f'python examples/multi_echo.py {name} {generic_cloud}',
-            'sleep 120',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
+            'sleep 10',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
+            'sleep 30',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
+            'sleep 30',
+            # Make sure that our job scheduler is fast enough to have at least
+            # 10 RUNNING jobs in parallel.
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "RUNNING" | wc -l | awk \'{{if ($1 < 10) exit 1}}\'',
+            'sleep 30',
+            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep "FAILED" && exit 1 || true',
+            f'until sky logs {name} 32 --status; do echo "Waiting for job 32 to finish..."; sleep 1; done',
         ] +
         # Ensure jobs succeeded.
         [f'sky logs {name} {i + 1} --status' for i in range(32)] +
diff --git a/tests/test_yamls/pipeline_aws.yaml b/tests/test_yamls/pipeline_aws.yaml
index 9971f30bc66..962183bd992 100644
--- a/tests/test_yamls/pipeline_aws.yaml
+++ b/tests/test_yamls/pipeline_aws.yaml
@@ -5,7 +5,7 @@ name: a
 
 resources:
   cloud: aws
-  region: us-west-2
+  region: us-east-2
   cpus: 2+
 
 run: |

From 0ccab084fab5ae3a3b05e57bf6da23d747f041f1 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Sat, 9 Nov 2024 15:56:20 -0800
Subject: [PATCH 02/14] [Core] Avoid job scheduling race condition (#4310)

* Avoid job schedule race condition

* format

* format

* Avoid race for cancel
---
 sky/skylet/job_lib.py | 46 +++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/sky/skylet/job_lib.py b/sky/skylet/job_lib.py
index fba015618f2..6e5ad5667da 100644
--- a/sky/skylet/job_lib.py
+++ b/sky/skylet/job_lib.py
@@ -11,8 +11,7 @@
 import sqlite3
 import subprocess
 import time
-import typing
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
 
 import colorama
 import filelock
@@ -24,9 +23,6 @@
 from sky.utils import db_utils
 from sky.utils import log_utils
 
-if typing.TYPE_CHECKING:
-    from ray.dashboard.modules.job import pydantic_models as ray_pydantic
-
 logger = sky_logging.init_logger(__name__)
 
 _JOB_STATUS_LOCK = '~/.sky/locks/.job_{}.lock'
@@ -183,12 +179,20 @@ def _run_job(self, job_id: int, run_cmd: str):
     def schedule_step(self, force_update_jobs: bool = False) -> None:
         if force_update_jobs:
             update_status()
-        pending_jobs = self._get_pending_jobs()
+        pending_job_ids = self._get_pending_job_ids()
         # TODO(zhwu, mraheja): One optimization can be allowing more than one
         # job staying in the pending state after ray job submit, so that to be
         # faster to schedule a large amount of jobs.
-        for job_id, run_cmd, submit, created_time in pending_jobs:
+        for job_id in pending_job_ids:
             with filelock.FileLock(_get_lock_path(job_id)):
+                pending_job = _get_pending_job(job_id)
+                if pending_job is None:
+                    # Pending job can be removed by another thread, due to the
+                    # job being scheduled already.
+                    continue
+                run_cmd = pending_job['run_cmd']
+                submit = pending_job['submit']
+                created_time = pending_job['created_time']
                 # We don't have to refresh the job status before checking, as
                 # the job status will only be stale in rare cases where ray job
                 # crashes; or the job stays in INIT state for a long time.
@@ -207,8 +211,8 @@ def schedule_step(self, force_update_jobs: bool = False) -> None:
                 self._run_job(job_id, run_cmd)
                 return
 
-    def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
-        """Returns the metadata for jobs in the pending jobs table
+    def _get_pending_job_ids(self) -> List[int]:
+        """Returns the job ids in the pending jobs table
 
         The information contains job_id, run command, submit time,
         creation time.
@@ -219,9 +223,10 @@ def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
 class FIFOScheduler(JobScheduler):
     """First in first out job scheduler"""
 
-    def _get_pending_jobs(self) -> List[Tuple[int, str, int, int]]:
-        return list(
-            _CURSOR.execute('SELECT * FROM pending_jobs ORDER BY job_id'))
+    def _get_pending_job_ids(self) -> List[int]:
+        rows = _CURSOR.execute(
+            'SELECT job_id FROM pending_jobs ORDER BY job_id').fetchall()
+        return [row[0] for row in rows]
 
 
 scheduler = FIFOScheduler()
@@ -518,11 +523,16 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
 
 
 def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
-    rows = _CURSOR.execute('SELECT created_time, submit FROM pending_jobs '
-                           f'WHERE job_id={job_id!r}')
+    rows = _CURSOR.execute(
+        'SELECT created_time, submit, run_cmd FROM pending_jobs '
+        f'WHERE job_id={job_id!r}')
     for row in rows:
-        created_time, submit = row
-        return {'created_time': created_time, 'submit': submit}
+        created_time, submit, run_cmd = row
+        return {
+            'created_time': created_time,
+            'submit': submit,
+            'run_cmd': run_cmd
+        }
     return None
 
 
@@ -792,7 +802,9 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
                     logger.warning(str(e))
                     continue
 
-            if job['status'] in [
+            # Get the job status again to avoid race condition.
+            job_status = get_status_no_lock(job['job_id'])
+            if job_status in [
                     JobStatus.PENDING, JobStatus.SETTING_UP, JobStatus.RUNNING
             ]:
                 _set_status_no_lock(job['job_id'], JobStatus.CANCELLED)

From dfe43e2515782e5ae78d4978d877c1553889688e Mon Sep 17 00:00:00 2001
From: zpoint <zguo@covariant.ai>
Date: Wed, 20 Nov 2024 16:28:29 +0800
Subject: [PATCH 03/14] merge #4284

---
 tests/test_smoke.py | 206 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 169 insertions(+), 37 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 2e52ef0235f..cc45ac01683 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -25,6 +25,7 @@
 # Change cloud for generic tests to aws
 # > pytest tests/test_smoke.py --generic-cloud aws
 
+import enum
 import inspect
 import json
 import os
@@ -60,6 +61,8 @@
 from sky.data.data_utils import Rclone
 from sky.skylet import constants
 from sky.skylet import events
+from sky.skylet.job_lib import JobStatus
+from sky.status_lib import ClusterStatus
 from sky.utils import common_utils
 from sky.utils import resources_utils
 from sky.utils import subprocess_utils
@@ -95,6 +98,84 @@
     'sleep 10; s=$(sky jobs queue);'
     'echo "Waiting for job to stop RUNNING"; echo "$s"; done')
 
+# Cluster functions
+_ALL_JOB_STATUSES = "|".join([status.value for status in JobStatus])
+_ALL_CLUSTER_STATUSES = "|".join([status.value for status in ClusterStatus])
+
+_WAIT_UNTIL_CLUSTER_STATUS_IS = (
+    # A while loop to wait until the cluster status
+    # becomes certain status, with timeout.
+    'start_time=$SECONDS; '
+    'while true; do '
+    'if (( $SECONDS - $start_time > {timeout} )); then '
+    '  echo "Timeout after {timeout} seconds waiting for cluster status \'{cluster_status}\'"; exit 1; '
+    'fi; '
+    'current_status=$(sky status {cluster_name} --refresh | '
+    'awk "/^{cluster_name}/ '
+    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_CLUSTER_STATUSES +
+    ')$/) print \$i}}"); '
+    'if [[ "$current_status" =~ {cluster_status} ]]; '
+    'then echo "Target cluster status {cluster_status} reached."; break; fi; '
+    'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; '
+    'sleep 10; '
+    'done')
+
+_WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
+    # A while loop to wait until the cluster is not found or timeout
+    'start_time=$SECONDS; '
+    'while true; do '
+    'if (( $SECONDS - $start_time > {timeout} )); then '
+    '  echo "Timeout after {timeout} seconds waiting for cluster to be removed"; exit 1; '
+    'fi; '
+    'if sky status -r {cluster_name}; sky status {cluster_name} | grep "{cluster_name} not found"; then '
+    '  echo "Cluster {cluster_name} successfully removed."; break; '
+    'fi; '
+    'echo "Waiting for cluster {name} to be removed..."; '
+    'sleep 10; '
+    'done')
+
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = (
+    # A while loop to wait until the job status
+    # contains certain status, with timeout.
+    'start_time=$SECONDS; '
+    'while true; do '
+    'if (( $SECONDS - $start_time > {timeout} )); then '
+    '  echo "Timeout after {timeout} seconds waiting for job status \'{job_status}\'"; exit 1; '
+    'fi; '
+    'current_status=$(sky queue {cluster_name} | '
+    'awk "\\$1 == \\"{job_id}\\" '
+    '{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_JOB_STATUSES +
+    ')$/) print \$i}}"); '
+    'found=0; '  # Initialize found variable outside the loop
+    'while read -r line; do '  # Read line by line
+    '  if [[ "$line" =~ {job_status} ]]; then '  # Check each line
+    '    echo "Target job status {job_status} reached."; '
+    '    found=1; '
+    '    break; '  # Break inner loop
+    '  fi; '
+    'done <<< "$current_status"; '
+    'if [ "$found" -eq 1 ]; then break; fi; '  # Break outer loop if match found
+    'echo "Waiting for job status to contains {job_status}, current status: $current_status"; '
+    'sleep 10; '
+    'done')
+
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
+    'awk "\\$1 == \\"{job_id}\\"', 'awk "')
+
+_WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
+    'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"')
+
+# Managed job functions
+
+_WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace(
+    'sky queue {cluster_name}',
+    'sky jobs queue').replace('awk "\\$2 == ', 'awk "\\$3 == ')
+
+# After the timeout, the cluster will stop if autostop is set, and our check
+# should be more than the timeout. To address this, we extend the timeout by
+# _BUMP_UP_SECONDS before exiting.
+_BUMP_UP_SECONDS = 35
+
 DEFAULT_CMD_TIMEOUT = 15 * 60
 
 
@@ -359,7 +440,6 @@ def test_minimal(generic_cloud: str):
     )
     run_one_test(test)
 
-
 # ---------- Test region ----------
 @pytest.mark.aws
 def test_aws_region():
@@ -745,6 +825,12 @@ def test_clone_disk_aws():
             f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"',
             f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true',
             f'sky stop {name} -y',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=60),
+            # Wait for EC2 instance to be in stopped state.
+            # TODO: event based wait.
             'sleep 60',
             f'sky launch --clone-disk-from {name} -y -c {name}-clone --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"',
             f'sky launch --clone-disk-from {name} -y -c {name}-clone-2 --cloud aws -d --region us-east-2 "cat ~/user_file.txt | grep hello"',
@@ -791,8 +877,8 @@ def test_gcp_mig():
             # Check MIG exists.
             f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"',
             f'sky autostop -i 0 --down -y {name}',
-            'sleep 120',
-            f'sky status -r {name}; sky status {name} | grep "{name} not found"',
+            _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=name,
+                                                    timeout=120),
             f'gcloud compute instance-templates list | grep "sky-it-{name}"',
             # Launch again with the same region. The original instance template
             # should be removed.
@@ -859,8 +945,10 @@ def test_custom_default_conda_env(generic_cloud: str):
         f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
         f'sky logs {name} 2 --status',
         f'sky autostop -y -i 0 {name}',
-        'sleep 60',
-        f'sky status -r {name} | grep "STOPPED"',
+        _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            cluster_name=name,
+            cluster_status=ClusterStatus.STOPPED.value,
+            timeout=80),
         f'sky start -y {name}',
         f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"',
         f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
@@ -881,7 +969,10 @@ def test_stale_job(generic_cloud: str):
             f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"',
             f'sky exec {name} -d "echo start; sleep 10000"',
             f'sky stop {name} -y',
-            'sleep 100',  # Ensure this is large enough, else GCP leaks.
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=100),
             f'sky start {name} -y',
             f'sky logs {name} 1 --status',
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED',
@@ -909,13 +1000,18 @@ def test_aws_stale_job_manual_restart():
             '--output text`; '
             f'aws ec2 stop-instances --region {region} '
             '--instance-ids $id',
-            'sleep 40',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=40),
             f'sky launch -c {name} -y "echo hi"',
             f'sky logs {name} 1 --status',
             f'sky logs {name} 3 --status',
             # Ensure the skylet updated the stale job status.
-            f'sleep {events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS}',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED',
+            _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
+                cluster_name=name,
+                job_status=JobStatus.FAILED.value,
+                timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS),
         ],
         f'sky down -y {name}',
     )
@@ -945,8 +1041,10 @@ def test_gcp_stale_job_manual_restart():
             f'sky logs {name} 1 --status',
             f'sky logs {name} 3 --status',
             # Ensure the skylet updated the stale job status.
-            f'sleep {events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS}',
-            f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep FAILED',
+            _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
+                cluster_name=name,
+                job_status=JobStatus.FAILED.value,
+                timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS)
         ],
         f'sky down -y {name}',
     )
@@ -1764,7 +1862,13 @@ def test_multi_echo(generic_cloud: str):
             f'until sky logs {name} 32 --status; do echo "Waiting for job 32 to finish..."; sleep 1; done',
         ] +
         # Ensure jobs succeeded.
-        [f'sky logs {name} {i + 1} --status' for i in range(32)] +
+        [
+            _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format(
+                cluster_name=name,
+                job_id=i + 1,
+                job_status=JobStatus.SUCCEEDED.value,
+                timeout=120) for i in range(32)
+        ] +
         # Ensure monitor/autoscaler didn't crash on the 'assert not
         # unfulfilled' error.  If process not found, grep->ssh returns 1.
         [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''],
@@ -2317,12 +2421,18 @@ def test_gcp_start_stop():
             f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',  # Ensure the raylet process has the correct file descriptor limit.
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
             f'sky stop -y {name}',
-            f'sleep 20',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=40),
             f'sky start -y {name} -i 1',
             f'sky exec {name} examples/gcp_start_stop.yaml',
             f'sky logs {name} 4 --status',  # Ensure the job succeeded.
-            'sleep 180',
-            f'sky status -r {name} | grep "INIT\|STOPPED"',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=
+                f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
+                timeout=200),
         ],
         f'sky down -y {name}',
     )
@@ -2345,9 +2455,12 @@ def test_azure_start_stop():
             f'sky start -y {name} -i 1',
             f'sky exec {name} examples/azure_start_stop.yaml',
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            'sleep 260',
-            f's=$(sky status -r {name}) && echo "$s" && echo "$s" | grep "INIT\|STOPPED"'
-            f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}'
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=
+                f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
+                timeout=280) +
+            f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}',
         ],
         f'sky down -y {name}',
         timeout=30 * 60,  # 30 mins
@@ -2383,8 +2496,10 @@ def test_autostop(generic_cloud: str):
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
 
             # Ensure the cluster is STOPPED.
-            f'sleep {autostop_timeout}',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=autostop_timeout),
 
             # Ensure the cluster is UP and the autostop setting is reset ('-').
             f'sky start -y {name}',
@@ -2400,8 +2515,10 @@ def test_autostop(generic_cloud: str):
             f'sky autostop -y {name} -i 1',  # Should restart the timer.
             'sleep 40',
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
-            f'sleep {autostop_timeout}',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=autostop_timeout),
 
             # Test restarting the idleness timer via exec:
             f'sky start -y {name}',
@@ -2411,8 +2528,10 @@ def test_autostop(generic_cloud: str):
             f'sky exec {name} echo hi',  # Should restart the timer.
             'sleep 45',
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
-            f'sleep {autostop_timeout}',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=autostop_timeout + _BUMP_UP_SECONDS),
         ],
         f'sky down -y {name}',
         timeout=total_timeout_minutes * 60,
@@ -2629,15 +2748,19 @@ def test_stop_gcp_spot():
             f'sky exec {name} -- ls myfile',
             f'sky logs {name} 2 --status',
             f'sky autostop {name} -i0 -y',
-            'sleep 90',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=90),
             f'sky start {name} -y',
             f'sky exec {name} -- ls myfile',
             f'sky logs {name} 3 --status',
             # -i option at launch should go through:
             f'sky launch -c {name} -i0 -y',
-            'sleep 120',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep STOPPED',
+            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+                cluster_name=name,
+                cluster_status=ClusterStatus.STOPPED.value,
+                timeout=120),
         ],
         f'sky down -y {name}',
     )
@@ -2657,14 +2780,21 @@ def test_managed_jobs(generic_cloud: str):
         [
             f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
             f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep "PENDING\|SUBMITTED\|STARTING\|RUNNING"',
-            f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "PENDING\|SUBMITTED\|STARTING\|RUNNING"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-1',
+                job_status=
+                f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})',
+                timeout=60),
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-2',
+                job_status=
+                f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})',
+                timeout=60),
             f'sky jobs cancel -y -n {name}-1',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 200',
-            f'{_GET_JOB_QUEUE} | grep {name}-1 | head -n1 | grep CANCELLED',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=f'{name}-1',
+                job_status=f'{JobStatus.CANCELLED.value}',
+                timeout=230),
             # Test the functionality for logging.
             f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"',
             f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"',
@@ -2734,9 +2864,11 @@ def test_managed_jobs_failed_setup(generic_cloud: str):
         'managed_jobs_failed_setup',
         [
             f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
-            'sleep 330',
             # Make sure the job failed quickly.
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"',
+            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+                job_name=name,
+                job_status=f'{JobStatus.FAILED_SETUP.value}',
+                timeout=330 + _BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.

From 36b3616d57d940d57f9322d2bf049a011144192a Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@berkeley.edu>
Date: Sun, 24 Nov 2024 11:11:37 -0800
Subject: [PATCH 04/14] [Storage] Call `sync_file_mounts` when either rsync or
 storage file_mounts are specified  (#4317)

do file mounts if storage is specified
---
 sky/execution.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sky/execution.py b/sky/execution.py
index 42a39a90380..e5cc8072d04 100644
--- a/sky/execution.py
+++ b/sky/execution.py
@@ -294,7 +294,8 @@ def _execute(
         do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
                       task.workdir is not None)
         do_file_mounts = (Stage.SYNC_FILE_MOUNTS in stages and not dryrun and
-                          task.file_mounts is not None)
+                          (task.file_mounts is not None or
+                           task.storage_mounts is not None))
         if do_workdir or do_file_mounts:
             logger.info(ux_utils.starting_message('Mounting files.'))
 

From 0ece9a89862cdf49636975ead72386e4dedcf4f6 Mon Sep 17 00:00:00 2001
From: zpoint <zp0int@qq.com>
Date: Tue, 26 Nov 2024 17:40:57 +0800
Subject: [PATCH 05/14] merge #4386

---
 tests/test_smoke.py | 388 ++++++++++++++++++++++++++++++--------------
 1 file changed, 268 insertions(+), 120 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index cc45ac01683..d75e5b21ca4 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -59,6 +59,7 @@
 from sky.data import data_utils
 from sky.data import storage as storage_lib
 from sky.data.data_utils import Rclone
+from sky.jobs.state import ManagedJobStatus
 from sky.skylet import constants
 from sky.skylet import events
 from sky.skylet.job_lib import JobStatus
@@ -101,8 +102,20 @@
 # Cluster functions
 _ALL_JOB_STATUSES = "|".join([status.value for status in JobStatus])
 _ALL_CLUSTER_STATUSES = "|".join([status.value for status in ClusterStatus])
+_ALL_MANAGED_JOB_STATUSES = "|".join(
+    [status.value for status in ManagedJobStatus])
 
-_WAIT_UNTIL_CLUSTER_STATUS_IS = (
+
+def _statuses_to_str(statuses: List[enum.Enum]):
+    """Convert a list of enums to a string with all the values separated by |."""
+    assert len(statuses) > 0, 'statuses must not be empty'
+    if len(statuses) > 1:
+        return '(' + '|'.join([status.value for status in statuses]) + ')'
+    else:
+        return statuses[0].value
+
+
+_WAIT_UNTIL_CLUSTER_STATUS_CONTAINS = (
     # A while loop to wait until the cluster status
     # becomes certain status, with timeout.
     'start_time=$SECONDS; '
@@ -120,6 +133,29 @@
     'sleep 10; '
     'done')
 
+
+def _get_cmd_wait_until_cluster_status_contains(
+        cluster_name: str, cluster_status: List[ClusterStatus], timeout: int):
+    return _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.format(
+        cluster_name=cluster_name,
+        cluster_status=_statuses_to_str(cluster_status),
+        timeout=timeout)
+
+
+def _get_cmd_wait_until_cluster_status_contains_wildcard(
+        cluster_name_wildcard: str, cluster_status: List[ClusterStatus],
+        timeout: int):
+    wait_cmd = _WAIT_UNTIL_CLUSTER_STATUS_CONTAINS.replace(
+        'sky status {cluster_name}',
+        'sky status "{cluster_name}"').replace('awk "/^{cluster_name}/',
+                                               'awk "/^{cluster_name_awk}/')
+    return wait_cmd.format(cluster_name=cluster_name_wildcard,
+                           cluster_name_awk=cluster_name_wildcard.replace(
+                               '*', '.*'),
+                           cluster_status=_statuses_to_str(cluster_status),
+                           timeout=timeout)
+
+
 _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND = (
     # A while loop to wait until the cluster is not found or timeout
     'start_time=$SECONDS; '
@@ -130,10 +166,16 @@
     'if sky status -r {cluster_name}; sky status {cluster_name} | grep "{cluster_name} not found"; then '
     '  echo "Cluster {cluster_name} successfully removed."; break; '
     'fi; '
-    'echo "Waiting for cluster {name} to be removed..."; '
+    'echo "Waiting for cluster {cluster_name} to be removed..."; '
     'sleep 10; '
     'done')
 
+
+def _get_cmd_wait_until_cluster_is_not_found(cluster_name: str, timeout: int):
+    return _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=cluster_name,
+                                                   timeout=timeout)
+
+
 _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID = (
     # A while loop to wait until the job status
     # contains certain status, with timeout.
@@ -165,11 +207,51 @@
 _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.replace(
     'awk "\\$1 == \\"{job_id}\\"', 'awk "\\$2 == \\"{job_name}\\"')
 
+
+def _get_cmd_wait_until_job_status_contains_matching_job_id(
+        cluster_name: str, job_id: str, job_status: List[JobStatus],
+        timeout: int):
+    return _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format(
+        cluster_name=cluster_name,
+        job_id=job_id,
+        job_status=_statuses_to_str(job_status),
+        timeout=timeout)
+
+
+def _get_cmd_wait_until_job_status_contains_without_matching_job(
+        cluster_name: str, job_status: List[JobStatus], timeout: int):
+    return _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
+        cluster_name=cluster_name,
+        job_status=_statuses_to_str(job_status),
+        timeout=timeout)
+
+
+def _get_cmd_wait_until_job_status_contains_matching_job_name(
+        cluster_name: str, job_name: str, job_status: List[JobStatus],
+        timeout: int):
+    return _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+        cluster_name=cluster_name,
+        job_name=job_name,
+        job_status=_statuses_to_str(job_status),
+        timeout=timeout)
+
+
 # Managed job functions
 
 _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME = _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.replace(
-    'sky queue {cluster_name}',
-    'sky jobs queue').replace('awk "\\$2 == ', 'awk "\\$3 == ')
+    'sky queue {cluster_name}', 'sky jobs queue').replace(
+        'awk "\\$2 == \\"{job_name}\\"',
+        'awk "\\$2 == \\"{job_name}\\" || \\$3 == \\"{job_name}\\"').replace(
+            _ALL_JOB_STATUSES, _ALL_MANAGED_JOB_STATUSES)
+
+
+def _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+        job_name: str, job_status: List[JobStatus], timeout: int):
+    return _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+        job_name=job_name,
+        job_status=_statuses_to_str(job_status),
+        timeout=timeout)
+
 
 # After the timeout, the cluster will stop if autostop is set, and our check
 # should be more than the timeout. To address this, we extend the timeout by
@@ -440,6 +522,7 @@ def test_minimal(generic_cloud: str):
     )
     run_one_test(test)
 
+
 # ---------- Test region ----------
 @pytest.mark.aws
 def test_aws_region():
@@ -825,9 +908,9 @@ def test_clone_disk_aws():
             f'sky launch -y -c {name} --cloud aws --region us-east-2 --retry-until-up "echo hello > ~/user_file.txt"',
             f'sky launch --clone-disk-from {name} -y -c {name}-clone && exit 1 || true',
             f'sky stop {name} -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=60),
             # Wait for EC2 instance to be in stopped state.
             # TODO: event based wait.
@@ -877,8 +960,8 @@ def test_gcp_mig():
             # Check MIG exists.
             f'gcloud compute instance-groups managed list --format="value(name)" | grep "^sky-mig-{name}"',
             f'sky autostop -i 0 --down -y {name}',
-            _WAIT_UNTIL_CLUSTER_IS_NOT_FOUND.format(cluster_name=name,
-                                                    timeout=120),
+            _get_cmd_wait_until_cluster_is_not_found(cluster_name=name,
+                                                     timeout=120),
             f'gcloud compute instance-templates list | grep "sky-it-{name}"',
             # Launch again with the same region. The original instance template
             # should be removed.
@@ -945,9 +1028,9 @@ def test_custom_default_conda_env(generic_cloud: str):
         f'sky exec {name} tests/test_yamls/test_custom_default_conda_env.yaml',
         f'sky logs {name} 2 --status',
         f'sky autostop -y -i 0 {name}',
-        _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+        _get_cmd_wait_until_cluster_status_contains(
             cluster_name=name,
-            cluster_status=ClusterStatus.STOPPED.value,
+            cluster_status=[ClusterStatus.STOPPED],
             timeout=80),
         f'sky start -y {name}',
         f'sky logs {name} 2 --no-follow | grep -E "myenv\\s+\\*"',
@@ -969,9 +1052,9 @@ def test_stale_job(generic_cloud: str):
             f'sky launch -y -c {name} --cloud {generic_cloud} "echo hi"',
             f'sky exec {name} -d "echo start; sleep 10000"',
             f'sky stop {name} -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=100),
             f'sky start {name} -y',
             f'sky logs {name} 1 --status',
@@ -1000,17 +1083,17 @@ def test_aws_stale_job_manual_restart():
             '--output text`; '
             f'aws ec2 stop-instances --region {region} '
             '--instance-ids $id',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=40),
             f'sky launch -c {name} -y "echo hi"',
             f'sky logs {name} 1 --status',
             f'sky logs {name} 3 --status',
             # Ensure the skylet updated the stale job status.
-            _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
+            _get_cmd_wait_until_job_status_contains_without_matching_job(
                 cluster_name=name,
-                job_status=JobStatus.FAILED.value,
+                job_status=[JobStatus.FAILED],
                 timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS),
         ],
         f'sky down -y {name}',
@@ -1041,9 +1124,9 @@ def test_gcp_stale_job_manual_restart():
             f'sky logs {name} 1 --status',
             f'sky logs {name} 3 --status',
             # Ensure the skylet updated the stale job status.
-            _WAIT_UNTIL_JOB_STATUS_CONTAINS_WITHOUT_MATCHING_JOB.format(
+            _get_cmd_wait_until_job_status_contains_without_matching_job(
                 cluster_name=name,
-                job_status=JobStatus.FAILED.value,
+                job_status=[JobStatus.FAILED.value],
                 timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS)
         ],
         f'sky down -y {name}',
@@ -1721,6 +1804,7 @@ def test_large_job_queue(generic_cloud: str):
             f'for i in `seq 1 75`; do sky exec {name} -n {name}-$i -d "echo $i; sleep 100000000"; done',
             f'sky cancel -y {name} 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16',
             'sleep 90',
+
             # Each job takes 0.5 CPU and the default VM has 8 CPUs, so there should be 8 / 0.5 = 16 jobs running.
             # The first 16 jobs are canceled, so there should be 75 - 32 = 43 jobs PENDING.
             f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep -v grep | grep PENDING | wc -l | grep 43',
@@ -1863,10 +1947,10 @@ def test_multi_echo(generic_cloud: str):
         ] +
         # Ensure jobs succeeded.
         [
-            _WAIT_UNTIL_JOB_STATUS_CONTAINS_MATCHING_JOB_ID.format(
+            _get_cmd_wait_until_job_status_contains_matching_job_id(
                 cluster_name=name,
                 job_id=i + 1,
-                job_status=JobStatus.SUCCEEDED.value,
+                job_status=[JobStatus.SUCCEEDED],
                 timeout=120) for i in range(32)
         ] +
         # Ensure monitor/autoscaler didn't crash on the 'assert not
@@ -2421,17 +2505,16 @@ def test_gcp_start_stop():
             f'sky exec {name} "prlimit -n --pid=\$(pgrep -f \'raylet/raylet --raylet_socket_name\') | grep \'"\'1048576 1048576\'"\'"',  # Ensure the raylet process has the correct file descriptor limit.
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
             f'sky stop -y {name}',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=40),
             f'sky start -y {name} -i 1',
             f'sky exec {name} examples/gcp_start_stop.yaml',
             f'sky logs {name} 4 --status',  # Ensure the job succeeded.
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=
-                f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
+                cluster_status=[ClusterStatus.STOPPED, ClusterStatus.INIT],
                 timeout=200),
         ],
         f'sky down -y {name}',
@@ -2455,10 +2538,9 @@ def test_azure_start_stop():
             f'sky start -y {name} -i 1',
             f'sky exec {name} examples/azure_start_stop.yaml',
             f'sky logs {name} 3 --status',  # Ensure the job succeeded.
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=
-                f'({ClusterStatus.STOPPED.value}|{ClusterStatus.INIT.value})',
+                cluster_status=[ClusterStatus.STOPPED, ClusterStatus.INIT],
                 timeout=280) +
             f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}',
         ],
@@ -2496,9 +2578,9 @@ def test_autostop(generic_cloud: str):
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
 
             # Ensure the cluster is STOPPED.
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=autostop_timeout),
 
             # Ensure the cluster is UP and the autostop setting is reset ('-').
@@ -2515,9 +2597,9 @@ def test_autostop(generic_cloud: str):
             f'sky autostop -y {name} -i 1',  # Should restart the timer.
             'sleep 40',
             f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s" | grep {name} | grep UP',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=autostop_timeout),
 
             # Test restarting the idleness timer via exec:
@@ -2527,10 +2609,9 @@ def test_autostop(generic_cloud: str):
             'sleep 45',  # Almost reached the threshold.
             f'sky exec {name} echo hi',  # Should restart the timer.
             'sleep 45',
-            f's=$(sky status {name} --refresh); echo "$s"; echo; echo; echo "$s"  | grep {name} | grep UP',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=autostop_timeout + _BUMP_UP_SECONDS),
         ],
         f'sky down -y {name}',
@@ -2748,18 +2829,18 @@ def test_stop_gcp_spot():
             f'sky exec {name} -- ls myfile',
             f'sky logs {name} 2 --status',
             f'sky autostop {name} -i0 -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=90),
             f'sky start {name} -y',
             f'sky exec {name} -- ls myfile',
             f'sky logs {name} 3 --status',
             # -i option at launch should go through:
             f'sky launch -c {name} -i0 -y',
-            _WAIT_UNTIL_CLUSTER_STATUS_IS.format(
+            _get_cmd_wait_until_cluster_status_contains(
                 cluster_name=name,
-                cluster_status=ClusterStatus.STOPPED.value,
+                cluster_status=[ClusterStatus.STOPPED],
                 timeout=120),
         ],
         f'sky down -y {name}',
@@ -2780,20 +2861,24 @@ def test_managed_jobs(generic_cloud: str):
         [
             f'sky jobs launch -n {name}-1 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
             f'sky jobs launch -n {name}-2 --cloud {generic_cloud} examples/managed_job.yaml -y -d',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-1',
-                job_status=
-                f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})',
+                job_status=[
+                    ManagedJobStatus.PENDING, ManagedJobStatus.SUBMITTED,
+                    ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING
+                ],
                 timeout=60),
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-2',
-                job_status=
-                f'({JobStatus.PENDING.value}|{JobStatus.INIT.value}|{JobStatus.RUNNING.value})',
+                job_status=[
+                    ManagedJobStatus.PENDING, ManagedJobStatus.SUBMITTED,
+                    ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING
+                ],
                 timeout=60),
             f'sky jobs cancel -y -n {name}-1',
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=f'{name}-1',
-                job_status=f'{JobStatus.CANCELLED.value}',
+                job_status=[ManagedJobStatus.CANCELLED],
                 timeout=230),
             # Test the functionality for logging.
             f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"',
@@ -2865,9 +2950,9 @@ def test_managed_jobs_failed_setup(generic_cloud: str):
         [
             f'sky jobs launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
             # Make sure the job failed quickly.
-            _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
                 job_name=name,
-                job_status=f'{JobStatus.FAILED_SETUP.value}',
+                job_status=[ManagedJobStatus.FAILED_SETUP],
                 timeout=330 + _BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
@@ -2891,7 +2976,10 @@ def test_managed_jobs_pipeline_failed_setup(generic_cloud: str):
         'managed_jobs_pipeline_failed_setup',
         [
             f'sky jobs launch -n {name} -y -d tests/test_yamls/failed_setup_pipeline.yaml',
-            'sleep 600',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.FAILED_SETUP],
+                timeout=600),
             # Make sure the job failed quickly.
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "FAILED_SETUP"',
             # Task 0 should be SUCCEEDED.
@@ -2925,8 +3013,10 @@ def test_managed_jobs_recovery_aws(aws_config_region):
         'managed_jobs_recovery_aws',
         [
             f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            'sleep 360',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=600),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the cluster manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
@@ -2936,8 +3026,10 @@ def test_managed_jobs_recovery_aws(aws_config_region):
              '--output text)'),
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            'sleep 200',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',
@@ -2965,15 +3057,19 @@ def test_managed_jobs_recovery_gcp():
         'managed_jobs_recovery_gcp',
         [
             f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            'sleep 360',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=300),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the cluster manually.
             terminate_cmd,
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            'sleep 200',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo "$RUN_ID"; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',
@@ -2996,8 +3092,10 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
         'managed_jobs_pipeline_recovery_aws',
         [
             f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml  -y -d',
-            'sleep 400',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=400),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
             # Terminate the cluster manually.
@@ -3016,8 +3114,10 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
                 '--output text)'),
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            'sleep 200',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
             f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new',
@@ -3047,8 +3147,10 @@ def test_managed_jobs_pipeline_recovery_gcp():
         'managed_jobs_pipeline_recovery_gcp',
         [
             f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml  -y -d',
-            'sleep 400',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=400),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids',
             # Terminate the cluster manually.
@@ -3059,8 +3161,10 @@ def test_managed_jobs_pipeline_recovery_gcp():
              f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'),
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            'sleep 200',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=200),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID: | grep "$RUN_ID"',
             f'RUN_IDS=$(sky jobs logs -n {name} --no-follow | grep -A 4 SKYPILOT_TASK_IDS | cut -d")" -f2); echo "$RUN_IDS" | tee /tmp/{name}-run-ids-new',
             f'diff /tmp/{name}-run-ids /tmp/{name}-run-ids-new',
@@ -3086,8 +3190,12 @@ def test_managed_jobs_recovery_default_resources(generic_cloud: str):
         'managed-spot-recovery-default-resources',
         [
             f'sky jobs launch -n {name} --cloud {generic_cloud} --use-spot "sleep 30 && sudo shutdown now && sleep 1000" -y -d',
-            'sleep 360',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING\|RECOVERING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[
+                    ManagedJobStatus.RUNNING, ManagedJobStatus.RECOVERING
+                ],
+                timeout=360),
         ],
         f'sky jobs cancel -y -n {name}',
         timeout=25 * 60,
@@ -3107,8 +3215,10 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
         'managed_jobs_recovery_multi_node_aws',
         [
             f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            'sleep 450',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=450),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the worker manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
@@ -3119,8 +3229,10 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
              '--output text)'),
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            'sleep 560',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=560),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',
@@ -3148,15 +3260,19 @@ def test_managed_jobs_recovery_multi_node_gcp():
         'managed_jobs_recovery_multi_node_gcp',
         [
             f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800"  -y -d',
-            'sleep 400',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=400),
             f'RUN_ID=$(sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2); echo "$RUN_ID" | tee /tmp/{name}-run-id',
             # Terminate the worker manually.
             terminate_cmd,
             _JOB_WAIT_NOT_RUNNING.format(job_name=name),
             f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"',
-            'sleep 420',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=560),
             f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky jobs logs -n {name} --no-follow | grep SKYPILOT_TASK_ID | cut -d: -f2 | grep "$RUN_ID"',
         ],
         f'sky jobs cancel -y -n {name}',
@@ -3181,13 +3297,17 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
         [
             # Test cancellation during spot cluster being launched.
             f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000"  -y -d',
-            'sleep 60',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[
+                    ManagedJobStatus.STARTING, ManagedJobStatus.RUNNING
+                ],
+                timeout=60 + _BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 120',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLED"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.CANCELLED],
+                timeout=120 + _BUMP_UP_SECONDS),
             (f's=$(aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_on_cloud}-* '
              f'--query Reservations[].Instances[].State[].Name '
@@ -3195,12 +3315,16 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             ),
             # Test cancelling the spot cluster during spot job being setup.
             f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
-            'sleep 300',
+            # The job is set up in the cluster, will shown as RUNNING.
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=f'{name}-2',
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=300 + _BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}-2',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 120',
-            f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLED"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=f'{name}-2',
+                job_status=[ManagedJobStatus.CANCELLED],
+                timeout=120 + _BUMP_UP_SECONDS),
             (f's=$(aws ec2 describe-instances --region {region} '
              f'--filters Name=tag:ray-cluster-name,Values={name_2_on_cloud}-* '
              f'--query Reservations[].Instances[].State[].Name '
@@ -3208,8 +3332,11 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             ),
             # Test cancellation during spot job is recovering.
             f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000"  -y -d',
-            'sleep 300',
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RUNNING"',
+            # The job is running in the cluster, will shown as RUNNING.
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=f'{name}-3',
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=300 + _BUMP_UP_SECONDS),
             # Terminate the cluster manually.
             (f'aws ec2 terminate-instances --region {region} --instance-ids $('
              f'aws ec2 describe-instances --region {region} '
@@ -3219,10 +3346,10 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
             _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
             f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
             f'sky jobs cancel -y -n {name}-3',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 120',
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLED"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=f'{name}-3',
+                job_status=[ManagedJobStatus.CANCELLED],
+                timeout=120 + _BUMP_UP_SECONDS),
             # The cluster should be terminated (shutting-down) after cancellation. We don't use the `=` operator here because
             # there can be multiple VM with the same name due to the recovery.
             (f's=$(aws ec2 describe-instances --region {region} '
@@ -3257,34 +3384,42 @@ def test_managed_jobs_cancellation_gcp():
         [
             # Test cancellation during spot cluster being launched.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000"  -y -d',
-            'sleep 60',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.STARTING],
+                timeout=60 + _BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 120',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "CANCELLED"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.CANCELLED],
+                timeout=120 + _BUMP_UP_SECONDS),
             # Test cancelling the spot cluster during spot job being setup.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml  -y -d',
-            'sleep 300',
+            # The job is set up in the cluster, will shown as RUNNING.
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=f'{name}-2',
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=300 + _BUMP_UP_SECONDS),
             f'sky jobs cancel -y -n {name}-2',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 120',
-            f'{_GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "CANCELLED"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=f'{name}-2',
+                job_status=[ManagedJobStatus.CANCELLED],
+                timeout=120 + _BUMP_UP_SECONDS),
             # Test cancellation during spot job is recovering.
             f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000"  -y -d',
-            'sleep 300',
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RUNNING"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=f'{name}-3',
+                job_status=[ManagedJobStatus.RUNNING],
+                timeout=300 + _BUMP_UP_SECONDS),
             # Terminate the cluster manually.
             terminate_cmd,
             _JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
             f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
             f'sky jobs cancel -y -n {name}-3',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLING\|CANCELLED"',
-            'sleep 120',
-            f'{_GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "CANCELLED"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=f'{name}-3',
+                job_status=[ManagedJobStatus.CANCELLED],
+                timeout=120 + _BUMP_UP_SECONDS),
             # The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because
             # there can be multiple VM with the same name due to the recovery.
             (f's=$({query_state_cmd}) && echo "$s" && echo; [[ -z "$s" ]] || echo "$s" | grep -v -E "PROVISIONING|STAGING|RUNNING|REPAIRING|TERMINATED|SUSPENDING|SUSPENDED|SUSPENDED"'
@@ -3374,8 +3509,10 @@ def test_managed_jobs_storage(generic_cloud: str):
                 *STORAGE_SETUP_COMMANDS,
                 f'sky jobs launch -n {name}{use_spot} --cloud {generic_cloud}{region_flag} {file_path} -y',
                 region_validation_cmd,  # Check if the bucket is created in the correct region
-                'sleep 60',  # Wait the spot queue to be updated
-                f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED',
+                _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                    job_name=name,
+                    job_status=[ManagedJobStatus.SUCCEEDED],
+                    timeout=60 + _BUMP_UP_SECONDS),
                 f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]',
                 # Check if file was written to the mounted output bucket
                 output_check_cmd
@@ -3399,10 +3536,17 @@ def test_managed_jobs_tpu():
         'test-spot-tpu',
         [
             f'sky jobs launch -n {name} --use-spot examples/tpu/tpuvm_mnist.yaml -y -d',
-            'sleep 5',
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep STARTING',
-            'sleep 900',  # TPU takes a while to launch
-            f'{_GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RUNNING\|SUCCEEDED"',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.STARTING],
+                timeout=60 + _BUMP_UP_SECONDS),
+            # TPU takes a while to launch
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[
+                    ManagedJobStatus.RUNNING, ManagedJobStatus.SUCCEEDED
+                ],
+                timeout=900 + _BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
@@ -3420,8 +3564,10 @@ def test_managed_jobs_inline_env(generic_cloud: str):
         'test-managed-jobs-inline-env',
         [
             f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
-            'sleep 20',
-            f'{_GET_JOB_QUEUE} | grep {name} | grep SUCCEEDED',
+            _get_cmd_wait_until_managed_job_status_contains_matching_job_name(
+                job_name=name,
+                job_status=[ManagedJobStatus.SUCCEEDED],
+                timeout=20 + _BUMP_UP_SECONDS),
         ],
         f'sky jobs cancel -y -n {name}',
         # Increase timeout since sky jobs queue -r can be blocked by other spot tests.
@@ -3528,8 +3674,10 @@ def test_azure_start_stop_two_nodes():
             f'sky start -y {name} -i 1',
             f'sky exec --num-nodes=2 {name} examples/azure_start_stop.yaml',
             f'sky logs {name} 2 --status',  # Ensure the job succeeded.
-            'sleep 200',
-            f's=$(sky status -r {name}) && echo "$s" && echo "$s" | grep "INIT\|STOPPED"'
+            _get_cmd_wait_until_cluster_status_contains(
+                cluster_name=name,
+                cluster_status=[ClusterStatus.INIT, ClusterStatus.STOPPED],
+                timeout=200 + _BUMP_UP_SECONDS) +
             f'|| {{ ssh {name} "cat ~/.sky/skylet.log"; exit 1; }}'
         ],
         f'sky down -y {name}',

From 07f87dd718909bcc5718c7a8b2f0a5b361a63d3c Mon Sep 17 00:00:00 2001
From: Tian Xia <cblmemo@gmail.com>
Date: Sun, 24 Nov 2024 23:57:48 -0800
Subject: [PATCH 06/14] Fix Spot instance on Azure (#4408)

---
 sky/provision/azure/instance.py | 3 ++-
 sky/templates/azure-ray.yml.j2  | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/sky/provision/azure/instance.py b/sky/provision/azure/instance.py
index 60159232787..c54199bd25d 100644
--- a/sky/provision/azure/instance.py
+++ b/sky/provision/azure/instance.py
@@ -305,7 +305,8 @@ def _create_vm(
         network_profile=network_profile,
         identity=compute.VirtualMachineIdentity(
             type='UserAssigned',
-            user_assigned_identities={provider_config['msi']: {}}))
+            user_assigned_identities={provider_config['msi']: {}}),
+        priority=node_config['azure_arm_parameters']['priority'])
     vm_poller = compute_client.virtual_machines.begin_create_or_update(
         resource_group_name=provider_config['resource_group'],
         vm_name=vm_name,
diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2
index 7b9737748d3..1140704a708 100644
--- a/sky/templates/azure-ray.yml.j2
+++ b/sky/templates/azure-ray.yml.j2
@@ -75,9 +75,6 @@ available_node_types:
             {%- if use_spot %}
             # optionally set priority to use Spot instances
             priority: Spot
-            # set a maximum price for spot instances if desired
-            # billingProfile:
-            #     maxPrice: -1
             {%- endif %}
             cloudInitSetupCommands: |-
               {%- for cmd in cloud_init_setup_commands %}

From a2ebb05a68c4f6999587487a10f0b22be211249f Mon Sep 17 00:00:00 2001
From: Tian Xia <cblmemo@gmail.com>
Date: Mon, 25 Nov 2024 14:21:51 -0800
Subject: [PATCH 07/14] Fix OD instance on Azure (#4411)

---
 sky/provision/azure/instance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sky/provision/azure/instance.py b/sky/provision/azure/instance.py
index c54199bd25d..700d31c597f 100644
--- a/sky/provision/azure/instance.py
+++ b/sky/provision/azure/instance.py
@@ -306,7 +306,7 @@ def _create_vm(
         identity=compute.VirtualMachineIdentity(
             type='UserAssigned',
             user_assigned_identities={provider_config['msi']: {}}),
-        priority=node_config['azure_arm_parameters']['priority'])
+        priority=node_config['azure_arm_parameters'].get('priority', None))
     vm_poller = compute_client.virtual_machines.begin_create_or_update(
         resource_group_name=provider_config['resource_group'],
         vm_name=vm_name,

From ab0e2cc001efa5433e9139309ec6f92dcccb1be1 Mon Sep 17 00:00:00 2001
From: Christopher Cooper <cooperc@assemblesys.com>
Date: Tue, 3 Dec 2024 11:01:10 -0800
Subject: [PATCH 08/14] avoid catching ValueError during failover (#4432)

* avoid catching ValueError during failover

If the cloud api raises ValueError or a subclass of ValueError during instance
termination, we will assume the cluster was downed. Fix this by introducing a
new exception ClusterDoesNotExist that we can catch instead of the more general
ValueError.

* add unit test

* lint
---
 sky/backends/backend_utils.py              |  9 ++--
 sky/core.py                                | 43 +++++++++++--------
 sky/exceptions.py                          |  7 ++++
 sky/execution.py                           |  5 ++-
 sky/jobs/recovery_strategy.py              |  3 +-
 tests/unit_tests/test_recovery_strategy.py | 48 ++++++++++++++++++++++
 6 files changed, 90 insertions(+), 25 deletions(-)
 create mode 100644 tests/unit_tests/test_recovery_strategy.py

diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
index da1e68991a4..a411ffa8344 100644
--- a/sky/backends/backend_utils.py
+++ b/sky/backends/backend_utils.py
@@ -1567,14 +1567,14 @@ def check_can_clone_disk_and_override_task(
         The task to use and the resource handle of the source cluster.
 
     Raises:
-        ValueError: If the source cluster does not exist.
+        exceptions.ClusterDoesNotExist: If the source cluster does not exist.
         exceptions.NotSupportedError: If the source cluster is not valid or the
             task is not compatible to clone disk from the source cluster.
     """
     source_cluster_status, handle = refresh_cluster_status_handle(cluster_name)
     if source_cluster_status is None:
         with ux_utils.print_exception_no_traceback():
-            raise ValueError(
+            raise exceptions.ClusterDoesNotExist(
                 f'Cannot find cluster {cluster_name!r} to clone disk from.')
 
     if not isinstance(handle, backends.CloudVmRayResourceHandle):
@@ -2060,7 +2060,7 @@ def check_cluster_available(
     """Check if the cluster is available.
 
     Raises:
-        ValueError: if the cluster does not exist.
+        exceptions.ClusterDoesNotExist: if the cluster does not exist.
         exceptions.ClusterNotUpError: if the cluster is not UP.
         exceptions.NotSupportedError: if the cluster is not based on
           CloudVmRayBackend.
@@ -2125,7 +2125,8 @@ def check_cluster_available(
             error_msg += message
 
         with ux_utils.print_exception_no_traceback():
-            raise ValueError(f'{colorama.Fore.YELLOW}{error_msg}{reset}')
+            raise exceptions.ClusterDoesNotExist(
+                f'{colorama.Fore.YELLOW}{error_msg}{reset}')
     assert cluster_status is not None, 'handle is not None but status is None'
     backend = get_backend_from_handle(handle)
     if check_cloud_vm_ray_backend and not isinstance(
diff --git a/sky/core.py b/sky/core.py
index 496b8b8ad5e..35bce9fa7eb 100644
--- a/sky/core.py
+++ b/sky/core.py
@@ -268,7 +268,8 @@ def _start(
     cluster_status, handle = backend_utils.refresh_cluster_status_handle(
         cluster_name)
     if handle is None:
-        raise ValueError(f'Cluster {cluster_name!r} does not exist.')
+        raise exceptions.ClusterDoesNotExist(
+            f'Cluster {cluster_name!r} does not exist.')
     if not force and cluster_status == status_lib.ClusterStatus.UP:
         sky_logging.print(f'Cluster {cluster_name!r} is already up.')
         return handle
@@ -359,12 +360,13 @@ def start(
             Useful for upgrading SkyPilot runtime.
 
     Raises:
-        ValueError: argument values are invalid: (1) the specified cluster does
-          not exist; (2) if ``down`` is set to True but
-          ``idle_minutes_to_autostop`` is None; (3) if the specified cluster is
-          the managed jobs controller, and either ``idle_minutes_to_autostop``
-          is not None or ``down`` is True (omit them to use the default
-          autostop settings).
+        ValueError: argument values are invalid: (1) if ``down`` is set to True
+          but ``idle_minutes_to_autostop`` is None; (2) if the specified
+          cluster is the managed jobs controller, and either
+          ``idle_minutes_to_autostop`` is not None or ``down`` is True (omit
+          them to use the default autostop settings).
+        sky.exceptions.ClusterDoesNotExist: the specified cluster does not
+          exist.
         sky.exceptions.NotSupportedError: if the cluster to restart was
           launched using a non-default backend that does not support this
           operation.
@@ -412,7 +414,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
             related resources.
 
     Raises:
-        ValueError: the specified cluster does not exist.
+        sky.exceptions.ClusterDoesNotExist: the specified cluster does not
+          exist.
         RuntimeError: failed to stop the cluster.
         sky.exceptions.NotSupportedError: if the specified cluster is a spot
           cluster, or a TPU VM Pod cluster, or the managed jobs controller.
@@ -423,7 +426,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
             f'is not supported.')
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
     if handle is None:
-        raise ValueError(f'Cluster {cluster_name!r} does not exist.')
+        raise exceptions.ClusterDoesNotExist(
+            f'Cluster {cluster_name!r} does not exist.')
 
     backend = backend_utils.get_backend_from_handle(handle)
 
@@ -467,14 +471,16 @@ def down(cluster_name: str, purge: bool = False) -> None:
             resources.
 
     Raises:
-        ValueError: the specified cluster does not exist.
+        sky.exceptions.ClusterDoesNotExist: the specified cluster does not
+          exist.
         RuntimeError: failed to tear down the cluster.
         sky.exceptions.NotSupportedError: the specified cluster is the managed
           jobs controller.
     """
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
     if handle is None:
-        raise ValueError(f'Cluster {cluster_name!r} does not exist.')
+        raise exceptions.ClusterDoesNotExist(
+            f'Cluster {cluster_name!r} does not exist.')
 
     usage_lib.record_cluster_name_for_current_operation(cluster_name)
     backend = backend_utils.get_backend_from_handle(handle)
@@ -521,7 +527,7 @@ def autostop(
           rather than autostop (restartable).
 
     Raises:
-        ValueError: if the cluster does not exist.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the cluster is not based on
           CloudVmRayBackend or the cluster is TPU VM Pod.
@@ -615,7 +621,7 @@ def queue(cluster_name: str,
             }
         ]
     raises:
-        ValueError: if the cluster does not exist.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the cluster is not based on
           CloudVmRayBackend.
@@ -674,7 +680,8 @@ def cancel(
             worker node is preempted in the spot cluster.
 
     Raises:
-        ValueError: if arguments are invalid, or the cluster does not exist.
+        ValueError: if arguments are invalid.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the specified cluster is a
           controller that does not support this operation.
@@ -749,8 +756,8 @@ def tail_logs(cluster_name: str,
     Please refer to the sky.cli.tail_logs for the document.
 
     Raises:
-        ValueError: arguments are invalid or the cluster is not supported or
-          the cluster does not exist.
+        ValueError: if arguments are invalid or the cluster is not supported.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the cluster is not based on
           CloudVmRayBackend.
@@ -792,7 +799,7 @@ def download_logs(
     Returns:
         Dict[str, str]: a mapping of job_id to local log path.
     Raises:
-        ValueError: if the cluster does not exist.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the cluster is not based on
           CloudVmRayBackend.
@@ -837,7 +844,7 @@ def job_status(cluster_name: str,
         If job_ids is None and there is no job on the cluster, it will return
         {None: None}.
     Raises:
-        ValueError: if the cluster does not exist.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the cluster is not based on
           CloudVmRayBackend.
diff --git a/sky/exceptions.py b/sky/exceptions.py
index f78c6605261..0eef19a5add 100644
--- a/sky/exceptions.py
+++ b/sky/exceptions.py
@@ -129,6 +129,13 @@ class ClusterSetUpError(Exception):
     pass
 
 
+class ClusterDoesNotExist(ValueError):
+    """Raise when trying to operate on a cluster that does not exist."""
+    # This extends ValueError for compatibility reasons - we used to throw
+    # ValueError instead of this.
+    pass
+
+
 class NotSupportedError(Exception):
     """Raised when a feature is not supported."""
     pass
diff --git a/sky/execution.py b/sky/execution.py
index e5cc8072d04..6a464d7fae8 100644
--- a/sky/execution.py
+++ b/sky/execution.py
@@ -524,8 +524,9 @@ def exec(  # pylint: disable=redefined-builtin
             submitted.
 
     Raises:
-        ValueError: if the specified cluster does not exist or is not in UP
-            status.
+        ValueError: if the specified cluster is not in UP status.
+        sky.exceptions.ClusterDoesNotExist: if the specified cluster does not
+            exist.
         sky.exceptions.NotSupportedError: if the specified cluster is a
             controller that does not support this operation.
 
diff --git a/sky/jobs/recovery_strategy.py b/sky/jobs/recovery_strategy.py
index e5c607696d8..369b4e62dc8 100644
--- a/sky/jobs/recovery_strategy.py
+++ b/sky/jobs/recovery_strategy.py
@@ -45,8 +45,9 @@ def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
             usage_lib.messages.usage.set_internal()
             sky.down(cluster_name)
             return
-        except ValueError:
+        except exceptions.ClusterDoesNotExist:
             # The cluster is already down.
+            logger.debug(f'The cluster {cluster_name} is already down.')
             return
         except Exception as e:  # pylint: disable=broad-except
             retry_cnt += 1
diff --git a/tests/unit_tests/test_recovery_strategy.py b/tests/unit_tests/test_recovery_strategy.py
new file mode 100644
index 00000000000..da8e8142da0
--- /dev/null
+++ b/tests/unit_tests/test_recovery_strategy.py
@@ -0,0 +1,48 @@
+from unittest import mock
+
+from sky.exceptions import ClusterDoesNotExist
+from sky.jobs import recovery_strategy
+
+
+@mock.patch('sky.down')
+@mock.patch('sky.usage.usage_lib.messages.usage.set_internal')
+def test_terminate_cluster_retry_on_value_error(mock_set_internal,
+                                                mock_sky_down) -> None:
+    # Set up mock to fail twice with ValueError, then succeed
+    mock_sky_down.side_effect = [
+        ValueError('Mock error 1'),
+        ValueError('Mock error 2'),
+        None,
+    ]
+
+    # Call should succeed after retries
+    recovery_strategy.terminate_cluster('test-cluster')
+
+    # Verify sky.down was called 3 times
+    assert mock_sky_down.call_count == 3
+    mock_sky_down.assert_has_calls([
+        mock.call('test-cluster'),
+        mock.call('test-cluster'),
+        mock.call('test-cluster'),
+    ])
+
+    # Verify usage.set_internal was called before each sky.down
+    assert mock_set_internal.call_count == 3
+
+
+@mock.patch('sky.down')
+@mock.patch('sky.usage.usage_lib.messages.usage.set_internal')
+def test_terminate_cluster_handles_nonexistent_cluster(mock_set_internal,
+                                                       mock_sky_down) -> None:
+    # Set up mock to raise ClusterDoesNotExist
+    mock_sky_down.side_effect = ClusterDoesNotExist('test-cluster')
+
+    # Call should succeed silently
+    recovery_strategy.terminate_cluster('test-cluster')
+
+    # Verify sky.down was called once
+    assert mock_sky_down.call_count == 1
+    mock_sky_down.assert_called_once_with('test-cluster')
+
+    # Verify usage.set_internal was called once
+    assert mock_set_internal.call_count == 1

From 86a35647acb3defc601b4872faddab50593d403f Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Wed, 4 Dec 2024 18:44:55 +0800
Subject: [PATCH 09/14] hot fix to support smoke tests

---
 tests/test_smoke.py | 53 ++++++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index d75e5b21ca4..16dde046281 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -312,20 +312,31 @@ def _terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str:
 def run_one_test(test: Test) -> Tuple[int, str, str]:
     # Fail fast if `sky` CLI somehow errors out.
     subprocess.run(['sky', 'status'], stdout=subprocess.DEVNULL, check=True)
-    log_file = tempfile.NamedTemporaryFile('a',
-                                           prefix=f'{test.name}-',
-                                           suffix='.log',
-                                           delete=False)
-    test.echo(f'Test started. Log: less {log_file.name}')
+    log_to_stdout = os.environ.get('LOG_TO_STDOUT', None)
+    if log_to_stdout:
+        write = test.echo
+        flush = lambda: None
+        subprocess_out = sys.stderr
+        test.echo(f'Test started. Log to stdout')
+    else:
+        log_file = tempfile.NamedTemporaryFile('a',
+                                               prefix=f'{test.name}-',
+                                               suffix='.log',
+                                               delete=False)
+        write = log_file.write
+        flush = log_file.flush
+        subprocess_out = log_file
+        test.echo(f'Test started. Log: less {log_file.name}')
+
     env_dict = os.environ.copy()
     if test.env:
         env_dict.update(test.env)
     for command in test.commands:
-        log_file.write(f'+ {command}\n')
-        log_file.flush()
+        write(f'+ {command}\n')
+        flush()
         proc = subprocess.Popen(
             command,
-            stdout=log_file,
+            stdout=subprocess_out,
             stderr=subprocess.STDOUT,
             shell=True,
             executable='/bin/bash',
@@ -334,11 +345,11 @@ def run_one_test(test: Test) -> Tuple[int, str, str]:
         try:
             proc.wait(timeout=test.timeout)
         except subprocess.TimeoutExpired as e:
-            log_file.flush()
+            flush()
             test.echo(f'Timeout after {test.timeout} seconds.')
             test.echo(str(e))
-            log_file.write(f'Timeout after {test.timeout} seconds.\n')
-            log_file.flush()
+            write(f'Timeout after {test.timeout} seconds.\n')
+            flush()
             # Kill the current process.
             proc.terminate()
             proc.returncode = 1  # None if we don't set it.
@@ -353,22 +364,29 @@ def run_one_test(test: Test) -> Tuple[int, str, str]:
                if proc.returncode else f'{fore.GREEN}Passed{style.RESET_ALL}')
     reason = f'\nReason: {command}' if proc.returncode else ''
     msg = (f'{outcome}.'
-           f'{reason}'
-           f'\nLog: less {log_file.name}\n')
-    test.echo(msg)
-    log_file.write(msg)
+           f'{reason}')
+    if log_to_stdout:
+        test.echo(msg)
+    else:
+        msg += f'\nLog: less {log_file.name}\n'
+        test.echo(msg)
+        write(msg)
+
     if (proc.returncode == 0 or
             pytest.terminate_on_failure) and test.teardown is not None:
         subprocess_utils.run(
             test.teardown,
-            stdout=log_file,
+            stdout=subprocess_out,
             stderr=subprocess.STDOUT,
             timeout=10 * 60,  # 10 mins
             shell=True,
         )
 
     if proc.returncode:
-        raise Exception(f'test failed: less {log_file.name}')
+        if log_to_stdout:
+            raise Exception(f'test failed')
+        else:
+            raise Exception(f'test failed: less {log_file.name}')
 
 
 def get_aws_region_for_quota_failover() -> Optional[str]:
@@ -3513,6 +3531,7 @@ def test_managed_jobs_storage(generic_cloud: str):
                     job_name=name,
                     job_status=[ManagedJobStatus.SUCCEEDED],
                     timeout=60 + _BUMP_UP_SECONDS),
+                f'sleep 30',
                 f'[ $(aws s3api list-buckets --query "Buckets[?contains(Name, \'{storage_name}\')].Name" --output text | wc -l) -eq 0 ]',
                 # Check if file was written to the mounted output bucket
                 output_check_cmd

From 4e42ddd6bb2989f48f95a7751ef2de4bb98d8a39 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Wed, 4 Dec 2024 21:03:14 +0800
Subject: [PATCH 10/14] bump up version

---
 Dockerfile      | 2 +-
 sky/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index d8954cdc351..a31c99e2f95 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,4 +8,4 @@ RUN conda install -c conda-forge google-cloud-sdk && \
     rm -rf /var/lib/apt/lists/*
 
 # Install sky
-RUN pip install --no-cache-dir "skypilot[all]==0.7.0"
+RUN pip install --no-cache-dir "skypilot[all]==0.7.1"
diff --git a/sky/__init__.py b/sky/__init__.py
index bb47770c716..17c61d86ec2 100644
--- a/sky/__init__.py
+++ b/sky/__init__.py
@@ -35,7 +35,7 @@ def _get_git_commit():
 
 
 __commit__ = _get_git_commit()
-__version__ = '0.7.0'
+__version__ = '0.7.1'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))
 
 

From 66530262baed2f62c2998eb4f86bb65af94c1aae Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Thu, 5 Dec 2024 13:13:03 +0800
Subject: [PATCH 11/14] pytest hot fix from master branch

---
 tests/test_optimizer_dryruns.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_optimizer_dryruns.py b/tests/test_optimizer_dryruns.py
index f1af9a0d9ee..7d00f4df2bc 100644
--- a/tests/test_optimizer_dryruns.py
+++ b/tests/test_optimizer_dryruns.py
@@ -663,7 +663,8 @@ def test_infer_cloud_from_region_or_zone(monkeypatch):
     _test_resources_launch(monkeypatch, zone='us-west2-a')
 
     # Maps to AWS.
-    _test_resources_launch(monkeypatch, region='us-east-2')
+    # Not use us-east-2 or us-west-1 as it is also supported by Lambda.
+    _test_resources_launch(monkeypatch, region='eu-south-1')
     _test_resources_launch(monkeypatch, zone='us-west-2a')
 
     # `sky launch`

From 77b11f32bf69407aee3420c3015e3b71b68669f3 Mon Sep 17 00:00:00 2001
From: Christopher Cooper <cooperc@assemblesys.com>
Date: Sat, 7 Dec 2024 17:08:21 -0800
Subject: [PATCH 12/14] [robustness] cover some potential resource leakage
 cases (#4443)

* if a newly-created cluster is missing from the cloud, wait before deleting

Addresses #4431.

* confirm cluster actually terminates before deleting from the db

* avoid deleting cluster data outside the primary provision loop

* tweaks

* Apply suggestions from code review

Co-authored-by: Zhanghao Wu <zhanghao.wu@outlook.com>

* use usage_intervals for new cluster detection

get_cluster_duration will include the total duration of the cluster since its
initial launch, while launched_at may be reset by sky launch on an existing
cluster. So this is a more accurate method to check.

* fix terminating/stopping state for Lambda and Paperspace

* Revert "use usage_intervals for new cluster detection"

This reverts commit aa6d2e9f8462c4e68196e9a6420c6781c9ff116b.

* check cloud.STATUS_VERSION before calling query_instances

* avoid try/catch when querying instances

* update comments

---------

Co-authored-by: Zhanghao Wu <zhanghao.wu@outlook.com>
---
 sky/backends/backend_utils.py          | 49 ++++++++++++++++++----
 sky/backends/cloud_vm_ray_backend.py   | 58 ++++++++++++++++++++++----
 sky/provision/azure/instance.py        |  2 +-
 sky/provision/gcp/instance.py          |  2 +
 sky/provision/lambda_cloud/instance.py |  2 +-
 sky/provision/paperspace/instance.py   |  3 +-
 6 files changed, 95 insertions(+), 21 deletions(-)

diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
index a411ffa8344..c45524cbe5a 100644
--- a/sky/backends/backend_utils.py
+++ b/sky/backends/backend_utils.py
@@ -111,6 +111,16 @@
 _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
                             'please retry after a while.')
 
+# If a cluster is less than LAUNCH_DOUBLE_CHECK_WINDOW seconds old, and we don't
+# see any instances in the cloud, the instances might be in the proccess of
+# being created. We will wait LAUNCH_DOUBLE_CHECK_DELAY seconds and then double
+# check to make sure there are still no instances. LAUNCH_DOUBLE_CHECK_DELAY
+# should be set longer than the delay between (sending the create instance
+# request) and (the instances appearing on the cloud).
+# See https://github.com/skypilot-org/skypilot/issues/4431.
+_LAUNCH_DOUBLE_CHECK_WINDOW = 60
+_LAUNCH_DOUBLE_CHECK_DELAY = 1
+
 # Include the fields that will be used for generating tags that distinguishes
 # the cluster in ray, to avoid the stopped cluster being discarded due to
 # updates in the yaml template.
@@ -1732,13 +1742,12 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
             logger.debug(
                 f'Refreshing status ({cluster_name!r}) failed to get IPs.')
         except RuntimeError as e:
-            logger.debug(str(e))
+            logger.debug(common_utils.format_exception(e))
         except Exception as e:  # pylint: disable=broad-except
             # This can be raised by `external_ssh_ports()`, due to the
             # underlying call to kubernetes API.
-            logger.debug(
-                f'Refreshing status ({cluster_name!r}) failed: '
-                f'{common_utils.format_exception(e, use_bracket=True)}')
+            logger.debug(f'Refreshing status ({cluster_name!r}) failed: ',
+                         exc_info=e)
         return False
 
     # Determining if the cluster is healthy (UP):
@@ -1765,6 +1774,24 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
         return record
 
     # All cases below are transitioning the cluster to non-UP states.
+
+    if (not node_statuses and handle.launched_resources.cloud.STATUS_VERSION >=
+            clouds.StatusVersion.SKYPILOT):
+        # Note: launched_at is set during sky launch, even on an existing
+        # cluster. This will catch the case where the cluster was terminated on
+        # the cloud and restarted by sky launch.
+        time_since_launch = time.time() - record['launched_at']
+        if (record['status'] == status_lib.ClusterStatus.INIT and
+                time_since_launch < _LAUNCH_DOUBLE_CHECK_WINDOW):
+            # It's possible the instances for this cluster were just created,
+            # and haven't appeared yet in the cloud API/console. Wait for a bit
+            # and check again. This is a best-effort leak prevention check.
+            # See https://github.com/skypilot-org/skypilot/issues/4431.
+            time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
+            node_statuses = _query_cluster_status_via_cloud_api(handle)
+            # Note: even if all the node_statuses are UP now, we will still
+            # consider this cluster abnormal, and its status will be INIT.
+
     if len(node_statuses) > handle.launched_nodes:
         # Unexpected: in the queried region more than 1 cluster with the same
         # constructed name tag returned. This will typically not happen unless
@@ -1793,13 +1820,15 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
                 f'{colorama.Style.RESET_ALL}')
     assert len(node_statuses) <= handle.launched_nodes
 
-    # If the node_statuses is empty, all the nodes are terminated. We can
-    # safely set the cluster status to TERMINATED. This handles the edge case
-    # where the cluster is terminated by the user manually through the UI.
+    # If the node_statuses is empty, it should mean that all the nodes are
+    # terminated and we can set the cluster status to TERMINATED. This handles
+    # the edge case where the cluster is terminated by the user manually through
+    # the UI.
     to_terminate = not node_statuses
 
-    # A cluster is considered "abnormal", if not all nodes are TERMINATED or
-    # not all nodes are STOPPED. We check that with the following logic:
+    # A cluster is considered "abnormal", if some (but not all) nodes are
+    # TERMINATED, or not all nodes are STOPPED. We check that with the following
+    # logic:
     #   * Not all nodes are terminated and there's at least one node
     #     terminated; or
     #   * Any of the non-TERMINATED nodes is in a non-STOPPED status.
@@ -1811,6 +1840,8 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
     #     cluster is probably down.
     #   * The cluster is partially terminated or stopped should be considered
     #     abnormal.
+    #   * The cluster is partially or completely in the INIT state, which means
+    #     that provisioning was interrupted. This is considered abnormal.
     #
     # An abnormal cluster will transition to INIT and have any autostop setting
     # reset (unless it's autostopping/autodowning).
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index a256277085f..07bdaa5cfb9 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -98,6 +98,11 @@
 # The maximum retry count for fetching IP address.
 _FETCH_IP_MAX_ATTEMPTS = 3
 
+# How many times to query the cloud provider to make sure instances are
+# stopping/terminating, and how long to wait between each query.
+_TEARDOWN_WAIT_MAX_ATTEMPTS = 10
+_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS = 1
+
 _TEARDOWN_FAILURE_MESSAGE = (
     f'\n{colorama.Fore.RED}Failed to terminate '
     '{cluster_name}. {extra_reason}'
@@ -2357,15 +2362,17 @@ def get_command_runners(self,
                 zip(ip_list, port_list), **ssh_credentials)
             return runners
         if self.cached_cluster_info is None:
-            # We have `or self.cached_external_ips is None` here, because
+            # We have `and self.cached_external_ips is None` here, because
             # when a cluster's cloud is just upgraded to the new provsioner,
             # although it has the cached_external_ips, the cached_cluster_info
             # can be None. We need to update it here, even when force_cached is
             # set to True.
             # TODO: We can remove `self.cached_external_ips is None` after
             # version 0.8.0.
-            assert not force_cached or self.cached_external_ips is not None, (
-                force_cached, self.cached_external_ips)
+            if force_cached and self.cached_external_ips is None:
+                raise RuntimeError(
+                    'Tried to use cached cluster info, but it\'s missing for '
+                    f'cluster "{self.cluster_name}"')
             self._update_cluster_info()
         assert self.cached_cluster_info is not None, self
         runners = provision_lib.get_command_runners(
@@ -2784,9 +2791,6 @@ def _provision(
                     if e.no_failover:
                         error_message = str(e)
                     else:
-                        # Clean up the cluster's entry in `sky status`.
-                        global_user_state.remove_cluster(cluster_name,
-                                                         terminate=True)
                         usage_lib.messages.usage.update_final_cluster_status(
                             None)
                         error_message = (
@@ -3928,7 +3932,6 @@ def teardown_no_lock(self,
                 limit=1000).get_result()['items']
             vpc_id = None
             try:
-                # pylint: disable=line-too-long
                 vpc_id = vpcs_filtered_by_tags_and_region[0]['crn'].rsplit(
                     ':', 1)[-1]
                 vpc_found = True
@@ -3937,7 +3940,6 @@ def teardown_no_lock(self,
                 returncode = -1
 
             if vpc_found:
-                # pylint: disable=line-too-long E1136
                 # Delete VPC and it's associated resources
                 vpc_provider = IBMVPCProvider(
                     config_provider['resource_group_id'], region,
@@ -4058,6 +4060,7 @@ def post_teardown_cleanup(self,
         * Removing the terminated cluster's scripts and ray yaml files.
         """
         cluster_name_on_cloud = handle.cluster_name_on_cloud
+        cloud = handle.launched_resources.cloud
 
         if (terminate and handle.launched_resources.is_image_managed is True):
             # Delete the image when terminating a "cloned" cluster, i.e.,
@@ -4078,7 +4081,6 @@ def post_teardown_cleanup(self,
                     'remove it manually to avoid image leakage. Details: '
                     f'{common_utils.format_exception(e, use_bracket=True)}')
         if terminate:
-            cloud = handle.launched_resources.cloud
             config = common_utils.read_yaml(handle.cluster_yaml)
             try:
                 cloud.check_features_are_supported(
@@ -4105,6 +4107,44 @@ def post_teardown_cleanup(self,
         config = common_utils.read_yaml(handle.cluster_yaml)
         backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)
 
+        # Confirm that instances have actually transitioned state before
+        # updating the state database. We do this immediately before removing
+        # the state from the database, so that we can guarantee that this is
+        # always called before the state is removed. We considered running this
+        # check as part of provisioner.teardown_cluster or
+        # provision.terminate_instances, but it would open the door code paths
+        # that successfully call this function but do not first call
+        # teardown_cluster or terminate_instances. See
+        # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
+        attempts = 0
+        while True:
+            logger.debug(f'instance statuses attempt {attempts + 1}')
+            node_status_dict = provision_lib.query_instances(
+                repr(cloud),
+                cluster_name_on_cloud,
+                config['provider'],
+                non_terminated_only=False)
+
+            unexpected_node_state: Optional[Tuple[str, str]] = None
+            for node_id, node_status in node_status_dict.items():
+                logger.debug(f'{node_id} status: {node_status}')
+                # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
+                # between "stopping/stopped" and "terminating/terminated", so we
+                # allow for either status instead of casing on `terminate`.
+                if node_status not in [None, status_lib.ClusterStatus.STOPPED]:
+                    unexpected_node_state = (node_id, node_status)
+
+            if unexpected_node_state is None:
+                break
+
+            attempts += 1
+            if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
+                time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
+            else:
+                (node_id, node_status) = unexpected_node_state
+                raise RuntimeError(f'Instance {node_id} in unexpected state '
+                                   f'{node_status}.')
+
         global_user_state.remove_cluster(handle.cluster_name,
                                          terminate=terminate)
 
diff --git a/sky/provision/azure/instance.py b/sky/provision/azure/instance.py
index 700d31c597f..229d7361e22 100644
--- a/sky/provision/azure/instance.py
+++ b/sky/provision/azure/instance.py
@@ -101,8 +101,8 @@ def cluster_status_map(
     ) -> Dict['AzureInstanceStatus', Optional[status_lib.ClusterStatus]]:
         return {
             cls.PENDING: status_lib.ClusterStatus.INIT,
-            cls.STOPPING: status_lib.ClusterStatus.INIT,
             cls.RUNNING: status_lib.ClusterStatus.UP,
+            cls.STOPPING: status_lib.ClusterStatus.STOPPED,
             cls.STOPPED: status_lib.ClusterStatus.STOPPED,
             cls.DELETING: None,
         }
diff --git a/sky/provision/gcp/instance.py b/sky/provision/gcp/instance.py
index 9872ad73dc7..6c09dbc6816 100644
--- a/sky/provision/gcp/instance.py
+++ b/sky/provision/gcp/instance.py
@@ -52,6 +52,8 @@ def _filter_instances(
 # non_terminated_only=True?
 # Will there be callers who would want this to be False?
 # stop() and terminate() for example already implicitly assume non-terminated.
+# Currently, even with non_terminated_only=False, we may not have a dict entry
+# for terminated instances, if they have already been fully deleted.
 @common_utils.retry
 def query_instances(
     cluster_name_on_cloud: str,
diff --git a/sky/provision/lambda_cloud/instance.py b/sky/provision/lambda_cloud/instance.py
index d10c36496ab..d33c97df95c 100644
--- a/sky/provision/lambda_cloud/instance.py
+++ b/sky/provision/lambda_cloud/instance.py
@@ -233,7 +233,7 @@ def query_instances(
         'booting': status_lib.ClusterStatus.INIT,
         'active': status_lib.ClusterStatus.UP,
         'unhealthy': status_lib.ClusterStatus.INIT,
-        'terminating': status_lib.ClusterStatus.INIT,
+        'terminating': None,
     }
     statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
     for instance_id, instance in instances.items():
diff --git a/sky/provision/paperspace/instance.py b/sky/provision/paperspace/instance.py
index 5804362d102..6775304c8f7 100644
--- a/sky/provision/paperspace/instance.py
+++ b/sky/provision/paperspace/instance.py
@@ -286,12 +286,13 @@ def query_instances(
     assert provider_config is not None, (cluster_name_on_cloud, provider_config)
     instances = _filter_instances(cluster_name_on_cloud, None)
 
+    # https://docs.digitalocean.com/reference/paperspace/core/commands/machines/#show
     status_map = {
         'starting': status_lib.ClusterStatus.INIT,
         'restarting': status_lib.ClusterStatus.INIT,
         'upgrading': status_lib.ClusterStatus.INIT,
         'provisioning': status_lib.ClusterStatus.INIT,
-        'stopping': status_lib.ClusterStatus.INIT,
+        'stopping': status_lib.ClusterStatus.STOPPED,
         'serviceready': status_lib.ClusterStatus.INIT,
         'ready': status_lib.ClusterStatus.UP,
         'off': status_lib.ClusterStatus.STOPPED,

From c5ae5cae2d20d9b699a851ccdb0af4299e7ef435 Mon Sep 17 00:00:00 2001
From: zpoint <zp0int@qq.com>
Date: Mon, 9 Dec 2024 11:02:08 +0800
Subject: [PATCH 13/14] smoke tests support storage mount only (#4446)

* smoke tests support storage mount only

* fix verify command

* rename to only_mount
---
 tests/test_smoke.py                           | 147 +++++++++---------
 .../test_yamls/test_storage_mounting.yaml.j2  |  10 +-
 2 files changed, 77 insertions(+), 80 deletions(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 16dde046281..a225c10566c 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -37,7 +37,7 @@
 import tempfile
 import textwrap
 import time
-from typing import Dict, List, NamedTuple, Optional, Tuple
+from typing import Dict, List, NamedTuple, Optional, TextIO, Tuple
 import urllib.parse
 import uuid
 
@@ -1239,34 +1239,68 @@ def test_using_file_mounts_with_env_vars(generic_cloud: str):
 
 
 # ---------- storage ----------
+
+
+def _storage_mounts_commands_generator(f: TextIO, cluster_name: str,
+                                       storage_name: str, ls_hello_command: str,
+                                       cloud: str, only_mount: bool):
+    template_str = pathlib.Path(
+        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
+    template = jinja2.Template(template_str)
+    content = template.render(
+        storage_name=storage_name,
+        cloud=cloud,
+        only_mount=only_mount,
+    )
+    f.write(content)
+    f.flush()
+    file_path = f.name
+    test_commands = [
+        *STORAGE_SETUP_COMMANDS,
+        f'sky launch -y -c {cluster_name} --cloud {cloud} {file_path}',
+        f'sky logs {cluster_name} 1 --status',  # Ensure job succeeded.
+        ls_hello_command,
+        f'sky stop -y {cluster_name}',
+        f'sky start -y {cluster_name}',
+        # Check if hello.txt from mounting bucket exists after restart in
+        # the mounted directory
+        f'sky exec {cluster_name} -- "set -ex; ls /mount_private_mount/hello.txt"',
+    ]
+    clean_command = f'sky down -y {cluster_name}; sky storage delete -y {storage_name}'
+    return test_commands, clean_command
+
+
 @pytest.mark.aws
 def test_aws_storage_mounts_with_stop():
     name = _get_cluster_name()
     cloud = 'aws'
     storage_name = f'sky-test-{int(time.time())}'
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(storage_name=storage_name, cloud=cloud)
+    ls_hello_command = f'aws s3 ls {storage_name}/hello.txt'
     with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud {cloud} {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            f'aws s3 ls {storage_name}/hello.txt',
-            f'sky stop -y {name}',
-            f'sky start -y {name}',
-            # Check if hello.txt from mounting bucket exists after restart in
-            # the mounted directory
-            f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"'
-        ]
+        test_commands, clean_command = _storage_mounts_commands_generator(
+            f, name, storage_name, ls_hello_command, cloud, False)
         test = Test(
             'aws_storage_mounts',
             test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
+            clean_command,
+            timeout=20 * 60,  # 20 mins
+        )
+        run_one_test(test)
+
+
+@pytest.mark.aws
+def test_aws_storage_mounts_with_stop_only_mount():
+    name = _get_cluster_name()
+    cloud = 'aws'
+    storage_name = f'sky-test-{int(time.time())}'
+    ls_hello_command = f'aws s3 ls {storage_name}/hello.txt'
+    with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
+        test_commands, clean_command = _storage_mounts_commands_generator(
+            f, name, storage_name, ls_hello_command, cloud, True)
+        test = Test(
+            'aws_storage_mounts_only_mount',
+            test_commands,
+            clean_command,
             timeout=20 * 60,  # 20 mins
         )
         run_one_test(test)
@@ -1277,29 +1311,14 @@ def test_gcp_storage_mounts_with_stop():
     name = _get_cluster_name()
     cloud = 'gcp'
     storage_name = f'sky-test-{int(time.time())}'
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(storage_name=storage_name, cloud=cloud)
+    ls_hello_command = f'gsutil ls gs://{storage_name}/hello.txt'
     with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud {cloud} {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            f'gsutil ls gs://{storage_name}/hello.txt',
-            f'sky stop -y {name}',
-            f'sky start -y {name}',
-            # Check if hello.txt from mounting bucket exists after restart in
-            # the mounted directory
-            f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"'
-        ]
+        test_commands, clean_command = _storage_mounts_commands_generator(
+            f, name, storage_name, ls_hello_command, cloud, False)
         test = Test(
             'gcp_storage_mounts',
             test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
+            clean_command,
             timeout=20 * 60,  # 20 mins
         )
         run_one_test(test)
@@ -1315,31 +1334,19 @@ def test_azure_storage_mounts_with_stop():
                             get_default_storage_account_name(default_region))
     storage_account_key = data_utils.get_az_storage_account_key(
         storage_account_name)
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(storage_name=storage_name, cloud=cloud)
+    # if the file does not exist, az storage blob list returns '[]'
+    ls_hello_command = (f'output=$(az storage blob list -c {storage_name} '
+                        f'--account-name {storage_account_name} '
+                        f'--account-key {storage_account_key} '
+                        f'--prefix hello.txt) '
+                        f'[ "$output" = "[]" ] && exit 1 || exit 0')
     with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud {cloud} {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            f'output=$(az storage blob list -c {storage_name} --account-name {storage_account_name} --account-key {storage_account_key} --prefix hello.txt)'
-            # if the file does not exist, az storage blob list returns '[]'
-            f'[ "$output" = "[]" ] && exit 1;'
-            f'sky stop -y {name}',
-            f'sky start -y {name}',
-            # Check if hello.txt from mounting bucket exists after restart in
-            # the mounted directory
-            f'sky exec {name} -- "set -ex; ls /mount_private_mount/hello.txt"'
-        ]
+        test_commands, clean_command = _storage_mounts_commands_generator(
+            f, name, storage_name, ls_hello_command, cloud, False)
         test = Test(
             'azure_storage_mounts',
             test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
+            clean_command,
             timeout=20 * 60,  # 20 mins
         )
         run_one_test(test)
@@ -1352,25 +1359,15 @@ def test_kubernetes_storage_mounts():
     # built for x86_64 only.
     name = _get_cluster_name()
     storage_name = f'sky-test-{int(time.time())}'
-    template_str = pathlib.Path(
-        'tests/test_yamls/test_storage_mounting.yaml.j2').read_text()
-    template = jinja2.Template(template_str)
-    content = template.render(storage_name=storage_name)
+    ls_hello_command = (f'aws s3 ls {storage_name}/hello.txt || '
+                        f'gsutil ls gs://{storage_name}/hello.txt')
     with tempfile.NamedTemporaryFile(suffix='.yaml', mode='w') as f:
-        f.write(content)
-        f.flush()
-        file_path = f.name
-        test_commands = [
-            *STORAGE_SETUP_COMMANDS,
-            f'sky launch -y -c {name} --cloud kubernetes {file_path}',
-            f'sky logs {name} 1 --status',  # Ensure job succeeded.
-            f'aws s3 ls {storage_name}/hello.txt || '
-            f'gsutil ls gs://{storage_name}/hello.txt',
-        ]
+        test_commands, clean_command = _storage_mounts_commands_generator(
+            f, name, storage_name, ls_hello_command, 'kubernetes', False)
         test = Test(
             'kubernetes_storage_mounts',
             test_commands,
-            f'sky down -y {name}; sky storage delete -y {storage_name}',
+            clean_command,
             timeout=20 * 60,  # 20 mins
         )
         run_one_test(test)
diff --git a/tests/test_yamls/test_storage_mounting.yaml.j2 b/tests/test_yamls/test_storage_mounting.yaml.j2
index 4241c63409e..651fb190012 100644
--- a/tests/test_yamls/test_storage_mounting.yaml.j2
+++ b/tests/test_yamls/test_storage_mounting.yaml.j2
@@ -20,13 +20,13 @@ file_mounts:
   /mount_private_copy:
     name: {{storage_name}}
     source: ~/tmp-workdir
-    mode: COPY
+    mode: {% if only_mount | default(false) %}MOUNT{% else %}COPY{% endif %}
 
   # Mounting private buckets in COPY mode with a list of files as source
   /mount_private_copy_lof:
     name: {{storage_name}}
     source: ['~/tmp-workdir/tmp file', '~/tmp-workdir/tmp file2']
-    mode: COPY
+    mode: {% if only_mount | default(false) %}MOUNT{% else %}COPY{% endif %}
 
   {% if include_private_mount | default(True) %}
   # Mounting private buckets in MOUNT mode
@@ -38,7 +38,7 @@ file_mounts:
 
 run: |
   set -ex
-  
+
   # Check public bucket contents
   ls -ltr /mount_public_s3/corpora
   ls -ltr /mount_public_gcp/tiles
@@ -55,12 +55,12 @@ run: |
   ls -ltr /mount_private_mount/foo
   ls -ltr /mount_private_mount/tmp\ file
   {% endif %}
-  
+
   # Symlinks are not copied to buckets
   ! ls /mount_private_copy/circle-link
   {% if include_private_mount | default(True) %}
   ! ls /mount_private_mount/circle-link
-  
+
   # Write to private bucket in MOUNT mode should pass
   echo "hello" > /mount_private_mount/hello.txt
   {% endif %}

From 2448bf6d9cc0769985cc3e209137f3df1ef10260 Mon Sep 17 00:00:00 2001
From: ZePing Guo <zp0int@qq.com>
Date: Mon, 9 Dec 2024 12:43:16 +0800
Subject: [PATCH 14/14] hot fix for smoke tests

---
 tests/test_smoke.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index a225c10566c..58a33334f79 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -1144,7 +1144,7 @@ def test_gcp_stale_job_manual_restart():
             # Ensure the skylet updated the stale job status.
             _get_cmd_wait_until_job_status_contains_without_matching_job(
                 cluster_name=name,
-                job_status=[JobStatus.FAILED.value],
+                job_status=[JobStatus.FAILED],
                 timeout=events.JobSchedulerEvent.EVENT_INTERVAL_SECONDS)
         ],
         f'sky down -y {name}',