skypilot-org · cblmemo · Sep 5, 2024 · Sep 5, 2024 · Sep 11, 2024 · Sep 11, 2024
diff --git a/sky/serve/constants.py b/sky/serve/constants.py
@@ -19,7 +19,7 @@
 # time the load balancer syncs with controller, it will update all available
 # replica ips for each service, also send the number of requests in last query
 # interval.
-LB_CONTROLLER_SYNC_INTERVAL_SECONDS = 20
+LB_CONTROLLER_SYNC_INTERVAL_SECONDS = 10
 
 # The maximum retry times for load balancer for each request. After changing to
 # proxy implementation, we do retry for failed requests.

diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py
@@ -2,7 +2,7 @@
 import asyncio
 import logging
 import threading
-from typing import Dict, Union
+from typing import Dict, List, Union
 
 import aiohttp
 import fastapi
@@ -160,11 +160,28 @@ async def _proxy_with_retries(
         # SkyServe supports serving on Spot Instances. To avoid preemptions
         # during request handling, we add a retry here.
         retry_cnt = 0
+        # Here we try to not retry those failed replicas for the case when the
+        # replica is in a NOT_READY state but does not sync-ed to the load
+        # balancer yet. However, we still maintain a per-request failed replica
+        # list instead of the global one to avoid the case for transient
+        # networking issues, and letting new requests to retry them. Since our
+        # LB policy is a global one, when the request rate is high, it is likely
+        # that multiple retries on a single request will use the same replica.
+        # Here we use the failed replica list to keep track of the failures
+        # happened on every request and try to avoid them in the next retry.
-        # Here we try to not retry those failed replicas for the case when the
-        # replica is in a NOT_READY state but does not sync-ed to the load
-        # balancer yet. However, we still maintain a per-request failed replica
-        # list instead of the global one to avoid the case for transient
-        # networking issues, and letting new requests to retry them. Since our
-        # LB policy is a global one, when the request rate is high, it is likely
-        # that multiple retries on a single request will use the same replica.
-        # Here we use the failed replica list to keep track of the failures
-        # happened on every request and try to avoid them in the next retry.
+        # We keep track of the failed replicas for the current request, because
+        # we have a global round-robin policy, and if there is a large load,
+        # all retries for the same request can go to the same replica by chance.
+        # If the same replica is in `NOT_READY` state but the new state has not
+        # been synced from the controller, the current request will fail.
+        # 
+        # We maintain a per-request failed replica list instead of the global one to
+        # allow multiple requests to still try failed replicas for one request in case
+        # that replica is failed by transient network issue.
-        # Here we try to not retry those failed replicas for the case when the
-        # replica is in a NOT_READY state but does not sync-ed to the load
-        # balancer yet. However, we still maintain a per-request failed replica
-        # list instead of the global one to avoid the case for transient
-        # networking issues, and letting new requests to retry them. Since our
-        # LB policy is a global one, when the request rate is high, it is likely
-        # that multiple retries on a single request will use the same replica.
-        # Here we use the failed replica list to keep track of the failures
-        # happened on every request and try to avoid them in the next retry.
+        # We keep track of the failed replicas for the current request, because
+        # we have a global round-robin policy, and if there is a large load,
+        # all retries for the same request can go to the same replica by chance.
+        # If the same replica is in `NOT_READY` state but the new state has not
+        # been synced from the controller, the current request will fail.
+        # 
+        # We maintain a per-request failed replica list instead of the global one to
+        # allow multiple requests to still try failed replicas for one request in case
+        # that replica is failed by transient network issue.
+        failed_replica_urls: List[str] = []
         while True:
             retry_cnt += 1
             with self._client_pool_lock:
                 ready_replica_url = self._load_balancing_policy.select_replica(
-                    request)
+                    request, failed_replica_urls)
+                # If all replicas are failed, retry them again as some
+                # of them might be transient networking issues.
+                if ready_replica_url is None and failed_replica_urls:
+                    failed_replica_urls = []
+                    ready_replica_url = (
+                        self._load_balancing_policy.select_replica(
+                            request, failed_replica_urls))
             if ready_replica_url is None:
                 response_or_exception = fastapi.HTTPException(
                     # 503 means that the server is currently
@@ -184,6 +201,8 @@ async def _proxy_with_retries(
                 # 499 means a client terminates the connection
                 # before the server is able to respond.
                 return fastapi.responses.Response(status_code=499)
+            assert ready_replica_url is not None
+            failed_replica_urls.append(ready_replica_url)
             # TODO(tian): Fail fast for errors like 404 not found.
             if retry_cnt == constants.LB_MAX_RETRY:
                 if isinstance(response_or_exception, fastapi.HTTPException):

diff --git a/sky/serve/load_balancing_policies.py b/sky/serve/load_balancing_policies.py
@@ -28,8 +28,9 @@ def __init__(self) -> None:
     def set_ready_replicas(self, ready_replicas: List[str]) -> None:
         raise NotImplementedError
 
-    def select_replica(self, request: 'fastapi.Request') -> Optional[str]:
-        replica = self._select_replica(request)
+    def select_replica(self, request: 'fastapi.Request',
+                       disabled_replicas: List[str]) -> Optional[str]:
+        replica = self._select_replica(request, disabled_replicas)
         if replica is not None:
             logger.info(f'Selected replica {replica} '
                         f'for request {_request_repr(request)}')
@@ -40,7 +41,8 @@ def select_replica(self, request: 'fastapi.Request') -> Optional[str]:
 
     # TODO(tian): We should have an abstract class for Request to
     # compatible with all frameworks.
-    def _select_replica(self, request: 'fastapi.Request') -> Optional[str]:
+    def _select_replica(self, request: 'fastapi.Request',
+                        disabled_replicas: List[str]) -> Optional[str]:
         raise NotImplementedError
 
 
@@ -61,10 +63,15 @@ def set_ready_replicas(self, ready_replicas: List[str]) -> None:
         self.ready_replicas = ready_replicas
         self.index = 0
 
-    def _select_replica(self, request: 'fastapi.Request') -> Optional[str]:
+    def _select_replica(self, request: 'fastapi.Request',
+                        disabled_replicas: List[str]) -> Optional[str]:
         del request  # Unused.
-        if not self.ready_replicas:
+        # Avoid infinite loop.
+        if not self.ready_replicas or all(
+                url in disabled_replicas for url in self.ready_replicas):
             return None
-        ready_replica_url = self.ready_replicas[self.index]
-        self.index = (self.index + 1) % len(self.ready_replicas)
-        return ready_replica_url
+        while True:
+            ready_replica_url = self.ready_replicas[self.index]
+            self.index = (self.index + 1) % len(self.ready_replicas)
+            if ready_replica_url not in disabled_replicas:
+                return ready_replica_url