skypilot-org · cblmemo · Sep 5, 2024 · Sep 5, 2024 · Sep 11, 2024 · Sep 11, 2024
diff --git a/sky/serve/constants.py b/sky/serve/constants.py
@@ -19,7 +19,7 @@
 # time the load balancer syncs with controller, it will update all available
 # replica ips for each service, also send the number of requests in last query
 # interval.
-LB_CONTROLLER_SYNC_INTERVAL_SECONDS = 20
+LB_CONTROLLER_SYNC_INTERVAL_SECONDS = 10
 
 # The maximum retry times for load balancer for each request. After changing to
 # proxy implementation, we do retry for failed requests.

diff --git a/sky/serve/load_balancer.py b/sky/serve/load_balancer.py
@@ -2,7 +2,7 @@
 import asyncio
 import logging
 import threading
-from typing import Dict, Union
+from typing import Dict, Set, Union
 
 import aiohttp
 import fastapi
@@ -160,11 +160,26 @@ async def _proxy_with_retries(
         # SkyServe supports serving on Spot Instances. To avoid preemptions
         # during request handling, we add a retry here.
         retry_cnt = 0
+        # We keep track of the failed replicas for the current request, because
+        # we have a global round-robin policy, and if there is a large load,
+        # all retries for the same request can go to the same replica by chance.
+        # If the same replica is in `NOT_READY` state but the new state has not
+        # been synced from the controller, the current request will fail.
+        #
+        # We maintain a per-request failed replica set instead of the global
+        # one to allow multiple requests to still try failed replicas for one
+        # time, in case that replica is failed by transient network issue.
+        failed_replica_urls: Set[str] = set()
         while True:
             retry_cnt += 1
             with self._client_pool_lock:
+                # If all replicas are failed, clear the record and retry them
+                # again as some of them might be transient networking issues.
+                if (len(failed_replica_urls) ==
+                        self._load_balancing_policy.num_ready_replicas()):
+                    failed_replica_urls.clear()
                 ready_replica_url = self._load_balancing_policy.select_replica(
-                    request)
+                    request, failed_replica_urls)
             if ready_replica_url is None:
                 response_or_exception = fastapi.HTTPException(
                     # 503 means that the server is currently
@@ -184,6 +199,8 @@ async def _proxy_with_retries(
                 # 499 means a client terminates the connection
                 # before the server is able to respond.
                 return fastapi.responses.Response(status_code=499)
+            assert ready_replica_url is not None
+            failed_replica_urls.add(ready_replica_url)
             # TODO(tian): Fail fast for errors like 404 not found.
             if retry_cnt == constants.LB_MAX_RETRY:
                 if isinstance(response_or_exception, fastapi.HTTPException):

diff --git a/sky/serve/load_balancing_policies.py b/sky/serve/load_balancing_policies.py
@@ -1,7 +1,7 @@
 """LoadBalancingPolicy: Policy to select endpoint."""
 import random
 import typing
-from typing import List, Optional
+from typing import List, Optional, Set
 
 from sky import sky_logging
 
@@ -28,8 +28,9 @@ def __init__(self) -> None:
     def set_ready_replicas(self, ready_replicas: List[str]) -> None:
         raise NotImplementedError
 
-    def select_replica(self, request: 'fastapi.Request') -> Optional[str]:
-        replica = self._select_replica(request)
+    def select_replica(self, request: 'fastapi.Request',
+                       disabled_replicas: Set[str]) -> Optional[str]:
+        replica = self._select_replica(request, disabled_replicas)
         if replica is not None:
             logger.info(f'Selected replica {replica} '
                         f'for request {_request_repr(request)}')
@@ -38,9 +39,13 @@ def select_replica(self, request: 'fastapi.Request') -> Optional[str]:
                            f'{_request_repr(request)}')
         return replica
 
+    def num_ready_replicas(self) -> int:
+        return len(self.ready_replicas)
+
     # TODO(tian): We should have an abstract class for Request to
     # compatible with all frameworks.
-    def _select_replica(self, request: 'fastapi.Request') -> Optional[str]:
+    def _select_replica(self, request: 'fastapi.Request',
+                        disabled_replicas: Set[str]) -> Optional[str]:
         raise NotImplementedError
 
 
@@ -61,10 +66,13 @@ def set_ready_replicas(self, ready_replicas: List[str]) -> None:
         self.ready_replicas = ready_replicas
         self.index = 0
 
-    def _select_replica(self, request: 'fastapi.Request') -> Optional[str]:
+    def _select_replica(self, request: 'fastapi.Request',
+                        disabled_replicas: Set[str]) -> Optional[str]:
         del request  # Unused.
         if not self.ready_replicas:
             return None
-        ready_replica_url = self.ready_replicas[self.index]
-        self.index = (self.index + 1) % len(self.ready_replicas)
-        return ready_replica_url
+        while True:
+            ready_replica_url = self.ready_replicas[self.index]
+            self.index = (self.index + 1) % len(self.ready_replicas)
+            if ready_replica_url not in disabled_replicas:
+                return ready_replica_url