v1: use multiprocessing by default

Previously, this code forced the use of the `spawn` multiprocessing method. Since we know this causes problems in some configurations, multiprocessing as off by default. This change turns it on by default and makes use of existing code that tries to choose the best multiprocessing method based on what we can detect. - use `fork` by default - use `spawn` if CUDA has already been initialized, but give a warning This same logic is already in use for spawning multiple workers for v1 tensor parallelism support. The design doc `docs/design/multiprocessing.md` covers this topic in more detail. Signed-off-by: Russell Bryant <[email protected]>
vllm-project · Dec 10, 2024 · be8449c · be8449c
1 parent ec4477c
commit be8449c
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 14 deletions.
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -68,7 +68,7 @@
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_DISABLED_KERNELS: List[str] = []
     VLLM_USE_V1: bool = False
-    VLLM_ENABLE_V1_MULTIPROCESSING: bool = False
+    VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
 
 
@@ -452,7 +452,7 @@ def get_default_config_root():
 
     # If set, enable multiprocessing in LLM for the V1 code path.
     "VLLM_ENABLE_V1_MULTIPROCESSING":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0"))),
+    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
     "VLLM_LOG_BATCHSIZE_INTERVAL":
     lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")),
 }

diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
@@ -274,7 +274,17 @@ def write_with_prefix(s: str):
     file.write = write_with_prefix  # type: ignore[method-assign]
 
 
+def _check_multiproc_method():
+    if (cuda_is_initialized()
+            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
+        logger.warning("CUDA was previously initialized. We must use "
+                       "the `spawn` multiprocessing start method. Setting "
+                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
 def get_mp_context():
+    _check_multiproc_method()
     mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
     return multiprocessing.get_context(mp_method)
 
@@ -284,12 +294,7 @@ def set_multiprocessing_worker_envs(parallel_config):
     in a multiprocessing environment. This should be called by the parent 
     process before worker processes are created"""
 
-    if (cuda_is_initialized()
-            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
-        logger.warning("CUDA was previously initialized. We must use "
-                       "the `spawn` multiprocessing start method. Setting "
-                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.")
-        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+    _check_multiproc_method()
 
     # Configure thread parallelism if OMP_NUM_THREADS isn't set
     #

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -1,4 +1,3 @@
-import multiprocessing
 import pickle
 import queue
 import signal
@@ -13,6 +12,7 @@
 from msgspec import msgpack
 
 from vllm.config import CacheConfig, VllmConfig
+from vllm.executor.multiproc_worker_utils import get_mp_context
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.core.scheduler import Scheduler
@@ -197,11 +197,7 @@ def make_engine_core_process(
         ready_path: str,
         should_shutdown: Synchronized,
     ) -> BaseProcess:
-        # The current process might have CUDA context,
-        # so we need to spawn a new process.
-        # NOTE(rob): this is a problem for using EngineCoreProc w/
-        # LLM, since we need a if __name__ == "__main__" guard.
-        context = multiprocessing.get_context("spawn")
+        context = get_mp_context()
 
         process_kwargs = {
             "input_path": input_path,