example

neuralmagic · Aug 10, 2024 · 8957faf · 8957faf
1 parent 3015a4b
commit 8957faf
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 7 deletions.
diff --git a/examples/openai_completion_client.py b/examples/openai_completion_client.py
@@ -2,7 +2,7 @@
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8001/v1"
+openai_api_base = "http://localhost:8000/v1"
 
 client = OpenAI(
     # defaults to os.environ.get("OPENAI_API_KEY")

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -1,5 +1,8 @@
 import asyncio
 import time
+import zmq
+import zmq.asyncio
+import pickle
 from functools import partial
 from typing import (AsyncGenerator, Callable, Dict, Iterable, List, Mapping,
                     Optional, Set, Tuple, Type, Union)
@@ -252,9 +255,24 @@ def has_new_requests(self):
 class _AsyncLLMEngine(LLMEngine):
     """Extension of LLMEngine to add async methods."""
 
-    async def do_log_stats_async(self, scheduler_outputs, model_output):
-        self.do_log_stats(scheduler_outputs, model_output)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.logger_ctx = zmq.asyncio.Context()
 
+        self.to_logger = self.logger_ctx.socket(zmq.constants.PUSH)
+        self.to_logger.bind("inproc://doesitwork")
+
+        self.from_engine = self.logger_ctx.socket(zmq.constants.PULL)
+        self.from_engine.connect("inproc://doesitwork")
+
+        self.logging_task = asyncio.create_task(self.run_logging_loop())
+
+
+    async def run_logging_loop(self):
+
+        while True:
+            data = await self.from_engine.recv_pyobj()
+            self.do_log_stats(**data)
 
     async def step_async(
         self, virtual_engine: int
@@ -294,15 +312,22 @@ async def step_async(
             scheduler_outputs.ignored_seq_groups, seq_group_metadata_list)
 
         # Log stats.
-        log_task = asyncio.create_task(self.do_log_stats_async(
-            scheduler_outputs, output))
-        _running_tasks.add(log_task)
-        log_task.add_done_callback(_running_tasks.discard)
+        # log_task = asyncio.create_task(self.do_log_stats_async(
+        #     scheduler_outputs, output))
+        # _running_tasks.add(log_task)
+        # log_task.add_done_callback(_running_tasks.discard)
         # self.do_log_stats(scheduler_outputs, output)
+        await self.to_logger.send_pyobj(
+            {
+                "scheduler_outputs": scheduler_outputs,
+                "model_output": output
+            }
+        )
 
         # Tracing
         self.do_tracing(scheduler_outputs)
 
+
         return request_outputs
 
     async def stop_remote_worker_execution_loop_async(self) -> None:

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -363,6 +363,7 @@ def __init__(
                 ),
             ))
 
+
     def _initialize_kv_caches(self) -> None:
         """Initialize the KV cache in the worker(s).
-Original file line number
+Diff line change
@@ Expand Up / @@ -363,6 +363,7 @@ def __init__( @@
                     ),
                 ))
         def _initialize_kv_caches(self) -> None:
             """Initialize the KV cache in the worker(s).
@@ Expand Down @@