Use separate test with chunked prefill; skip empty chunks

vllm-project · Oct 15, 2024 · 88bbb2d · 88bbb2d
1 parent 5cfa08b
commit 88bbb2d
Show file tree

Hide file tree

Showing 4 changed files with 138 additions and 60 deletions.
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
@@ -28,7 +28,7 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
         "--dtype",
         "bfloat16",
         "--max-model-len",
-        "None" if MODEL_NAME == "meta-llama/Llama-3.2-1B-Instruct" else "8192",
+        "8192",
         "--enforce-eager",
         # lora config below
         "--enable-lora",
@@ -429,49 +429,6 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 
     # Test stream=True, stream_options={"include_usage": True,
     #                           "continuous_usage_stats": True}
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-        extra_body=dict(min_tokens=10),
-        temperature=0.0,
-        stream=True,
-        stream_options={
-            "include_usage": True,
-            "continuous_usage_stats": True,
-        },
-    )
-    last_completion_tokens = 0
-    async for chunk in stream:
-        assert chunk.usage.prompt_tokens >= 0
-        assert last_completion_tokens == 0 or \
-               chunk.usage.completion_tokens > last_completion_tokens or \
-               (
-                   not chunk.choices and
-                   chunk.usage.completion_tokens == last_completion_tokens
-               )
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
-        last_completion_tokens = chunk.usage.completion_tokens
-
-    assert last_completion_tokens == 10
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    ["meta-llama/Llama-3.2-1B-Instruct"],
-)
-async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
-        client: openai.AsyncOpenAI, model_name: str):
-    # Test stream with long prompt
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "What is the capital of France?" * 3000
-    }]
     stream = await client.chat.completions.create(
         model=model_name,
         messages=messages,
@@ -482,27 +439,13 @@ async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
             "include_usage": True,
             "continuous_usage_stats": True
         },
-        logprobs=True,
-        top_logprobs=5,
     )
-
-    tokens_received = 0
     async for chunk in stream:
         assert chunk.usage.prompt_tokens >= 0
         assert chunk.usage.completion_tokens >= 0
         assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
                                             chunk.usage.completion_tokens)
 
-        if chunk.choices[0].delta.content == "":
-            # when there is no tokens generated
-            assert chunk.usage.completion_tokens == 0
-            assert chunk.choices[0].logprobs is None
-        else:
-            tokens_received += 1
-
-        if chunk.choices[0].finish_reason is not None:
-            assert chunk.usage.completion_tokens == tokens_received
-
 
 # NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
 # (i.e. using the same ordering as in the Completions API tests), the test

diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -0,0 +1,126 @@
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--max-num-seqs",
+        "128",
+        "--enable-chunked-prefill",
+        "--max-num-batched-tokens",
+        "1000",
+        # large prompts create a lot of output
+        "--disable-log-requests",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_completion_stream_options_and_logprobs_with_long_prompts(
+        client: openai.AsyncOpenAI):
+    # Test stream with long prompt
+    prompt = "What is the capital of France?" * 400
+
+    stream = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=5,
+    )
+
+    tokens_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if not finished:
+            tokens_received += 1
+            assert chunk.choices[0].text
+
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_stream_options_and_logprobs_with_long_prompts(
+        client: openai.AsyncOpenAI):
+    # Test stream with long prompt
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?" * 400
+    }]
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={
+            "include_usage": True,
+            "continuous_usage_stats": True,
+        },
+        logprobs=True,
+        top_logprobs=5,
+    )
+
+    tokens_received = 0
+    empty_chunks_received = 0
+    finished = False
+    async for chunk in stream:
+        assert chunk.usage.prompt_tokens >= 0
+        assert chunk.usage.completion_tokens >= 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+
+        if not finished:
+            if chunk.choices[0].delta.content == "":
+                # when there is no tokens generated
+                assert chunk.usage.completion_tokens == 0
+                assert chunk.choices[0].logprobs is None
+                empty_chunks_received += 1
+            else:
+                tokens_received += 1
+
+            if chunk.choices[0].finish_reason is not None:
+                finished = True
+
+        if finished:
+            assert chunk.usage.completion_tokens == tokens_received
+
+    assert empty_chunks_received <= 1
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -435,6 +435,12 @@ async def chat_completion_stream_generator(
                         logprobs = None
 
                     delta_text = output.text
+
+                    if not delta_text and not output.token_ids and \
+                        not previous_num_tokens[i]:
+                        # Chunked prefill case, don't return empty chunks
+                        continue
+
                     delta_message: Optional[DeltaMessage]
 
                     # handle streaming deltas for tools with named tool_choice

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -274,8 +274,6 @@ async def completion_stream_generator(
 
                 for output in res.outputs:
                     i = output.index + prompt_idx * num_choices
-                    # TODO(simon): optimize the performance by avoiding full
-                    # text O(n^2) sending.
 
                     assert request.max_tokens is not None
                     if request.echo and request.max_tokens == 0:
@@ -307,6 +305,11 @@ async def completion_stream_generator(
                         delta_token_ids = output.token_ids
                         out_logprobs = output.logprobs
 
+                        if not delta_text and not delta_token_ids \
+                            and not previous_num_tokens[i]:
+                            # Chunked prefill case, don't return empty chunks
+                            continue
+
                     if request.logprobs is not None:
                         assert out_logprobs is not None, (
                             "Did not output logprobs")