From 164ca81b263a288397581bf837981689e9b57fc1 Mon Sep 17 00:00:00 2001 From: Piotr Marcinkiewicz Date: Thu, 14 Nov 2024 16:30:59 +0100 Subject: [PATCH] Remove use beam search parameter from vLLM example --- examples/vllm/server.py | 1 - .../L1_vllm_integration/benchmark_serving.py | 11 ++--------- tests/functional/L1_vllm_integration/server.py | 1 - 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/examples/vllm/server.py b/examples/vllm/server.py index 62568b23..cdee9adc 100755 --- a/examples/vllm/server.py +++ b/examples/vllm/server.py @@ -97,7 +97,6 @@ async def generate_fn(requests: List[Request]) -> AsyncGenerator[List[Dict[str, Tensor(name="prompt", dtype=bytes, shape=(1,)), Tensor(name="n", dtype=np.int32, shape=(1,), optional=True), Tensor(name="best_of", dtype=np.int32, shape=(1,), optional=True), - Tensor(name="use_beam_search", dtype=np.bool_, shape=(1,), optional=True), Tensor(name="temperature", dtype=np.float32, shape=(1,), optional=True), Tensor(name="top_p", dtype=np.float32, shape=(1,), optional=True), Tensor(name="max_tokens", dtype=np.int32, shape=(1,), optional=True), diff --git a/tests/functional/L1_vllm_integration/benchmark_serving.py b/tests/functional/L1_vllm_integration/benchmark_serving.py index c93613db..410016f8 100755 --- a/tests/functional/L1_vllm_integration/benchmark_serving.py +++ b/tests/functional/L1_vllm_integration/benchmark_serving.py @@ -102,7 +102,6 @@ async def send_request( prompt_len: int, output_len: int, best_of: int, - use_beam_search: bool, ) -> None: request_start_time = time.perf_counter() @@ -112,7 +111,6 @@ async def send_request( "prompt": prompt, "n": 1, "best_of": best_of, - "use_beam_search": use_beam_search, "temperature": 0.0, # force greedy decoding for same results "top_p": 1.0, "max_tokens": output_len, @@ -120,7 +118,6 @@ async def send_request( "stream": False, } elif backend == "tgi": - assert not use_beam_search params = { "best_of": best_of, "max_new_tokens": output_len, @@ -163,15 +160,12 @@ async def benchmark( api_url: str, input_requests: List[Tuple[str, int, int]], best_of: int, - use_beam_search: bool, request_rate: float, ) -> None: tasks: List[asyncio.Task] = [] async for request in get_request(input_requests, request_rate): prompt, prompt_len, output_len = request - task = asyncio.create_task( - send_request(backend, api_url, prompt, prompt_len, output_len, best_of, use_beam_search) - ) + task = asyncio.create_task(send_request(backend, api_url, prompt, prompt_len, output_len, best_of)) tasks.append(task) await asyncio.gather(*tasks) @@ -192,7 +186,7 @@ def main(args: argparse.Namespace): input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer) benchmark_start_time = time.perf_counter() - asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of, args.use_beam_search, args.request_rate)) + asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of, args.request_rate)) benchmark_end_time = time.perf_counter() benchmark_time = benchmark_end_time - benchmark_start_time print(f"Total time: {benchmark_time:.2f} s") # noqa: T201 @@ -222,7 +216,6 @@ def main(args: argparse.Namespace): parser.add_argument( "--best-of", type=int, default=1, help="Generates `best_of` sequences per prompt and returns the best one." ) - parser.add_argument("--use-beam-search", action="store_true") parser.add_argument("--num-prompts", type=int, default=1000, help="Number of prompts to process.") parser.add_argument( "--request-rate", diff --git a/tests/functional/L1_vllm_integration/server.py b/tests/functional/L1_vllm_integration/server.py index 1f4de43a..cbaeac9b 100755 --- a/tests/functional/L1_vllm_integration/server.py +++ b/tests/functional/L1_vllm_integration/server.py @@ -95,7 +95,6 @@ async def generate_fn(requests): Tensor(name="prompt", dtype=bytes, shape=(1,)), Tensor(name="n", dtype=np.int32, shape=(1,)), Tensor(name="best_of", dtype=np.int32, shape=(1,)), - Tensor(name="use_beam_search", dtype=np.bool_, shape=(1,)), Tensor(name="temperature", dtype=np.float32, shape=(1,)), Tensor(name="top_p", dtype=np.float32, shape=(1,)), Tensor(name="max_tokens", dtype=np.int32, shape=(1,)),