From c90927b8be4bc8f47deecd531819651ee0b55e98 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 28 Aug 2024 20:10:34 +0000 Subject: [PATCH] fixt --- benchmarks/benchmark_throughput.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 0c30f5877aae8..eaf256f7cb8c2 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -87,7 +87,6 @@ def run_vllm( download_dir: Optional[str] = None, load_format: str = EngineArgs.load_format, disable_async_output_proc: bool = False, - max_num_seqs: Optional[int] = None, ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -241,7 +240,7 @@ def main(args: argparse.Namespace): args.max_num_batched_tokens, args.distributed_executor_backend, args.gpu_memory_utilization, args.num_scheduler_steps, args.use_v2_block_manager, args.download_dir, args.load_format, - args.disable_async_output_proc, args.max_num_seqs) + args.disable_async_output_proc) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -399,11 +398,6 @@ def main(args: argparse.Namespace): help='Backend to use for distributed serving. When more than 1 GPU ' 'is used, will be automatically set to "ray" if installed ' 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--max-num-seqs', - type=int, - default=None, - help='Max num seqs.') parser.add_argument( '--load-format', type=str,