Skip to content

Commit

Permalink
Remove use beam search parameter from vLLM example
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrm-nvidia committed Nov 14, 2024
1 parent 75798ba commit 164ca81
Show file tree
Hide file tree
Showing 3 changed files with 2 additions and 11 deletions.
1 change: 0 additions & 1 deletion examples/vllm/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ async def generate_fn(requests: List[Request]) -> AsyncGenerator[List[Dict[str,
Tensor(name="prompt", dtype=bytes, shape=(1,)),
Tensor(name="n", dtype=np.int32, shape=(1,), optional=True),
Tensor(name="best_of", dtype=np.int32, shape=(1,), optional=True),
Tensor(name="use_beam_search", dtype=np.bool_, shape=(1,), optional=True),
Tensor(name="temperature", dtype=np.float32, shape=(1,), optional=True),
Tensor(name="top_p", dtype=np.float32, shape=(1,), optional=True),
Tensor(name="max_tokens", dtype=np.int32, shape=(1,), optional=True),
Expand Down
11 changes: 2 additions & 9 deletions tests/functional/L1_vllm_integration/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ async def send_request(
prompt_len: int,
output_len: int,
best_of: int,
use_beam_search: bool,
) -> None:
request_start_time = time.perf_counter()

Expand All @@ -112,15 +111,13 @@ async def send_request(
"prompt": prompt,
"n": 1,
"best_of": best_of,
"use_beam_search": use_beam_search,
"temperature": 0.0, # force greedy decoding for same results
"top_p": 1.0,
"max_tokens": output_len,
"ignore_eos": True,
"stream": False,
}
elif backend == "tgi":
assert not use_beam_search
params = {
"best_of": best_of,
"max_new_tokens": output_len,
Expand Down Expand Up @@ -163,15 +160,12 @@ async def benchmark(
api_url: str,
input_requests: List[Tuple[str, int, int]],
best_of: int,
use_beam_search: bool,
request_rate: float,
) -> None:
tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate):
prompt, prompt_len, output_len = request
task = asyncio.create_task(
send_request(backend, api_url, prompt, prompt_len, output_len, best_of, use_beam_search)
)
task = asyncio.create_task(send_request(backend, api_url, prompt, prompt_len, output_len, best_of))
tasks.append(task)
await asyncio.gather(*tasks)

Expand All @@ -192,7 +186,7 @@ def main(args: argparse.Namespace):
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)

benchmark_start_time = time.perf_counter()
asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of, args.use_beam_search, args.request_rate))
asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of, args.request_rate))
benchmark_end_time = time.perf_counter()
benchmark_time = benchmark_end_time - benchmark_start_time
print(f"Total time: {benchmark_time:.2f} s") # noqa: T201
Expand Down Expand Up @@ -222,7 +216,6 @@ def main(args: argparse.Namespace):
parser.add_argument(
"--best-of", type=int, default=1, help="Generates `best_of` sequences per prompt and returns the best one."
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument("--num-prompts", type=int, default=1000, help="Number of prompts to process.")
parser.add_argument(
"--request-rate",
Expand Down
1 change: 0 additions & 1 deletion tests/functional/L1_vllm_integration/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ async def generate_fn(requests):
Tensor(name="prompt", dtype=bytes, shape=(1,)),
Tensor(name="n", dtype=np.int32, shape=(1,)),
Tensor(name="best_of", dtype=np.int32, shape=(1,)),
Tensor(name="use_beam_search", dtype=np.bool_, shape=(1,)),
Tensor(name="temperature", dtype=np.float32, shape=(1,)),
Tensor(name="top_p", dtype=np.float32, shape=(1,)),
Tensor(name="max_tokens", dtype=np.int32, shape=(1,)),
Expand Down

0 comments on commit 164ca81

Please sign in to comment.