Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
Benchmarking update - phase 1 (#339)
Browse files Browse the repository at this point in the history
This PR updates the benchmarking performed in remote-push and nightly
runs according to the first set of deliverables from our recent meeting:

* Only the `benchmark_serving.json` config is run
* This is accomplished with a new list,
`nm_benchmark_base_config_list.txt`, other lists are untouched
* The `benchmark_serving.json` has various reductions:
* Model list reduced to `facebook/opt-350m` and
`meta-llama/Meta-Llama-3-8B-Instruct`
  * `nr-qps` list reduced to `300,1`
* Metric tracking reduced to mean TPOT and mean TTFT (other metrics
still recorded/logged per usual)

There is also a small fix related to server startup (changing from
`localhost` to `127.0.0.1` because `localhost` on the machines is mapped
to the IPv6 `::1` which something in the server stack doesn’t seem to
like).

In a commit prior to opening the PR with all functional changes, the
full `benchmark` job took <30 min:

https://github.com/neuralmagic/nm-vllm/actions/runs/9669361155/job/26709082658
  • Loading branch information
dbarbuzzi authored Jun 28, 2024
1 parent b42a940 commit 569c905
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 93 deletions.
1 change: 1 addition & 0 deletions .github/data/nm_benchmark_base_config_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
neuralmagic/benchmarks/configs/benchmark_serving.json
2 changes: 1 addition & 1 deletion .github/workflows/nm-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
test_timeout: 480

benchmark_label: gcp-k8s-l4-solo
benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt
benchmark_timeout: 480
push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nm-remote-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
test_timeout: 480

benchmark_label: gcp-k8s-l4-solo
benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt
benchmark_timeout: 480

lm_eval_label: gcp-k8s-l4-solo
Expand Down
89 changes: 17 additions & 72 deletions neuralmagic/benchmarks/configs/benchmark_serving.json
Original file line number Diff line number Diff line change
@@ -1,74 +1,19 @@
{
"configs": [
{
"description": "VLLM Serving - Dense",
"models": [
"teknium/OpenHermes-2.5-Mistral-7B",
"NousResearch/Llama-2-7b-chat-hf",
"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
],
"use_all_available_gpus" : "",
"max_model_lens": [
4096
],
"sparsity": [],
"script_name": "benchmark_serving",
"script_args": {
"nr-qps-pair_": [
"150,0.5",
"300,1",
"750,2.5",
"1500,5"
],
"dataset": [
"sharegpt"
]
}
},
{
"description": "VLLM Serving - Sparse",
"models": [
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
],
"use_all_available_gpus" : "",
"max_model_lens": [
4096
],
"sparsity": ["sparse_w16a16"],
"script_name": "benchmark_serving",
"script_args": {
"nr-qps-pair_": [
"300,1",
"750,2.5",
"1500,5"
],
"dataset": [
"sharegpt"
]
}
},
{
"description": "VLLM Serving - 2:4 Sparse",
"models": [
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4"
],
"use_all_available_gpus" : "",
"max_model_lens": [
4096
],
"sparsity": ["semi_structured_sparse_w16a16"],
"script_name": "benchmark_serving",
"script_args": {
"nr-qps-pair_": [
"150,0.5",
"750,2.5",
"1500,5"
],
"dataset": [
"sharegpt"
]
}
}
]
"configs": [
{
"description": "VLLM Serving - Dense",
"models": [
"facebook/opt-350m",
"meta-llama/Meta-Llama-3-8B-Instruct"
],
"use_all_available_gpus": "",
"max_model_lens": [4096],
"sparsity": [],
"script_name": "benchmark_serving",
"script_args": {
"nr-qps-pair_": ["300,1"],
"dataset": ["sharegpt"]
}
}
]
}
2 changes: 1 addition & 1 deletion neuralmagic/benchmarks/run_benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
max_model_length_from_model_id, script_args_to_cla)
from .scripts.common import num_available_gpus, warmup_server

BENCH_SERVER_HOST = "localhost"
BENCH_SERVER_HOST = "127.0.0.1"
BENCH_SERVER_PORT = 9000


Expand Down
19 changes: 1 addition & 18 deletions neuralmagic/benchmarks/scripts/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,25 +92,8 @@ def update_benchmark_result_metadata(
def update_benchmark_result_metrics(
self, result: BenchmarkResult) -> BenchmarkResult:
rmt = ResultMetricTemplates
result.add_metric(rmt.request_throughput,
self.metrics.request_throughput)
result.add_metric(rmt.input_throughput, self.metrics.input_throughput)
result.add_metric(rmt.output_throughput,
self.metrics.output_throughput)
result.add_metric(rmt.median_request_latency,
self.metrics.median_request_latency)
result.add_metric(rmt.p90_request_latency,
self.metrics.p90_request_latency)
result.add_metric(rmt.p99_request_latency,
self.metrics.p99_request_latency)
result.add_metric(rmt.mean_ttft_ms, self.metrics.mean_ttft_ms)
result.add_metric(rmt.median_ttft_ms, self.metrics.median_ttft_ms)
result.add_metric(rmt.p90_ttft_ms, self.metrics.p90_ttft_ms)
result.add_metric(rmt.p99_ttft_ms, self.metrics.p99_ttft_ms)
result.add_metric(rmt.mean_tpot_ms, self.metrics.mean_tpot_ms)
result.add_metric(rmt.median_tpot_ms, self.metrics.median_tpot_ms)
result.add_metric(rmt.p90_tpot_ms, self.metrics.p90_tpot_ms)
result.add_metric(rmt.p99_tpot_ms, self.metrics.p99_tpot_ms)
return result

def update_benchmark_result(self,
Expand Down Expand Up @@ -509,7 +492,7 @@ def from_str(arg: str):
help="""
First argument in the pair is num_prompts to process.
Second argument in the pair is request_rate per second.
If this is inf, then all the requests are sent at time 0.
If this is inf, then all the requests are sent at time 0.
Otherwise, we use Poisson process to synthesize""",
default=None)

Expand Down

0 comments on commit 569c905

Please sign in to comment.