From 569c9051e46243e36d0b6fbbe5c6e3f9df2956f3 Mon Sep 17 00:00:00 2001 From: Domenic Barbuzzi Date: Fri, 28 Jun 2024 16:39:56 -0400 Subject: [PATCH] Benchmarking update - phase 1 (#339) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR updates the benchmarking performed in remote-push and nightly runs according to the first set of deliverables from our recent meeting: * Only the `benchmark_serving.json` config is run * This is accomplished with a new list, `nm_benchmark_base_config_list.txt`, other lists are untouched * The `benchmark_serving.json` has various reductions: * Model list reduced to `facebook/opt-350m` and `meta-llama/Meta-Llama-3-8B-Instruct` * `nr-qps` list reduced to `300,1` * Metric tracking reduced to mean TPOT and mean TTFT (other metrics still recorded/logged per usual) There is also a small fix related to server startup (changing from `localhost` to `127.0.0.1` because `localhost` on the machines is mapped to the IPv6 `::1` which something in the server stack doesn’t seem to like). In a commit prior to opening the PR with all functional changes, the full `benchmark` job took <30 min: https://github.com/neuralmagic/nm-vllm/actions/runs/9669361155/job/26709082658 --- .../data/nm_benchmark_base_config_list.txt | 1 + .github/workflows/nm-nightly.yml | 2 +- .github/workflows/nm-remote-push.yml | 2 +- .../benchmarks/configs/benchmark_serving.json | 89 ++++--------------- .../benchmarks/run_benchmark_serving.py | 2 +- .../benchmarks/scripts/benchmark_serving.py | 19 +--- 6 files changed, 22 insertions(+), 93 deletions(-) create mode 100644 .github/data/nm_benchmark_base_config_list.txt diff --git a/.github/data/nm_benchmark_base_config_list.txt b/.github/data/nm_benchmark_base_config_list.txt new file mode 100644 index 0000000000000..8945192390c2a --- /dev/null +++ b/.github/data/nm_benchmark_base_config_list.txt @@ -0,0 +1 @@ +neuralmagic/benchmarks/configs/benchmark_serving.json diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml index 841f71ef71914..434f2b9032b1b 100644 --- a/.github/workflows/nm-nightly.yml +++ b/.github/workflows/nm-nightly.yml @@ -40,7 +40,7 @@ jobs: test_timeout: 480 benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt + benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt benchmark_timeout: 480 push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml index 41398f0625d16..a44274d9e8a11 100644 --- a/.github/workflows/nm-remote-push.yml +++ b/.github/workflows/nm-remote-push.yml @@ -26,7 +26,7 @@ jobs: test_timeout: 480 benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt + benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt benchmark_timeout: 480 lm_eval_label: gcp-k8s-l4-solo diff --git a/neuralmagic/benchmarks/configs/benchmark_serving.json b/neuralmagic/benchmarks/configs/benchmark_serving.json index 4e7b4e50c3e4a..5c51b98b64453 100644 --- a/neuralmagic/benchmarks/configs/benchmark_serving.json +++ b/neuralmagic/benchmarks/configs/benchmark_serving.json @@ -1,74 +1,19 @@ { - "configs": [ - { - "description": "VLLM Serving - Dense", - "models": [ - "teknium/OpenHermes-2.5-Mistral-7B", - "NousResearch/Llama-2-7b-chat-hf", - "neuralmagic/OpenHermes-2.5-Mistral-7B-marlin", - "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ" - ], - "use_all_available_gpus" : "", - "max_model_lens": [ - 4096 - ], - "sparsity": [], - "script_name": "benchmark_serving", - "script_args": { - "nr-qps-pair_": [ - "150,0.5", - "300,1", - "750,2.5", - "1500,5" - ], - "dataset": [ - "sharegpt" - ] - } - }, - { - "description": "VLLM Serving - Sparse", - "models": [ - "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50" - ], - "use_all_available_gpus" : "", - "max_model_lens": [ - 4096 - ], - "sparsity": ["sparse_w16a16"], - "script_name": "benchmark_serving", - "script_args": { - "nr-qps-pair_": [ - "300,1", - "750,2.5", - "1500,5" - ], - "dataset": [ - "sharegpt" - ] - } - }, - { - "description": "VLLM Serving - 2:4 Sparse", - "models": [ - "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4" - ], - "use_all_available_gpus" : "", - "max_model_lens": [ - 4096 - ], - "sparsity": ["semi_structured_sparse_w16a16"], - "script_name": "benchmark_serving", - "script_args": { - "nr-qps-pair_": [ - "150,0.5", - "750,2.5", - "1500,5" - ], - "dataset": [ - "sharegpt" - ] - } - } - ] + "configs": [ + { + "description": "VLLM Serving - Dense", + "models": [ + "facebook/opt-350m", + "meta-llama/Meta-Llama-3-8B-Instruct" + ], + "use_all_available_gpus": "", + "max_model_lens": [4096], + "sparsity": [], + "script_name": "benchmark_serving", + "script_args": { + "nr-qps-pair_": ["300,1"], + "dataset": ["sharegpt"] + } + } + ] } diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py index e16da7d0b5e38..74f5b641a8166 100644 --- a/neuralmagic/benchmarks/run_benchmark_serving.py +++ b/neuralmagic/benchmarks/run_benchmark_serving.py @@ -13,7 +13,7 @@ max_model_length_from_model_id, script_args_to_cla) from .scripts.common import num_available_gpus, warmup_server -BENCH_SERVER_HOST = "localhost" +BENCH_SERVER_HOST = "127.0.0.1" BENCH_SERVER_PORT = 9000 diff --git a/neuralmagic/benchmarks/scripts/benchmark_serving.py b/neuralmagic/benchmarks/scripts/benchmark_serving.py index 3a78e2b850c39..62d3cc3f1231f 100644 --- a/neuralmagic/benchmarks/scripts/benchmark_serving.py +++ b/neuralmagic/benchmarks/scripts/benchmark_serving.py @@ -92,25 +92,8 @@ def update_benchmark_result_metadata( def update_benchmark_result_metrics( self, result: BenchmarkResult) -> BenchmarkResult: rmt = ResultMetricTemplates - result.add_metric(rmt.request_throughput, - self.metrics.request_throughput) - result.add_metric(rmt.input_throughput, self.metrics.input_throughput) - result.add_metric(rmt.output_throughput, - self.metrics.output_throughput) - result.add_metric(rmt.median_request_latency, - self.metrics.median_request_latency) - result.add_metric(rmt.p90_request_latency, - self.metrics.p90_request_latency) - result.add_metric(rmt.p99_request_latency, - self.metrics.p99_request_latency) result.add_metric(rmt.mean_ttft_ms, self.metrics.mean_ttft_ms) - result.add_metric(rmt.median_ttft_ms, self.metrics.median_ttft_ms) - result.add_metric(rmt.p90_ttft_ms, self.metrics.p90_ttft_ms) - result.add_metric(rmt.p99_ttft_ms, self.metrics.p99_ttft_ms) result.add_metric(rmt.mean_tpot_ms, self.metrics.mean_tpot_ms) - result.add_metric(rmt.median_tpot_ms, self.metrics.median_tpot_ms) - result.add_metric(rmt.p90_tpot_ms, self.metrics.p90_tpot_ms) - result.add_metric(rmt.p99_tpot_ms, self.metrics.p99_tpot_ms) return result def update_benchmark_result(self, @@ -509,7 +492,7 @@ def from_str(arg: str): help=""" First argument in the pair is num_prompts to process. Second argument in the pair is request_rate per second. - If this is inf, then all the requests are sent at time 0. + If this is inf, then all the requests are sent at time 0. Otherwise, we use Poisson process to synthesize""", default=None)