Benchmarking update - phase 1 (#339)

This PR updates the benchmarking performed in remote-push and nightly runs according to the first set of deliverables from our recent meeting: * Only the `benchmark_serving.json` config is run * This is accomplished with a new list, `nm_benchmark_base_config_list.txt`, other lists are untouched * The `benchmark_serving.json` has various reductions: * Model list reduced to `facebook/opt-350m` and `meta-llama/Meta-Llama-3-8B-Instruct` * `nr-qps` list reduced to `300,1` * Metric tracking reduced to mean TPOT and mean TTFT (other metrics still recorded/logged per usual) There is also a small fix related to server startup (changing from `localhost` to `127.0.0.1` because `localhost` on the machines is mapped to the IPv6 `::1` which something in the server stack doesn’t seem to like). In a commit prior to opening the PR with all functional changes, the full `benchmark` job took <30 min: https://github.com/neuralmagic/nm-vllm/actions/runs/9669361155/job/26709082658
neuralmagic · Jun 28, 2024 · 569c905 · 569c905
1 parent b42a940
commit 569c905
Show file tree

Hide file tree

Showing 6 changed files with 22 additions and 93 deletions.
diff --git a/.github/data/nm_benchmark_base_config_list.txt b/.github/data/nm_benchmark_base_config_list.txt
@@ -0,0 +1 @@
+neuralmagic/benchmarks/configs/benchmark_serving.json
diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml
@@ -40,7 +40,7 @@ jobs:
             test_timeout: 480
 
             benchmark_label: gcp-k8s-l4-solo
-            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
+            benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt
             benchmark_timeout: 480
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
 

diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml
@@ -26,7 +26,7 @@ jobs:
             test_timeout: 480
 
             benchmark_label: gcp-k8s-l4-solo
-            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
+            benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt
             benchmark_timeout: 480
 
             lm_eval_label: gcp-k8s-l4-solo

diff --git a/neuralmagic/benchmarks/configs/benchmark_serving.json b/neuralmagic/benchmarks/configs/benchmark_serving.json
@@ -1,74 +1,19 @@
 {
-	"configs": [
-		{
-			"description": "VLLM Serving - Dense",
-			"models": [
-                          "teknium/OpenHermes-2.5-Mistral-7B",
-                          "NousResearch/Llama-2-7b-chat-hf",
-                          "neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
-                          "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
-			],
-			"use_all_available_gpus" : "",
-			"max_model_lens": [
-				4096
-			],
-			"sparsity": [],
-			"script_name": "benchmark_serving",
-			"script_args": {
-				"nr-qps-pair_": [
-                                        "150,0.5",
-                                        "300,1",
-                                        "750,2.5",
-                                        "1500,5"
-				],
-				"dataset": [
-					"sharegpt"
-				]
-			}
-		},
-                {
-			"description": "VLLM Serving - Sparse",
-			"models": [
-                          "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
-			],
-			"use_all_available_gpus" : "",
-			"max_model_lens": [
-				4096
-			],
-			"sparsity": ["sparse_w16a16"],
-			"script_name": "benchmark_serving",
-			"script_args": {
-				"nr-qps-pair_": [
-                                        "300,1",
-                                        "750,2.5",
-                                        "1500,5"
-				],
-				"dataset": [
-					"sharegpt"
-				]
-			}
-		},
-                {
-			"description": "VLLM Serving - 2:4 Sparse",
-			"models": [
-                          "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4"
-			],
-			"use_all_available_gpus" : "",
-			"max_model_lens": [
-				4096
-			],
-			"sparsity": ["semi_structured_sparse_w16a16"],
-			"script_name": "benchmark_serving",
-			"script_args": {
-				"nr-qps-pair_": [
-                                        "150,0.5",
-                                        "750,2.5",
-                                        "1500,5"
-				],
-				"dataset": [
-					"sharegpt"
-				]
-			}
-		}
-	]
+  "configs": [
+    {
+      "description": "VLLM Serving - Dense",
+      "models": [
+        "facebook/opt-350m",
+        "meta-llama/Meta-Llama-3-8B-Instruct"
+      ],
+      "use_all_available_gpus": "",
+      "max_model_lens": [4096],
+      "sparsity": [],
+      "script_name": "benchmark_serving",
+      "script_args": {
+        "nr-qps-pair_": ["300,1"],
+        "dataset": ["sharegpt"]
+      }
+    }
+  ]
 }
diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py
@@ -13,7 +13,7 @@
                      max_model_length_from_model_id, script_args_to_cla)
 from .scripts.common import num_available_gpus, warmup_server
 
-BENCH_SERVER_HOST = "localhost"
+BENCH_SERVER_HOST = "127.0.0.1"
 BENCH_SERVER_PORT = 9000
 
 

diff --git a/neuralmagic/benchmarks/scripts/benchmark_serving.py b/neuralmagic/benchmarks/scripts/benchmark_serving.py
@@ -92,25 +92,8 @@ def update_benchmark_result_metadata(
     def update_benchmark_result_metrics(
             self, result: BenchmarkResult) -> BenchmarkResult:
         rmt = ResultMetricTemplates
-        result.add_metric(rmt.request_throughput,
-                          self.metrics.request_throughput)
-        result.add_metric(rmt.input_throughput, self.metrics.input_throughput)
-        result.add_metric(rmt.output_throughput,
-                          self.metrics.output_throughput)
-        result.add_metric(rmt.median_request_latency,
-                          self.metrics.median_request_latency)
-        result.add_metric(rmt.p90_request_latency,
-                          self.metrics.p90_request_latency)
-        result.add_metric(rmt.p99_request_latency,
-                          self.metrics.p99_request_latency)
         result.add_metric(rmt.mean_ttft_ms, self.metrics.mean_ttft_ms)
-        result.add_metric(rmt.median_ttft_ms, self.metrics.median_ttft_ms)
-        result.add_metric(rmt.p90_ttft_ms, self.metrics.p90_ttft_ms)
-        result.add_metric(rmt.p99_ttft_ms, self.metrics.p99_ttft_ms)
         result.add_metric(rmt.mean_tpot_ms, self.metrics.mean_tpot_ms)
-        result.add_metric(rmt.median_tpot_ms, self.metrics.median_tpot_ms)
-        result.add_metric(rmt.p90_tpot_ms, self.metrics.p90_tpot_ms)
-        result.add_metric(rmt.p99_tpot_ms, self.metrics.p99_tpot_ms)
         return result
 
     def update_benchmark_result(self,
@@ -509,7 +492,7 @@ def from_str(arg: str):
                         help="""
         First argument in the pair is num_prompts to process.
         Second argument in the pair is request_rate per second.
-            If this is inf, then all the requests are sent at time 0. 
+            If this is inf, then all the requests are sent at time 0.
             Otherwise, we use Poisson process to synthesize""",
                         default=None)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		neuralmagic/benchmarks/configs/benchmark_serving.json