From 569c9051e46243e36d0b6fbbe5c6e3f9df2956f3 Mon Sep 17 00:00:00 2001
From: Domenic Barbuzzi <dbarbuzzi@gmail.com>
Date: Fri, 28 Jun 2024 16:39:56 -0400
Subject: [PATCH] Benchmarking update - phase 1 (#339)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR updates the benchmarking performed in remote-push and nightly
runs according to the first set of deliverables from our recent meeting:

* Only the `benchmark_serving.json` config is run
* This is accomplished with a new list,
`nm_benchmark_base_config_list.txt`, other lists are untouched
* The `benchmark_serving.json` has various reductions:
* Model list reduced to `facebook/opt-350m` and
`meta-llama/Meta-Llama-3-8B-Instruct`
  * `nr-qps` list reduced to `300,1`
* Metric tracking reduced to mean TPOT and mean TTFT (other metrics
still recorded/logged per usual)

There is also a small fix related to server startup (changing from
`localhost` to `127.0.0.1` because `localhost` on the machines is mapped
to the IPv6 `::1` which something in the server stack doesn’t seem to
like).

In a commit prior to opening the PR with all functional changes, the
full `benchmark` job took <30 min:

https://github.com/neuralmagic/nm-vllm/actions/runs/9669361155/job/26709082658
---
 .../data/nm_benchmark_base_config_list.txt    |  1 +
 .github/workflows/nm-nightly.yml              |  2 +-
 .github/workflows/nm-remote-push.yml          |  2 +-
 .../benchmarks/configs/benchmark_serving.json | 89 ++++---------------
 .../benchmarks/run_benchmark_serving.py       |  2 +-
 .../benchmarks/scripts/benchmark_serving.py   | 19 +---
 6 files changed, 22 insertions(+), 93 deletions(-)
 create mode 100644 .github/data/nm_benchmark_base_config_list.txt

diff --git a/.github/data/nm_benchmark_base_config_list.txt b/.github/data/nm_benchmark_base_config_list.txt
new file mode 100644
index 0000000000000..8945192390c2a
--- /dev/null
+++ b/.github/data/nm_benchmark_base_config_list.txt
@@ -0,0 +1 @@
+neuralmagic/benchmarks/configs/benchmark_serving.json
diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml
index 841f71ef71914..434f2b9032b1b 100644
--- a/.github/workflows/nm-nightly.yml
+++ b/.github/workflows/nm-nightly.yml
@@ -40,7 +40,7 @@ jobs:
             test_timeout: 480
 
             benchmark_label: gcp-k8s-l4-solo
-            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
+            benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt
             benchmark_timeout: 480
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
 
diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml
index 41398f0625d16..a44274d9e8a11 100644
--- a/.github/workflows/nm-remote-push.yml
+++ b/.github/workflows/nm-remote-push.yml
@@ -26,7 +26,7 @@ jobs:
             test_timeout: 480
 
             benchmark_label: gcp-k8s-l4-solo
-            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
+            benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt
             benchmark_timeout: 480
 
             lm_eval_label: gcp-k8s-l4-solo
diff --git a/neuralmagic/benchmarks/configs/benchmark_serving.json b/neuralmagic/benchmarks/configs/benchmark_serving.json
index 4e7b4e50c3e4a..5c51b98b64453 100644
--- a/neuralmagic/benchmarks/configs/benchmark_serving.json
+++ b/neuralmagic/benchmarks/configs/benchmark_serving.json
@@ -1,74 +1,19 @@
 {
-	"configs": [
-		{
-			"description": "VLLM Serving - Dense",
-			"models": [
-                          "teknium/OpenHermes-2.5-Mistral-7B",
-                          "NousResearch/Llama-2-7b-chat-hf",
-                          "neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
-                          "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
-			],
-			"use_all_available_gpus" : "",
-			"max_model_lens": [
-				4096
-			],
-			"sparsity": [],
-			"script_name": "benchmark_serving",
-			"script_args": {
-				"nr-qps-pair_": [
-                                        "150,0.5",
-                                        "300,1",
-                                        "750,2.5",
-                                        "1500,5"
-				],
-				"dataset": [
-					"sharegpt"
-				]
-			}
-		},
-                {
-			"description": "VLLM Serving - Sparse",
-			"models": [
-                          "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
-			],
-			"use_all_available_gpus" : "",
-			"max_model_lens": [
-				4096
-			],
-			"sparsity": ["sparse_w16a16"],
-			"script_name": "benchmark_serving",
-			"script_args": {
-				"nr-qps-pair_": [
-                                        "300,1",
-                                        "750,2.5",
-                                        "1500,5"
-				],
-				"dataset": [
-					"sharegpt"
-				]
-			}
-		},
-                {
-			"description": "VLLM Serving - 2:4 Sparse",
-			"models": [
-                          "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4"
-			],
-			"use_all_available_gpus" : "",
-			"max_model_lens": [
-				4096
-			],
-			"sparsity": ["semi_structured_sparse_w16a16"],
-			"script_name": "benchmark_serving",
-			"script_args": {
-				"nr-qps-pair_": [
-                                        "150,0.5",
-                                        "750,2.5",
-                                        "1500,5"
-				],
-				"dataset": [
-					"sharegpt"
-				]
-			}
-		}
-	]
+  "configs": [
+    {
+      "description": "VLLM Serving - Dense",
+      "models": [
+        "facebook/opt-350m",
+        "meta-llama/Meta-Llama-3-8B-Instruct"
+      ],
+      "use_all_available_gpus": "",
+      "max_model_lens": [4096],
+      "sparsity": [],
+      "script_name": "benchmark_serving",
+      "script_args": {
+        "nr-qps-pair_": ["300,1"],
+        "dataset": ["sharegpt"]
+      }
+    }
+  ]
 }
diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py
index e16da7d0b5e38..74f5b641a8166 100644
--- a/neuralmagic/benchmarks/run_benchmark_serving.py
+++ b/neuralmagic/benchmarks/run_benchmark_serving.py
@@ -13,7 +13,7 @@
                      max_model_length_from_model_id, script_args_to_cla)
 from .scripts.common import num_available_gpus, warmup_server
 
-BENCH_SERVER_HOST = "localhost"
+BENCH_SERVER_HOST = "127.0.0.1"
 BENCH_SERVER_PORT = 9000
 
 
diff --git a/neuralmagic/benchmarks/scripts/benchmark_serving.py b/neuralmagic/benchmarks/scripts/benchmark_serving.py
index 3a78e2b850c39..62d3cc3f1231f 100644
--- a/neuralmagic/benchmarks/scripts/benchmark_serving.py
+++ b/neuralmagic/benchmarks/scripts/benchmark_serving.py
@@ -92,25 +92,8 @@ def update_benchmark_result_metadata(
     def update_benchmark_result_metrics(
             self, result: BenchmarkResult) -> BenchmarkResult:
         rmt = ResultMetricTemplates
-        result.add_metric(rmt.request_throughput,
-                          self.metrics.request_throughput)
-        result.add_metric(rmt.input_throughput, self.metrics.input_throughput)
-        result.add_metric(rmt.output_throughput,
-                          self.metrics.output_throughput)
-        result.add_metric(rmt.median_request_latency,
-                          self.metrics.median_request_latency)
-        result.add_metric(rmt.p90_request_latency,
-                          self.metrics.p90_request_latency)
-        result.add_metric(rmt.p99_request_latency,
-                          self.metrics.p99_request_latency)
         result.add_metric(rmt.mean_ttft_ms, self.metrics.mean_ttft_ms)
-        result.add_metric(rmt.median_ttft_ms, self.metrics.median_ttft_ms)
-        result.add_metric(rmt.p90_ttft_ms, self.metrics.p90_ttft_ms)
-        result.add_metric(rmt.p99_ttft_ms, self.metrics.p99_ttft_ms)
         result.add_metric(rmt.mean_tpot_ms, self.metrics.mean_tpot_ms)
-        result.add_metric(rmt.median_tpot_ms, self.metrics.median_tpot_ms)
-        result.add_metric(rmt.p90_tpot_ms, self.metrics.p90_tpot_ms)
-        result.add_metric(rmt.p99_tpot_ms, self.metrics.p99_tpot_ms)
         return result
 
     def update_benchmark_result(self,
@@ -509,7 +492,7 @@ def from_str(arg: str):
                         help="""
         First argument in the pair is num_prompts to process.
         Second argument in the pair is request_rate per second.
-            If this is inf, then all the requests are sent at time 0. 
+            If this is inf, then all the requests are sent at time 0.
             Otherwise, we use Poisson process to synthesize""",
                         default=None)