From 998d93620b2a225a410987da174fe89ebbeb9435 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Date: Wed, 9 Oct 2024 21:30:18 +0000
Subject: [PATCH] pull in all engine args

---
 examples/offline_profile.py | 78 ++++---------------------------------
 1 file changed, 8 insertions(+), 70 deletions(-)

diff --git a/examples/offline_profile.py b/examples/offline_profile.py
index a16e0abd5b687..c9fceb03848c1 100644
--- a/examples/offline_profile.py
+++ b/examples/offline_profile.py
@@ -9,7 +9,7 @@
 import torch
 
 from vllm import LLM, SamplingParams
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.engine.arg_utils import EngineArgs
 from vllm.profiler import layerwise_profile
 from vllm.utils import FlexibleArgumentParser
 
@@ -20,18 +20,10 @@
 
 @dataclass
 class ProfileContext:
-    model: str
-    tokenizer: str
-    model_revision: str
-    quantization: str
-    max_model_len: int
-    max_num_batched_tokens: int
+    engine_args: EngineArgs
     prompt_len: int
     output_len: int
     batch_size: int
-    dtype: str
-    tensor_parallel_size: int
-    allow_cuda_graphs: bool
     save_traces_folder: Optional[str]
 
 
@@ -55,18 +47,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
                                      ignore_eos=True)
 
     # Create LLM
-    llm = LLM(model=context.model,
-              tokenizer=context.tokenizer
-              if context.tokenizer is not None else context.model,
-              revision=context.model_revision,
-              enforce_eager=not context.allow_cuda_graphs,
-              tensor_parallel_size=context.tensor_parallel_size,
-              gpu_memory_utilization=0.9,
-              max_model_len=context.max_model_len,
-              quantization=context.quantization,
-              dtype=get_dtype(context.dtype),
-              max_num_batched_tokens=context.max_num_batched_tokens)
-
+    llm = LLM(**asdict(context.engine_args))
     batch_size = context.batch_size
     prompt_len = context.prompt_len
     output_len = context.output_len
@@ -232,8 +213,8 @@ def abort_requests():
     ```
     python examples/offline_profile.py \\
         --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
-        --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8
-                
+        --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
+        --enforce-eager
     ```
 
     then you can use various tools to analyze the json output
@@ -250,18 +231,6 @@ def abort_requests():
         ```
 """,
                                     formatter_class=RawTextHelpFormatter)
-
-    parser.add_argument(
-        "--model",
-        type=str,
-        required=True,
-        help='The name or path of a HuggingFace Transformers model.')
-    parser.add_argument("--tokenizer",
-                        type=str,
-                        default=None,
-                        help="path to the tokenizer")
-
-    parser.add_argument("--model-revision", type=str, default=None)
     parser.add_argument(
         "--csv",
         type=str,
@@ -281,27 +250,6 @@ def abort_requests():
                         help="Save chrome traces for the prefill and decode "
                         "will save traces as prefill.json and decode_1.json, "
                         "etc. inside this folder")
-    parser.add_argument("--quantization",
-                        "-q",
-                        type=str,
-                        choices=[*QUANTIZATION_METHODS, None],
-                        default=None)
-    parser.add_argument("--dtype",
-                        type=str,
-                        default='auto',
-                        help="model dtype")
-    parser.add_argument(
-        "--max-model-len",
-        type=int,
-        default=None,
-        help="Maximum length of a sequence (including prompt and output)")
-    parser.add_argument(
-        "--max-num-batched-tokens",
-        type=int,
-        default=None,
-        help="Maximum number of tokens to be processed in a single iteration. "
-        " Should be greater than batch-size * prompt-len so the prefill can "
-        " run in a single iteration.")
     parser.add_argument(
         "--prompt-len",
         type=int,
@@ -313,19 +261,6 @@ def abort_requests():
                         default=BATCH_SIZE_DEFAULT,
                         help=f"Number of requests to run as a single batch, "
                         f"default={BATCH_SIZE_DEFAULT}")
-    parser.add_argument("--tensor-parallel-size",
-                        "-tp",
-                        type=int,
-                        default=1,
-                        help="Number of GPUs to use i.e. tensor parallelism, "
-                        "default=1")
-    parser.add_argument(
-        "--allow-cuda-graphs",
-        action='store_true',
-        help="Enables cuda graphs to be used. This wo; remove a lot of the "
-        "module level info in the profiler results since almost everything "
-        "runs in the graph where we do not have access to an informative stack "
-        "trace")
     parser.add_argument(
         "--output-len",
         type=int,
@@ -333,9 +268,12 @@ def abort_requests():
         help="Number of llm steps to run (includes prefill and decode) "
         "- default={OUTPUT_LEN_DEFAULT}")
 
+    EngineArgs.add_cli_args(parser)
+
     args = parser.parse_args()
 
     context = ProfileContext(
+        engine_args=EngineArgs.from_cli_args(args),
         **{
             k: v
             for k, v in vars(args).items()