From 23c781d36bb968b1863457caf177c32a13fc2b14 Mon Sep 17 00:00:00 2001 From: Emilio Cota Date: Mon, 5 Feb 2024 23:13:23 -0500 Subject: [PATCH] benchmarks: add script to run llama2 inference benchmarks --- benchmarks/llama.py | 236 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 benchmarks/llama.py diff --git a/benchmarks/llama.py b/benchmarks/llama.py new file mode 100644 index 000000000000..d1a0885ee5da --- /dev/null +++ b/benchmarks/llama.py @@ -0,0 +1,236 @@ +import argparse +import os +import subprocess +import json +import datetime +import re +import sys + + +def parse_args(): + parser = argparse.ArgumentParser(description='Run Llama inference benchmarks') + parser.add_argument('--batch_size', type=int, default=1, help='Batch size.') + parser.add_argument( + '--repeat', type=int, default=8, help='Number of repetitions') + parser.add_argument( + '--workspace_dir', type=str, required=True, help='Workspace directory.') + args = parser.parse_args() + + return args + + +def get_info_from_result_file(results_dir: str) -> tuple[str, str, float]: + results_file = os.path.join(results_dir, 'results.jsonl') + if not os.path.exists(results_file): + sys.exit(f"Results file {results_file} not found. " + "Please run experiment_runner.py first.") + accelerator_model = None + with open(results_file, 'r') as f: + first_line = f.readline() + acc_match = re.search(r'"accelerator_model": "([^"]+)"', first_line) + time_match = re.search(r'"timestamp": ([0-9.]+)', first_line) + if acc_match and time_match: + accelerator_model = acc_match.group(1) + timestamp = float(time_match.group(1)) + else: + sys.exit(f"Cannot find a timestamp and a matching accelerator " + "in {results_file}.") + return accelerator_model, timestamp + + +def set_up_llama_repo(workspace_dir: str) -> str: + llama_dir = os.path.join(workspace_dir, 'llama-inference') + if os.path.exists(llama_dir): + return llama_dir + + subprocess.check_call([ + 'git', 'clone', 'https://github.com/pytorch-tpu/llama.git', '--branch', + 'llama2-google-next-inference', llama_dir + ]) + subprocess.check_call( + ['pip', 'install', '-r', + os.path.join(llama_dir, 'requirements.txt')]) + subprocess.check_call(['pip', 'install', '-e', llama_dir]) + + # TODO: remove once https://github.com/pytorch-tpu/llama/pull/47 is merged. + subprocess.check_call( + ['git', '-C', llama_dir, 'am', '/tmp/dynamo_flag.mail.patch']) + + # Create model JSON files + model_configs = { + '7b.json': { + "dim": 4096, + "multiple_of": 256, + "n_heads": 32, + "n_layers": 32, + "norm_eps": 1e-05, + "vocab_size": -1 + }, + '13b.json': { + "dim": 5120, + "multiple_of": 256, + "n_heads": 40, + "n_layers": 40, + "norm_eps": 1e-05, + "vocab_size": -1 + }, + '70b.json': { + "dim": 8192, + "multiple_of": 4096, + "ffn_dim_multiplier": 1.3, + "n_heads": 64, + "n_kv_heads": 8, + "n_layers": 80, + "norm_eps": 1e-05, + "vocab_size": -1 + } + } + for filename, config in model_configs.items(): + filepath = os.path.join(llama_dir, filename) + with open(filepath, 'w') as f: + json.dump(config, f) + return llama_dir + + +def parse_log_file(log_file: str): + latencies = [] + with open(log_file, 'r') as f: + for line in f: + if ('Totally decoded ' not in line or 'tokens in' not in line or + ' seconds' not in line): + continue + parts = line.strip().split() + tokens = float(parts[2]) + seconds = float(parts[5]) + latency_per_token = seconds / tokens + latencies.append(latency_per_token) + return latencies + + +def benchmark_has_already_run(results_file: str, model_name: str, xla: str, + dynamo: str, batch_size: int): + with open(results_file, 'r') as f: + for line in f: + # Grep for relevant lines to avoid parsing the entire JSONL file. + if f'"model_name": "{model_name}"' not in line: + continue + r = json.loads(line.rstrip('\n|\r')) + # yapf: disable + if all( + r.get(k1, {}).get(k2) == v + for (k1, k2, v) in [ + ('experiment', 'accelerator', 'cuda'), + ('experiment', 'batch_size', batch_size), + ('experiment', 'dynamo', dynamo), + ('experiment', 'test', 'eval'), + ('experiment', 'xla', xla), + ('experiment', 'xla_flags', None), + ('model', 'model_name', model_name), + ]): + return True + # yapf: enable + return False + + +def run_benchmarks(args, llama_dir: str, results_dir: str, + accelerator_model: str, timestamp: float): + os.chdir(llama_dir) + for size in ['7b', '13b', '70b']: + params_json = 'params.json' + if os.path.exists(params_json): + os.remove(params_json) + os.symlink(f'{size}.json', params_json) + model_name = f"llama2.{size}" + for dynamo in [None, 'inductor', 'openxla', 'openxla_eval']: + backend = dynamo if dynamo else 'lazytensor' + xla = None if dynamo == 'inductor' else 'PJRT' + summary = f"{model_name} eval {backend} batch {args.batch_size}" + + results_file = os.path.join(results_dir, 'results.jsonl') + if benchmark_has_already_run(results_file, model_name, xla, dynamo, + args.batch_size): + print(f"SKIP already completed benchmark -- {summary}") + continue + + print(f"RUN {summary}") + log_file = os.path.join(results_dir, + f'llama-inference.{backend}.{size}.log') + + cmd = [ + 'python', 'example_text_completion.py', '1', '--ckpt_dir', '.', + '--tokenizer_path', + os.path.join(llama_dir, 't5_tokenizer/spiece.model'), '--max_seq_len', + '2048', '--max_gen_len', '1000', f'--max_batch_size', + f'{args.batch_size}', '--mp', 'True', f'--repeat', f'{args.repeat}', + f'--dynamo', f'"{dynamo}"' if dynamo else "''" + ] + + run_env = os.environ.copy() + if dynamo == 'inductor': + run_env['CUDA_VISIBLE_DEVICES'] = '0' + run_env['USE_CUDA'] = '1' + else: + run_env['PJRT_DEVICE'] = 'CUDA' + run_env['GPU_NUM_DEVICES'] = '1' + + run_ok = True + with open(log_file, 'w') as f: + try: + subprocess.check_call(cmd, stdout=f, stderr=f, env=run_env) + except subprocess.CalledProcessError: + print(f"Run failed -- see {log_file}.", file=sys.stderr) + run_ok = False + + result = { + 'model': { + 'suite_name': 'llama2', + 'model_name': model_name, + }, + 'experiment': { + 'accelerator': 'cuda', + 'accelerator_model': accelerator_model, + 'xla': xla, + 'xla_flags': None, + 'dynamo': dynamo, + 'test': 'eval', + 'batch_size': args.batch_size, + }, + 'repeat': args.repeat, + 'iterations_per_run': 1, + 'metrics': { + # Filled in below. + }, + 'timestamp': timestamp, + } + if run_ok: + latencies = parse_log_file(log_file) + result['metrics']['total_time'] = latencies + else: + result['metrics']['error'] = f"Run failed -- see {log_file}." + + with open(results_file, mode="a", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False) + f.write("\n") + + +def main(): + args = parse_args() + args.workspace_dir = os.path.expanduser(args.workspace_dir) + if not os.path.exists(args.workspace_dir): + sys.exit(f"Workspace directory {args.workspace_dir} not found.") + + # Sanity check: we should already be inside the appropriate venv. + workspace_dir = os.path.abspath(args.workspace_dir) + if sys.prefix != os.path.join(workspace_dir, 'env'): + sys.exit( + "Error: must run under the Python venv from the given --workspace_dir.") + + results_dir = os.path.join(workspace_dir, 'experiment_results') + accelerator_model, timestamp = get_info_from_result_file(results_dir) + llama_dir = set_up_llama_repo(workspace_dir) + + run_benchmarks(args, llama_dir, results_dir, accelerator_model, timestamp) + + +if __name__ == "__main__": + main()