From 80b06af7e991afcb2431fe01791712bca2a05cd1 Mon Sep 17 00:00:00 2001 From: Milan Kordic Date: Sun, 8 Dec 2024 22:05:15 -0500 Subject: [PATCH] Update README, instructions, and setup script for Llama 3.1 70B on LoudBox for v0 release --- README.md | 63 +++++--- models/demos/t3000/llama3_70b/README.md | 123 ++++++++++++--- models/demos/t3000/llama3_70b/setup_llama3.sh | 140 ++++++++++++++++++ 3 files changed, 284 insertions(+), 42 deletions(-) create mode 100644 models/demos/t3000/llama3_70b/setup_llama3.sh diff --git a/README.md b/README.md index 2765715fea1..f47aaeb927d 100644 --- a/README.md +++ b/README.md @@ -21,29 +21,34 @@ --- ## LLMs -| Model | Batch | Hardware | ttft (ms) | t/s/u | Target
t/s/u | t/s | Release | -|---------------------------------------------------------------|-------|----------------------------------------------------------|----------|-------|-----------------|--------|---------------------------------------------------------------------------| -| [Falcon7B-decode](./models/demos/ttnn_falcon7b) | 32 | [e150](https://tenstorrent.com/hardware/grayskull) | | 4.2 | 4.4 | 134.4 | | -| [Falcon7B](./models/demos/wormhole/falcon7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 71 | 17.6 | 26 | 563.2 | [v0.53.0-rc44](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc44) | -| [Mistral-7B](./models/demos/wormhole/mistral7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | | 9.9 | 25 | 316.8 | [v0.51.0-rc28](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc28) | -| [Mamba-2.8B](./models/demos/wormhole/mamba) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 48 | 12.3 | 41 | 393.6 | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) | -| [LLaMA-3.1-8B](./models/demos/llama3) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 202 | 28.6 | 23 | 28.6 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | -| [LLaMA-3.2-1B](./models/demos/llama3) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 71 | 90.8 | 160 | 90.8 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | -| [LLaMA-3.2-3B](./models/demos/llama3) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 112 | 49.1 | 60 | 49.1 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | -| [Falcon7B (DP=8)](./models/demos/t3000/falcon7b) | 256 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 97 | 14.6 | 26 | 3737.6 | [v0.53.0-rc44](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc44) | -| [LLaMA-3.1-70B (TP=8)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190 | 15.1 | 20 | 483.2 | [v0.53.0-rc36](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc36) | -| [Falcon40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | -| [Mixtral7Bx8 (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 230 | 14.6 | 33 | 467.2 | [v0.53.0-rc44](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc44) | -| [Falcon7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 242 | 4.4 | 26 | 4505.6 | [v0.53.0-rc33](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc33) | -| [LLaMA-3.1-70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 190 | 14.3 | 20 | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) | -> **Last Update:** December 2, 2024 +| Model | Batch | Hardware | ttft (ms) | t/s/u | Target
t/s/u | t/s | TT-Metalium Release | vLLM Release | +|---------------------------------------------------------------|-------|----------------------------------------------------------|-----------|-------|-----------------|--------|---------------------------------------------------|---------------------------------------------------------------------------------------------------| +| [Falcon 7B (decode only)](./models/demos/ttnn_falcon7b) | 32 | [e150](https://tenstorrent.com/hardware/grayskull) | | 4.2 | 4.4 | 134.4 | | | +| [Falcon 7B](./models/demos/wormhole/falcon7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 71 | 17.6 | 26 | 563.2 | [v0.53.0-rc44](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc44) | | +| [Mistral 7B](./models/demos/wormhole/mistral7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | | 9.9 | 25 | 316.8 | [v0.51.0-rc28](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc28) | | +| [Mamba 2.8B](./models/demos/wormhole/mamba) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 48 | 12.3 | 41 | 393.6 | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) | | +| [Llama 3.1 8B](./models/demos/llama3) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 202 | 28.6 | 23 | 28.6 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | | +| [Llama 3.2 1B](./models/demos/llama3) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 71 | 90.8 | 160 | 90.8 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | | +| [Llama 3.2 3B](./models/demos/llama3) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 112 | 49.1 | 60 | 49.1 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | | +| [Falcon 7B (DP=8)](./models/demos/t3000/falcon7b) | 256 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 97 | 14.6 | 26 | 3737.6 | [v0.53.0-rc44](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc44) | | +| [Llama 3.1 70B (TP=8)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190 | 15.1 | 20 | 483.2 | [v0.53.0-rc36](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc36) | [384f179](https://github.com/tenstorrent/vllm/tree/384f1790c3be16e1d1b10de07252be2e66d00935) | +| [Falcon 40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | | +| [Mixtral 8x7B (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 230 | 14.6 | 33 | 467.2 | [v0.53.0-rc44](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc44) | | +| [Falcon 7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 242 | 4.4 | 26 | 4505.6 | [v0.53.0-rc33](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc33) | | +| [Llama 3.1 70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 190 | 14.3 | 20 | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) | | + +> **Last Update:** December 7, 2024 +> > **Notes:** +> +> - ttft = time to first token | t/s/u = tokens/second/user | t/s = tokens/second; where t/s = t/s/u * batch. > - TP = Tensor Parallel, DP = Data Parallel; Defines parallelization factors across multiple devices. > - The reported LLM performance is for an input sequence length (number of rows filled in the KV cache) of 128 for all models except Mamba (which can accept any sequence length). > - The t/s/u reported is the throughput of the first token generated after prefill, i.e. 1 / inter token latency. ## CNNs + | Model | Batch | Hardware | fps | Target fps | Release | |-----------------------------------------------------------------------------|-------|----------------------------------------------------------|---------|------------|-------------| | [ResNet-50 (224x224)](./models/demos/grayskull/resnet50) | 20 | [e150](https://tenstorrent.com/hardware/grayskull) | 5,100 | 10,000 | | @@ -55,11 +60,11 @@ | [ViT (224x224)](./models/demos/grayskull/vit) | 9 | [e150](https://tenstorrent.com/hardware/grayskull) | 1,360 | 2,000 | | | [ViT (224x224)](./models/demos/wormhole/vit) | 8 | [n150](https://tenstorrent.com/hardware/wormhole) | 912 | 1,600 | | | [Stable Diffusion 1.4 (512x512)](./models/demos/wormhole/stable_diffusion) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.167 | 0.3 | | -| [Yolo V4 (320x320)](./models/demos/yolov4) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 95 | 300 | | -| [Segformer Semantic Segmentation (512x512)](./models/demos/segformer) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 90 | 300 | | - +| [YOLOv4 (320x320)](./models/demos/yolov4) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 95 | 300 | | +| [SegFormer Semantic Segmentation (512x512)](./models/demos/segformer) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 90 | 300 | | ## NLPs + | Model | Batch | Hardware | sen/sec | Target sen/sec | Release | |-----------------------------------------------------|-------|----------------------------------------------------|---------|----------------|---------| | [BERT-Large](./models/demos/metal_BERT_large_11/) | 12 | [e150](https://tenstorrent.com/hardware/grayskull) | 370 | 410 | | @@ -68,9 +73,11 @@ | [Bloom](.models/demos/grayskull/functional_bloom) | | [e150](https://tenstorrent.com/hardware/grayskull) | 70 | | | ## Model Updates + For the latest model updates and features, please see [MODEL_UPDATES.md](models/MODEL_UPDATES.md) ## TT-NN Tech Reports + - [Advanced Performance Optimizations for Models](./tech_reports/AdvancedPerformanceOptimizationsForModels/AdvancedPerformanceOptimizationsForModels.md) (updated Dec 4th) - [Programming Mesh of Devices](./tech_reports/Programming%20Mesh%20of%20Devices/Programming%20Mesh%20of%20Devices%20with%20TT-NN.md) (updated Sept 9th) - [ViT Implementation in TT-NN on GS](./tech_reports/ViT-TTNN/vit.md) (updated Sept 22nd) @@ -78,8 +85,8 @@ For the latest model updates and features, please see [MODEL_UPDATES.md](models/ - [YOLOv4 Implementation in TT-NN on WH](./tech_reports/YoloV4-TTNN/yolov4.md) (updated November 8th) ## Benchmarks -- [Matrix Multiply FLOPS on WH](./tech_reports/GEMM_FLOPS/GEMM_FLOPS.md) (updated November 13th) +- [Matrix Multiply FLOPS on WH](./tech_reports/GEMM_FLOPS/GEMM_FLOPS.md) (updated November 13th) --- @@ -89,7 +96,6 @@ For the latest model updates and features, please see [MODEL_UPDATES.md](models/ **TT-Metalium** is our low-level programming model, enabling kernel development for Tenstorrent hardware. -

[Programming Guide](./METALIUM_GUIDE.md) | [API Reference](https://docs.tenstorrent.com/tt-metalium/latest/tt_metal/apis/index.html) @@ -102,6 +108,7 @@ For the latest model updates and features, please see [MODEL_UPDATES.md](models/ Get started with [simple kernels](https://docs.tenstorrent.com/tt-metalium/latest/tt_metal/examples/index.html). ## TT-Metalium Tech Reports + - [Matrix Engine](./tech_reports/matrix_engine/matrix_engine.md) (updated Sept 6th) - [Data Formats](./tech_reports/data_formats/data_formats.md) (updated Sept 7th) - [Reconfiguring Data Formats](./tech_reports/data_formats/reconfig_data_format.md) (updated Oct 17th) @@ -113,24 +120,36 @@ Get started with [simple kernels](https://docs.tenstorrent.com/tt-metalium/lates - [CNNs on TT Architectures](./tech_reports/CNNs/ttcnn.md) (updated Sept 6th) - [Ethernet and Multichip Basics](./tech_reports/EthernetMultichip/BasicEthernetGuide.md) (Updated Sept 20th) - [Collective Communication Library (CCL)](./tech_reports/EthernetMultichip/CclDeveloperGuide.md) (Updated Sept 20th) -- [Blackhole Bring-Up Prgramming Guide](./tech_reports/Blackhole/BlackholeBringUpProgrammingGuide.md) (Updated Oct 30th) +- [Blackhole Bring-Up Programming Guide](./tech_reports/Blackhole/BlackholeBringUpProgrammingGuide.md) (Updated Oct 30th) ## TT-Metalium Programming Examples + ### Hello World + - [Hello World! Compute Kernel](./tech_reports/prog_examples/hello_world_compute/hello_world_compute.md) - [Hello World! Data Movement Kernel](./tech_reports/prog_examples/hello_world_data_movement/hello_world_data_movement.md) + ### Add Integers + - [Add 2 Integers in Baby RiscV](./tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md) - [Add 2 Integers in Compute Kernel](./tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md) + ### Simple Tensor Manipulation + - [Sharding](./tech_reports/prog_examples/shard_data_rm/shard_data_rm.md) - [Padding](./tech_reports/prog_examples/pad_multi_core/pad_multi_core.md) + ### DRAM Data Movement + - [Dram Loopback Data Movement](./tech_reports/prog_examples/dram_loopback/dram_loopback.md) + ### Eltwise + - [Eltwise Unary OP in Vector Engine (SFPU)](./tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md) - [Eltwise Binary OP in Matrix Engine (FPU)](./tech_reports/prog_examples/eltwise_binary/eltwise_binary.md) + ### Matmul + - [Matmul OP on a Single_core](./tech_reports/prog_examples/matmul_single_core/matmul_single_core.md) - [Matmul OP on Multi_core (Basic)](./tech_reports/prog_examples/matmul_multi_core/matmul_multi_core.md) - [Matmul Multi_core Reuse (Optimized)](./tech_reports/prog_examples/matmul_multi_core_optimized/data_reuse.md) diff --git a/models/demos/t3000/llama3_70b/README.md b/models/demos/t3000/llama3_70b/README.md index 6555cd36dbf..66c44157e55 100644 --- a/models/demos/t3000/llama3_70b/README.md +++ b/models/demos/t3000/llama3_70b/README.md @@ -1,32 +1,74 @@ -# Llama3-70B Demo +# Llama3/3.1-70B Demo + +## Table of Contents + +- [One command run](#one-command-run) +- [How to Run](#how-to-run) + - [Running the demo from TT-Metalium](#running-the-demo-from-tt-metalium) + - [Serving the model from vLLM](#serving-the-model-from-vllm) + +## One command run + +```bash +chmod +x ./models/demos/t3000/llama3_70b/setup_llama3.sh && ./models/demos/t3000/llama3_70b/setup_llama3.sh +``` + +Where, `TT_METAL_COMMIT_SHA_OR_TAG` and `TT_VLLM_COMMIT_SHA_OR_TAG` are found in the root [README](/README.md#llms) under "Release" version, respectively. + +Example: + +```bash +./models/demos/t3000/llama3_70b/setup_llama3.sh v0.53.0-rc36 384f1790c3be16e1d1b10de07252be2e66d00935 +``` + +Follow prompts as they come up in CLI to select appropriate weights for Llama 3.1 70B Instruct. + +Prerequisites: + +- Submit request to access weights from Meta: [Llama Downloads](https://www.llama.com/llama-downloads) +- Submit permissions on HuggingFace and have a HF personal access token: [Llama 3.1 70B Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) + +Steps run: + +- Setup environment +- Build `tt-metal` +- Download Llama 3.1 70B Instruct weights +- Install vLLM +- Deploy vLLM server ## How to Run -1. **Download the Llama3-70B weights from Meta (https://llama.meta.com/):** +Note: This guide requires the installation / build of `tt-metal`. Please refer to the [installation instructions](/INSTALLING.md) for the release corresponding to [README](/README.md#llms). + +1. **Download the Llama3/3.1-70B weights from Meta ():** 2. **Repack the weights:** + ```bash # This concatenates the sharded checkpoints and makes it easier for us to load. python models/demos/t3000/llama2_70b/scripts/repack_weights.py ``` + Note: Use `5` for `chunk_size`. Once the weights are repacked, move the `params.json` file from the `checkpoint_dir` to the `repacked_output_dir`. -### Running the Demo +### Running the demo from TT-Metalium After setting up the repacked weights and tokenizer, you can run the demo using the commands below: 1. **Prepare the weight cache directory:** + ```bash # Make a directory for us to cache weights into. This speeds up subsequent runs. mkdir ``` 2. **Set up environment variables:** + ```bash export LLAMA3_CKPT_DIR= - export LLAMA3_TOKENIZER_PATH= # Path needs to include the tokenizer.model file + export LLAMA3_TOKENIZER_PATH=/tokenizer.model # Path needs to include the tokenizer.model file export LLAMA3_CACHE_PATH= export WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml @@ -38,13 +80,11 @@ After setting up the repacked weights and tokenizer, you can run the demo using # export LLAMA3_CKPT_DIR="/home/llama-data-repacked/llama-3-70b/" # export LLAMA3_TOKENIZER_PATH="/home/llama-data-repacked/tokenizer.model" # export LLAMA3_CACHE_PATH="/home/llama-data-cache/weights-cache" - - ``` 3. **Run the demo:** - NOTE: Run the following comand twice. + Note: Run the following command twice. 1. The first run will cache the weights. This will take some time. 2. The second run will use the cached weights, thereby running much faster. @@ -58,31 +98,74 @@ After setting up the repacked weights and tokenizer, you can run the demo using The above demo does not achieve peak performance because we log outputs to the screen. The following perf test will print an accurate end-to-end throughput number. For best performance, ensure that tt-metal is built in release mode (default), and ensure the host's CPU frequency governors are set to `performance` -- instructions for setting the frequency governor vary by machine. This performance test runs with sequence length 128 and batch size 32. + ```bash pytest -svv models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py::test_Llama_perf_host[wormhole_b0-True-device_params0-gen128-llama3] ``` -## Details +#### Details Supported context lengths and batch sizes for the Llama3.1-70B demo are as follows: | Context Length | Max Batch Size | -|----------------|------------| -| 2k | 32 | -| 8k | 16 | -| 128k | 1 | +|----------------|----------------| +| 2k | 32 | +| 8k | 16 | +| 128k | 1 | - **Input File:** Uses `./demo/data/multi_prompt.json`. - **Model Configuration:** Utilizes a pretrained model. - **Hardware Requirements:** Runs on an 8-chip T3000 machine using tensor parallelism. The host machine must have at least 512 GB of memory. - **Demo arguments:** - - `context: [short_context, long_context, 128k_context]`: Select between short context (batch 32, sequence_length 2k) and long context (batch 16, sequence length 8k) and full context (batch 1, sequence length 128k) - - `ground_truth: [check_disabled, check_enabled]`: Enable or disable ground truth checking, used for testing - - `sampling: [greedy, sampling]`: Select between greedy decoding and top-k/top-p sampling - - `implementation: [tt-70b-T3000]`: Run the 70B model on the Tenstorrent backend - - `num_layers: [1L, 2L, 10L, 80L]`: Select 80L to run the full model - - `decode_only: [decode_only, prefill_decode]`: Use `prefill_decode`. Alternately, `decode_only` implements prefill via decode. - - `chat: [text_completion, chat_completion]`: Run in text_completion mode for the pretrained model or chat_completion for the finetuned model - - `llama_version: [llama3, llama2]`: Select the Llama3 model + - `context: [short_context, long_context, 128k_context]`: Select between short context (batch 32, sequence_length 2k) and long context (batch 16, sequence length 8k) and full context (batch 1, sequence length 128k) + - `ground_truth: [check_disabled, check_enabled]`: Enable or disable ground truth checking, used for testing + - `sampling: [greedy, sampling]`: Select between greedy decoding and top-k/top-p sampling + - `implementation: [tt-70b-T3000]`: Run the 70B model on the Tenstorrent backend + - `num_layers: [1L, 2L, 10L, 80L]`: Select 80L to run the full model + - `decode_only: [decode_only, prefill_decode]`: Use `prefill_decode`. Alternately, `decode_only` implements prefill via decode. + - `chat: [text_completion, chat_completion]`: Run in text_completion mode for the pretrained model or chat_completion for the finetuned model + - `llama_version: [llama3, llama2]`: Select the Llama3 model Ensure you follow these guidelines to successfully run the Llama3-70B demo. + +### Serving the model from vLLM + +1. Complete Step 1 and Step 2 of [Running the Demo from TT-Metalium](#running-the-demo-from-tt-metalium) + +2. **Install vLLM** + + ```bash + # Installing from within `tt-metal` + export VLLM_TARGET_DEVICE="tt" + git clone https://github.com/tenstorrent/vllm.git + cd vllm + git checkout TT_VLLM_COMMIT_SHA_OR_TAG + pip install -e . + cd .. + ``` + + > **Note:** TT_VLLM_COMMIT_SHA_OR_TAG is the vLLM Release version from [README](/README.md#llms) + +3. **Running the server** + + ```bash + python vllm/examples/server_example_tt.py + ``` + +4. **Interact with server** + + In a separate terminal window, run: + + ```bash + curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3.1-70B", + "prompt": "Write a poem about RISC-V", + "max_tokens": 128, + "temperature": 1, + "top_p": 0.9, + "top_k": 10, + "stream": false + }' + ``` diff --git a/models/demos/t3000/llama3_70b/setup_llama3.sh b/models/demos/t3000/llama3_70b/setup_llama3.sh new file mode 100644 index 00000000000..e3e042849a6 --- /dev/null +++ b/models/demos/t3000/llama3_70b/setup_llama3.sh @@ -0,0 +1,140 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: Ā© 2024 Tenstorrent AI ULC +# +# Purpose: Setup and deploy Llama 3.1 70B Instruct model with dependencies. + +set -euo pipefail + +# Model information +MODEL=llama-3.1-70b-instruct +META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct" +META_DIR_FILTER="llama3_1" + +# Require commit SHA or tag for TT_METAL and vLLM +TT_METAL_COMMIT_SHA_OR_TAG=${1:-""} +TT_VLLM_COMMIT_SHA_OR_TAG=${2:-""} + +# Ensure required arguments are passed +if [[ -z "${TT_METAL_COMMIT_SHA_OR_TAG}" || -z "${TT_VLLM_COMMIT_SHA_OR_TAG}" ]]; then + echo "āŒ Error: Both TT_METAL_COMMIT_SHA_OR_TAG and TT_VLLM_COMMIT_SHA_OR_TAG are required." + echo "Usage: $0 " + exit 1 +fi + +# Defined variables +DEFAULT_PERSISTENT_VOLUME_ROOT=~/persistent_volume +DEFAULT_LLAMA_REPO=~/llama-models + +# functions +error_exit() { + echo "ā›” Error: $1" >&2 + exit 1 +} + +print_step() { + echo -e "\nšŸ‘‰ $1...\n" +} + +setup_environment() { + print_step "Setting up environment" + export LLAMA3_CKPT_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_weights/repacked-${MODEL}" + export LLAMA3_TOKENIZER_PATH="${LLAMA3_CKPT_DIR}/tokenizer.model" + export LLAMA3_CACHE_PATH="${DEFAULT_PERSISTENT_VOLUME_ROOT}/tt_metal_cache/cache_repacked-${MODEL}" + export ARCH_NAME=wormhole_b0 + export TT_METAL_HOME=$(pwd) + export PYTHONPATH=$(pwd) + echo "Environment variables set." +} + +check_and_build_tt_metal() { + print_step "Checking and building tt-metal" + pushd "${TT_METAL_HOME}" >/dev/null + if [[ ! -d "python_env" ]]; then + git checkout "${TT_METAL_COMMIT_SHA_OR_TAG}" + git submodule update --init --recursive + git submodule foreach 'git lfs fetch --all && git lfs pull' + ./build_metal.sh + ./create_venv.sh + source python_env/bin/activate + pip install -r models/demos/t3000/llama2_70b/reference/llama/requirements.txt + else + echo "šŸ”” tt-metal Python environment already exists. Skipping build." + source python_env/bin/activate + fi + popd >/dev/null +} + +clone_repo() { + local REPO_PATH=$1 + local REPO_URL=$2 + local COMMIT_HASH=$3 + + print_step "Cloning Llama repository" + if [[ ! -d "${REPO_PATH}" ]]; then + git clone "${REPO_URL}" "${REPO_PATH}" + pushd "${REPO_PATH}" >/dev/null + git checkout "${COMMIT_HASH}" + popd >/dev/null + else + echo "šŸ”” Repository already exists at ${REPO_PATH}, skipping clone." + fi +} + +setup_weights() { + print_step "Setting up weights" + local LLAMA_REPO=$1 + local LLAMA_DIR="${LLAMA_REPO}/models/${META_DIR_FILTER}" + local LLAMA_WEIGHTS_DIR="${LLAMA_DIR}/${META_MODEL_NAME}" + local WEIGHTS_DIR="${LLAMA3_CKPT_DIR}" + + mkdir -p "${WEIGHTS_DIR}" "${LLAMA3_CACHE_PATH}" + + if [[ -d "${LLAMA_WEIGHTS_DIR}" && -n "$(ls -A "${LLAMA_WEIGHTS_DIR}")" ]]; then + echo "Weights already downloaded in ${LLAMA_WEIGHTS_DIR}" + else + print_step "Downloading weights" + pushd "${LLAMA_DIR}" >/dev/null + [[ -x "./download.sh" ]] && ./download.sh || error_exit "Download script not found!" + popd >/dev/null + fi + + print_step "Repacking weights" + source python_env/bin/activate + huggingface-cli login + cp "${LLAMA_WEIGHTS_DIR}/tokenizer.model" "${WEIGHTS_DIR}/tokenizer.model" + cp "${LLAMA_WEIGHTS_DIR}/params.json" "${WEIGHTS_DIR}/params.json" + python models/demos/t3000/llama2_70b/scripts/repack_weights.py "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" 5 +} + +install_vllm() { + print_step "Installing vLLM" + if [[ ! -d "vllm" ]]; then + source python_env/bin/activate + export VLLM_TARGET_DEVICE="tt" + git clone https://github.com/tenstorrent/vllm.git + pushd vllm >/dev/null + git checkout "${TT_VLLM_COMMIT_SHA_OR_TAG}" + pip install -e . + popd >/dev/null + else + echo "šŸ”” vLLM already installed. Skipping install." + fi +} + +deploy_server() { + print_step "Deploying Llama server" + source python_env/bin/activate + export WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml + python vllm/examples/server_example_tt.py + echo "āœ… Deployment complete! Interact via http://localhost:8000." +} + +# ---- MAIN ---- +setup_environment +check_and_build_tt_metal +clone_repo "${DEFAULT_LLAMA_REPO}" "https://github.com/meta-llama/llama-models.git" "685ac4c107c75ce8c291248710bf990a876e1623" +setup_weights "${DEFAULT_LLAMA_REPO}" +install_vllm +deploy_server