From dd2287de4e364bedf1ada01112737111de771047 Mon Sep 17 00:00:00 2001 From: Richard Gong Date: Wed, 13 Dec 2023 20:23:36 +0000 Subject: [PATCH] vllm mixtral --- 06_gpu_and_ml/vllm_mixtral.py | 206 ++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 06_gpu_and_ml/vllm_mixtral.py diff --git a/06_gpu_and_ml/vllm_mixtral.py b/06_gpu_and_ml/vllm_mixtral.py new file mode 100644 index 000000000..efdd1e532 --- /dev/null +++ b/06_gpu_and_ml/vllm_mixtral.py @@ -0,0 +1,206 @@ +# # Fast inference with vLLM (Mixtral 8x7B) +# +# In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) +# to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. +# +# `vLLM` also supports a use case as a FastAPI server which we will explore in a future guide. This example +# walks through setting up an environment that works with `vLLM ` for basic inference. +# +# We are running the [Mixtral 8x7B Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) model here, which is a mixture-of-experts model finetuned for conversation. +# You can expect 20 second cold starts and well over 100 tokens/second. The larger the batch of prompts, the higher the throughput. +# For example, with the 60 prompts below, we can produce 19k tokens in 15 seconds, which is around 1.25k tokens/second. +# +# To run +# [any of the other supported models](https://vllm.readthedocs.io/en/latest/models/supported_models.html), +# simply replace the model name in the download step. You may also need to enable `trust_remote_code` for MPT models (see comment below).. +# +# ## Setup +# +# First we import the components we need from `modal`. + +import time +import os + +from modal import Image, Stub, gpu, method + +MODEL_DIR = "/model" +BASE_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1" +GPU_CONFIG = gpu.A100(memory=80, count=2) + + +# ## Define a container image +# +# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this +# is that the container no longer has to re-download the model from Huggingface - instead, it will take +# advantage of Modal's internal filesystem for faster cold starts. +# +# ### Download the weights +# +# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. +# +# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. +def download_model_to_folder(): + from huggingface_hub import snapshot_download + + os.makedirs(MODEL_DIR, exist_ok=True) + + snapshot_download( + BASE_MODEL, + local_dir=MODEL_DIR, + ignore_patterns="*.safetensors", # vLLM doesn't support Mixtral safetensors anyway. + ) + + +# ### Image definition +# We’ll start from a Dockerhub image recommended by `vLLM`, and use +# run_function to run the function defined above to ensure the weights of +# the model are saved within the container image. + +VLLM_HASH = "89523c8293bc02a4dfaaa80079a5347dc3952464a33a501d5de329921eea7ec7" + +image = ( + Image.from_registry( + f"vllm/vllm-openai@sha256:{VLLM_HASH}", + setup_dockerfile_commands=[ + "RUN apt-get install python-is-python3", + "RUN mv /workspace/* /root", + ], + ) + .dockerfile_commands("ENTRYPOINT []") + .pip_install("huggingface_hub==0.19.4", "hf-transfer==0.1.4") + .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) + .run_function(download_model_to_folder, timeout=60 * 20) +) + +stub = Stub("example-vllm-inference", image=image) + + +# ## The model class +# +# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions) and the `__enter__` method. +# This enables us to load the model into memory just once every time a container starts up, and keep it cached +# on the GPU for each subsequent invocation of the function. +# +# The `vLLM` library allows the code to remain quite clean. There are, however, some +# outstanding issues and performance improvements that we patch here, such as multi-GPU setup and +# suboptimal Ray CPU pinning. +@stub.cls(gpu=GPU_CONFIG, timeout=60 * 10, container_idle_timeout=60 * 10) +class Model: + def __enter__(self): + import subprocess + + from vllm import LLM + + if GPU_CONFIG.count > 1: + # Patch issue from https://github.com/vllm-project/vllm/issues/1116 + import ray + + ray.shutdown() + ray.init(num_gpus=GPU_CONFIG.count) + + # Load the model. Tip: MPT models may require `trust_remote_code=true`. + self.llm = LLM(MODEL_DIR, tensor_parallel_size=GPU_CONFIG.count) + self.template = " [INST] {user} [/INST] " + + if GPU_CONFIG.count > 1: + # Performance improvement from https://github.com/vllm-project/vllm/issues/2073#issuecomment-1853422529 + RAY_CORE_PIN_OVERRIDE = "cpuid=0 ; for pid in $(ps xo '%p %c' | grep ray:: | awk '{print $1;}') ; do taskset -cp $cpuid $pid ; cpuid=$(($cpuid + 1)) ; done" + subprocess.call(RAY_CORE_PIN_OVERRIDE, shell=True) + + @method() + def generate(self, user_questions): + from vllm import SamplingParams + + prompts = [self.template.format(user=q) for q in user_questions] + + sampling_params = SamplingParams( + temperature=0.75, + top_p=1, + max_tokens=800, + presence_penalty=1.15, + ) + + t0 = time.time() + result = self.llm.generate(prompts, sampling_params) + num_tokens = 0 + for output in result: + num_tokens += len(output.outputs[0].token_ids) + print(output.prompt, output.outputs[0].text, "\n\n", sep="") + + print(f"Generated {num_tokens} tokens in {time.time() - t0:.2f}s") + + +# ## Run the model +# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function +# sequentially for a list of inputs. You can run this locally with `modal run vllm_mixtral.py`. +@stub.local_entrypoint() +def main(): + model = Model() + questions = [ + # Coding questions + "Implement a Python function to compute the Fibonacci numbers.", + "Write a Rust function that performs binary exponentiation.", + "How do I allocate memory in C?", + "What are the differences between Javascript and Python?", + "How do I find invalid indices in Postgres?", + "How can you implement a LRU (Least Recently Used) cache in Python?", + "What approach would you use to detect and prevent race conditions in a multithreaded application?", + "Can you explain how a decision tree algorithm works in machine learning?", + "How would you design a simple key-value store database from scratch?", + "How do you handle deadlock situations in concurrent programming?", + "What is the logic behind the A* search algorithm, and where is it used?", + "How can you design an efficient autocomplete system?", + "What approach would you take to design a secure session management system in a web application?", + "How would you handle collision in a hash table?", + "How can you implement a load balancer for a distributed system?", + # Literature + "What is the fable involving a fox and grapes?", + "Write a story in the style of James Joyce about a trip to the Australian outback in 2083, to see robots in the beautiful desert.", + "Who does Harry turn into a balloon?", + "Write a tale about a time-traveling historian who's determined to witness the most significant events in human history.", + "Describe a day in the life of a secret agent who's also a full-time parent.", + "Create a story about a detective who can communicate with animals.", + "What is the most unusual thing about living in a city floating in the clouds?", + "In a world where dreams are shared, what happens when a nightmare invades a peaceful dream?", + "Describe the adventure of a lifetime for a group of friends who found a map leading to a parallel universe.", + "Tell a story about a musician who discovers that their music has magical powers.", + "In a world where people age backwards, describe the life of a 5-year-old man.", + "Create a tale about a painter whose artwork comes to life every night.", + "What happens when a poet's verses start to predict future events?", + "Imagine a world where books can talk. How does a librarian handle them?", + "Tell a story about an astronaut who discovered a planet populated by plants.", + "Describe the journey of a letter traveling through the most sophisticated postal service ever.", + "Write a tale about a chef whose food can evoke memories from the eater's past.", + # History + "What were the major contributing factors to the fall of the Roman Empire?", + "How did the invention of the printing press revolutionize European society?", + "What are the effects of quantitative easing?", + "How did the Greek philosophers influence economic thought in the ancient world?", + "What were the economic and philosophical factors that led to the fall of the Soviet Union?", + "How did decolonization in the 20th century change the geopolitical map?", + "What was the influence of the Khmer Empire on Southeast Asia's history and culture?", + # Thoughtfulness + "Describe the city of the future, considering advances in technology, environmental changes, and societal shifts.", + "In a dystopian future where water is the most valuable commodity, how would society function?", + "If a scientist discovers immortality, how could this impact society, economy, and the environment?", + "What could be the potential implications of contact with an advanced alien civilization?", + # Math + "What is the product of 9 and 8?", + "If a train travels 120 kilometers in 2 hours, what is its average speed?", + "Think through this step by step. If the sequence a_n is defined by a_1 = 3, a_2 = 5, and a_n = a_(n-1) + a_(n-2) for n > 2, find a_6.", + "Think through this step by step. Calculate the sum of an arithmetic series with first term 3, last term 35, and total terms 11.", + "Think through this step by step. What is the area of a triangle with vertices at the points (1,2), (3,-4), and (-2,5)?", + "Think through this step by step. Solve the following system of linear equations: 3x + 2y = 14, 5x - y = 15.", + # Facts + "Who was Emperor Norton I, and what was his significance in San Francisco's history?", + "What is the Voynich manuscript, and why has it perplexed scholars for centuries?", + "What was Project A119 and what were its objectives?", + "What is the 'Dyatlov Pass incident' and why does it remain a mystery?", + "What is the 'Emu War' that took place in Australia in the 1930s?", + "What is the 'Phantom Time Hypothesis' proposed by Heribert Illig?", + "Who was the 'Green Children of Woolpit' as per 12th-century English legend?", + "What are 'zombie stars' in the context of astronomy?", + "Who were the 'Dog-Headed Saint' and the 'Lion-Faced Saint' in medieval Christian traditions?", + "What is the story of the 'Globsters', unidentified organic masses washed up on the shores?", + ] + model.generate.remote(questions)