From dd2287de4e364bedf1ada01112737111de771047 Mon Sep 17 00:00:00 2001
From: Richard Gong <richard@modal.com>
Date: Wed, 13 Dec 2023 20:23:36 +0000
Subject: [PATCH] vllm mixtral

---
 06_gpu_and_ml/vllm_mixtral.py | 206 ++++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)
 create mode 100644 06_gpu_and_ml/vllm_mixtral.py

diff --git a/06_gpu_and_ml/vllm_mixtral.py b/06_gpu_and_ml/vllm_mixtral.py
new file mode 100644
index 000000000..efdd1e532
--- /dev/null
+++ b/06_gpu_and_ml/vllm_mixtral.py
@@ -0,0 +1,206 @@
+# # Fast inference with vLLM (Mixtral 8x7B)
+#
+# In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm)
+# to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching.
+#
+# `vLLM` also supports a use case as a FastAPI server which we will explore in a future guide. This example
+# walks through setting up an environment that works with `vLLM ` for basic inference.
+#
+# We are running the [Mixtral 8x7B Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) model here, which is a mixture-of-experts model finetuned for conversation.
+# You can expect 20 second cold starts and well over 100 tokens/second. The larger the batch of prompts, the higher the throughput.
+# For example, with the 60 prompts below, we can produce 19k tokens in 15 seconds, which is around 1.25k tokens/second.
+#
+# To run
+# [any of the other supported models](https://vllm.readthedocs.io/en/latest/models/supported_models.html),
+# simply replace the model name in the download step. You may also need to enable `trust_remote_code` for MPT models (see comment below)..
+#
+# ## Setup
+#
+# First we import the components we need from `modal`.
+
+import time
+import os
+
+from modal import Image, Stub, gpu, method
+
+MODEL_DIR = "/model"
+BASE_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+GPU_CONFIG = gpu.A100(memory=80, count=2)
+
+
+# ## Define a container image
+#
+# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this
+# is that the container no longer has to re-download the model from Huggingface - instead, it will take
+# advantage of Modal's internal filesystem for faster cold starts.
+#
+# ### Download the weights
+#
+# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`.
+#
+# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
+def download_model_to_folder():
+    from huggingface_hub import snapshot_download
+
+    os.makedirs(MODEL_DIR, exist_ok=True)
+
+    snapshot_download(
+        BASE_MODEL,
+        local_dir=MODEL_DIR,
+        ignore_patterns="*.safetensors",  # vLLM doesn't support Mixtral safetensors anyway.
+    )
+
+
+# ### Image definition
+# We’ll start from a Dockerhub image recommended by `vLLM`, and use
+# run_function to run the function defined above to ensure the weights of
+# the model are saved within the container image.
+
+VLLM_HASH = "89523c8293bc02a4dfaaa80079a5347dc3952464a33a501d5de329921eea7ec7"
+
+image = (
+    Image.from_registry(
+        f"vllm/vllm-openai@sha256:{VLLM_HASH}",
+        setup_dockerfile_commands=[
+            "RUN apt-get install python-is-python3",
+            "RUN mv /workspace/* /root",
+        ],
+    )
+    .dockerfile_commands("ENTRYPOINT []")
+    .pip_install("huggingface_hub==0.19.4", "hf-transfer==0.1.4")
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+    .run_function(download_model_to_folder, timeout=60 * 20)
+)
+
+stub = Stub("example-vllm-inference", image=image)
+
+
+# ## The model class
+#
+# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions) and the `__enter__` method.
+# This enables us to load the model into memory just once every time a container starts up, and keep it cached
+# on the GPU for each subsequent invocation of the function.
+#
+# The `vLLM` library allows the code to remain quite clean. There are, however, some
+# outstanding issues and performance improvements that we patch here, such as multi-GPU setup and
+# suboptimal Ray CPU pinning.
+@stub.cls(gpu=GPU_CONFIG, timeout=60 * 10, container_idle_timeout=60 * 10)
+class Model:
+    def __enter__(self):
+        import subprocess
+
+        from vllm import LLM
+
+        if GPU_CONFIG.count > 1:
+            # Patch issue from https://github.com/vllm-project/vllm/issues/1116
+            import ray
+
+            ray.shutdown()
+            ray.init(num_gpus=GPU_CONFIG.count)
+
+        # Load the model. Tip: MPT models may require `trust_remote_code=true`.
+        self.llm = LLM(MODEL_DIR, tensor_parallel_size=GPU_CONFIG.count)
+        self.template = "<s> [INST] {user} [/INST] "
+
+        if GPU_CONFIG.count > 1:
+            # Performance improvement from https://github.com/vllm-project/vllm/issues/2073#issuecomment-1853422529
+            RAY_CORE_PIN_OVERRIDE = "cpuid=0 ; for pid in $(ps xo '%p %c' | grep ray:: | awk '{print $1;}') ; do taskset -cp $cpuid $pid ; cpuid=$(($cpuid + 1)) ; done"
+            subprocess.call(RAY_CORE_PIN_OVERRIDE, shell=True)
+
+    @method()
+    def generate(self, user_questions):
+        from vllm import SamplingParams
+
+        prompts = [self.template.format(user=q) for q in user_questions]
+
+        sampling_params = SamplingParams(
+            temperature=0.75,
+            top_p=1,
+            max_tokens=800,
+            presence_penalty=1.15,
+        )
+
+        t0 = time.time()
+        result = self.llm.generate(prompts, sampling_params)
+        num_tokens = 0
+        for output in result:
+            num_tokens += len(output.outputs[0].token_ids)
+            print(output.prompt, output.outputs[0].text, "\n\n", sep="")
+
+        print(f"Generated {num_tokens} tokens in {time.time() - t0:.2f}s")
+
+
+# ## Run the model
+# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function
+# sequentially for a list of inputs. You can run this locally with `modal run vllm_mixtral.py`.
+@stub.local_entrypoint()
+def main():
+    model = Model()
+    questions = [
+        # Coding questions
+        "Implement a Python function to compute the Fibonacci numbers.",
+        "Write a Rust function that performs binary exponentiation.",
+        "How do I allocate memory in C?",
+        "What are the differences between Javascript and Python?",
+        "How do I find invalid indices in Postgres?",
+        "How can you implement a LRU (Least Recently Used) cache in Python?",
+        "What approach would you use to detect and prevent race conditions in a multithreaded application?",
+        "Can you explain how a decision tree algorithm works in machine learning?",
+        "How would you design a simple key-value store database from scratch?",
+        "How do you handle deadlock situations in concurrent programming?",
+        "What is the logic behind the A* search algorithm, and where is it used?",
+        "How can you design an efficient autocomplete system?",
+        "What approach would you take to design a secure session management system in a web application?",
+        "How would you handle collision in a hash table?",
+        "How can you implement a load balancer for a distributed system?",
+        # Literature
+        "What is the fable involving a fox and grapes?",
+        "Write a story in the style of James Joyce about a trip to the Australian outback in 2083, to see robots in the beautiful desert.",
+        "Who does Harry turn into a balloon?",
+        "Write a tale about a time-traveling historian who's determined to witness the most significant events in human history.",
+        "Describe a day in the life of a secret agent who's also a full-time parent.",
+        "Create a story about a detective who can communicate with animals.",
+        "What is the most unusual thing about living in a city floating in the clouds?",
+        "In a world where dreams are shared, what happens when a nightmare invades a peaceful dream?",
+        "Describe the adventure of a lifetime for a group of friends who found a map leading to a parallel universe.",
+        "Tell a story about a musician who discovers that their music has magical powers.",
+        "In a world where people age backwards, describe the life of a 5-year-old man.",
+        "Create a tale about a painter whose artwork comes to life every night.",
+        "What happens when a poet's verses start to predict future events?",
+        "Imagine a world where books can talk. How does a librarian handle them?",
+        "Tell a story about an astronaut who discovered a planet populated by plants.",
+        "Describe the journey of a letter traveling through the most sophisticated postal service ever.",
+        "Write a tale about a chef whose food can evoke memories from the eater's past.",
+        # History
+        "What were the major contributing factors to the fall of the Roman Empire?",
+        "How did the invention of the printing press revolutionize European society?",
+        "What are the effects of quantitative easing?",
+        "How did the Greek philosophers influence economic thought in the ancient world?",
+        "What were the economic and philosophical factors that led to the fall of the Soviet Union?",
+        "How did decolonization in the 20th century change the geopolitical map?",
+        "What was the influence of the Khmer Empire on Southeast Asia's history and culture?",
+        # Thoughtfulness
+        "Describe the city of the future, considering advances in technology, environmental changes, and societal shifts.",
+        "In a dystopian future where water is the most valuable commodity, how would society function?",
+        "If a scientist discovers immortality, how could this impact society, economy, and the environment?",
+        "What could be the potential implications of contact with an advanced alien civilization?",
+        # Math
+        "What is the product of 9 and 8?",
+        "If a train travels 120 kilometers in 2 hours, what is its average speed?",
+        "Think through this step by step. If the sequence a_n is defined by a_1 = 3, a_2 = 5, and a_n = a_(n-1) + a_(n-2) for n > 2, find a_6.",
+        "Think through this step by step. Calculate the sum of an arithmetic series with first term 3, last term 35, and total terms 11.",
+        "Think through this step by step. What is the area of a triangle with vertices at the points (1,2), (3,-4), and (-2,5)?",
+        "Think through this step by step. Solve the following system of linear equations: 3x + 2y = 14, 5x - y = 15.",
+        # Facts
+        "Who was Emperor Norton I, and what was his significance in San Francisco's history?",
+        "What is the Voynich manuscript, and why has it perplexed scholars for centuries?",
+        "What was Project A119 and what were its objectives?",
+        "What is the 'Dyatlov Pass incident' and why does it remain a mystery?",
+        "What is the 'Emu War' that took place in Australia in the 1930s?",
+        "What is the 'Phantom Time Hypothesis' proposed by Heribert Illig?",
+        "Who was the 'Green Children of Woolpit' as per 12th-century English legend?",
+        "What are 'zombie stars' in the context of astronomy?",
+        "Who were the 'Dog-Headed Saint' and the 'Lion-Faced Saint' in medieval Christian traditions?",
+        "What is the story of the 'Globsters', unidentified organic masses washed up on the shores?",
+    ]
+    model.generate.remote(questions)