From 55497dff801a906b4587f99d260eb128b1ea3552 Mon Sep 17 00:00:00 2001
From: Preston Badeer <467756+pbadeer@users.noreply.github.com>
Date: Fri, 26 Jan 2024 15:38:17 -0600
Subject: [PATCH] Correct small typos in vllm_mixtral.py (#563)

Co-authored-by: Preston Badeer <467756+pbadeer@users.noreply.github.com>
Co-authored-by: Eric Zhang <ekzhang1@gmail.com>
---
 06_gpu_and_ml/vllm_mixtral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/06_gpu_and_ml/vllm_mixtral.py b/06_gpu_and_ml/vllm_mixtral.py
index cdfbc573c..f28caa750 100644
--- a/06_gpu_and_ml/vllm_mixtral.py
+++ b/06_gpu_and_ml/vllm_mixtral.py
@@ -7,7 +7,7 @@
 # walks through setting up an environment that works with `vLLM ` for basic inference.
 #
 # We are running the [Mixtral 8x7B Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) model here, which is a mixture-of-experts model finetuned for conversation.
-# You can expect 3 minute second cold starts
+# You can expect 3 minute cold starts.
 # For a single request, the throughput is about 11 tokens/second, but there are upcoming `vLLM` optimizations to improve this.
 # The larger the batch of prompts, the higher the throughput (up to about 300 tokens/second).
 # For example, with the 60 prompts below, we can produce 30k tokens in 100 seconds.