diff --git a/06_gpu_and_ml/llm-serving/text_generation_inference.py b/06_gpu_and_ml/llm-serving/text_generation_inference.py
deleted file mode 100644
index 79a85d18b..000000000
--- a/06_gpu_and_ml/llm-serving/text_generation_inference.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# ---
-# tags: ["use-case-lm-inference"]
-# ---
-# # Hosting any LLaMA 3 model with Text Generation Inference (TGI)
-#
-# In this example, we show how to run an optimized inference server using [Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference)
-# with performance advantages over standard text generation pipelines including:
-# - continuous batching, so multiple generations can take place at the same time on a single container
-# - PagedAttention, which applies memory paging to the attention mechanism's key-value cache, increasing throughput
-#
-# This example deployment, [accessible here](https://modal.chat), can serve LLaMA 3 70B with
-# 70 second cold starts, up to 200 tokens/s of throughput, and a per-token latency of 55ms.
-
-# ## Setup
-#
-# First we import the components we need from `modal`.
-
-import subprocess
-from pathlib import Path
-
-import modal
-
-# Next, we set which model to serve, taking care to specify the GPU configuration required
-# to fit the model into VRAM, and the quantization method (`bitsandbytes` or `gptq`) if desired.
-# Note that quantization does degrade token generation performance significantly.
-#
-# Any model supported by TGI can be chosen here.
-
-MODEL_ID = "NousResearch/Meta-Llama-3-8B"
-MODEL_REVISION = "315b20096dc791d381d514deb5f8bd9c8d6d3061"
-# Add `["--quantize", "gptq"]` for TheBloke GPTQ models.
-LAUNCH_FLAGS = [
-    "--model-id",
-    MODEL_ID,
-    "--port",
-    "8000",
-    "--revision",
-    MODEL_REVISION,
-]
-
-# ## Define a container image
-#
-# We want to create a Modal Image which has the Huggingface model cache pre-populated.
-# The benefit of this is that the container no longer has to re-download the model from Huggingface -
-# instead, it will take advantage of Modal's internal filesystem for faster cold starts. On
-# the largest 70B model, the 135GB model can be loaded in as little as 70 seconds.
-#
-# ### Download the weights
-# We can use the included utilities to download the model weights (and convert to safetensors, if necessary)
-# as part of the image build.
-
-
-def download_model():
-    subprocess.run(
-        [
-            "text-generation-server",
-            "download-weights",
-            MODEL_ID,
-            "--revision",
-            MODEL_REVISION,
-        ],
-    )
-
-
-# ### Image definition
-# We’ll start from a Docker Hub image recommended by TGI, and override the default `ENTRYPOINT` for
-# Modal to run its own which enables seamless serverless deployments.
-#
-# Next we run the download function above to pre-populate the image with our model weights.
-#
-# If you adapt this example to run another model,
-# note that for this step to work on a [gated model](https://github.com/huggingface/text-generation-inference#using-a-private-or-gated-model)
-# the `HF_TOKEN` environment variable must be set and provided as a [Modal Secret](https://modal.com/secrets).
-#
-# Finally, we install the `text-generation` client to interface with TGI's Rust webserver over `localhost`.
-
-app = modal.App("example-tgi-" + MODEL_ID.split("/")[-1])
-
-tgi_image = (
-    modal.Image.from_registry(
-        "ghcr.io/huggingface/text-generation-inference:1.4"
-    )
-    .dockerfile_commands("ENTRYPOINT []")
-    .run_function(
-        download_model,
-        timeout=3600,
-    )
-    .pip_install("fastapi[standard]", "text-generation")
-)
-
-
-# ## The model class
-#
-# The inference function is best represented with Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions).
-# The class syntax is a special representation for a Modal function which splits logic into two parts:
-# 1. the `@enter()` function, which runs once per container when it starts up, and
-# 2. the `@method()` function, which runs per inference request.
-#
-# This means the model is loaded into the GPUs, and the backend for TGI is launched just once when each
-# container starts, and this state is cached for each subsequent invocation of the function.
-# Note that on start-up, we must wait for the Rust webserver to accept connections before considering the
-# container ready.
-#
-# Here, we also
-# - specify the secret so the `HUGGING_FACE_HUB_TOKEN` environment variable can be set
-# - specify how many A100s we need per container
-# - specify that each container is allowed to handle up to 10 inputs (i.e. requests) simultaneously
-# - keep idle containers for 10 minutes before spinning down
-# - increase the timeout limit
-
-
-GPU_CONFIG = modal.gpu.H100(count=2)  # 2 H100s
-
-
-@app.cls(
-    gpu=GPU_CONFIG,
-    allow_concurrent_inputs=15,
-    container_idle_timeout=60 * 10,
-    timeout=60 * 60,
-    image=tgi_image,
-)
-class Model:
-    @modal.enter()
-    def start_server(self):
-        import socket
-        import time
-
-        from text_generation import AsyncClient
-
-        self.launcher = subprocess.Popen(
-            ["text-generation-launcher"] + LAUNCH_FLAGS,
-        )
-        self.client = AsyncClient("http://127.0.0.1:8000", timeout=60)
-        self.template = """<|begin_of_text|><|start_header_id|>user<|end_header_id|>
-
-{user}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
-"""
-
-        # Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
-        def webserver_ready():
-            try:
-                socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
-                return True
-            except (socket.timeout, ConnectionRefusedError):
-                # Check if launcher webserving process has exited.
-                # If so, a connection can never be made.
-                retcode = self.launcher.poll()
-                if retcode is not None:
-                    raise RuntimeError(
-                        f"launcher exited unexpectedly with code {retcode}"
-                    )
-                return False
-
-        while not webserver_ready():
-            time.sleep(1.0)
-
-        print("Webserver ready!")
-
-    @modal.exit()
-    def terminate_server(self):
-        self.launcher.terminate()
-
-    @modal.method()
-    async def generate(self, question: str):
-        prompt = self.template.format(user=question)
-        result = await self.client.generate(
-            prompt, max_new_tokens=1024, stop_sequences=["<|eot_id|>"]
-        )
-
-        return result.generated_text
-
-    @modal.method()
-    async def generate_stream(self, question: str):
-        prompt = self.template.format(user=question)
-
-        async for response in self.client.generate_stream(
-            prompt, max_new_tokens=1024, stop_sequences=["<|eot_id|>"]
-        ):
-            if (
-                not response.token.special
-                and response.token.text != "<|eot_id|>"
-            ):
-                yield response.token.text
-
-
-# ## Run the model
-# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke
-# our remote function. You can run this script locally with `modal run text_generation_inference.py`.
-@app.local_entrypoint()
-def main(prompt: str = None):
-    if prompt is None:
-        prompt = "Implement a Python function to compute the Fibonacci numbers."
-    print(Model().generate.remote(prompt))
-
-
-# ## Serve the model
-# Once we deploy this model with `modal deploy text_generation_inference.py`, we can serve it
-# behind an ASGI app front-end. The front-end code (a single file of Alpine.js) is available
-# [here](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/llm-frontend/index.html).
-#
-# You can try our deployment [here](https://modal.chat).
-
-frontend_path = Path(__file__).parent.parent / "llm-frontend"
-
-
-@app.function(
-    mounts=[modal.Mount.from_local_dir(frontend_path, remote_path="/assets")],
-    keep_warm=1,
-    allow_concurrent_inputs=10,
-    timeout=60 * 10,
-)
-@modal.asgi_app(label="llama3")
-def tgi_app():
-    import json
-
-    import fastapi
-    import fastapi.staticfiles
-    from fastapi.responses import StreamingResponse
-
-    web_app = fastapi.FastAPI()
-
-    @web_app.get("/stats")
-    async def stats():
-        stats = await Model().generate_stream.get_current_stats.aio()
-        return {
-            "backlog": stats.backlog,
-            "num_total_runners": stats.num_total_runners,
-            "model": MODEL_ID,
-        }
-
-    @web_app.get("/completion/{question}")
-    async def completion(question: str):
-        from urllib.parse import unquote
-
-        async def generate():
-            async for text in Model().generate_stream.remote_gen.aio(
-                unquote(question)
-            ):
-                yield f"data: {json.dumps(dict(text=text), ensure_ascii=False)}\n\n"
-
-        return StreamingResponse(generate(), media_type="text/event-stream")
-
-    web_app.mount(
-        "/", fastapi.staticfiles.StaticFiles(directory="/assets", html=True)
-    )
-    return web_app
-
-
-# ## Invoke the model from other apps
-# Once the model is deployed, we can invoke inference from other apps, sharing the same pool
-# of GPU containers with all other apps we might need.
-#
-# ```
-# $ python
-# >>> import modal
-# >>> f = modal.Function.lookup("example-tgi-Meta-Llama-3-70B-Instruct", "Model.generate")
-# >>> f.remote("What is the story about the fox and grapes?")
-# 'The story about the fox and grapes ...
-# ```
diff --git a/06_gpu_and_ml/llm-serving/tgi_mixtral.py b/06_gpu_and_ml/llm-serving/tgi_mixtral.py
deleted file mode 100644
index 1efbd9b1c..000000000
--- a/06_gpu_and_ml/llm-serving/tgi_mixtral.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# ---
-# deploy: true
-# tags: ["use-case-lm-inference"]
-# ---
-# # Hosting Mixtral 8x7B with Text Generation Inference (TGI)
-#
-# In this example, we show how to run an optimized inference server using [Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference)
-# with performance advantages over standard text generation pipelines including:
-# - continuous batching, so multiple generations can take place at the same time on a single container
-# - PagedAttention, which applies memory paging to the attention mechanism's key-value cache, increasing throughput
-#
-# This example deployment, [accessible here](https://modal-labs--tgi-mixtral.modal.run), can serve Mixtral 8x7B on two 80GB A100s, with
-# up to 500 tokens/s of throughput and per-token latency of 78ms.
-
-# ## Setup
-#
-# First we import the components we need from `modal`.
-
-import subprocess
-from pathlib import Path
-
-import modal
-
-# Next, we set which model to serve, taking care to specify the GPU configuration required
-# to fit the model into VRAM, and the quantization method (`bitsandbytes` or `gptq`) if desired.
-# Note that quantization does degrade token generation performance significantly.
-#
-# Any model supported by TGI can be chosen here.
-
-GPU_CONFIG = modal.gpu.A100(size="40GB", count=4)
-MODEL_ID = "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"
-MODEL_REVISION = "286ae6737d048ad1d965c2e830864df02db50f2f"
-# Add `["--quantize", "gptq"]` for TheBloke GPTQ models.
-LAUNCH_FLAGS = [
-    "--model-id",
-    MODEL_ID,
-    "--revision",
-    MODEL_REVISION,
-    "--port",
-    "8000",
-]
-
-# ## Define a container image
-#
-# We want to create a Modal image which has the Hugging Face model cache pre-populated.
-# The benefit of this is that the container no longer has to re-download the model from Huggingface -
-# instead, it will take advantage of Modal's internal filesystem for faster cold starts.
-# The 95GB model can be loaded in as little as 70 seconds.
-#
-# ### Download the weights
-# We can use the included utilities to download the model weights (and convert to safetensors, if necessary)
-# as part of the image build.
-
-
-def download_model():
-    subprocess.run(
-        [
-            "text-generation-server",
-            "download-weights",
-            MODEL_ID,
-            "--revision",
-            MODEL_REVISION,
-        ]
-    )
-
-
-# ### Image definition
-# We’ll start from a Docker Hub image recommended by TGI, and override the default `ENTRYPOINT` for
-# Modal to run its own which enables seamless serverless deployments.
-#
-# Next we run the download step to pre-populate the image with our model weights.
-#
-# Finally, we install the `text-generation` client to interface with TGI's Rust webserver over `localhost`.
-
-tgi_image = (
-    modal.Image.from_registry(
-        "ghcr.io/huggingface/text-generation-inference:1.3.3"
-    )
-    .dockerfile_commands("ENTRYPOINT []")
-    .run_function(
-        download_model,
-        timeout=60 * 20,
-    )
-    .pip_install("text-generation")
-)
-
-app = modal.App("example-tgi-mixtral")
-
-
-# ## The model class
-#
-# The inference function is best represented with Modal's [class syntax](https://modal.com/docs/guide/lifecycle-functions).
-# The class syntax is a special representation for a Modal function which splits logic into two parts:
-# 1. the `@enter()` function, which runs once per container when it starts up, and
-# 2. the `@method()` function, which runs per inference request.
-#
-# This means the model is loaded into the GPUs, and the backend for TGI is launched just once when each
-# container starts, and this state is cached for each subsequent invocation of the function.
-# Note that on start-up, we must wait for the Rust webserver to accept connections before considering the
-# container ready.
-#
-# Here, we also
-# - specify how many A100s we need per container
-# - specify that each container is allowed to handle up to 10 inputs (i.e. requests) simultaneously
-# - keep idle containers for 10 minutes before spinning down
-# - lift the timeout of each request.
-
-
-@app.cls(
-    gpu=GPU_CONFIG,
-    allow_concurrent_inputs=10,
-    container_idle_timeout=60 * 10,
-    timeout=60 * 60,
-    image=tgi_image,
-)
-class Model:
-    @modal.enter()
-    def start_server(self):
-        import socket
-        import time
-
-        from text_generation import AsyncClient
-
-        self.launcher = subprocess.Popen(
-            ["text-generation-launcher"] + LAUNCH_FLAGS
-        )
-        self.client = AsyncClient("http://127.0.0.1:8000", timeout=60)
-        self.template = "[INST] {user} [/INST]"
-
-        # Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
-        webserver_ready = False
-        while not webserver_ready:
-            try:
-                socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
-                webserver_ready = True
-                print("Webserver ready!")
-            except (socket.timeout, ConnectionRefusedError):
-                # If launcher process exited, a connection can never be made.
-                if retcode := self.launcher.poll():
-                    raise RuntimeError(f"launcher exited with code {retcode}")
-                time.sleep(1.0)
-
-    @modal.exit()
-    def terminate_server(self):
-        self.launcher.terminate()
-
-    @modal.method()
-    async def generate(self, question: str):
-        prompt = self.template.format(user=question)
-        result = await self.client.generate(prompt, max_new_tokens=1024)
-
-        return result.generated_text
-
-    @modal.method()
-    async def generate_stream(self, question: str):
-        prompt = self.template.format(user=question)
-
-        async for response in self.client.generate_stream(
-            prompt, max_new_tokens=1024
-        ):
-            if not response.token.special:
-                yield response.token.text
-
-
-# ## Run the model
-# We define a [`local_entrypoint`](https://modal.com/docs/guide/apps#entrypoints-for-ephemeral-apps) to invoke
-# our remote function. You can run this script locally with `modal run text_generation_inference.py`.
-@app.local_entrypoint()
-def main():
-    print(
-        Model().generate.remote(
-            "Implement a Python function to compute the Fibonacci numbers."
-        )
-    )
-
-
-# ## Serve the model
-# Once we deploy this model with `modal deploy text_generation_inference.py`, we can serve it
-# behind an ASGI app front-end. The front-end code (a single file of Alpine.js) is available
-# [here](https://github.com/modal-labs/modal-examples/blob/main/06_gpu_and_ml/llm-frontend/index.html).
-#
-# You can try our deployment [here](https://modal-labs--tgi-mixtral.modal.run).
-
-frontend_path = Path(__file__).parent.parent / "llm-frontend"
-frontend_image = modal.Image.debian_slim().pip_install("fastapi[standard]")
-
-
-@app.function(
-    image=frontend_image,
-    mounts=[modal.Mount.from_local_dir(frontend_path, remote_path="/assets")],
-    keep_warm=1,
-    allow_concurrent_inputs=20,
-    timeout=60 * 10,
-)
-@modal.asgi_app(label="tgi-mixtral")
-def tgi_mixtral():
-    import json
-
-    import fastapi
-    import fastapi.staticfiles
-    from fastapi.responses import StreamingResponse
-
-    web_app = fastapi.FastAPI()
-
-    @web_app.get("/stats")
-    async def stats():
-        stats = await Model().generate_stream.get_current_stats.aio()
-        return {
-            "backlog": stats.backlog,
-            "num_total_runners": stats.num_total_runners,
-            "model": MODEL_ID + " (TGI)",
-        }
-
-    @web_app.get("/completion/{question}")
-    async def completion(question: str):
-        from urllib.parse import unquote
-
-        async def generate():
-            async for text in Model().generate_stream.remote_gen.aio(
-                unquote(question)
-            ):
-                yield f"data: {json.dumps(dict(text=text), ensure_ascii=False)}\n\n"
-
-        return StreamingResponse(generate(), media_type="text/event-stream")
-
-    web_app.mount(
-        "/", fastapi.staticfiles.StaticFiles(directory="/assets", html=True)
-    )
-    return web_app
-
-
-# ## Invoke the model from other apps
-# Once the model is deployed, we can invoke inference from other apps, sharing the same pool
-# of GPU containers with all other apps we might need.
-#
-# ```
-# $ python
-# >>> import modal
-# >>> f = modal.Function.lookup("example-tgi-mixtral", "Model.generate")
-# >>> f.remote("What is the story about the fox and grapes?")
-# 'The story about the fox and grapes ...
-# ```