From 69e2a46cb16bf30755009b5e438f62585e8d80bb Mon Sep 17 00:00:00 2001
From: Kostis-S-Z <kostissz@pm.me>
Date: Fri, 6 Dec 2024 15:51:24 +0200
Subject: [PATCH 1/3] Add GPU support for TTS model

---
 src/document_to_podcast/inference/text_to_speech.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/document_to_podcast/inference/text_to_speech.py b/src/document_to_podcast/inference/text_to_speech.py
index 3822751..c49fceb 100644
--- a/src/document_to_podcast/inference/text_to_speech.py
+++ b/src/document_to_podcast/inference/text_to_speech.py
@@ -7,9 +7,10 @@ def _speech_generation_parler(
     model: PreTrainedModel,
     tokenizer: PreTrainedTokenizerBase,
     speaker_description: str,
+    device: str,
 ) -> np.ndarray:
-    input_ids = tokenizer(speaker_description, return_tensors="pt").input_ids
-    prompt_input_ids = tokenizer(input_text, return_tensors="pt").input_ids
+    input_ids = tokenizer(speaker_description, return_tensors="pt").input_ids.to(device)
+    prompt_input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
 
     generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
     waveform = generation.cpu().numpy().squeeze()
@@ -22,6 +23,7 @@ def text_to_speech(
     model: PreTrainedModel,
     tokenizer: PreTrainedTokenizerBase,
     speaker_profile: str,
+    device: str = "cpu",
 ) -> np.ndarray:
     """
     Generates a speech waveform using the input_text, a model and a speaker profile to define a distinct voice pattern.
@@ -34,11 +36,14 @@ def text_to_speech(
         model (PreTrainedModel): The model used for generating the waveform.
         tokenizer (PreTrainedTokenizerBase): The tokenizer used for tokenizing the text in order to send to the model.
         speaker_profile (str): A description used by the ParlerTTS model to configure the speaker profile.
+        device (str): The device to compute the generation on, such as "cuda:0" or "cpu".
     Returns:
         numpy array: The waveform of the speech as a 2D numpy array
     """
     model_id = model.config.name_or_path
     if "parler" in model_id:
-        return _speech_generation_parler(input_text, model, tokenizer, speaker_profile)
+        return _speech_generation_parler(
+            input_text, model, tokenizer, speaker_profile, device
+        )
     else:
         raise NotImplementedError(f"Model {model_id} not yet implemented for TTS")

From 1a202422d1066d2c1f325a0705a428fa063cbdfb Mon Sep 17 00:00:00 2001
From: Kostis-S-Z <kostissz@pm.me>
Date: Fri, 6 Dec 2024 15:51:41 +0200
Subject: [PATCH 2/3] Use GPU if available in demo app

---
 demo/app.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/demo/app.py b/demo/app.py
index 6356b74..3c4ca9a 100644
--- a/demo/app.py
+++ b/demo/app.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 
 import streamlit as st
+import torch
 
 from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
 from document_to_podcast.inference.model_loaders import (
@@ -35,6 +36,8 @@
     "2": "Jon's voice is calm with very clear audio and no background noise.",
 }
 
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+
 
 @st.cache_resource
 def load_text_to_text_model():
@@ -45,7 +48,7 @@ def load_text_to_text_model():
 
 @st.cache_resource
 def load_text_to_speech_model_and_tokenizer():
-    return load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu")
+    return load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", device)
 
 
 st.title("Document To Podcast")
@@ -117,6 +120,7 @@ def load_text_to_speech_model_and_tokenizer():
                             speech_model,
                             speech_tokenizer,
                             SPEAKER_DESCRIPTIONS[speaker_id],
+                            device=device,
                         )
                     st.audio(speech, sample_rate=speech_model.config.sampling_rate)
                     text = ""

From bacb3fc32f11969a12dcff794f3e635129437180 Mon Sep 17 00:00:00 2001
From: Kostis-S-Z <kostissz@pm.me>
Date: Fri, 6 Dec 2024 19:03:31 +0200
Subject: [PATCH 3/3] Use GPU if available for text-to-text in demo app

---
 demo/app.py                                        |  3 ++-
 src/document_to_podcast/inference/model_loaders.py | 10 ++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/demo/app.py b/demo/app.py
index 3c4ca9a..61200db 100644
--- a/demo/app.py
+++ b/demo/app.py
@@ -42,7 +42,8 @@
 @st.cache_resource
 def load_text_to_text_model():
     return load_llama_cpp_model(
-        model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
+        model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf",
+        device=device,
     )
 
 
diff --git a/src/document_to_podcast/inference/model_loaders.py b/src/document_to_podcast/inference/model_loaders.py
index a72b2c3..f89a4ce 100644
--- a/src/document_to_podcast/inference/model_loaders.py
+++ b/src/document_to_podcast/inference/model_loaders.py
@@ -5,19 +5,18 @@
 from transformers import AutoTokenizer, PreTrainedModel, PreTrainedTokenizerBase
 
 
-def load_llama_cpp_model(
-    model_id: str,
-) -> Llama:
+def load_llama_cpp_model(model_id: str, device: str = "cpu") -> Llama:
     """
     Loads the given model_id using Llama.from_pretrained.
 
     Examples:
         >>> model = load_llama_cpp_model(
-            "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf")
+            "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf", "cpu")
 
     Args:
         model_id (str): The model id to load.
             Format is expected to be `{org}/{repo}/{filename}`.
+        device (str): The device to load the model on, such as "cuda:0" or "cpu".
 
     Returns:
         Llama: The loaded model.
@@ -26,8 +25,11 @@ def load_llama_cpp_model(
     model = Llama.from_pretrained(
         repo_id=f"{org}/{repo}",
         filename=filename,
+        # -1 means offload all layers to GPU, but you can also define to have some in CPU, some in GPU
+        n_gpu_layers=0 if device == "cpu" else -1,
         # 0 means that the model limit will be used, instead of the default (512) or other hardcoded value
         n_ctx=0,
+        verbose=True,
     )
     return model