From 69e2a46cb16bf30755009b5e438f62585e8d80bb Mon Sep 17 00:00:00 2001 From: Kostis-S-Z Date: Fri, 6 Dec 2024 15:51:24 +0200 Subject: [PATCH 1/3] Add GPU support for TTS model --- src/document_to_podcast/inference/text_to_speech.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/document_to_podcast/inference/text_to_speech.py b/src/document_to_podcast/inference/text_to_speech.py index 3822751..c49fceb 100644 --- a/src/document_to_podcast/inference/text_to_speech.py +++ b/src/document_to_podcast/inference/text_to_speech.py @@ -7,9 +7,10 @@ def _speech_generation_parler( model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase, speaker_description: str, + device: str, ) -> np.ndarray: - input_ids = tokenizer(speaker_description, return_tensors="pt").input_ids - prompt_input_ids = tokenizer(input_text, return_tensors="pt").input_ids + input_ids = tokenizer(speaker_description, return_tensors="pt").input_ids.to(device) + prompt_input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device) generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) waveform = generation.cpu().numpy().squeeze() @@ -22,6 +23,7 @@ def text_to_speech( model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase, speaker_profile: str, + device: str = "cpu", ) -> np.ndarray: """ Generates a speech waveform using the input_text, a model and a speaker profile to define a distinct voice pattern. @@ -34,11 +36,14 @@ def text_to_speech( model (PreTrainedModel): The model used for generating the waveform. tokenizer (PreTrainedTokenizerBase): The tokenizer used for tokenizing the text in order to send to the model. speaker_profile (str): A description used by the ParlerTTS model to configure the speaker profile. + device (str): The device to compute the generation on, such as "cuda:0" or "cpu". Returns: numpy array: The waveform of the speech as a 2D numpy array """ model_id = model.config.name_or_path if "parler" in model_id: - return _speech_generation_parler(input_text, model, tokenizer, speaker_profile) + return _speech_generation_parler( + input_text, model, tokenizer, speaker_profile, device + ) else: raise NotImplementedError(f"Model {model_id} not yet implemented for TTS") From 1a202422d1066d2c1f325a0705a428fa063cbdfb Mon Sep 17 00:00:00 2001 From: Kostis-S-Z Date: Fri, 6 Dec 2024 15:51:41 +0200 Subject: [PATCH 2/3] Use GPU if available in demo app --- demo/app.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/demo/app.py b/demo/app.py index 6356b74..3c4ca9a 100644 --- a/demo/app.py +++ b/demo/app.py @@ -2,6 +2,7 @@ from pathlib import Path import streamlit as st +import torch from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS from document_to_podcast.inference.model_loaders import ( @@ -35,6 +36,8 @@ "2": "Jon's voice is calm with very clear audio and no background noise.", } +device = "cuda:0" if torch.cuda.is_available() else "cpu" + @st.cache_resource def load_text_to_text_model(): @@ -45,7 +48,7 @@ def load_text_to_text_model(): @st.cache_resource def load_text_to_speech_model_and_tokenizer(): - return load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu") + return load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", device) st.title("Document To Podcast") @@ -117,6 +120,7 @@ def load_text_to_speech_model_and_tokenizer(): speech_model, speech_tokenizer, SPEAKER_DESCRIPTIONS[speaker_id], + device=device, ) st.audio(speech, sample_rate=speech_model.config.sampling_rate) text = "" From bacb3fc32f11969a12dcff794f3e635129437180 Mon Sep 17 00:00:00 2001 From: Kostis-S-Z Date: Fri, 6 Dec 2024 19:03:31 +0200 Subject: [PATCH 3/3] Use GPU if available for text-to-text in demo app --- demo/app.py | 3 ++- src/document_to_podcast/inference/model_loaders.py | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/demo/app.py b/demo/app.py index 3c4ca9a..61200db 100644 --- a/demo/app.py +++ b/demo/app.py @@ -42,7 +42,8 @@ @st.cache_resource def load_text_to_text_model(): return load_llama_cpp_model( - model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf" + model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf", + device=device, ) diff --git a/src/document_to_podcast/inference/model_loaders.py b/src/document_to_podcast/inference/model_loaders.py index a72b2c3..f89a4ce 100644 --- a/src/document_to_podcast/inference/model_loaders.py +++ b/src/document_to_podcast/inference/model_loaders.py @@ -5,19 +5,18 @@ from transformers import AutoTokenizer, PreTrainedModel, PreTrainedTokenizerBase -def load_llama_cpp_model( - model_id: str, -) -> Llama: +def load_llama_cpp_model(model_id: str, device: str = "cpu") -> Llama: """ Loads the given model_id using Llama.from_pretrained. Examples: >>> model = load_llama_cpp_model( - "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf") + "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf", "cpu") Args: model_id (str): The model id to load. Format is expected to be `{org}/{repo}/{filename}`. + device (str): The device to load the model on, such as "cuda:0" or "cpu". Returns: Llama: The loaded model. @@ -26,8 +25,11 @@ def load_llama_cpp_model( model = Llama.from_pretrained( repo_id=f"{org}/{repo}", filename=filename, + # -1 means offload all layers to GPU, but you can also define to have some in CPU, some in GPU + n_gpu_layers=0 if device == "cpu" else -1, # 0 means that the model limit will be used, instead of the default (512) or other hardcoded value n_ctx=0, + verbose=True, ) return model