Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gpu support #44

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions demo/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path

import streamlit as st
import torch

from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
from document_to_podcast.inference.model_loaders import (
Expand Down Expand Up @@ -35,17 +36,20 @@
"2": "Jon's voice is calm with very clear audio and no background noise.",
}

device = "cuda:0" if torch.cuda.is_available() else "cpu"


@st.cache_resource
def load_text_to_text_model():
return load_llama_cpp_model(
model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf",
device=device,
)


@st.cache_resource
def load_text_to_speech_model_and_tokenizer():
return load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu")
return load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", device)


st.title("Document To Podcast")
Expand Down Expand Up @@ -117,6 +121,7 @@ def load_text_to_speech_model_and_tokenizer():
speech_model,
speech_tokenizer,
SPEAKER_DESCRIPTIONS[speaker_id],
device=device,
)
st.audio(speech, sample_rate=speech_model.config.sampling_rate)
text = ""
10 changes: 6 additions & 4 deletions src/document_to_podcast/inference/model_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,18 @@
from transformers import AutoTokenizer, PreTrainedModel, PreTrainedTokenizerBase


def load_llama_cpp_model(
model_id: str,
) -> Llama:
def load_llama_cpp_model(model_id: str, device: str = "cpu") -> Llama:
"""
Loads the given model_id using Llama.from_pretrained.

Examples:
>>> model = load_llama_cpp_model(
"allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf")
"allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf", "cpu")

Args:
model_id (str): The model id to load.
Format is expected to be `{org}/{repo}/{filename}`.
device (str): The device to load the model on, such as "cuda:0" or "cpu".

Returns:
Llama: The loaded model.
Expand All @@ -26,8 +25,11 @@ def load_llama_cpp_model(
model = Llama.from_pretrained(
repo_id=f"{org}/{repo}",
filename=filename,
# -1 means offload all layers to GPU, but you can also define to have some in CPU, some in GPU
n_gpu_layers=0 if device == "cpu" else -1,
# 0 means that the model limit will be used, instead of the default (512) or other hardcoded value
n_ctx=0,
verbose=True,
)
return model

Expand Down
11 changes: 8 additions & 3 deletions src/document_to_podcast/inference/text_to_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ def _speech_generation_parler(
model: PreTrainedModel,
tokenizer: PreTrainedTokenizerBase,
speaker_description: str,
device: str,
) -> np.ndarray:
input_ids = tokenizer(speaker_description, return_tensors="pt").input_ids
prompt_input_ids = tokenizer(input_text, return_tensors="pt").input_ids
input_ids = tokenizer(speaker_description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
waveform = generation.cpu().numpy().squeeze()
Expand All @@ -22,6 +23,7 @@ def text_to_speech(
model: PreTrainedModel,
tokenizer: PreTrainedTokenizerBase,
speaker_profile: str,
device: str = "cpu",
) -> np.ndarray:
"""
Generates a speech waveform using the input_text, a model and a speaker profile to define a distinct voice pattern.
Expand All @@ -34,11 +36,14 @@ def text_to_speech(
model (PreTrainedModel): The model used for generating the waveform.
tokenizer (PreTrainedTokenizerBase): The tokenizer used for tokenizing the text in order to send to the model.
speaker_profile (str): A description used by the ParlerTTS model to configure the speaker profile.
device (str): The device to compute the generation on, such as "cuda:0" or "cpu".
Returns:
numpy array: The waveform of the speech as a 2D numpy array
"""
model_id = model.config.name_or_path
if "parler" in model_id:
return _speech_generation_parler(input_text, model, tokenizer, speaker_profile)
return _speech_generation_parler(
input_text, model, tokenizer, speaker_profile, device
)
else:
raise NotImplementedError(f"Model {model_id} not yet implemented for TTS")
Loading