mozilla-ai · Kostis-S-Z · Dec 4, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,7 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
+
+# Generated audio files
+*.wav
diff --git a/demo/app.py b/demo/app.py
@@ -6,6 +6,7 @@
 from opennotebookllm.preprocessing import DATA_LOADERS, DATA_CLEANERS
 from opennotebookllm.inference.model_loaders import load_llama_cpp_model
 from opennotebookllm.inference.text_to_text import text_to_text_stream
+from opennotebookllm.podcast_maker.script_to_audio import script_to_audio
 
 PODCAST_PROMPT = """
 You are a helpful podcast writer.
@@ -75,10 +76,19 @@
         if st.button("Generate Podcast Script"):
             with st.spinner("Generating Podcast Script..."):
                 text = ""
+                final_script = ""
                 for chunk in text_to_text_stream(
                     clean_text, model, system_prompt=system_prompt.strip()
                 ):
                     text += chunk
+                    final_script += chunk
                     if text.endswith("\n"):
                         st.write(text)
                         text = ""
+
+            if st.button("Generate Audio"):
+                filename = "demo_podcast.wav"
+                with st.spinner("Generating Audio..."):
+                    script_to_audio(final_script, filename=filename)
+
+                st.audio(filename)
diff --git a/src/opennotebookllm/inference/model_loaders.py b/src/opennotebookllm/inference/model_loaders.py
@@ -1,4 +1,8 @@
+from typing import Tuple
+
 from llama_cpp import Llama
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedModel
 
 
 def load_llama_cpp_model(
@@ -26,3 +30,26 @@ def load_llama_cpp_model(
         n_ctx=0,
     )
     return model
+
+
+def load_parler_tts_model_and_tokenizer(
+    model_id: str, device: str = "cpu"
+) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+    """
+    Loads the given model_id using parler_tts.from_pretrained.
+
+    Examples:
+        >>> model = load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu")
+
+    Args:
+        model_id (str): The model id to load.
+            Format is expected to be `{repo}/{filename}`.
+        device (str): The device to load the model on, such as "cuda:0" or "cpu".
+
+    Returns:
+        PreTrainedModel: The loaded model.
+    """
+    model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+    return model, tokenizer
diff --git a/src/opennotebookllm/inference/text_to_speech.py b/src/opennotebookllm/inference/text_to_speech.py
@@ -0,0 +1,49 @@
+import numpy as np
+from src.opennotebookllm.inference.model_loaders import (
+    load_parler_tts_model_and_tokenizer,
+)
+
+
+default_speaker_1_description = "Laura's voice is exciting and fast in delivery with very clear audio and no background noise."
+default_speaker_2_description = (
+    "Jon's voice is calm with very clear audio and no background noise."
+)
+
+
+def _speech_generation_parler(
+    input_text: str, model_id: str, speaker_description: str
+) -> np.array:
+    model, tokenizer = load_parler_tts_model_and_tokenizer(model_id)
+
+    prompt_input_ids = tokenizer(input_text, return_tensors="pt").input_ids
+    input_ids = tokenizer(speaker_description, return_tensors="pt").input_ids
+
+    generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+    waveform = generation.cpu().numpy().squeeze()
+
+    return waveform
+
+
+def text_to_speech(
+    input_text: str,
+    model_id: str,
+    speaker_description: str = default_speaker_1_description,
+) -> np.array:
+    """
+    Generates a speech waveform using the input_text, a speaker description and a given model id.
+
+    Examples:
+        >>> waveform = text_to_speech("Welcome to our amazing podcast", "parler-tts/parler-tts-mini-v1", "Laura's voice is exciting and fast in delivery with very clear audio and no background noise.")
+
+    Args:
+        input_text (str): The text to convert to speech.
+        model_id (str): A model id from the registered models list.
+        speaker_description (str): A description in natural language of how we want the voice to sound.
+
+    Returns:
+        numpy array: The waveform of the speech as a 2D numpy array
+    """
+    if "parler" in model_id:
+        return _speech_generation_parler(input_text, model_id, speaker_description)
+    else:
+        raise NotImplementedError(f"Model {model_id} not yet implemented for TTS")
diff --git a/src/opennotebookllm/podcast_maker/__init__.py b/src/opennotebookllm/podcast_maker/__init__.py
diff --git a/src/opennotebookllm/podcast_maker/script_to_audio.py b/src/opennotebookllm/podcast_maker/script_to_audio.py
@@ -0,0 +1,34 @@
+import numpy as np
+
+from src.opennotebookllm.inference.text_to_speech import (
+    text_to_speech,
+    default_speaker_1_description,
+    default_speaker_2_description,
+)
+from scipy.io.wavfile import write
+
+
+def script_to_audio(
+    script: str,
+    model_id: str = "parler-tts/parler-tts-mini-v1",
+    filename: str = "podcast.wav",
+    sampling_rate: int = 24_000,
+):
+    parts = script.split("Speaker")
+    podcast_waveform = []
+    for part in parts:
+        if ":" in part:
+            speaker_id, speaker_text = part.split(":")
+            if int(speaker_id) == 1:
+                speaker_1 = text_to_speech(
+                    speaker_text, model_id, default_speaker_1_description
+                )
+                podcast_waveform.append(speaker_1)
+            elif int(speaker_id) == 2:
+                speaker_2 = text_to_speech(
+                    speaker_text, model_id, default_speaker_2_description
+                )
+                podcast_waveform.append(speaker_2)
+
+    podcast_waveform = np.concatenate(podcast_waveform)
+    write(filename, rate=sampling_rate, data=podcast_waveform)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -6,3 +6,18 @@
 @pytest.fixture(scope="session")
 def example_data():
     return Path(__file__).parent.parent / "example_data"
+
+
+@pytest.fixture()
+def tts_prompt():
+    return "Wow you are really good at writing unit tests!"
+
+
+@pytest.fixture()
+def tts_speaker_description():
+    return "Laura's voice is enthusiastic and fast with a very close recording that has no background noise."
+
+
+@pytest.fixture()
+def podcast_script():
+    return "Speaker 1: Welcome to our podcast. Speaker 2: It's great to be here!"
diff --git a/tests/unit/inference/test_text_to_speech.py b/tests/unit/inference/test_text_to_speech.py
@@ -0,0 +1,12 @@
+from src.opennotebookllm.inference.text_to_speech import text_to_speech
+from scipy.io.wavfile import write
+
+
+def test_text_to_speech_parler(tts_prompt, tts_speaker_description):
+    waveform = text_to_speech(
+        input_text=tts_prompt,
+        speaker_description=tts_speaker_description,
+        model_id="parler-tts/parler-tts-mini-v1",
+    )
+
+    write("test_parler_tts.wav", rate=24_000, data=waveform)
diff --git a/tests/unit/podcast_maker/test_script_to_audio.py b/tests/unit/podcast_maker/test_script_to_audio.py
@@ -0,0 +1,9 @@
+import os
+from src.opennotebookllm.podcast_maker.script_to_audio import script_to_audio
+
+
+def test_parse_script(podcast_script: str):
+    filename = "test_podcast.wav"
+    script_to_audio(podcast_script, filename=filename)
+
+    assert os.path.isfile(filename)