mozilla-ai · Kostis-S-Z · Dec 4, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/.github/setup.sh b/.github/setup.sh
@@ -0,0 +1,5 @@
+python -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
+git clone https://github.com/descriptinc/audiotools
+python -m pip install audiotools
+python -m pip install -e .
+rm -rf audiotools
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,7 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
+
+# Generated audio files
+*.wav
diff --git a/demo/app.py b/demo/app.py
@@ -1,12 +1,17 @@
+import re
 from pathlib import Path
 
 import streamlit as st
-from huggingface_hub import list_repo_files
 
 from opennotebookllm.preprocessing import DATA_LOADERS, DATA_CLEANERS
-from opennotebookllm.inference.model_loaders import load_llama_cpp_model
+from opennotebookllm.inference.model_loaders import (
+    load_llama_cpp_model,
+    load_parler_tts_model_and_tokenizer,
+)
+from opennotebookllm.inference.text_to_speech import _speech_generation_parler
 from opennotebookllm.inference.text_to_text import text_to_text_stream
 
+
 PODCAST_PROMPT = """
 You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. The script features two speakers:
 Speaker 1: Laura, the main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.
@@ -18,74 +23,100 @@
 - Format output as a JSON conversation.
 Example:
 {
-  "Speaker 1": "Welcome to our podcast! Today, we’re exploring...",
-  "Speaker 2": "Hi Laura! I’m excited to hear about this. Can you explain...",
+  "Speaker 1": "Welcome to our podcast! Today, we're exploring...",
+  "Speaker 2": "Hi Laura! I'm excited to hear about this. Can you explain...",
   "Speaker 1": "Sure! Imagine it like this...",
-  "Speaker 2": "Oh, that’s cool! But how does..."
+  "Speaker 2": "Oh, that's cool! But how does..."
 }
 """
 
-CURATED_REPOS = [
-    "allenai/OLMoE-1B-7B-0924-Instruct-GGUF",
-    "MaziyarPanahi/SmolLM2-1.7B-Instruct-GGUF",
-    # system prompt seems to be ignored for this model.
-    # "microsoft/Phi-3-mini-4k-instruct-gguf",
-    "HuggingFaceTB/SmolLM2-360M-Instruct-GGUF",
-    "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
-    "Qwen/Qwen2.5-3B-Instruct-GGUF",
-]
+SPEAKER_DESCRIPTIONS = {
+    "1": "Laura's voice is exciting and fast in delivery with very clear audio and no background noise.",
+    "2": "Jon's voice is calm with very clear audio and no background noise.",
+}
+
+
+@st.cache_resource
+def load_text_to_text_model():
+    return load_llama_cpp_model(
+        model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
+    )
+
+
+@st.cache_resource
+def load_text_to_speech_model_and_tokenizer():
+    return load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu")
+
+
+st.title("Document To Podcast")
+
+st.header("Uploading Data")
 
 uploaded_file = st.file_uploader(
     "Choose a file", type=["pdf", "html", "txt", "docx", "md"]
 )
 
+
 if uploaded_file is not None:
+    st.divider()
+    st.header("Loading and Cleaning Data")
+    st.markdown(
+        "[API Reference for data_cleaners](https://mozilla-ai.github.io/document-to-podcast/api/#opennotebookllm.preprocessing.data_cleaners)"
+    )
+
     extension = Path(uploaded_file.name).suffix
 
     col1, col2 = st.columns(2)
 
     raw_text = DATA_LOADERS[extension](uploaded_file)
     with col1:
-        st.title("Raw Text")
+        st.subheader("Raw Text")
         st.text_area(f"Total Length: {len(raw_text)}", f"{raw_text[:500]} . . .")
 
     clean_text = DATA_CLEANERS[extension](raw_text)
     with col2:
-        st.title("Cleaned Text")
+        st.subheader("Cleaned Text")
         st.text_area(f"Total Length: {len(clean_text)}", f"{clean_text[:500]} . . .")
 
-    repo_name = st.selectbox("Select Repo", CURATED_REPOS)
-    model_name = st.selectbox(
-        "Select Model",
-        [
-            x
-            for x in list_repo_files(repo_name)
-            if ".gguf" in x.lower() and ("q8" in x.lower() or "fp16" in x.lower())
-        ],
-        index=None,
+    st.divider()
+    st.header("Downloading and Loading models")
+    st.markdown(
+        "[API Reference for model_loaders](https://mozilla-ai.github.io/document-to-podcast/api/#opennotebookllm.inference.model_loaders)"
     )
-    if model_name:
-        with st.spinner("Downloading and Loading Model..."):
-            model = load_llama_cpp_model(model_id=f"{repo_name}/{model_name}")
-
-        # ~4 characters per token is considered a reasonable default.
-        max_characters = model.n_ctx() * 4
-        if len(clean_text) > max_characters:
-            st.warning(
-                f"Input text is too big ({len(clean_text)})."
-                f" Using only a subset of it ({max_characters})."
-            )
-            clean_text = clean_text[:max_characters]
-
-        system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT)
-
-        if st.button("Generate Podcast Script"):
-            with st.spinner("Generating Podcast Script..."):
-                text = ""
-                for chunk in text_to_text_stream(
-                    clean_text, model, system_prompt=system_prompt.strip()
-                ):
-                    text += chunk
-                    if text.endswith("\n"):
-                        st.write(text)
-                        text = ""
+
+    text_model = load_text_to_text_model()
+    speech_model, speech_tokenizer = load_text_to_speech_model_and_tokenizer()
+
+    # ~4 characters per token is considered a reasonable default.
+    max_characters = text_model.n_ctx() * 4
+    if len(clean_text) > max_characters:
+        st.warning(
+            f"Input text is too big ({len(clean_text)})."
+            f" Using only a subset of it ({max_characters})."
+        )
+        clean_text = clean_text[:max_characters]
+
+    st.divider()
+    st.header("Podcast generation")
+
+    system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT)
+
+    if st.button("Generate Podcast"):
+        with st.spinner("Generating Podcast..."):
+            text = ""
+            for chunk in text_to_text_stream(
+                clean_text, text_model, system_prompt=system_prompt.strip()
+            ):
+                text += chunk
+                if text.endswith("\n") and "Speaker" in text:
+                    st.write(text)
+                    speaker_id = re.search(r"Speaker (\d+)", text).group(1)
+                    with st.spinner("Generating Audio..."):
+                        speech = _speech_generation_parler(
+                            text.split(f'"Speaker {speaker_id}":')[-1],
+                            speech_model,
+                            speech_tokenizer,
+                            SPEAKER_DESCRIPTIONS[speaker_id],
+                        )
+                    st.audio(speech, sample_rate=44_100)
+                    text = ""
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,6 +13,8 @@ dependencies = [
   "huggingface-hub",
   "llama-cpp-python",
   "loguru",
+  "parler_tts @ git+https://github.com/daavoo/parler-tts.git",
+  "pydantic",
   "PyPDF2[crypto]",
   "python-docx",
   "streamlit",

diff --git a/src/opennotebookllm/inference/model_loaders.py b/src/opennotebookllm/inference/model_loaders.py
@@ -1,4 +1,8 @@
+from typing import Tuple
+
 from llama_cpp import Llama
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer, PreTrainedModel, PreTrainedTokenizerBase
 
 
 def load_llama_cpp_model(
@@ -8,7 +12,7 @@ def load_llama_cpp_model(
     Loads the given model_id using Llama.from_pretrained.
 
     Examples:
-        >>> model = load_model(
+        >>> model = load_llama_cpp_model(
             "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf")
 
     Args:
@@ -26,3 +30,26 @@ def load_llama_cpp_model(
         n_ctx=0,
     )
     return model
+
+
+def load_parler_tts_model_and_tokenizer(
+    model_id: str, device: str = "cpu"
+) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]:
+    """
+    Loads the given model_id using parler_tts.from_pretrained.
+
+    Examples:
+        >>> model, tokenizer = load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu")
+
+    Args:
+        model_id (str): The model id to load.
+            Format is expected to be `{repo}/{filename}`.
+        device (str): The device to load the model on, such as "cuda:0" or "cpu".
+
+    Returns:
+        PreTrainedModel: The loaded model.
+    """
+    model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+    return model, tokenizer
diff --git a/src/opennotebookllm/inference/text_to_speech.py b/src/opennotebookllm/inference/text_to_speech.py
@@ -0,0 +1,44 @@
+import numpy as np
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+
+
+def _speech_generation_parler(
+    input_text: str,
+    model: PreTrainedModel,
+    tokenizer: PreTrainedTokenizerBase,
+    speaker_description: str,
+) -> np.ndarray:
+    input_ids = tokenizer(speaker_description, return_tensors="pt").input_ids
+    prompt_input_ids = tokenizer(input_text, return_tensors="pt").input_ids
+
+    generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+    waveform = generation.cpu().numpy().squeeze()
+
+    return waveform
+
+
+def text_to_speech(
+    input_text: str,
+    model: PreTrainedModel,
+    tokenizer: PreTrainedTokenizerBase,
+    speaker_profile: str,
+) -> np.ndarray:
+    """
+    Generates a speech waveform using the input_text, a model and a speaker profile to define a distinct voice pattern.
+
+    Examples:
+        >>> waveform = text_to_speech("Welcome to our amazing podcast", "parler-tts/parler-tts-mini-v1", "Laura's voice is exciting and fast in delivery with very clear audio and no background noise.")
+
+    Args:
+        input_text (str): The text to convert to speech.
+        model (PreTrainedModel): The model used for generating the waveform.
+        tokenizer (PreTrainedTokenizerBase): The tokenizer used for tokenizing the text in order to send to the model.
+        speaker_profile (str): A description used by the ParlerTTS model to configure the speaker profile.
+    Returns:
+        numpy array: The waveform of the speech as a 2D numpy array
+    """
+    model_id = model.config.name_or_path
+    if "parler" in model_id:
+        return _speech_generation_parler(input_text, model, tokenizer, speaker_profile)
+    else:
+        raise NotImplementedError(f"Model {model_id} not yet implemented for TTS")
diff --git a/src/opennotebookllm/podcast_maker/__init__.py b/src/opennotebookllm/podcast_maker/__init__.py
diff --git a/src/opennotebookllm/podcast_maker/config.py b/src/opennotebookllm/podcast_maker/config.py
@@ -0,0 +1,20 @@
+from typing import Dict, Optional
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from pydantic import BaseModel, ConfigDict
+
+
+class SpeakerConfig(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    model: PreTrainedModel
+    speaker_id: str
+    # ParlerTTS specific configuration
+    tokenizer: Optional[PreTrainedTokenizerBase] = None
+    speaker_description: Optional[str] = (
+        None  # This description is used by the ParlerTTS model to configure the speaker profile
+    )
+
+
+class PodcastConfig(BaseModel):
+    speakers: Dict[str, SpeakerConfig]
+    sampling_rate: int = 44_100
diff --git a/src/opennotebookllm/podcast_maker/script_to_audio.py b/src/opennotebookllm/podcast_maker/script_to_audio.py
@@ -0,0 +1,74 @@
+import numpy as np
+import soundfile as sf
+
+from opennotebookllm.inference.model_loaders import load_parler_tts_model_and_tokenizer
+from opennotebookllm.inference.text_to_speech import text_to_speech
+from opennotebookllm.podcast_maker.config import PodcastConfig, SpeakerConfig
+
+
+def parse_script_to_waveform(script: str, podcast_config: PodcastConfig):
+    """
+    Given a script with speaker identifiers (such as "Speaker 1") parse it so that each speaker has its own unique
+    voice and concatenate all the voices in a sequence to form the complete podcast.
+    Args:
+        script:
+        podcast_config:
+
+    Returns: A 2D numpy array containing the whole podcast in waveform format.
+
+    """
+    parts = script.split("Speaker ")
+    podcast_waveform = []
+    for part in parts:
+        if ":" in part:
+            speaker_id, speaker_text = part.replace('"', "").split(":")
+            speaker_model = podcast_config.speakers[speaker_id].model
+            speaker_tokenizer = podcast_config.speakers[speaker_id].tokenizer
+            speaker_description = podcast_config.speakers[
+                speaker_id
+            ].speaker_description
+            speaker_waveform = text_to_speech(
+                speaker_text, speaker_model, speaker_tokenizer, speaker_description
+            )
+            podcast_waveform.append(speaker_waveform)
+
+    return np.concatenate(podcast_waveform)
+
+
+def save_waveform_as_file(
+    waveform: np.ndarray, sampling_rate: int, filename: str
+) -> None:
+    sf.write(filename, waveform, sampling_rate)
+
+
+if __name__ == "__main__":
+    test_filename = "test_podcast.wav"
+    test_podcast_script = '{"Speaker 1": "Welcome to our podcast.", "Speaker 2": "It\'s great to be here!", "Speaker 1": "What do you want to talk about today?", "Speaker 2": "Wish I knew!"}'
+
+    model, tokenizer = load_parler_tts_model_and_tokenizer(
+        "parler-tts/parler-tts-mini-v1", "cpu"
+    )
+    speaker_1 = SpeakerConfig(
+        model=model,
+        speaker_id="1",
+        tokenizer=tokenizer,
+        speaker_description="Laura's voice is exciting and fast in delivery with very clear audio and no background noise.",
+    )
+    speaker_2 = SpeakerConfig(
+        model=model,
+        speaker_id="2",
+        tokenizer=tokenizer,
+        speaker_description="Jon's voice is calm with very clear audio and no background noise.",
+    )
+    demo_podcast_config = PodcastConfig(
+        speakers={s.speaker_id: s for s in [speaker_1, speaker_2]}
+    )
+    test_podcast_waveform = parse_script_to_waveform(
+        test_podcast_script, demo_podcast_config
+    )
+
+    save_waveform_as_file(
+        test_podcast_waveform,
+        sampling_rate=demo_podcast_config.sampling_rate,
+        filename=test_filename,
+    )