mozilla-ai · Kostis-S-Z · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -27,9 +27,12 @@ jobs:
           python-version: '3.10'
           cache: "pip"
 
-      - name: Install
+      - name: Install test dependencies
         run: pip install -e '.[tests]'
 
+      - name: Install parler dependency
+        run: pip install -e '.[parler]'
+
       - name: Run Unit Tests
         run: pytest -v tests/unit
 

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ It is designed to work on most local setups or with [GitHub Codespaces](https://
 ### Built with
 - Python 3.10+ (use Python 3.12 for Apple M1/2/3 chips)
 - [Llama-cpp](https://github.com/abetlen/llama-cpp-python) (text-to-text, i.e script generation)
-- [Parler_tts](https://github.com/huggingface/parler-tts) (text-to-speech, i.e audio generation)
+- [OuteAI](https://github.com/edwko/OuteTTS) / [Parler_tts](https://github.com/huggingface/parler-tts) (text-to-speech, i.e audio generation)
 - [Streamlit](https://streamlit.io/) (UI demo)
 
 
@@ -57,6 +57,7 @@ Once the Codespaces environment launches, inside the terminal, start the Streaml
 
 ***NOTE***: The first time you run the demo app it might take a while to generate the script or the audio because it will download the models to the machine which are a few GBs in size.
 
+
 ## How it Works
 
 <img src="./images/document-to-podcast-diagram.png" width="1200" />
@@ -91,12 +92,33 @@ Once the Codespaces environment launches, inside the terminal, start the Streaml
   -	Each speaker is assigned a distinct voice.
 	- The final output is saved as an audio file in formats like MP3 or WAV.
 
+## Models
+
+The architecture of this codebase focuses on modularity and adaptability, meaning it shouldn't be too difficult to swap frameworks to use your own suite of models. We have selected fully open source models that are very memory efficient and can run on a laptop CPU with less than 10GB RAM requirements.
+
+### text-to-text
+
+We are using the [llama.cpp](https://github.com/ggerganov/llama.cpp) library, which supports open source models optimized for local inference and minimal hardware requirements. Our default text-to-text model is the open source [OLMoE-7B-Instruct](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct) from [AllenAI](https://allenai.org/).
+
+For the complete list of models supported out-of-the-box, visit this [link](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#text-only).
+
+### text-to-speech
+
+We support models from the [OuteAI](https://github.com/edwko/OuteTTS) and [Parler_tts](https://github.com/huggingface/parler-tts) packages. For a complete list of models visit [Oute HF](https://huggingface.co/collections/OuteAI/outetts-6728aa71a53a076e4ba4817c) (only the GGUF versions) and [Parler HF](https://huggingface.co/collections/parler-tts/parler-tts-fully-open-source-high-quality-tts-66164ad285ba03e8ffde214c).
+
+**Important note:** In order to keep the package dependencies as lightweight as possible, only the Oute interface is installed by default. If you want to use the parler models, please also run:
+
+```bash
+pip install -e '.[parler]'
+```
+
+
 ## Pre-requisites
 
 - **System requirements**:
   - OS: Windows, macOS, or Linux
-  - Python 3.10 or higher
-  - Minimum RAM: 16 GB
+  - Python 3.10>, <3.12
+  - Minimum RAM: 10 GB
   - Disk space: 32 GB minimum
 
 - **Dependencies**:

diff --git a/demo/app.py b/demo/app.py
@@ -8,7 +8,7 @@
 from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
-    load_parler_tts_model_and_tokenizer,
+    load_outetts_model,
 )
 from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
 from document_to_podcast.inference.text_to_speech import text_to_speech
@@ -23,8 +23,8 @@ def load_text_to_text_model():
 
 
 @st.cache_resource
-def load_text_to_speech_model_and_tokenizer():
-    return load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu")
+def load_text_to_speech_model():
+    return load_outetts_model("OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf")
 
 
 script = "script"
@@ -89,15 +89,15 @@ def gen_button_clicked():
     st.markdown(
         "For this demo, we are using the following models: \n"
         "- [OLMoE-1B-7B-0924-Instruct-GGUF](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF)\n"
-        "- [parler-tts-mini-v1](https://huggingface.co/parler-tts/parler-tts-mini-v1)"
+        "- [OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf](https://huggingface.co/OuteAI/OuteTTS-0.1-350M-GGUF)"
     )
     st.markdown(
         "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
         " for more information on how to use different models."
     )
 
     text_model = load_text_to_text_model()
-    speech_model, speech_tokenizer = load_text_to_speech_model_and_tokenizer()
+    speech_model = load_text_to_speech_model()
 
     # ~4 characters per token is considered a reasonable default.
     max_characters = text_model.n_ctx() * 4
@@ -147,10 +147,10 @@ def gen_button_clicked():
                         speech = text_to_speech(
                             text.split(f'"Speaker {speaker_id}":')[-1],
                             speech_model,
-                            speech_tokenizer,
                             voice_profile,
                         )
-                    st.audio(speech, sample_rate=44100)
+                    st.audio(speech, sample_rate=speech_model.audio_codec.sr)
+
                     st.session_state.audio.append(speech)
                     text = ""
 
@@ -160,7 +160,7 @@ def gen_button_clicked():
             sf.write(
                 "podcast.wav",
                 st.session_state.audio,
-                samplerate=44100,
+                samplerate=speech_model.audio_codec.sr,
             )
             st.markdown("Podcast saved to disk!")
 

diff --git a/docs/customization.md b/docs/customization.md
@@ -54,8 +54,7 @@ Customizing the app:
 Example:
 
 ```python
-PODCAST_PROMPT = """
-SPEAKER_DESCRIPTIONS = {
+SPEAKER_DESCRIPTIONS_OUTE = {
     "1": "A cheerful and animated voice with a fast-paced delivery.",
     "2": "A calm and deep voice, speaking with authority and warmth."
 }
@@ -78,6 +77,7 @@ Example:
 def load_text_to_speech_model_and_tokenizer():
     return load_parler_tts_model_and_tokenizer(
         "parler-tts/parler-tts-mini-expresso", "cpu")
+```
 
 ## 💡 Other Customization Ideas
 

diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -15,28 +15,32 @@ python -m streamlit run demo/app.py
 
 
 ### 💻  **Option 2: Local Installation**
+1.**Clone the Repository**
 
-1. **Clone the Repository**
-
-   Inside your terminal, run:
-
+Inside your terminal, run:
 ```bash
-git clone https://github.com/mozilla-ai/document-to-podcast.git
-cd document-to-podcast
+   git clone https://github.com/mozilla-ai/document-to-podcast.git
+   cd document-to-podcast
 ```
-
 2. **Install Dependencies**
 
    Inside your terminal, run:
 
 ```bash
 pip install -e .
 ```
-
 3. **Run the Demo**
 
    Inside your terminal, start the Streamlit demo by running:
 
 ```bash
 python -m streamlit run demo/app.py
 ```
+
+
+### [Optional]: Use Parler models for text-to-speech
+
+If you want to use the [parler tts](https://github.com/huggingface/parler-tts) models, you will need to **additionally** install an optional dependency by running:
+```bash
+pip install -e '.[parler]'
+```
diff --git a/docs/index.md b/docs/index.md
@@ -11,7 +11,7 @@ These docs are your companion to mastering the **Document-to-Podcast Blueprint**
 ### Built with
 - Python 3.10+
 - [Llama-cpp](https://github.com/abetlen/llama-cpp-python) (text-to-text, i.e script generation)
-- [Parler_tts](https://github.com/huggingface/parler-tts) (text-to-speech, i.e audio generation)
+- [OuteAI](https://github.com/edwko/OuteTTS) / [Parler_tts](https://github.com/huggingface/parler-tts) (text-to-speech, i.e audio generation)
 - [Streamlit](https://streamlit.io/) (UI demo)
 
 

diff --git a/docs/step-by-step-guide.md b/docs/step-by-step-guide.md
@@ -56,11 +56,9 @@ In this step, the pre-processed text is transformed into a conversational podcas
 
  **1 - Model Loading**
 
-   - The [`model_loader.py`](api.md/#document_to_podcast.inference.model_loaders) script is responsible for loading GGUF-type models using the `llama_cpp` library.
+   - The [`model_loader.py`](api.md/#document_to_podcast.inference.model_loaders) module is responsible for loading the `text-to-text` models using the `llama_cpp` library.
 
-   - The function `load_llama_cpp_model` takes a model ID in the format `{org}/{repo}/{filename}` and loads the specified model.
-
-   - This approach of using the `llama_cpp` library supports efficient CPU-based inference, making language models accessible even on machines without GPUs.
+   - The function `load_llama_cpp_model` takes a model ID in the format `{org}/{repo}/{filename}` and loads the specified model. This approach of using the `llama_cpp` library supports efficient CPU-based inference, making language models accessible even on machines without GPUs.
 
  **2 - Text-to-Text Generation**
 
@@ -79,21 +77,29 @@ In this final step, the generated podcast transcript is brought to life as an au
 
 ### ⚙️ **Key Components in this Step**
 
-**1 - Text-to-Speech Audio Generation**
+ **1 - Model Loading**
+
+   - The [`model_loader.py`](api.md/#document_to_podcast.inference.model_loaders) module is responsible for loading the `text-to-speech` models using the `outetts` and `parler_tts` libraries.
+
+   - The function `load_outetts_model` takes a model ID in the format `{org}/{repo}/{filename}` and loads the specified model, either on CPU or GPU, based on the `device` parameter. The parameter `language` also enables to swap between the languages the Oute package supports (as of Dec 2024: `en, zh, ja, ko`)
+
+   - The function `load_parler_tts_model_and_tokenizer` takes a model ID in the format `{repo}/{filename}` and loads the specified model and tokenizer, either on CPU or GPU, based on the `device` parameter.
+
+**2 - Text-to-Speech Audio Generation**
 
-   - The [`text_to_speech.py`](api.md/#document_to_podcast.inference.text_to_speech) script converts text into audio using a specified TTS model and tokenizer.
+   - The [`text_to_speech.py`](api.md/#document_to_podcast.inference.text_to_speech) script converts text into audio using a specified TTS model.
 
-   - A **speaker profile** defines the voice characteristics (e.g., tone, speed, clarity) for each speaker.
+   - A **speaker profile** defines the voice characteristics (e.g., tone, speed, clarity) for each speaker. This is specific to each TTS package. Oute models require one of the IDs specified [here](https://github.com/edwko/OuteTTS/tree/main/outetts/version/v1/default_speakers). Parler requires natural language description of the speaker's voice and you have to use a pre-defined name (see [here](https://github.com/huggingface/parler-tts/blob/main/INFERENCE.md#speaker-consistency))
 
-   - The function `text_to_speech` takes the input text (e.g podcast script) and speaker profile, generating a waveform (audio data) that represents the spoken version of the text.
+   - The function `text_to_speech` takes the input text (e.g. podcast script) and speaker profile, generating a waveform (audio data in a numpy array) that represents the spoken version of the text.
 
 **2 - Parsing and Combining Voices**
 
 - The [`script_to_audio.py`](api.md/#document_to_podcast.podcast_maker.script_to_audio) script ensures each speaker’s dialogue is spoken in their unique voice.
 
 - The function `parse_script_to_waveform` splits the dialogue script by speakers and uses `text_to_speech` to generate audio for each speaker, stitching them together into a full podcast.
 
-- Once the podcast waveform is ready, the save_waveform_as_file function saves it as an audio file (e.g., MP3 or WAV), making it ready for distribution.
+- Once the podcast waveform is ready, the `save_waveform_as_file` function saves it as an audio file (e.g., MP3 or WAV), making it ready for distribution.
 
 
 ## **Bringing It All Together in `app.py`**
@@ -123,7 +129,7 @@ This demo uses [Streamlit](https://streamlit.io/), an open-source Python framewo
 
 - The script uses `load_llama_cpp_model` from `model_loader.py` to load the LLM for generating the podcast script.
 
-- Similarly, `load_parler_tts_model_and_tokenizer` is used to prepare the TTS model and tokenizer for audio generation.
+- Similarly, `load_outetts_model` is used to prepare the TTS model and tokenizer for audio generation.
 
 - These models are cached using `@st.cache_resource` to ensure fast and efficient reuse during app interactions.
 

diff --git a/example_data/config.yaml b/example_data/config.yaml
@@ -1,7 +1,7 @@
 input_file: "example_data/introducing-mozilla-ai-investing-in-trustworthy-ai.html"
 output_folder: "example_data/"
 text_to_text_model: "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
-text_to_speech_model: "parler-tts/parler-tts-mini-v1"
+text_to_speech_model: "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf"
 text_to_text_prompt: |
   You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. 
   The script features the following speakers:
@@ -22,8 +22,9 @@ speakers:
   - id: 1
     name: Laura
     description: The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.
-    voice_profile: Laura's voice is exciting and fast in delivery with very clear audio and no background noise.
+    voice_profile: female_1
   - id: 2
     name: Jon
     description: The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.
-    voice_profile: Jon's voice is calm with very clear audio and no background noise.
+    voice_profile: male_1
+device: "cpu"
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ dependencies = [
   "huggingface-hub",
   "llama-cpp-python",
   "loguru",
-  "parler_tts",
+  "outetts",
   "pydantic",
   "PyPDF2[crypto]",
   "python-docx",
@@ -33,6 +33,10 @@ tests = [
   "pytest-mock>=3.14.0"
 ]
 
+parler = [
+  "parler_tts @ git+https://github.com/huggingface/parler-tts.git",
+]
+
 [project.urls]
 Documentation = "https://mozilla-ai.github.io/document-to-podcast/"
 Issues = "https://github.com/mozilla-ai/document-to-podcast/issues"

diff --git a/src/document_to_podcast/cli.py b/src/document_to_podcast/cli.py
@@ -7,16 +7,10 @@
 from fire import Fire
 from loguru import logger
 
-
-from document_to_podcast.config import (
-    Config,
-    Speaker,
-    DEFAULT_PROMPT,
-    DEFAULT_SPEAKERS,
-    SUPPORTED_TTS_MODELS,
-)
+from document_to_podcast.config import Config, Speaker, DEFAULT_PROMPT, DEFAULT_SPEAKERS
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
+    load_outetts_model,
     load_parler_tts_model_and_tokenizer,
 )
 from document_to_podcast.inference.text_to_text import text_to_text_stream
@@ -30,7 +24,7 @@ def document_to_podcast(
     output_folder: str | None = None,
     text_to_text_model: str = "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf",
     text_to_text_prompt: str = DEFAULT_PROMPT,
-    text_to_speech_model: SUPPORTED_TTS_MODELS = "parler-tts/parler-tts-mini-v1",
+    text_to_speech_model: str = "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf",
     speakers: list[Speaker] | None = None,
     from_config: str | None = None,
 ):
@@ -65,7 +59,7 @@ def document_to_podcast(
             Defaults to DEFAULT_PROMPT.
 
         text_to_speech_model (str, optional): The path to the text-to-speech model.
-            Defaults to `parler-tts/parler-tts-mini-v1`.
+            Defaults to `OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf`.
 
         speakers (list[Speaker] | None, optional): The speakers for the podcast.
             Defaults to DEFAULT_SPEAKERS.
@@ -103,10 +97,17 @@ def document_to_podcast(
 
     logger.info(f"Loading {config.text_to_text_model}")
     text_model = load_llama_cpp_model(model_id=config.text_to_text_model)
-    logger.info(f"Loading {config.text_to_speech_model}")
-    speech_model, speech_tokenizer = load_parler_tts_model_and_tokenizer(
-        model_id=config.text_to_speech_model
-    )
+
+    logger.info(f"Loading {config.text_to_speech_model} on {config.device}")
+    if "oute" in config.text_to_speech_model.lower():
+        speech_model = load_outetts_model(model_id=config.text_to_speech_model)
+        speech_tokenizer = None
+        sample_rate = speech_model.audio_codec.sr
+    else:
+        speech_model, speech_tokenizer = load_parler_tts_model_and_tokenizer(
+            model_id=config.text_to_speech_model
+        )
+        sample_rate = speech_model.config.sampling_rate
 
     # ~4 characters per token is considered a reasonable default.
     max_characters = text_model.n_ctx() * 4
@@ -134,15 +135,13 @@ def document_to_podcast(
             logger.debug(text)
             speaker_id = re.search(r"Speaker (\d+)", text).group(1)
             voice_profile = next(
-                speaker.voice_profile
-                for speaker in config.speakers
-                if speaker.id == int(speaker_id)
-            )
+                speaker for speaker in config.speakers if speaker.id == int(speaker_id)
+            ).voice_profile
             speech = text_to_speech(
                 text.split(f'"Speaker {speaker_id}":')[-1],
                 speech_model,
-                speech_tokenizer,
                 voice_profile,
+                tokenizer=speech_tokenizer,  # Applicable only for parler models
             )
             podcast_audio.append(speech)
             text = ""
@@ -151,7 +150,7 @@ def document_to_podcast(
     sf.write(
         str(output_folder / "podcast.wav"),
         np.concatenate(podcast_audio),
-        samplerate=44100,
+        samplerate=sample_rate,
     )
     (output_folder / "podcast.txt").write_text(podcast_script)
     logger.success("Done!")