Merge from 17-implement-cli-based-interaction

mozilla-ai · Dec 12, 2024 · 5099bbd · 5099bbd
2 parents 483d44d + 615dfbc
commit 5099bbd
Show file tree

Hide file tree

Showing 25 changed files with 618 additions and 473 deletions.
diff --git a/.github/setup.sh b/.github/setup.sh
@@ -3,3 +3,4 @@ git clone https://github.com/descriptinc/audiotools
 python -m pip install audiotools
 python -m pip install -e .
 rm -rf audiotools
+python -m pip install --upgrade streamlit
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -27,11 +27,18 @@ jobs:
           python-version: '3.10'
           cache: "pip"
 
-      - name: Install
+      - name: Install test dependencies
         run: pip install -e '.[tests]'
 
       - name: Install parler dependency
         run: pip install -e '.[parler]'
 
-      - name: Run tests
-        run: pytest -v tests
+      - name: Run Unit Tests
+        run: pytest -v tests/unit
+
+      - name: Run Integration Tests
+        run: pytest -v tests/integration
+
+      - name: Run E2E tests
+        if: ${{ github.event_name == 'workflow_dispatch' }}
+        run: pytest -v tests/e2e
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,7 +1,48 @@
 # Contributing to mozilla.ai Blueprints
 
-We welcome contributions of all kinds! Whether you're a seasoned developer or just starting out, your help is greatly appreciated.
+Thank you for your interest in contributing to this repository! This project is part of the Blueprints initiative, which empowers developers to integrate AI capabilities into their projects using open-source tools and models.
 
-# How to Contribute
+We welcome all kinds of contributions, from improving customization, to extending capabilities, to fixing bugs. Your efforts help make Blueprints better and more impactful! Whether you’re an experienced developer or just starting out, your support is highly appreciated.
 
 ---
+
+## **How to Contribute**
+
+### **Browse Existing Issues** 🔍
+- Check the [Issues](https://github.com/mozilla-ai/document-to-podcast/issues) page to see if there are any tasks you'd like to tackle.
+- Look for issues labeled **`good first issue`** if you're new to the project—they're a great place to start.
+
+### **Report Issues** 🐛
+
+- Found a bug? Open a [Bug Report](https://github.com/mozilla-ai/document-to-podcast/issues/new?assignees=&labels=bug&projects=document-to-podcast&template=bug_report.yaml&title=%5BBUG%5D%3A+).
+- Provide as much detail as possible, including the steps to reproduce the issue and Expected vs. actual behavior
+
+### **Suggest Features** 🚀
+- Have an idea for improving the Blueprint? Open a [Feature Request](https://github.com/mozilla-ai/document-to-podcast/issues/new?assignees=&labels=enhancement&projects=Document-to-podcast&template=feature_request.yaml&title=%5BFEATURE%5D%3A+).
+- Share why the feature is important and any alternative solutions you’ve considered.
+
+### **Submit Pull Requests** 💻
+- Fork the repository and create a new branch for your changes.
+- Ensure your branch is up-to-date with the main branch before submitting the PR
+- Please follow the PR template, adding as much detail as possible, including how to test the changes
+
+---
+
+## **Contribution Ideas**
+
+Looking for inspiration? Check out the [Future Features & Contributions page](https://mozilla-ai.github.io/document-to-podcast/future-features-contributions/) to explore meaningful ways you can enhance and extend this Blueprint.
+Please also feel free to get involved in this repo's [GitHub Discussions](https://github.com/mozilla-ai/document-to-podcast/discussions) to engage in ideas, get feedback, or ask questions.
+
+---
+
+### **Guidelines for Contributions**
+
+**Coding Standards**
+- Follow PEP 8 for Python formatting.
+- Use clear variable and function names and add comments to improve readability.
+
+**Testing**
+- Test changes locally and in GitHub Codespaces to ensure functionality.
+
+**Documentation**
+- Update docs for changes to functionality and maintain consistency with existing docs.
diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@ It is designed to work on most local setups or with [GitHub Codespaces](https://
 ### 👉 📖 For more detailed guidance on using this project, please visit our [Docs here](https://mozilla-ai.github.io/document-to-podcast/).
 
 ### Built with
-- Python 3.10+
+- Python 3.10+ (use Python 3.12 for Apple M1/2/3 chips)
 - [Llama-cpp](https://github.com/abetlen/llama-cpp-python) (text-to-text, i.e script generation)
 - [OuteAI](https://github.com/edwko/OuteTTS) / [Parler_tts](https://github.com/huggingface/parler-tts) (text-to-speech, i.e audio generation)
 - [Streamlit](https://streamlit.io/) (UI demo)

diff --git a/demo/app.py b/demo/app.py
@@ -1,58 +1,21 @@
 import re
 from pathlib import Path
 
+import numpy as np
+import soundfile as sf
 import streamlit as st
 from llama_cpp import Llama
-
-from document_to_podcast.podcast_maker.config import SpeakerConfig, PodcastConfig
+from outetts import InterfaceGGUF
 from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
     load_outetts_model,
-    load_parler_tts_model_and_tokenizer,
 )
+from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
 from document_to_podcast.inference.text_to_speech import text_to_speech
 from document_to_podcast.inference.text_to_text import text_to_text_stream
 
 
-PODCAST_PROMPT = """
-You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. The script features two speakers:
-Speaker 1: Laura, the main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.
-Speaker 2: Jon, the co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like “hmm” or “umm.”
-Instructions:
-- Write dynamic, easy-to-follow dialogue.
-- Include natural interruptions and interjections.
-- Avoid repetitive phrasing between speakers.
-- Format output as a JSON conversation.
-Example:
-{
-  "Speaker 1": "Welcome to our podcast! Today, we're exploring...",
-  "Speaker 2": "Hi Laura! I'm excited to hear about this. Can you explain...",
-  "Speaker 1": "Sure! Imagine it like this...",
-  "Speaker 2": "Oh, that's cool! But how does..."
-}
-"""
-
-# For a list of speakers supported: https://github.com/edwko/OuteTTS/tree/main/outetts/version/v1/default_speakers
-SPEAKER_DESCRIPTIONS_OUTE = {
-    "1": "female_1",
-    "2": "male_1",
-}
-# For a list of speakers supported: https://github.com/huggingface/parler-tts?tab=readme-ov-file#-using-a-specific-speaker
-SPEAKER_DESCRIPTIONS_PARLER = {
-    "1": "Laura's voice is exciting and fast in delivery with very clear audio and no background noise.",
-    "2": "Jon's voice is calm with very clear audio and no background noise.",
-}
-
-TTS_MODELS = [
-    "OuteTTS-0.1-350M",
-    "OuteTTS-0.2-500M",
-    "parler-tts-large-v1",
-    "parler-tts-mini-v1",
-    "parler-tts-mini-expresso",
-]
-
-
 @st.cache_resource
 def load_text_to_text_model() -> Llama:
     return load_llama_cpp_model(
@@ -61,36 +24,23 @@ def load_text_to_text_model() -> Llama:
 
 
 @st.cache_resource
-def load_text_to_speech_model(model_id: str) -> PodcastConfig:
-    if "oute" in model_id.lower():
-        model = load_outetts_model(f"OuteAI/{model_id}-GGUF/{model_id}-FP16.gguf")
-        tokenizer = None
-        speaker_descriptions = SPEAKER_DESCRIPTIONS_OUTE
-        sampling_rate = model.audio_codec.sr
-    else:
-        model, tokenizer = load_parler_tts_model_and_tokenizer(
-            f"parler-tts/{model_id}", "cpu"
-        )
-        speaker_descriptions = SPEAKER_DESCRIPTIONS_PARLER
-        sampling_rate = model.config.sampling_rate
-
-    speaker_1 = SpeakerConfig(
-        model=model,
-        speaker_id="1",
-        tokenizer=tokenizer,
-        speaker_profile=speaker_descriptions["1"],
-    )
-    speaker_2 = SpeakerConfig(
-        model=model,
-        speaker_id="2",
-        tokenizer=tokenizer,
-        speaker_profile=speaker_descriptions["2"],
-    )
+def load_text_to_speech_model() -> InterfaceGGUF:
+    return load_outetts_model("OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf")
 
-    return PodcastConfig(
-        speakers={s.speaker_id: s for s in [speaker_1, speaker_2]},
-        sampling_rate=sampling_rate,
-    )
+
+script = "script"
+audio = "audio"
+gen_button = "generate podcast button"
+if script not in st.session_state:
+    st.session_state[script] = ""
+if audio not in st.session_state:
+    st.session_state.audio = []
+if gen_button not in st.session_state:
+    st.session_state[gen_button] = False
+
+
+def gen_button_clicked():
+    st.session_state[gen_button] = True
 
 
 st.title("Document To Podcast")
@@ -131,64 +81,93 @@ def load_text_to_speech_model(model_id: str) -> PodcastConfig:
         )
 
     st.divider()
-    text_model = load_text_to_text_model()
+    st.header("Downloading and Loading models")
+    st.markdown(
+        "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
+    )
+    st.divider()
 
-    model_name = st.selectbox(
-        label="Select Text-to-Speech Model", options=TTS_MODELS, index=None
+    st.markdown(
+        "For this demo, we are using the following models: \n"
+        "- [OLMoE-1B-7B-0924-Instruct-GGUF](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF)\n"
+        "- [OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf](https://huggingface.co/OuteAI/OuteTTS-0.1-350M-GGUF)"
+    )
+    st.markdown(
+        "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
+        " for more information on how to use different models."
     )
 
-    if model_name:
-        st.header("Downloading and Loading models")
-        st.markdown(
-            "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)"
+    text_model = load_text_to_text_model()
+    speech_model = load_text_to_speech_model()
+
+    # ~4 characters per token is considered a reasonable default.
+    max_characters = text_model.n_ctx() * 4
+    if len(clean_text) > max_characters:
+        st.warning(
+            f"Input text is too big ({len(clean_text)})."
+            f" Using only a subset of it ({max_characters})."
         )
-        st.divider()
+        clean_text = clean_text[:max_characters]
 
-        st.markdown(
-            "For this demo, we are using [OLMoE-1B-7B-0924-Instruct-GGUF](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF) for text-to-text.\n"
-        )
-        st.markdown(
-            "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)"
-            " for more information on how to use different models."
+    st.divider()
+    st.header("Podcast generation")
+    st.markdown(
+        "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-3-audio-podcast-generation)"
+    )
+    st.divider()
+
+    st.subheader("Speaker configuration")
+    for s in DEFAULT_SPEAKERS:
+        s.pop("id", None)
+    speakers = st.data_editor(DEFAULT_SPEAKERS, num_rows="dynamic")
+
+    if st.button("Generate Podcast", on_click=gen_button_clicked):
+        for n, speaker in enumerate(speakers):
+            speaker["id"] = n + 1
+        system_prompt = DEFAULT_PROMPT.replace(
+            "{SPEAKERS}",
+            "\n".join(str(Speaker.model_validate(speaker)) for speaker in speakers),
         )
-        tts_model = load_text_to_speech_model(model_name)
-
-        # ~4 characters per token is considered a reasonable default.
-        max_characters = text_model.n_ctx() * 4
-        if len(clean_text) > max_characters:
-            st.warning(
-                f"Input text is too big ({len(clean_text)})."
-                f" Using only a subset of it ({max_characters})."
+        with st.spinner("Generating Podcast..."):
+            text = ""
+            for chunk in text_to_text_stream(
+                clean_text, text_model, system_prompt=system_prompt.strip()
+            ):
+                text += chunk
+                if text.endswith("\n") and "Speaker" in text:
+                    st.session_state.script += text
+                    st.write(st.session_state.script)
+
+                    speaker_id = re.search(r"Speaker (\d+)", text).group(1)
+                    voice_profile = next(
+                        speaker["voice_profile"]
+                        for speaker in speakers
+                        if speaker["id"] == int(speaker_id)
+                    )
+                    with st.spinner("Generating Audio..."):
+                        speech = text_to_speech(
+                            text.split(f'"Speaker {speaker_id}":')[-1],
+                            speech_model,
+                            voice_profile,
+                        )
+                    st.audio(speech, sample_rate=speech_model.audio_codec.sr)
+
+                    st.session_state.audio.append(speech)
+                    text = ""
+
+    if st.session_state[gen_button]:
+        if st.button("Save Podcast to audio file"):
+            st.session_state.audio = np.concatenate(st.session_state.audio)
+            sf.write(
+                "podcast.wav",
+                st.session_state.audio,
+                samplerate=speech_model.audio_codec.sr,
             )
-            clean_text = clean_text[:max_characters]
+            st.markdown("Podcast saved to disk!")
 
-        st.divider()
-        st.header("Podcast generation")
-        st.markdown(
-            "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-3-audio-podcast-generation)"
-        )
-        st.divider()
-
-        system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT)
-
-        if st.button("Generate Podcast"):
-            with st.spinner("Generating Podcast..."):
-                text = ""
-                for chunk in text_to_text_stream(
-                    clean_text, text_model, system_prompt=system_prompt.strip()
-                ):
-                    text += chunk
-                    if text.endswith("\n") and "Speaker" in text:
-                        st.write(text)
-                        speaker_id = re.search(r"Speaker (\d+)", text).group(1)
-                        with st.spinner("Generating Audio..."):
-                            speech = text_to_speech(
-                                input_text=text.split(f'"Speaker {speaker_id}":')[-1],
-                                model=tts_model.speakers[speaker_id].model,
-                                tokenizer=tts_model.speakers[speaker_id].tokenizer,
-                                speaker_profile=tts_model.speakers[
-                                    speaker_id
-                                ].speaker_profile,
-                            )
-                        st.audio(speech, sample_rate=tts_model.sampling_rate)
-                        text = ""
+        if st.button("Save Podcast script to text file"):
+            with open("script.txt", "w") as f:
+                st.session_state.script += "}"
+                f.write(st.session_state.script)
+
+            st.markdown("Script saved to disk!")
diff --git a/docs/api.md b/docs/api.md
@@ -7,7 +7,3 @@
 ::: document_to_podcast.inference.text_to_text
 
 ::: document_to_podcast.inference.text_to_speech
-
-::: document_to_podcast.podcast_maker.script_to_audio
-
-::: document_to_podcast.podcast_maker.config
diff --git a/docs/cli.md b/docs/cli.md
@@ -0,0 +1,30 @@
+# Command Line Interface
+
+Once you have [installed the blueprint](./getting-started.md), you can use it from the CLI.
+
+You can either provide the path to a configuration file:
+
+```bash
+document-to-podcast --from_config "example_data/config.yaml"
+```
+
+Or provide values to the arguments directly:
+
+
+```bash
+document-to-podcast \
+--input_file "example_data/Mozilla-Trustworthy_AI.pdf" \
+--output_folder "example_data"
+--text_to_text_model "Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf"
+```
+
+---
+
+::: document_to_podcast.cli.document_to_podcast
+
+---
+
+::: document_to_podcast.config.Config
+::: document_to_podcast.config.Speaker
+::: document_to_podcast.config.DEFAULT_PROMPT
+::: document_to_podcast.config.DEFAULT_SPEAKERS