diff --git a/.github/setup.sh b/.github/setup.sh index 97aacba..b51d025 100644 --- a/.github/setup.sh +++ b/.github/setup.sh @@ -3,3 +3,4 @@ git clone https://github.com/descriptinc/audiotools python -m pip install audiotools python -m pip install -e . rm -rf audiotools +python -m pip install --upgrade streamlit diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 069977c..4714424 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -27,11 +27,18 @@ jobs: python-version: '3.10' cache: "pip" - - name: Install + - name: Install test dependencies run: pip install -e '.[tests]' - name: Install parler dependency run: pip install -e '.[parler]' - - name: Run tests - run: pytest -v tests + - name: Run Unit Tests + run: pytest -v tests/unit + + - name: Run Integration Tests + run: pytest -v tests/integration + + - name: Run E2E tests + if: ${{ github.event_name == 'workflow_dispatch' }} + run: pytest -v tests/e2e diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2260e05..3bfdbc1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,7 +1,48 @@ # Contributing to mozilla.ai Blueprints -We welcome contributions of all kinds! Whether you're a seasoned developer or just starting out, your help is greatly appreciated. +Thank you for your interest in contributing to this repository! This project is part of the Blueprints initiative, which empowers developers to integrate AI capabilities into their projects using open-source tools and models. -# How to Contribute +We welcome all kinds of contributions, from improving customization, to extending capabilities, to fixing bugs. Your efforts help make Blueprints better and more impactful! Whether youโ€™re an experienced developer or just starting out, your support is highly appreciated. --- + +## **How to Contribute** + +### **Browse Existing Issues** ๐Ÿ” +- Check the [Issues](https://github.com/mozilla-ai/document-to-podcast/issues) page to see if there are any tasks you'd like to tackle. +- Look for issues labeled **`good first issue`** if you're new to the projectโ€”they're a great place to start. + +### **Report Issues** ๐Ÿ› + +- Found a bug? Open a [Bug Report](https://github.com/mozilla-ai/document-to-podcast/issues/new?assignees=&labels=bug&projects=document-to-podcast&template=bug_report.yaml&title=%5BBUG%5D%3A+). +- Provide as much detail as possible, including the steps to reproduce the issue and Expected vs. actual behavior + +### **Suggest Features** ๐Ÿš€ +- Have an idea for improving the Blueprint? Open a [Feature Request](https://github.com/mozilla-ai/document-to-podcast/issues/new?assignees=&labels=enhancement&projects=Document-to-podcast&template=feature_request.yaml&title=%5BFEATURE%5D%3A+). +- Share why the feature is important and any alternative solutions youโ€™ve considered. + +### **Submit Pull Requests** ๐Ÿ’ป +- Fork the repository and create a new branch for your changes. +- Ensure your branch is up-to-date with the main branch before submitting the PR +- Please follow the PR template, adding as much detail as possible, including how to test the changes + +--- + +## **Contribution Ideas** + +Looking for inspiration? Check out the [Future Features & Contributions page](https://mozilla-ai.github.io/document-to-podcast/future-features-contributions/) to explore meaningful ways you can enhance and extend this Blueprint. +Please also feel free to get involved in this repo's [GitHub Discussions](https://github.com/mozilla-ai/document-to-podcast/discussions) to engage in ideas, get feedback, or ask questions. + +--- + +### **Guidelines for Contributions** + +**Coding Standards** +- Follow PEP 8 for Python formatting. +- Use clear variable and function names and add comments to improve readability. + +**Testing** +- Test changes locally and in GitHub Codespaces to ensure functionality. + +**Documentation** +- Update docs for changes to functionality and maintain consistency with existing docs. diff --git a/README.md b/README.md index 68d355a..27134ce 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ It is designed to work on most local setups or with [GitHub Codespaces](https:// ### ๐Ÿ‘‰ ๐Ÿ“– For more detailed guidance on using this project, please visit our [Docs here](https://mozilla-ai.github.io/document-to-podcast/). ### Built with -- Python 3.10+ +- Python 3.10+ (use Python 3.12 for Apple M1/2/3 chips) - [Llama-cpp](https://github.com/abetlen/llama-cpp-python) (text-to-text, i.e script generation) - [OuteAI](https://github.com/edwko/OuteTTS) / [Parler_tts](https://github.com/huggingface/parler-tts) (text-to-speech, i.e audio generation) - [Streamlit](https://streamlit.io/) (UI demo) diff --git a/demo/app.py b/demo/app.py index c1a9ab7..68ae51f 100644 --- a/demo/app.py +++ b/demo/app.py @@ -1,58 +1,21 @@ import re from pathlib import Path +import numpy as np +import soundfile as sf import streamlit as st from llama_cpp import Llama - -from document_to_podcast.podcast_maker.config import SpeakerConfig, PodcastConfig +from outetts import InterfaceGGUF from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS from document_to_podcast.inference.model_loaders import ( load_llama_cpp_model, load_outetts_model, - load_parler_tts_model_and_tokenizer, ) +from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker from document_to_podcast.inference.text_to_speech import text_to_speech from document_to_podcast.inference.text_to_text import text_to_text_stream -PODCAST_PROMPT = """ -You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. The script features two speakers: -Speaker 1: Laura, the main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way. -Speaker 2: Jon, the co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like โ€œhmmโ€ or โ€œumm.โ€ -Instructions: -- Write dynamic, easy-to-follow dialogue. -- Include natural interruptions and interjections. -- Avoid repetitive phrasing between speakers. -- Format output as a JSON conversation. -Example: -{ - "Speaker 1": "Welcome to our podcast! Today, we're exploring...", - "Speaker 2": "Hi Laura! I'm excited to hear about this. Can you explain...", - "Speaker 1": "Sure! Imagine it like this...", - "Speaker 2": "Oh, that's cool! But how does..." -} -""" - -# For a list of speakers supported: https://github.com/edwko/OuteTTS/tree/main/outetts/version/v1/default_speakers -SPEAKER_DESCRIPTIONS_OUTE = { - "1": "female_1", - "2": "male_1", -} -# For a list of speakers supported: https://github.com/huggingface/parler-tts?tab=readme-ov-file#-using-a-specific-speaker -SPEAKER_DESCRIPTIONS_PARLER = { - "1": "Laura's voice is exciting and fast in delivery with very clear audio and no background noise.", - "2": "Jon's voice is calm with very clear audio and no background noise.", -} - -TTS_MODELS = [ - "OuteTTS-0.1-350M", - "OuteTTS-0.2-500M", - "parler-tts-large-v1", - "parler-tts-mini-v1", - "parler-tts-mini-expresso", -] - - @st.cache_resource def load_text_to_text_model() -> Llama: return load_llama_cpp_model( @@ -61,36 +24,23 @@ def load_text_to_text_model() -> Llama: @st.cache_resource -def load_text_to_speech_model(model_id: str) -> PodcastConfig: - if "oute" in model_id.lower(): - model = load_outetts_model(f"OuteAI/{model_id}-GGUF/{model_id}-FP16.gguf") - tokenizer = None - speaker_descriptions = SPEAKER_DESCRIPTIONS_OUTE - sampling_rate = model.audio_codec.sr - else: - model, tokenizer = load_parler_tts_model_and_tokenizer( - f"parler-tts/{model_id}", "cpu" - ) - speaker_descriptions = SPEAKER_DESCRIPTIONS_PARLER - sampling_rate = model.config.sampling_rate - - speaker_1 = SpeakerConfig( - model=model, - speaker_id="1", - tokenizer=tokenizer, - speaker_profile=speaker_descriptions["1"], - ) - speaker_2 = SpeakerConfig( - model=model, - speaker_id="2", - tokenizer=tokenizer, - speaker_profile=speaker_descriptions["2"], - ) +def load_text_to_speech_model() -> InterfaceGGUF: + return load_outetts_model("OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf") - return PodcastConfig( - speakers={s.speaker_id: s for s in [speaker_1, speaker_2]}, - sampling_rate=sampling_rate, - ) + +script = "script" +audio = "audio" +gen_button = "generate podcast button" +if script not in st.session_state: + st.session_state[script] = "" +if audio not in st.session_state: + st.session_state.audio = [] +if gen_button not in st.session_state: + st.session_state[gen_button] = False + + +def gen_button_clicked(): + st.session_state[gen_button] = True st.title("Document To Podcast") @@ -131,64 +81,93 @@ def load_text_to_speech_model(model_id: str) -> PodcastConfig: ) st.divider() - text_model = load_text_to_text_model() + st.header("Downloading and Loading models") + st.markdown( + "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)" + ) + st.divider() - model_name = st.selectbox( - label="Select Text-to-Speech Model", options=TTS_MODELS, index=None + st.markdown( + "For this demo, we are using the following models: \n" + "- [OLMoE-1B-7B-0924-Instruct-GGUF](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF)\n" + "- [OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf](https://huggingface.co/OuteAI/OuteTTS-0.1-350M-GGUF)" + ) + st.markdown( + "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)" + " for more information on how to use different models." ) - if model_name: - st.header("Downloading and Loading models") - st.markdown( - "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-2-podcast-script-generation)" + text_model = load_text_to_text_model() + speech_model = load_text_to_speech_model() + + # ~4 characters per token is considered a reasonable default. + max_characters = text_model.n_ctx() * 4 + if len(clean_text) > max_characters: + st.warning( + f"Input text is too big ({len(clean_text)})." + f" Using only a subset of it ({max_characters})." ) - st.divider() + clean_text = clean_text[:max_characters] - st.markdown( - "For this demo, we are using [OLMoE-1B-7B-0924-Instruct-GGUF](https://huggingface.co/allenai/OLMoE-1B-7B-0924-Instruct-GGUF) for text-to-text.\n" - ) - st.markdown( - "You can check the [Customization Guide](https://mozilla-ai.github.io/document-to-podcast/customization/)" - " for more information on how to use different models." + st.divider() + st.header("Podcast generation") + st.markdown( + "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-3-audio-podcast-generation)" + ) + st.divider() + + st.subheader("Speaker configuration") + for s in DEFAULT_SPEAKERS: + s.pop("id", None) + speakers = st.data_editor(DEFAULT_SPEAKERS, num_rows="dynamic") + + if st.button("Generate Podcast", on_click=gen_button_clicked): + for n, speaker in enumerate(speakers): + speaker["id"] = n + 1 + system_prompt = DEFAULT_PROMPT.replace( + "{SPEAKERS}", + "\n".join(str(Speaker.model_validate(speaker)) for speaker in speakers), ) - tts_model = load_text_to_speech_model(model_name) - - # ~4 characters per token is considered a reasonable default. - max_characters = text_model.n_ctx() * 4 - if len(clean_text) > max_characters: - st.warning( - f"Input text is too big ({len(clean_text)})." - f" Using only a subset of it ({max_characters})." + with st.spinner("Generating Podcast..."): + text = "" + for chunk in text_to_text_stream( + clean_text, text_model, system_prompt=system_prompt.strip() + ): + text += chunk + if text.endswith("\n") and "Speaker" in text: + st.session_state.script += text + st.write(st.session_state.script) + + speaker_id = re.search(r"Speaker (\d+)", text).group(1) + voice_profile = next( + speaker["voice_profile"] + for speaker in speakers + if speaker["id"] == int(speaker_id) + ) + with st.spinner("Generating Audio..."): + speech = text_to_speech( + text.split(f'"Speaker {speaker_id}":')[-1], + speech_model, + voice_profile, + ) + st.audio(speech, sample_rate=speech_model.audio_codec.sr) + + st.session_state.audio.append(speech) + text = "" + + if st.session_state[gen_button]: + if st.button("Save Podcast to audio file"): + st.session_state.audio = np.concatenate(st.session_state.audio) + sf.write( + "podcast.wav", + st.session_state.audio, + samplerate=speech_model.audio_codec.sr, ) - clean_text = clean_text[:max_characters] + st.markdown("Podcast saved to disk!") - st.divider() - st.header("Podcast generation") - st.markdown( - "[Docs for this Step](https://mozilla-ai.github.io/document-to-podcast/step-by-step-guide/#step-3-audio-podcast-generation)" - ) - st.divider() - - system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT) - - if st.button("Generate Podcast"): - with st.spinner("Generating Podcast..."): - text = "" - for chunk in text_to_text_stream( - clean_text, text_model, system_prompt=system_prompt.strip() - ): - text += chunk - if text.endswith("\n") and "Speaker" in text: - st.write(text) - speaker_id = re.search(r"Speaker (\d+)", text).group(1) - with st.spinner("Generating Audio..."): - speech = text_to_speech( - input_text=text.split(f'"Speaker {speaker_id}":')[-1], - model=tts_model.speakers[speaker_id].model, - tokenizer=tts_model.speakers[speaker_id].tokenizer, - speaker_profile=tts_model.speakers[ - speaker_id - ].speaker_profile, - ) - st.audio(speech, sample_rate=tts_model.sampling_rate) - text = "" + if st.button("Save Podcast script to text file"): + with open("script.txt", "w") as f: + st.session_state.script += "}" + f.write(st.session_state.script) + + st.markdown("Script saved to disk!") diff --git a/docs/api.md b/docs/api.md index c781ee2..2995f23 100644 --- a/docs/api.md +++ b/docs/api.md @@ -7,7 +7,3 @@ ::: document_to_podcast.inference.text_to_text ::: document_to_podcast.inference.text_to_speech - -::: document_to_podcast.podcast_maker.script_to_audio - -::: document_to_podcast.podcast_maker.config diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 0000000..cc2eef0 --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,30 @@ +# Command Line Interface + +Once you have [installed the blueprint](./getting-started.md), you can use it from the CLI. + +You can either provide the path to a configuration file: + +```bash +document-to-podcast --from_config "example_data/config.yaml" +``` + +Or provide values to the arguments directly: + + +```bash +document-to-podcast \ +--input_file "example_data/Mozilla-Trustworthy_AI.pdf" \ +--output_folder "example_data" +--text_to_text_model "Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf" +``` + +--- + +::: document_to_podcast.cli.document_to_podcast + +--- + +::: document_to_podcast.config.Config +::: document_to_podcast.config.Speaker +::: document_to_podcast.config.DEFAULT_PROMPT +::: document_to_podcast.config.DEFAULT_SPEAKERS diff --git a/docs/future-features-contributions.md b/docs/future-features-contributions.md index e91d951..db5d446 100644 --- a/docs/future-features-contributions.md +++ b/docs/future-features-contributions.md @@ -3,28 +3,31 @@ The Document-to-Podcast Blueprint is an evolving project designed to grow with the help of the open-source community. Whether youโ€™re an experienced developer or just starting, there are many ways you can contribute and help shape the future of this tool. --- -## ๐Ÿ› ๏ธ **This Page is Evolving** -As the community grows, weโ€™ll use this space to highlight contributions, showcase new ideas, and share guidance on expanding the Blueprint ecosystem. -We have some ideas of how this Blueprint can be extended and improved, will be sharing these ideas and request for contributions shortly. +## ๐ŸŒŸ **How You Can Contribute** ---- +### ๐Ÿ› ๏ธ **Enhance the Blueprint** +- Check the [Issues](https://github.com/mozilla-ai/document-to-podcast/issues) page to see if there are feature requests you'd like to implement +- Refer to our [Contribution Guide](https://github.com/mozilla-ai/document-to-podcast/blob/main/CONTRIBUTING.md) for more details on contributions -## ๐ŸŒŸ **How You Can Contribute** +### ๐ŸŽจ **Extensibility Ideas** + +This Blueprint is designed to be a foundation you can build upon. By extending its capabilities, you can open the door to new applications, improve user experience, and adapt the Blueprint to address other use cases. Here are a few ideas for how you can expand its potential: -### ๐Ÿ’ก **Share Your Ideas** -Got a vision for how this Blueprint could be improved? Share your suggestions through [GitHub Discussions](https://github.com/mozilla-ai/document-to-podcast/discussions). Your insights can help inspire new directions for the project. -### ๐Ÿ› ๏ธ **Enhance the Code** -Dive into the codebase and contribute enhancements, optimizations, or bug fixes. Whether it's a small tweak or a big feature, every contribution helps! Start by checking our Contribution Guide (coming soon). +- **Multi-language podcast generation:** Add support for multi-language podcast generation to expand the reach of this Blueprint. +- **New modalities input:** Add support to the Blueprint to be able to handle different input modalities, like audio or images, enabling more flexibility in podcast generation. +- **Improved audio quality:** Explore and integrate more advanced open-source TTS frameworks to enhance the quality of generated audio, making podcasts sound more natural. +Weโ€™d love to see how you can enhance this Blueprint! If you create improvements or extend its capabilities, consider contributing them back to the project so others in the community can benefit from your work. Check out our [Contributions Guide](https://github.com/mozilla-ai/document-to-podcast/blob/main/CONTRIBUTING.md) to get started! + +### ๐Ÿ’ก **Share Your Ideas** +Got an idea for how this Blueprint could be improved? You can share your suggestions through [GitHub Discussions](https://github.com/mozilla-ai/document-to-podcast/discussions). + ### ๐ŸŒ **Build New Blueprints** -This project is part of a larger initiative to create a collection of reusable starter code solutions that use open-source AI tools. If youโ€™re inspired to create your own Blueprint, weโ€™d love to see it! +This project is part of a larger initiative to create a collection of reusable starter code solutions that use open-source AI tools. If youโ€™re inspired to create your own Blueprint, you can use the [Blueprint-template](https://github.com/new?template_name=Blueprint-template&template_owner=mozilla-ai) to get started. --- -## ๐Ÿค **Get Involved** -- Visit our [GitHub Discussions](https://github.com/mozilla-ai/document-to-podcast/discussions) to explore ongoing conversations and share your thoughts. - -Your contributions help make this Blueprint better for everyone. Thank you for being part of the journey! ๐ŸŽ‰ +Your contributions help make this Blueprint better for everyone ๐ŸŽ‰ diff --git a/example_data/config.yaml b/example_data/config.yaml new file mode 100644 index 0000000..8c62996 --- /dev/null +++ b/example_data/config.yaml @@ -0,0 +1,29 @@ +input_file: "example_data/introducing-mozilla-ai-investing-in-trustworthy-ai.html" +output_folder: "example_data/" +text_to_text_model: "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf" +text_to_speech_model: "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf" +text_to_text_prompt: | + You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. + The script features the following speakers: + {SPEAKERS} + Instructions: + - Write dynamic, easy-to-follow dialogue. + - Include natural interruptions and interjections. + - Avoid repetitive phrasing between speakers. + - Format output as a JSON conversation. + Example: + { + "Speaker 1": "Welcome to our podcast! Today, we're exploring...", + "Speaker 2": "Hi! I'm excited to hear about this. Can you explain...", + "Speaker 1": "Sure! Imagine it like this...", + "Speaker 2": "Oh, that's cool! But how does..." + } +speakers: + - id: 1 + name: Laura + description: The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way. + voice_profile: female_1 + - id: 2 + name: Jon + description: The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm. + voice_profile: male_1 diff --git a/mkdocs.yml b/mkdocs.yml index 006d1b0..f1d5068 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -7,6 +7,7 @@ nav: - Getting Started: getting-started.md - Step-by-Step Guide: step-by-step-guide.md - Customization Guide: customization.md + - Command Line Interface: cli.md - API Reference: api.md - Future Features & Contributions: future-features-contributions.md diff --git a/pyproject.toml b/pyproject.toml index cce6a39..91c8ba9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "document-to-podcast" readme = "README.md" license = {text = "Apache-2.0"} -requires-python = ">=3.10" +requires-python = ">=3.10,<3.13" dynamic = ["version"] dependencies = [ "beautifulsoup4", @@ -30,10 +30,11 @@ docs = [ tests = [ "pytest>=8,<9", "pytest-sugar>=0.9.6", + "pytest-mock>=3.14.0" ] parler = [ - "parler_tts @ git+https://github.com/daavoo/parler-tts.git", + "parler_tts @ git+https://github.com/huggingface/parler-tts.git", ] [project.urls] @@ -47,3 +48,6 @@ where = ["src"] namespaces = false [tool.setuptools_scm] + +[project.scripts] +document-to-podcast = "document_to_podcast.cli:main" diff --git a/src/document_to_podcast/cli.py b/src/document_to_podcast/cli.py new file mode 100644 index 0000000..bcc1dee --- /dev/null +++ b/src/document_to_podcast/cli.py @@ -0,0 +1,150 @@ +import re +from pathlib import Path + +import numpy as np +import soundfile as sf +import yaml +from fire import Fire +from loguru import logger + + +from document_to_podcast.config import Config, Speaker, DEFAULT_PROMPT, DEFAULT_SPEAKERS +from document_to_podcast.inference.model_loaders import ( + load_llama_cpp_model, + load_outetts_model, +) +from document_to_podcast.inference.text_to_text import text_to_text_stream +from document_to_podcast.inference.text_to_speech import text_to_speech +from document_to_podcast.preprocessing import DATA_CLEANERS, DATA_LOADERS + + +@logger.catch(reraise=True) +def document_to_podcast( + input_file: str | None = None, + output_folder: str | None = None, + text_to_text_model: str = "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf", + text_to_text_prompt: str = DEFAULT_PROMPT, + text_to_speech_model: str = "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf", + speakers: list[Speaker] | None = None, + from_config: str | None = None, +): + """ + Generate a podcast from a document. + + Args: + input_file (str): The path to the input file. + Supported extensions: + + - .pdf + - .html + - .txt + - .docx + - .md + + output_folder (str): The path to the output folder. + Two files will be created: + + - {output_folder}/podcast.txt + - {output_folder}/podcast.wav + + text_to_text_model (str, optional): The path to the text-to-text model. + + Need to be formatted as `owner/repo/file`. + + Need to be a gguf file. + + Defaults to `allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf`. + + text_to_text_prompt (str, optional): The prompt for the text-to-text model. + Defaults to DEFAULT_PROMPT. + + text_to_speech_model (str, optional): The path to the text-to-speech model. + Defaults to `OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf`. + + speakers (list[Speaker] | None, optional): The speakers for the podcast. + Defaults to DEFAULT_SPEAKERS. + + from_config (str, optional): The path to the config file. Defaults to None. + + If provided, all other arguments will be ignored. + """ + if from_config: + config = Config.model_validate(yaml.safe_load(Path(from_config).read_text())) + else: + speakers = speakers or DEFAULT_SPEAKERS + config = Config( + input_file=input_file, + output_folder=output_folder, + text_to_text_model=text_to_text_model, + text_to_text_prompt=text_to_text_prompt, + text_to_speech_model=text_to_speech_model, + speakers=[Speaker.model_validate(speaker) for speaker in speakers], + ) + + output_folder = Path(config.output_folder) + output_folder.mkdir(parents=True, exist_ok=True) + + data_loader = DATA_LOADERS[Path(config.input_file).suffix] + logger.info(f"Loading {config.input_file}") + raw_text = data_loader(config.input_file) + logger.debug(f"Loaded {len(raw_text)} characters") + + data_cleaner = DATA_CLEANERS[Path(config.input_file).suffix] + logger.info(f"Cleaning {config.input_file}") + clean_text = data_cleaner(raw_text) + logger.debug(f"Cleaned {len(raw_text) - len(clean_text)} characters") + logger.debug(f"Length of cleaned text: {len(clean_text)}") + + logger.info(f"Loading {config.text_to_text_model}") + text_model = load_llama_cpp_model(model_id=config.text_to_text_model) + logger.info(f"Loading {config.text_to_speech_model}") + speech_model = load_outetts_model(model_id=config.text_to_speech_model) + + # ~4 characters per token is considered a reasonable default. + max_characters = text_model.n_ctx() * 4 + if len(clean_text) > max_characters: + logger.warning( + f"Input text is too big ({len(clean_text)})." + f" Using only a subset of it ({max_characters})." + ) + clean_text = clean_text[:max_characters] + + logger.info("Generating Podcast...") + podcast_script = "" + text = "" + podcast_audio = [] + system_prompt = config.text_to_text_prompt.strip() + system_prompt = system_prompt.replace( + "{SPEAKERS}", "\n".join(str(speaker) for speaker in config.speakers) + ) + for chunk in text_to_text_stream( + clean_text, text_model, system_prompt=system_prompt + ): + text += chunk + podcast_script += chunk + if text.endswith("\n") and "Speaker" in text: + logger.debug(text) + speaker_id = re.search(r"Speaker (\d+)", text).group(1) + voice_profile = next( + speaker for speaker in config.speakers if speaker.id == int(speaker_id) + ).voice_profile + speech = text_to_speech( + text.split(f'"Speaker {speaker_id}":')[-1], + speech_model, + voice_profile, + ) + podcast_audio.append(speech) + text = "" + + logger.info("Saving Podcast...") + sf.write( + str(output_folder / "podcast.wav"), + np.concatenate(podcast_audio), + samplerate=speech_model.audio_codec.sr, + ) + (output_folder / "podcast.txt").write_text(podcast_script) + logger.success("Done!") + + +def main(): + Fire(document_to_podcast) diff --git a/src/document_to_podcast/config.py b/src/document_to_podcast/config.py new file mode 100644 index 0000000..e43ca92 --- /dev/null +++ b/src/document_to_podcast/config.py @@ -0,0 +1,90 @@ +from pathlib import Path +from typing import Literal +from typing_extensions import Annotated + +from pydantic import BaseModel, FilePath +from pydantic.functional_validators import AfterValidator + +from document_to_podcast.preprocessing import DATA_LOADERS + + +DEFAULT_PROMPT = """ +You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. +The script features the following speakers: +{SPEAKERS} +Instructions: +- Write dynamic, easy-to-follow dialogue. +- Include natural interruptions and interjections. +- Avoid repetitive phrasing between speakers. +- Format output as a JSON conversation. +Example: +{ + "Speaker 1": "Welcome to our podcast! Today, we're exploring...", + "Speaker 2": "Hi! I'm excited to hear about this. Can you explain...", + "Speaker 1": "Sure! Imagine it like this...", + "Speaker 2": "Oh, that's cool! But how does..." +} +""" + +DEFAULT_SPEAKERS = [ + { + "id": 1, + "name": "Laura", + "description": "The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.", + "voice_profile": "female_1", + }, + { + "id": 2, + "name": "Jon", + "description": "The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.", + "voice_profile": "male_1", + }, +] + + +def validate_input_file(value): + if Path(value).suffix not in DATA_LOADERS: + raise ValueError( + f"input_file extension must be one of {list(DATA_LOADERS.keys())}" + ) + return value + + +def validate_text_to_text_model(value): + parts = value.split("/") + if len(parts) != 3: + raise ValueError("text_to_text_model must be formatted as `owner/repo/file`") + if not value.endswith(".gguf"): + raise ValueError("text_to_text_model must be a gguf file") + return value + + +def validate_text_to_text_prompt(value): + if "{SPEAKERS}" not in value: + raise ValueError("text_to_text_prompt must contain `{SPEAKERS}` placeholder") + return value + + +class Speaker(BaseModel): + id: int + name: str + description: str + voice_profile: str + + def __str__(self): + return f"Speaker {self.id}. Named {self.name}. {self.description}" + + +class Config(BaseModel): + input_file: Annotated[FilePath, AfterValidator(validate_input_file)] + output_folder: str + text_to_text_model: Annotated[str, AfterValidator(validate_text_to_text_model)] + text_to_text_prompt: str + text_to_speech_model: Literal[ + "OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf", + "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf", + "parler-tts/parler-tts-large-v1", + "parler-tts/parler-tts-mini-v1", + "parler-tts/parler-tts-mini-v1.1", + ] + speakers: list[Speaker] diff --git a/src/document_to_podcast/inference/model_loaders.py b/src/document_to_podcast/inference/model_loaders.py index 4079168..ade90e0 100644 --- a/src/document_to_podcast/inference/model_loaders.py +++ b/src/document_to_podcast/inference/model_loaders.py @@ -3,7 +3,6 @@ from huggingface_hub import hf_hub_download from llama_cpp import Llama from outetts import GGUFModelConfig_v1, InterfaceGGUF -from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer, PreTrainedModel, PreTrainedTokenizerBase @@ -30,6 +29,7 @@ def load_llama_cpp_model( filename=filename, # 0 means that the model limit will be used, instead of the default (512) or other hardcoded value n_ctx=0, + verbose=False, ) return model @@ -81,6 +81,8 @@ def load_parler_tts_model_and_tokenizer( Returns: PreTrainedModel: The loaded model. """ + from parler_tts import ParlerTTSForConditionalGeneration + model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device) tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/src/document_to_podcast/inference/text_to_speech.py b/src/document_to_podcast/inference/text_to_speech.py index 609d12b..cffe47a 100644 --- a/src/document_to_podcast/inference/text_to_speech.py +++ b/src/document_to_podcast/inference/text_to_speech.py @@ -42,27 +42,31 @@ def _speech_generation_parler( def text_to_speech( input_text: str, - model: Union[PreTrainedModel, InterfaceGGUFClass], + model: Union[InterfaceGGUFClass, PreTrainedModel], + voice_profile: str, tokenizer: PreTrainedTokenizerBase = None, - speaker_profile: str = "", ) -> np.ndarray: """ - Generates a speech waveform using the input_text, a model and a speaker profile to define a distinct voice pattern. + Generates a speech waveform from a text input using a pre-trained text-to-speech (TTS) model. Examples: - >>> waveform = text_to_speech(input_text="Welcome to our amazing podcast", model=model, tokenizer=tokenizer, speaker_profile="Laura's voice is exciting and fast in delivery with very clear audio and no background noise.") + >>> waveform = text_to_speech(input_text="Welcome to our amazing podcast", model=model, voice_profile="male_1") Args: input_text (str): The text to convert to speech. model (PreTrainedModel): The model used for generating the waveform. - tokenizer (PreTrainedTokenizerBase): The tokenizer used for tokenizing the text in order to send to the model. - speaker_profile (str): A description used by the ParlerTTS model to configure the speaker profile. + voice_profile (str): Depending on the selected TTS model it should either be + - a pre-defined ID for the Oute models (e.g. "female_1") + more info here https://github.com/edwko/OuteTTS/tree/main/outetts/version/v1/default_speakers + - a natural description of the voice profile using a pre-defined name for the Parler model (e.g. Laura's voice is calm) + more info here https://github.com/huggingface/parler-tts?tab=readme-ov-file#-using-a-specific-speaker + tokenizer (PreTrainedTokenizerBase): [Only used for the Parler models!] The tokenizer used for tokenizing the text in order to send to the model. Returns: numpy array: The waveform of the speech as a 2D numpy array """ if isinstance(model, InterfaceGGUFClass): - return _speech_generation_oute(input_text, model, speaker_profile) + return _speech_generation_oute(input_text, model, voice_profile) elif isinstance(model, PreTrainedModel): - return _speech_generation_parler(input_text, model, tokenizer, speaker_profile) + return _speech_generation_parler(input_text, model, tokenizer, voice_profile) else: raise NotImplementedError("Model not yet implemented for TTS") diff --git a/src/document_to_podcast/podcast_maker/__init__.py b/src/document_to_podcast/podcast_maker/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/document_to_podcast/podcast_maker/config.py b/src/document_to_podcast/podcast_maker/config.py index 8b6fdf0..e69de29 100644 --- a/src/document_to_podcast/podcast_maker/config.py +++ b/src/document_to_podcast/podcast_maker/config.py @@ -1,33 +0,0 @@ -from typing import Dict, Optional, Union - -from outetts.version.v1.interface import InterfaceGGUF as InterfaceGGUFClass -from transformers import PreTrainedModel, PreTrainedTokenizerBase -from pydantic import BaseModel, ConfigDict - - -class SpeakerConfig(BaseModel): - """ - Pydantic model that stores configuration of an individual speaker for the TTS model. - - model: The actual model instance to be used for generation. - speaker_id: A string defining the speaker in order to have a consistent voice during podcast generation. - speaker_profile: This profile is defined based on the specific model family used (e.g. Parler uses natural language descriptions whereas Oute models use IDs) - tokenizer: Parler models also need a tokenizer. This is None in the case of Oute models. - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) - - model: Union[PreTrainedModel, InterfaceGGUFClass] - speaker_id: str - speaker_profile: Optional[str] = None - tokenizer: Optional[PreTrainedTokenizerBase] = None - - -class PodcastConfig(BaseModel): - """ - Pydantic model that stores configuration of all the speakers for the TTS model. This allows different speakers to - use different models and configurations. - """ - - speakers: Dict[str, SpeakerConfig] - sampling_rate: int = 44_100 diff --git a/src/document_to_podcast/podcast_maker/script_to_audio.py b/src/document_to_podcast/podcast_maker/script_to_audio.py deleted file mode 100644 index 19190b0..0000000 --- a/src/document_to_podcast/podcast_maker/script_to_audio.py +++ /dev/null @@ -1,79 +0,0 @@ -import numpy as np -import soundfile as sf - -from document_to_podcast.inference.model_loaders import load_outetts_model -from document_to_podcast.inference.text_to_speech import text_to_speech -from document_to_podcast.podcast_maker.config import PodcastConfig, SpeakerConfig - - -def parse_script_to_waveform(script: str, podcast_config: PodcastConfig): - """ - Given a script with speaker identifiers (such as "Speaker 1") parse it so that each speaker has its own unique - voice and concatenate all the voices in a sequence to form the complete podcast. - Args: - script: - podcast_config: - - Returns: A 2D numpy array containing the whole podcast in waveform format. - - """ - parts = script.split("Speaker ") - podcast_waveform = [] - for part in parts: - if ":" in part: - speaker_id, speaker_text = part.replace('"', "").split(":") - speaker_model = podcast_config.speakers[speaker_id].model - speaker_tokenizer = podcast_config.speakers[speaker_id].tokenizer - speaker_description = podcast_config.speakers[speaker_id].speaker_profile - speaker_waveform = text_to_speech( - speaker_text, speaker_model, speaker_tokenizer, speaker_description - ) - podcast_waveform.append(speaker_waveform) - - return np.concatenate(podcast_waveform) - - -def save_waveform_as_file( - waveform: np.ndarray, sampling_rate: int, filename: str -) -> None: - """ - Save the output of the TTS (a numpy waveform) to a .wav file using the soundfile library. - - Args: - waveform: 2D numpy array of a waveform - sampling_rate: Usually 44.100, but check the specifications of the TTS model you are using. - filename: the destination filename to save the audio - - """ - sf.write(filename, waveform, sampling_rate) - - -if __name__ == "__main__": - test_filename = "test_podcast.wav" - test_podcast_script = '{"Speaker 1": "Welcome to our podcast.", "Speaker 2": "It\'s great to be here!", "Speaker 1": "What do you want to talk about today?", "Speaker 2": "Wish I knew!"}' - - model = load_outetts_model( - "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf", "en", "cpu" - ) - speaker_1 = SpeakerConfig( - model=model, - speaker_id="1", - speaker_profile="female_1", - ) - speaker_2 = SpeakerConfig( - model=model, - speaker_id="2", - speaker_profile="male_1", - ) - speakers = {s.speaker_id: s for s in [speaker_1, speaker_2]} - demo_podcast_config = PodcastConfig(speakers=speakers) - - test_podcast_waveform = parse_script_to_waveform( - test_podcast_script, demo_podcast_config - ) - - save_waveform_as_file( - test_podcast_waveform, - sampling_rate=demo_podcast_config.sampling_rate, - filename=test_filename, - ) diff --git a/tests/conftest.py b/tests/conftest.py index 33b08e1..f9f326d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,12 +2,6 @@ import pytest -from document_to_podcast.inference.model_loaders import load_outetts_model -from document_to_podcast.podcast_maker.config import ( - PodcastConfig, - SpeakerConfig, -) - @pytest.fixture(scope="session") def example_data(): @@ -22,22 +16,3 @@ def tts_prompt(): @pytest.fixture() def podcast_script(): return '{"Speaker 1": "Welcome to our podcast.", "Speaker 2": "It\'s great to be here!"}' - - -@pytest.fixture() -def podcast_config(): - model = load_outetts_model( - "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf", "en", "cpu" - ) - speaker_1 = SpeakerConfig( - model=model, - speaker_id="1", - speaker_profile="female_1", - ) - speaker_2 = SpeakerConfig( - model=model, - speaker_id="2", - speaker_profile="male_1", - ) - speakers = {s.speaker_id: s for s in [speaker_1, speaker_2]} - return PodcastConfig(speakers=speakers) diff --git a/tests/e2e/test_document_to_podcast.py b/tests/e2e/test_document_to_podcast.py new file mode 100644 index 0000000..9e20717 --- /dev/null +++ b/tests/e2e/test_document_to_podcast.py @@ -0,0 +1,35 @@ +from document_to_podcast.cli import document_to_podcast + +EXAMPLE_INPUT = """ +### Mozilla's approach to trustworthy AI + +Mozilla has a rich history of reimagining computing norms to favor openness and innovation. We first did this in the early 2000s by championing an openness in an era where the web was on the brink of being monopolized by a single company, Microsoft, which had gone from being a minor player in the browser market to near-dominance. The market dominance of Internet Explorer threatened to lock in users, stamp out competitors, and stifle innovation online. + +In the face of Microsoft's monopolization of the browser market, a loose coalition of open source activists, software developers, and web enthusiasts came together to build + +standards-based browsers and web servers that would eventually wrest power away from the + +tech giant. Mozilla was an early and active member of this movement. We focused resources, coordinated code, and ultimately released Firefox as part of this movement. Around the same time, the US Department of Justice's antitrust case against Microsoft demonstrated how regulators can help keep the technology industry competitive and healthy. + +The result was a fundamental shift in the computing environment of the time. A renewed interest in web standards like HTML and JavaScript made true cross-platform applications the norm, replacing the dominant paradigm of end user apps that only worked on Windows. This fostered an open environment that allowed new cross platform products and services โ€” including Facebook and Gmail โ€” to enter the field. The internet we know now would not exist if the constrained environment of Windows and Internet Explorer 6 had become the status quo. + +Today, we are at a similar inflection point. As in the early 2000s, many of our current problems are caused by a limited playing field. There are bright spots: A growing number of software developers, activists, academics, designers, and technologists are asking critical questions about how current norms around AI and data are centralizing power, stifling innovation, and eliminating user agency. But these efforts desperately need more fuel. + +In this paper, we provide Mozilla's perspective on how we might do just this. Our work began in earnest in 2019, when members of the Mozilla community began asking questions like: What can Mozilla do to shift norms around AI? Who else is tackling this problem? And, how can we help them? We emerged from the exploration process with big-picture learnings. For instance, while many of the challenges with AI are individual, large scale AI also presents major collective risks. We also emerged with granular learnings. For instance, there is progress being made in creating privacy-preserving ways to handle data for machine learning. In addition, governments are hungry to figure out how to fairly and effectively regulate AI, but they lack the internal expertise and independent research needed to do so. + +All of these learnings culminated in Mozilla's theory of change โ€” a rough road map for what levers we need to pull in order to achieve trustworthy AI at scale and in a lasting way. Some of these levers exist in the realm of industry: Mozilla can support better education for computer science students or push for greater algorithmic accountability. Some of these levers exist in policy: We can steer more like-minded technologists toward government or advocate for stricter enforcement of privacy laws. Other levers exist in civil society, in the realms of academia, activism, art, and journalism. + +All these levers are interconnected, and over the coming years, Mozilla will focus our effort and resources on pulling these levers. However, we know that our own contribution to this work exists within a much larger constellation of actors. Just like we did in the early Firefox era, Mozilla will function as one part of a broader movement: focusing resources, coordinating work, + +and nurturing a more equitable computing environment. +""" + + +def test_document_to_podcast(tmp_path): + input_file = tmp_path / "input_file.md" + input_file.write_text(EXAMPLE_INPUT) + document_to_podcast( + input_file=str(input_file), output_folder=str(tmp_path / "output") + ) + assert (tmp_path / "output" / "podcast.txt").exists() + assert (tmp_path / "output" / "podcast.wav").exists() diff --git a/tests/integration/test_model_load_and_inference.py b/tests/integration/test_model_load_and_inference.py deleted file mode 100644 index 85a92e5..0000000 --- a/tests/integration/test_model_load_and_inference.py +++ /dev/null @@ -1,92 +0,0 @@ -import json -from typing import Iterator - -import numpy as np -import pytest - -from document_to_podcast.inference.model_loaders import ( - load_llama_cpp_model, - load_outetts_model, - load_parler_tts_model_and_tokenizer, -) -from document_to_podcast.inference.text_to_speech import text_to_speech -from document_to_podcast.inference.text_to_text import text_to_text, text_to_text_stream - - -def test_model_load_and_inference_text_to_text(): - model = load_llama_cpp_model( - "HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/smollm-135m-instruct-add-basics-q8_0.gguf" - ) - result = text_to_text( - "Answer to: What is the capital of France?", - model=model, - system_prompt="", - ) - assert isinstance(result, str) - assert json.loads(result) - - -def test_model_load_and_inference_text_to_text_no_json(): - model = load_llama_cpp_model( - "HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/smollm-135m-instruct-add-basics-q8_0.gguf" - ) - result = text_to_text( - "What is the capital of France?", - model=model, - system_prompt="", - return_json=False, - stop=".", - ) - assert isinstance(result, str) - with pytest.raises(json.JSONDecodeError): - json.loads(result) - assert result.startswith("The capital of France is Paris") - - -def test_model_load_and_inference_text_to_text_stream_no_json(): - model = load_llama_cpp_model( - "HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/smollm-135m-instruct-add-basics-q8_0.gguf" - ) - result = text_to_text_stream( - "What is the capital of France?", - model=model, - system_prompt="", - return_json=False, - stop=".", - ) - assert isinstance(result, Iterator) - result = "".join(result) - with pytest.raises(json.JSONDecodeError): - json.loads(result) - assert result.startswith("The capital of France is Paris") - - -def test_model_load_and_inference_text_to_speech_oute(): - model = load_outetts_model( - "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf", "en", "cpu" - ) - - speech = text_to_speech( - input_text="What a pretty unit test this is!", - model=model, - speaker_profile="female_1", - ) - - assert isinstance(speech, np.ndarray) - assert speech.size > 1 - - -def test_model_load_and_inference_text_to_speech_parler(): - model, tokenizer = load_parler_tts_model_and_tokenizer( - "parler-tts/parler-tts-mini-v1", "cpu" - ) - - speech = text_to_speech( - input_text="What a pretty unit test this is!", - model=model, - tokenizer=tokenizer, - speaker_profile="Laura's voice is exciting and fast in delivery with very clear audio and no background noise.", - ) - - assert isinstance(speech, np.ndarray) - assert speech.size > 1 diff --git a/tests/integration/test_text_to_text_to_speech.py b/tests/integration/test_text_to_text_to_speech.py deleted file mode 100644 index 25d1331..0000000 --- a/tests/integration/test_text_to_text_to_speech.py +++ /dev/null @@ -1,36 +0,0 @@ -import os -from pathlib import Path - -from document_to_podcast.inference.model_loaders import load_llama_cpp_model -from document_to_podcast.inference.text_to_speech import text_to_speech -from document_to_podcast.inference.text_to_text import text_to_text -from document_to_podcast.podcast_maker.config import PodcastConfig -from document_to_podcast.podcast_maker.script_to_audio import save_waveform_as_file - - -def test_text_to_text_to_speech(tmp_path: Path, podcast_config: PodcastConfig): - model = load_llama_cpp_model( - "HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/smollm-135m-instruct-add-basics-q8_0.gguf" - ) - result = text_to_text( - "What is the capital of France?", - model=model, - system_prompt="", - return_json=False, - stop=".", - ) - - speaker_cfg = list(podcast_config.speakers.values())[0] - waveform = text_to_speech( - input_text=result, - model=speaker_cfg.model, - tokenizer=speaker_cfg.tokenizer, - speaker_profile=speaker_cfg.speaker_profile, - ) - - filename = str(tmp_path / "test_text_to_text_to_speech_parler.wav") - save_waveform_as_file( - waveform=waveform, sampling_rate=podcast_config.sampling_rate, filename=filename - ) - - assert os.path.isfile(filename) diff --git a/tests/unit/inference/test_text_to_speech.py b/tests/unit/inference/test_text_to_speech.py index f268690..5e7a4fb 100644 --- a/tests/unit/inference/test_text_to_speech.py +++ b/tests/unit/inference/test_text_to_speech.py @@ -1,25 +1,11 @@ -from pathlib import Path - from document_to_podcast.inference.text_to_speech import text_to_speech -from document_to_podcast.podcast_maker.config import PodcastConfig -from document_to_podcast.podcast_maker.script_to_audio import save_waveform_as_file - - -def test_text_to_speech_oute( - tmp_path: Path, tts_prompt: str, podcast_config: PodcastConfig -): - speaker_cfg = list(podcast_config.speakers.values())[0] - - waveform = text_to_speech( - tts_prompt, - speaker_cfg.model, - speaker_cfg.tokenizer, - speaker_cfg.speaker_profile, - ) - save_waveform_as_file( - waveform=waveform, - sampling_rate=podcast_config.sampling_rate, - filename=str(tmp_path / "test_oute_tts.wav"), +def test_text_to_speech(mocker): + model = mocker.MagicMock() + text_to_speech( + "Hello?", + model=model, + voice_profile="male_1", ) + model.generate.assert_called_with(input_ids=mocker.ANY, prompt_input_ids=mocker.ANY) diff --git a/tests/unit/inference/test_text_to_text.py b/tests/unit/inference/test_text_to_text.py new file mode 100644 index 0000000..e851d16 --- /dev/null +++ b/tests/unit/inference/test_text_to_text.py @@ -0,0 +1,81 @@ +import pytest + +from document_to_podcast.inference.text_to_text import text_to_text, text_to_text_stream + + +def test_text_to_text(mocker): + model = mocker.MagicMock() + text_to_text( + "Hello?", + model=model, + system_prompt="You are a helpful assistant.", + ) + model.create_chat_completion.assert_called_with( + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello?"}, + ], + response_format={"type": "json_object"}, + stop=[], + stream=False, + ) + + +def test_text_to_text_no_return_json(mocker): + model = mocker.MagicMock() + text_to_text( + "Hello?", + model=model, + system_prompt="You are a helpful assistant.", + return_json=False, + ) + model.create_chat_completion.assert_called_with( + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello?"}, + ], + response_format=None, + stop=[], + stream=False, + ) + + +def test_text_to_text_stream_no_return_json(mocker): + model = mocker.MagicMock() + iterator = text_to_text_stream( + "Hello?", + model=model, + system_prompt="You are a helpful assistant.", + return_json=False, + ) + with pytest.raises(StopIteration): + next(iterator) + model.create_chat_completion.assert_called_with( + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello?"}, + ], + response_format=None, + stop=[], + stream=True, + ) + + +def test_text_to_text_stream(mocker): + model = mocker.MagicMock() + iterator = text_to_text_stream( + "Hello?", + model=model, + system_prompt="You are a helpful assistant.", + ) + with pytest.raises(StopIteration): + next(iterator) + model.create_chat_completion.assert_called_with( + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello?"}, + ], + response_format={"type": "json_object"}, + stop=[], + stream=True, + ) diff --git a/tests/unit/podcast_maker/test_script_to_audio.py b/tests/unit/podcast_maker/test_script_to_audio.py deleted file mode 100644 index 42ef2ff..0000000 --- a/tests/unit/podcast_maker/test_script_to_audio.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -from pathlib import Path - -import numpy as np - -from document_to_podcast.podcast_maker.config import PodcastConfig -from document_to_podcast.podcast_maker.script_to_audio import ( - parse_script_to_waveform, - save_waveform_as_file, -) - - -def test_parse_script_waveform(podcast_script: str, podcast_config: PodcastConfig): - podcast_waveform = parse_script_to_waveform(podcast_script, podcast_config) - - assert isinstance(podcast_waveform, np.ndarray) - assert podcast_waveform.size > 1 - - -def test_script_to_podcast( - tmp_path: Path, podcast_script: str, podcast_config: PodcastConfig -): - filename = str(tmp_path / "test_podcast.wav") - podcast_waveform = parse_script_to_waveform(podcast_script, podcast_config) - - save_waveform_as_file( - podcast_waveform, sampling_rate=podcast_config.sampling_rate, filename=filename - ) - assert os.path.isfile(filename)