diff --git a/.github/setup.sh b/.github/setup.sh new file mode 100644 index 0000000..97aacba --- /dev/null +++ b/.github/setup.sh @@ -0,0 +1,5 @@ +python -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu +git clone https://github.com/descriptinc/audiotools +python -m pip install audiotools +python -m pip install -e . +rm -rf audiotools diff --git a/demo/app.py b/demo/app.py index e336a41..ef0550f 100644 --- a/demo/app.py +++ b/demo/app.py @@ -1,19 +1,16 @@ +import re from pathlib import Path import streamlit as st -from huggingface_hub import list_repo_files -from opennotebookllm.podcast_maker.config import PodcastConfig, SpeakerConfig from opennotebookllm.preprocessing import DATA_LOADERS, DATA_CLEANERS from opennotebookllm.inference.model_loaders import ( load_llama_cpp_model, load_parler_tts_model_and_tokenizer, ) +from opennotebookllm.inference.text_to_speech import _speech_generation_parler from opennotebookllm.inference.text_to_text import text_to_text_stream -from opennotebookllm.podcast_maker.script_to_audio import ( - parse_script_to_waveform, - save_waveform_as_file, -) + PODCAST_PROMPT = """ You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. The script features two speakers: @@ -26,113 +23,100 @@ - Format output as a JSON conversation. Example: { - "Speaker 1": "Welcome to our podcast! Today, we’re exploring...", - "Speaker 2": "Hi Laura! I’m excited to hear about this. Can you explain...", + "Speaker 1": "Welcome to our podcast! Today, we're exploring...", + "Speaker 2": "Hi Laura! I'm excited to hear about this. Can you explain...", "Speaker 1": "Sure! Imagine it like this...", - "Speaker 2": "Oh, that’s cool! But how does..." + "Speaker 2": "Oh, that's cool! But how does..." } """ -SPEAKER_1_DESC = "Laura's voice is exciting and fast in delivery with very clear audio and no background noise." -SPEAKER_2_DESC = "Jon's voice is calm with very clear audio and no background noise." +SPEAKER_DESCRIPTIONS = { + "1": "Laura's voice is exciting and fast in delivery with very clear audio and no background noise.", + "2": "Jon's voice is calm with very clear audio and no background noise.", +} + + +@st.cache_resource +def load_text_to_text_model(): + return load_llama_cpp_model( + model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf" + ) + -CURATED_REPOS = [ - "allenai/OLMoE-1B-7B-0924-Instruct-GGUF", - "MaziyarPanahi/SmolLM2-1.7B-Instruct-GGUF", - # system prompt seems to be ignored for this model. - # "microsoft/Phi-3-mini-4k-instruct-gguf", - "HuggingFaceTB/SmolLM2-360M-Instruct-GGUF", - "Qwen/Qwen2.5-1.5B-Instruct-GGUF", - "Qwen/Qwen2.5-3B-Instruct-GGUF", -] +@st.cache_resource +def load_text_to_speech_model_and_tokenizer(): + return load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu") + + +st.title("Document To Podcast") + +st.header("Uploading Data") uploaded_file = st.file_uploader( "Choose a file", type=["pdf", "html", "txt", "docx", "md"] ) + if uploaded_file is not None: + st.divider() + st.header("Loading and Cleaning Data") + st.markdown( + "[API Reference for data_cleaners](https://mozilla-ai.github.io/document-to-podcast/api/#opennotebookllm.preprocessing.data_cleaners)" + ) + extension = Path(uploaded_file.name).suffix col1, col2 = st.columns(2) raw_text = DATA_LOADERS[extension](uploaded_file) with col1: - st.title("Raw Text") + st.subheader("Raw Text") st.text_area(f"Total Length: {len(raw_text)}", f"{raw_text[:500]} . . .") clean_text = DATA_CLEANERS[extension](raw_text) with col2: - st.title("Cleaned Text") + st.subheader("Cleaned Text") st.text_area(f"Total Length: {len(clean_text)}", f"{clean_text[:500]} . . .") - repo_name = st.selectbox("Select Repo", CURATED_REPOS) - model_name = st.selectbox( - "Select Model", - [ - x - for x in list_repo_files(repo_name) - if ".gguf" in x.lower() and ("q8" in x.lower() or "fp16" in x.lower()) - ], - index=None, + st.divider() + st.header("Downloading and Loading models") + st.markdown( + "[API Reference for model_loaders](https://mozilla-ai.github.io/document-to-podcast/api/#opennotebookllm.inference.model_loaders)" ) - if model_name: - with st.spinner("Downloading and Loading Model..."): - model = load_llama_cpp_model(model_id=f"{repo_name}/{model_name}") - - # ~4 characters per token is considered a reasonable default. - max_characters = model.n_ctx() * 4 - if len(clean_text) > max_characters: - st.warning( - f"Input text is too big ({len(clean_text)})." - f" Using only a subset of it ({max_characters})." - ) - clean_text = clean_text[:max_characters] - - system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT) - - if st.button("Generate Podcast"): - final_script = "" - with st.spinner("Generating Podcast Script..."): - text = "" - for chunk in text_to_text_stream( - clean_text, model, system_prompt=system_prompt.strip() - ): - text += chunk - final_script += chunk - if text.endswith("\n"): - st.write(text) - text = "" - - if final_script: - model.close() # Free up memory in order to load the TTS model - - filename = "demo_podcast.wav" - - with st.spinner("Downloading and Loading TTS Model..."): - tts_model, tokenizer = load_parler_tts_model_and_tokenizer( - "parler-tts/parler-tts-mini-v1", "cpu" - ) - speaker_1 = SpeakerConfig( - model=tts_model, - speaker_id="1", - tokenizer=tokenizer, - speaker_description=SPEAKER_1_DESC, - ) - speaker_2 = SpeakerConfig( - model=tts_model, - speaker_id="2", - tokenizer=tokenizer, - speaker_description=SPEAKER_2_DESC, - ) - demo_podcast_config = PodcastConfig( - speakers={s.speaker_id: s for s in [speaker_1, speaker_2]} - ) - - with st.spinner("Generating Audio..."): - waveform = parse_script_to_waveform( - final_script, demo_podcast_config - ) - save_waveform_as_file( - waveform, demo_podcast_config.sampling_rate, filename - ) - st.audio(filename) + + text_model = load_text_to_text_model() + speech_model, speech_tokenizer = load_text_to_speech_model_and_tokenizer() + + # ~4 characters per token is considered a reasonable default. + max_characters = text_model.n_ctx() * 4 + if len(clean_text) > max_characters: + st.warning( + f"Input text is too big ({len(clean_text)})." + f" Using only a subset of it ({max_characters})." + ) + clean_text = clean_text[:max_characters] + + st.divider() + st.header("Podcast generation") + + system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT) + + if st.button("Generate Podcast"): + with st.spinner("Generating Podcast..."): + text = "" + for chunk in text_to_text_stream( + clean_text, text_model, system_prompt=system_prompt.strip() + ): + text += chunk + if text.endswith("\n") and "Speaker" in text: + st.write(text) + speaker_id = re.search(r"Speaker (\d+)", text).group(1) + with st.spinner("Generating Audio..."): + speech = _speech_generation_parler( + text.split(f'"Speaker {speaker_id}":')[-1], + speech_model, + speech_tokenizer, + SPEAKER_DESCRIPTIONS[speaker_id], + ) + st.audio(speech, sample_rate=44_100) + text = "" diff --git a/pyproject.toml b/pyproject.toml index 2bcf840..75432ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ dependencies = [ "huggingface-hub", "llama-cpp-python", "loguru", - "parler_tts @ git+https://github.com/huggingface/parler-tts.git", + "parler_tts @ git+https://github.com/daavoo/parler-tts.git", "pydantic", "PyPDF2[crypto]", "python-docx", diff --git a/src/opennotebookllm/podcast_maker/script_to_audio.py b/src/opennotebookllm/podcast_maker/script_to_audio.py index 4abba32..911266f 100644 --- a/src/opennotebookllm/podcast_maker/script_to_audio.py +++ b/src/opennotebookllm/podcast_maker/script_to_audio.py @@ -21,7 +21,7 @@ def parse_script_to_waveform(script: str, podcast_config: PodcastConfig): podcast_waveform = [] for part in parts: if ":" in part: - speaker_id, speaker_text = part.replace("\"", "").split(":") + speaker_id, speaker_text = part.replace('"', "").split(":") speaker_model = podcast_config.speakers[speaker_id].model speaker_tokenizer = podcast_config.speakers[speaker_id].tokenizer speaker_description = podcast_config.speakers[