Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to demo to include audio part #26

Merged
merged 10 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/setup.sh
Kostis-S-Z marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
python -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
git clone https://github.com/descriptinc/audiotools
python -m pip install audiotools
python -m pip install -e .
rm -rf audiotools
168 changes: 76 additions & 92 deletions demo/app.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
import re
from pathlib import Path

import streamlit as st
from huggingface_hub import list_repo_files

from opennotebookllm.podcast_maker.config import PodcastConfig, SpeakerConfig
from opennotebookllm.preprocessing import DATA_LOADERS, DATA_CLEANERS
from opennotebookllm.inference.model_loaders import (
load_llama_cpp_model,
load_parler_tts_model_and_tokenizer,
)
from opennotebookllm.inference.text_to_speech import _speech_generation_parler
from opennotebookllm.inference.text_to_text import text_to_text_stream
from opennotebookllm.podcast_maker.script_to_audio import (
parse_script_to_waveform,
save_waveform_as_file,
)


PODCAST_PROMPT = """
You are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format. The script features two speakers:
Expand All @@ -26,113 +23,100 @@
- Format output as a JSON conversation.
Example:
{
"Speaker 1": "Welcome to our podcast! Today, were exploring...",
"Speaker 2": "Hi Laura! Im excited to hear about this. Can you explain...",
"Speaker 1": "Welcome to our podcast! Today, we're exploring...",
"Speaker 2": "Hi Laura! I'm excited to hear about this. Can you explain...",
Kostis-S-Z marked this conversation as resolved.
Show resolved Hide resolved
"Speaker 1": "Sure! Imagine it like this...",
"Speaker 2": "Oh, thats cool! But how does..."
"Speaker 2": "Oh, that's cool! But how does..."
}
"""

SPEAKER_1_DESC = "Laura's voice is exciting and fast in delivery with very clear audio and no background noise."
SPEAKER_2_DESC = "Jon's voice is calm with very clear audio and no background noise."
SPEAKER_DESCRIPTIONS = {
"1": "Laura's voice is exciting and fast in delivery with very clear audio and no background noise.",
"2": "Jon's voice is calm with very clear audio and no background noise.",
}


@st.cache_resource
def load_text_to_text_model():
return load_llama_cpp_model(
model_id="allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
)


CURATED_REPOS = [
"allenai/OLMoE-1B-7B-0924-Instruct-GGUF",
"MaziyarPanahi/SmolLM2-1.7B-Instruct-GGUF",
# system prompt seems to be ignored for this model.
# "microsoft/Phi-3-mini-4k-instruct-gguf",
"HuggingFaceTB/SmolLM2-360M-Instruct-GGUF",
"Qwen/Qwen2.5-1.5B-Instruct-GGUF",
"Qwen/Qwen2.5-3B-Instruct-GGUF",
]
@st.cache_resource
def load_text_to_speech_model_and_tokenizer():
return load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu")
Kostis-S-Z marked this conversation as resolved.
Show resolved Hide resolved


st.title("Document To Podcast")

st.header("Uploading Data")

uploaded_file = st.file_uploader(
"Choose a file", type=["pdf", "html", "txt", "docx", "md"]
)


if uploaded_file is not None:
st.divider()
st.header("Loading and Cleaning Data")
st.markdown(
"[API Reference for data_cleaners](https://mozilla-ai.github.io/document-to-podcast/api/#opennotebookllm.preprocessing.data_cleaners)"
)

extension = Path(uploaded_file.name).suffix

col1, col2 = st.columns(2)

raw_text = DATA_LOADERS[extension](uploaded_file)
with col1:
st.title("Raw Text")
st.subheader("Raw Text")
st.text_area(f"Total Length: {len(raw_text)}", f"{raw_text[:500]} . . .")

clean_text = DATA_CLEANERS[extension](raw_text)
with col2:
st.title("Cleaned Text")
st.subheader("Cleaned Text")
st.text_area(f"Total Length: {len(clean_text)}", f"{clean_text[:500]} . . .")

repo_name = st.selectbox("Select Repo", CURATED_REPOS)
model_name = st.selectbox(
"Select Model",
[
x
for x in list_repo_files(repo_name)
if ".gguf" in x.lower() and ("q8" in x.lower() or "fp16" in x.lower())
],
index=None,
st.divider()
st.header("Downloading and Loading models")
st.markdown(
"[API Reference for model_loaders](https://mozilla-ai.github.io/document-to-podcast/api/#opennotebookllm.inference.model_loaders)"
)
if model_name:
with st.spinner("Downloading and Loading Model..."):
model = load_llama_cpp_model(model_id=f"{repo_name}/{model_name}")

# ~4 characters per token is considered a reasonable default.
max_characters = model.n_ctx() * 4
if len(clean_text) > max_characters:
st.warning(
f"Input text is too big ({len(clean_text)})."
f" Using only a subset of it ({max_characters})."
)
clean_text = clean_text[:max_characters]

system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT)

if st.button("Generate Podcast"):
final_script = ""
with st.spinner("Generating Podcast Script..."):
text = ""
for chunk in text_to_text_stream(
clean_text, model, system_prompt=system_prompt.strip()
):
text += chunk
final_script += chunk
if text.endswith("\n"):
st.write(text)
text = ""

if final_script:
model.close() # Free up memory in order to load the TTS model

filename = "demo_podcast.wav"

with st.spinner("Downloading and Loading TTS Model..."):
tts_model, tokenizer = load_parler_tts_model_and_tokenizer(
"parler-tts/parler-tts-mini-v1", "cpu"
)
speaker_1 = SpeakerConfig(
model=tts_model,
speaker_id="1",
tokenizer=tokenizer,
speaker_description=SPEAKER_1_DESC,
)
speaker_2 = SpeakerConfig(
model=tts_model,
speaker_id="2",
tokenizer=tokenizer,
speaker_description=SPEAKER_2_DESC,
)
demo_podcast_config = PodcastConfig(
speakers={s.speaker_id: s for s in [speaker_1, speaker_2]}
)

with st.spinner("Generating Audio..."):
waveform = parse_script_to_waveform(
final_script, demo_podcast_config
)
save_waveform_as_file(
waveform, demo_podcast_config.sampling_rate, filename
)
st.audio(filename)

text_model = load_text_to_text_model()
speech_model, speech_tokenizer = load_text_to_speech_model_and_tokenizer()

# ~4 characters per token is considered a reasonable default.
max_characters = text_model.n_ctx() * 4
if len(clean_text) > max_characters:
st.warning(
f"Input text is too big ({len(clean_text)})."
f" Using only a subset of it ({max_characters})."
)
clean_text = clean_text[:max_characters]

st.divider()
st.header("Podcast generation")

system_prompt = st.text_area("Podcast generation prompt", value=PODCAST_PROMPT)

if st.button("Generate Podcast"):
with st.spinner("Generating Podcast..."):
text = ""
for chunk in text_to_text_stream(
clean_text, text_model, system_prompt=system_prompt.strip()
):
text += chunk
if text.endswith("\n") and "Speaker" in text:
st.write(text)
speaker_id = re.search(r"Speaker (\d+)", text).group(1)
with st.spinner("Generating Audio..."):
speech = _speech_generation_parler(
text.split(f'"Speaker {speaker_id}":')[-1],
speech_model,
speech_tokenizer,
SPEAKER_DESCRIPTIONS[speaker_id],
)
st.audio(speech, sample_rate=44_100)
text = ""
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ dependencies = [
"huggingface-hub",
"llama-cpp-python",
"loguru",
"parler_tts @ git+https://github.com/huggingface/parler-tts.git",
"parler_tts @ git+https://github.com/daavoo/parler-tts.git",
"pydantic",
"PyPDF2[crypto]",
"python-docx",
Expand Down
2 changes: 1 addition & 1 deletion src/opennotebookllm/podcast_maker/script_to_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def parse_script_to_waveform(script: str, podcast_config: PodcastConfig):
podcast_waveform = []
for part in parts:
if ":" in part:
speaker_id, speaker_text = part.replace("\"", "").split(":")
speaker_id, speaker_text = part.replace('"', "").split(":")
speaker_model = podcast_config.speakers[speaker_id].model
speaker_tokenizer = podcast_config.speakers[speaker_id].tokenizer
speaker_description = podcast_config.speakers[
Expand Down