From f0798dfb6e61eb98342040d607181cc57fd89b59 Mon Sep 17 00:00:00 2001 From: daavoo Date: Fri, 3 Jan 2025 12:38:53 +0100 Subject: [PATCH] Add random-lenght silences. Insert silences of lenght between 0.1 and 1.1 seconds between each speaker segment. --- demo/app.py | 6 ++++-- src/document_to_podcast/cli.py | 4 ++-- src/document_to_podcast/utils.py | 9 +++++++++ 3 files changed, 15 insertions(+), 4 deletions(-) create mode 100644 src/document_to_podcast/utils.py diff --git a/demo/app.py b/demo/app.py index f3f904c..a360a13 100644 --- a/demo/app.py +++ b/demo/app.py @@ -1,7 +1,6 @@ import re from pathlib import Path -import numpy as np import soundfile as sf import streamlit as st @@ -13,6 +12,7 @@ from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker from document_to_podcast.inference.text_to_speech import text_to_speech from document_to_podcast.inference.text_to_text import text_to_text_stream +from document_to_podcast.utils import stack_audio_segments @st.cache_resource @@ -160,7 +160,9 @@ def gen_button_clicked(): if st.session_state[gen_button]: if st.button("Save Podcast to audio file"): - st.session_state.audio = np.concatenate(st.session_state.audio) + st.session_state.audio = stack_audio_segments( + st.session_state.audio, speech_model.audio_codec.sr + ) sf.write( "podcast.wav", st.session_state.audio, diff --git a/src/document_to_podcast/cli.py b/src/document_to_podcast/cli.py index f71e1cb..c056570 100644 --- a/src/document_to_podcast/cli.py +++ b/src/document_to_podcast/cli.py @@ -1,7 +1,6 @@ import re from pathlib import Path -import numpy as np import soundfile as sf import yaml from fire import Fire @@ -22,6 +21,7 @@ from document_to_podcast.inference.text_to_text import text_to_text_stream from document_to_podcast.inference.text_to_speech import text_to_speech from document_to_podcast.preprocessing import DATA_CLEANERS, DATA_LOADERS +from document_to_podcast.utils import stack_audio_segments @logger.catch(reraise=True) @@ -158,7 +158,7 @@ def document_to_podcast( logger.info("Saving Podcast...") sf.write( str(output_folder / "podcast.wav"), - np.concatenate(podcast_audio), + stack_audio_segments(podcast_audio), samplerate=sample_rate, ) (output_folder / "podcast.txt").write_text(podcast_script) diff --git a/src/document_to_podcast/utils.py b/src/document_to_podcast/utils.py new file mode 100644 index 0000000..2695c7b --- /dev/null +++ b/src/document_to_podcast/utils.py @@ -0,0 +1,9 @@ +import numpy as np + + +def stack_audio_segments(audio_segments, sample_rate): + stacked = [] + for segment in audio_segments: + stacked.append(segment) + stacked.append(np.zeros(int((0.1 + np.random.rand()) * sample_rate))) + return np.concatenate(stacked)