You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi, i am working with io.BytesIO as an input, and i will not be able to change it. My problem is that generated output_audio file is almost always smaller than input one. Of course i mean when input is 10s and 1000kb remove_silence cuts lets say 5s and its 200kb and it reduces bitrate, can someone help me?
import io
import numpy as np
import soundfile as sf
from silero_vad import load_silero_vad, get_speech_timestamps, read_audio
# Load Silero VAD model
model = load_silero_vad()
def remove_silence(input_audio: io.BytesIO) -> io.BytesIO:
# Save and inspect input file properties
save_audio(input_audio, "input.wav")
data, samplerate = sf.read("input.wav")
print_audio_properties("Input", data, samplerate, input_audio.getbuffer().nbytes)
# Process audio for silence removal
audio = read_audio(input_audio)
speech_timestamps = get_speech_timestamps(audio.numpy().flatten(), model,
min_silence_duration_ms=2000,
threshold=0.25,
speech_pad_ms=500)
if not speech_timestamps:
return io.BytesIO()
base_audio_length = len(audio)
new_audio = []
for segment in speech_timestamps:
print(f"Speech found from: {segment['start'] / samplerate:.2f}s to {segment['end'] / samplerate:.2f}s")
start, end = segment["start"], segment["end"]
if start < base_audio_length and end <= base_audio_length:
new_audio.append(data[start:end])
new_audio = np.concatenate(new_audio)
output_audio = io.BytesIO()
output_audio.name = "vad_temp.wav"
sf.write(output_audio, new_audio, samplerate, format="WAV")
output_audio.seek(0)
save_audio(output_audio, "output.wav")
output_file_data, samplerate = sf.read("output.wav")
print_audio_properties("Output", output_file_data, samplerate, output_audio.getbuffer().nbytes)
return output_audio
def save_audio(buffer: io.BytesIO, filename: str):
with open(filename, "wb") as f:
f.write(buffer.getbuffer())
def print_audio_properties(label: str, data: np.ndarray, samplerate: int, size: int):
channels = data.shape[1] if data.ndim > 1 else 1
duration = len(data) / samplerate
bitrate = (size * 8) / duration if duration > 0 else 0 # File size in bits / duration
print(f"{label} audio properties:")
print(f" Format: {data.dtype}")
print(f" Samplerate: {samplerate}")
print(f" Channels: {channels}")
print(f" Length: {duration:.2f}s")
print(f" Size: {size} bytes")
print(f" Bitrate: {bitrate / 1000:.2f} kbps")
print("-" * 30)
This discussion was converted from issue #557 on October 15, 2024 11:57.
Heading
Bold
Italic
Quote
Code
Link
Numbered list
Unordered list
Task list
Attach files
Mention
Reference
Menu
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
-
Removin silence from io.BytesIO
Hi, i am working with io.BytesIO as an input, and i will not be able to change it. My problem is that generated output_audio file is almost always smaller than input one. Of course i mean when input is 10s and 1000kb remove_silence cuts lets say 5s and its 200kb and it reduces bitrate, can someone help me?
so my example printouts are like:
Beta Was this translation helpful? Give feedback.
All reactions