remove_silence with get_speech_timestamps on io.BytesIO #558

BlaiseCz · 2024-10-15T11:06:55Z

BlaiseCz
Oct 15, 2024

Removin silence from io.BytesIO

Hi, i am working with io.BytesIO as an input, and i will not be able to change it. My problem is that generated output_audio file is almost always smaller than input one. Of course i mean when input is 10s and 1000kb remove_silence cuts lets say 5s and its 200kb and it reduces bitrate, can someone help me?


import io

import numpy as np
import soundfile as sf
from silero_vad import load_silero_vad, get_speech_timestamps, read_audio

# Load Silero VAD model
model = load_silero_vad()


def remove_silence(input_audio: io.BytesIO) -> io.BytesIO:
    # Save and inspect input file properties
    save_audio(input_audio, "input.wav")
    data, samplerate = sf.read("input.wav")
    print_audio_properties("Input", data, samplerate, input_audio.getbuffer().nbytes)

    # Process audio for silence removal
    audio = read_audio(input_audio)
    speech_timestamps = get_speech_timestamps(audio.numpy().flatten(), model,
                                              min_silence_duration_ms=2000,
                                              threshold=0.25,
                                              speech_pad_ms=500)
    if not speech_timestamps:
        return io.BytesIO()

    base_audio_length = len(audio)
    new_audio = []

    for segment in speech_timestamps:
        print(f"Speech found from: {segment['start'] / samplerate:.2f}s to {segment['end'] / samplerate:.2f}s")
        start, end = segment["start"], segment["end"]
        if start < base_audio_length and end <= base_audio_length:
            new_audio.append(data[start:end])

    new_audio = np.concatenate(new_audio)

    output_audio = io.BytesIO()
    output_audio.name = "vad_temp.wav"
    sf.write(output_audio, new_audio, samplerate, format="WAV")
    output_audio.seek(0)

    save_audio(output_audio, "output.wav")
    output_file_data, samplerate = sf.read("output.wav")
    print_audio_properties("Output", output_file_data, samplerate, output_audio.getbuffer().nbytes)

    return output_audio


def save_audio(buffer: io.BytesIO, filename: str):
    with open(filename, "wb") as f:
        f.write(buffer.getbuffer())


def print_audio_properties(label: str, data: np.ndarray, samplerate: int, size: int):
    channels = data.shape[1] if data.ndim > 1 else 1
    duration = len(data) / samplerate
    bitrate = (size * 8) / duration if duration > 0 else 0  # File size in bits / duration

    print(f"{label} audio properties:")
    print(f"  Format: {data.dtype}")
    print(f"  Samplerate: {samplerate}")
    print(f"  Channels: {channels}")
    print(f"  Length: {duration:.2f}s")
    print(f"  Size: {size} bytes")
    print(f"  Bitrate: {bitrate / 1000:.2f} kbps")
    print("-" * 30)

so my example printouts are like:


Input audio properties:
  Format: float64
  Samplerate: 16000
  Channels: 1
  Length: 7.54s
  Size: 482618 bytes
  Bitrate: 512.06 kbps
------------------------------
Speech found from: 0.33s to 3.00s
Speech found from: 5.32s to 7.54s

Output audio properties:
  Format: float64
  Samplerate: 16000
  Channels: 1
  Length: 4.88s
  Size: 156204 bytes
  Bitrate: 256.07 kbps

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

remove_silence with get_speech_timestamps on io.BytesIO #558

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 0 comments

Select a reply

remove_silence with get_speech_timestamps on io.BytesIO #558

BlaiseCz Oct 15, 2024

Removin silence from io.BytesIO

Replies: 0 comments

BlaiseCz
Oct 15, 2024