Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update audio.py #1112

Closed
wants to merge 6 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 72 additions & 42 deletions faster_whisper/audio.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,86 @@
"""We use the PyAV library to decode the audio: https://github.com/PyAV-Org/PyAV

The advantage of PyAV is that it bundles the FFmpeg libraries so there is no additional
system dependencies. FFmpeg does not need to be installed on the system.

However, the API is quite low-level so we need to manipulate audio frames directly.
"""We use system FFmpeg when available for faster audio decoding. As a fallback, we use
the PyAV library: https://github.com/PyAV-Org/PyAV
If FFmpeg is installed on the system, we use it directly for optimal performance.
If FFmpeg is not available, PyAV provides the advantage of bundled FFmpeg libraries,
though its low-level API requires direct frame manipulation.
"""

import gc
import io
import itertools

import subprocess
import tempfile
from typing import BinaryIO, Union

import av
import numpy as np
import torch


def is_ffmpeg_available() -> bool:
try:
subprocess.check_output(["ffmpeg", "-version"])
return True
except (subprocess.SubprocessError, FileNotFoundError):
return False


def decode_audio_ffmpeg(
input_file: Union[str, BinaryIO], sampling_rate: int = 16000, split_stereo: bool = False
):
with tempfile.TemporaryDirectory() as tmpdir:
output_file = f"{tmpdir}/temp.wav"
channels = 2 if split_stereo else 1

cmd = [
"ffmpeg",
"-i",
str(input_file),
"-ar",
str(sampling_rate),
"-ac",
str(channels),
"-acodec",
"pcm_s16le",
output_file,
"-y",
]

try:
subprocess.check_call(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
audio = np.fromfile(output_file, dtype=np.int16)
audio = audio.astype(np.float32) / 32768.0

if split_stereo:
left_channel = audio[0::2]
right_channel = audio[1::2]
return torch.from_numpy(left_channel), torch.from_numpy(right_channel)
return torch.from_numpy(audio)

except subprocess.SubprocessError:
raise RuntimeError("FFmpeg failed to decode the audio file")


def decode_audio(
input_file: Union[str, BinaryIO],
sampling_rate: int = 16000,
split_stereo: bool = False,
input_file: Union[str, BinaryIO], sampling_rate: int = 16000, split_stereo: bool = False
):
"""Decodes the audio.

Args:
input_file: Path to the input file or a file-like object.
sampling_rate: Resample the audio to this sample rate.
split_stereo: Return separate left and right channels.
input_file: Path to the input file or a file-like object.
sampling_rate: Resample the audio to this sample rate.
split_stereo: Return separate left and right channels.

Returns:
A float32 Numpy array.

If `split_stereo` is enabled, the function returns a 2-tuple with the
separated left and right channels.
A float32 Numpy array.
If `split_stereo` is enabled, the function returns a 2-tuple with the
separated left and right channels.
"""
if is_ffmpeg_available():
try:
return decode_audio_ffmpeg(input_file, sampling_rate, split_stereo)
except (subprocess.SubprocessError, RuntimeError):
pass

resampler = av.audio.resampler.AudioResampler(
format="s16",
layout="mono" if not split_stereo else "stereo",
Expand All @@ -46,10 +92,9 @@ def decode_audio(

with av.open(input_file, mode="r", metadata_errors="ignore") as container:
frames = container.decode(audio=0)
frames = _ignore_invalid_frames(frames)
frames = _group_frames(frames, 500000)
frames = _resample_frames(frames, resampler)

frames = ignore_invalid_frames(frames)
frames = group_frames(frames, 500000)
frames = resample_frames(frames, resampler)
for frame in frames:
array = frame.to_ndarray()
dtype = array.dtype
Expand All @@ -65,21 +110,18 @@ def decode_audio(
gc.collect()

audio = np.frombuffer(raw_buffer.getbuffer(), dtype=dtype)

# Convert s16 back to f32.
audio = audio.astype(np.float32) / 32768.0

if split_stereo:
left_channel = audio[0::2]
right_channel = audio[1::2]
return torch.from_numpy(left_channel), torch.from_numpy(right_channel)

return torch.from_numpy(audio)


def _ignore_invalid_frames(frames):
def ignore_invalid_frames(frames):
iterator = iter(frames)

while True:
try:
yield next(iterator)
Expand All @@ -89,44 +131,32 @@ def _ignore_invalid_frames(frames):
continue


def _group_frames(frames, num_samples=None):
def group_frames(frames, num_samples=None):
fifo = av.audio.fifo.AudioFifo()

for frame in frames:
frame.pts = None # Ignore timestamp check.
fifo.write(frame)

if num_samples is not None and fifo.samples >= num_samples:
yield fifo.read()

if fifo.samples > 0:
yield fifo.read()


def _resample_frames(frames, resampler):
def resample_frames(frames, resampler):
# Add None to flush the resampler.
for frame in itertools.chain(frames, [None]):
yield from resampler.resample(frame)


def pad_or_trim(array, length: int = 3000, *, axis: int = -1):
"""
Pad or trim the Mel features array to 3000, as expected by the encoder.
"""
"""Pad or trim the Mel features array to 3000, as expected by the encoder."""
axis = axis % array.ndim
if array.shape[axis] > length:
idx = [Ellipsis] * axis + [slice(length)] + [Ellipsis] * (array.ndim - axis - 1)
return array[idx]

if array.shape[axis] < length:
pad_widths = (
[
0,
]
* array.ndim
* 2
)
pad_widths = [0] * array.ndim * 2
pad_widths[2 * axis] = length - array.shape[axis]
array = torch.nn.functional.pad(array, tuple(pad_widths[::-1]))

return array
Loading