From 211241ea4a3c812da576d0abdbb3890472722b8f Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Mon, 25 Nov 2024 12:47:16 -0500 Subject: [PATCH] Check media and log attributes This commit uses ffprobe, which comes with ffmpeg (a Whisper requirement), to check that the file being transcribed is an audio or video file that ffmpeg understands. The format, duration and size of the file is logged to make it a bit easier to analyze the speech-to-text logs. If an invalid file is supplied to the service it will raise an exception indicating that the format was invalid. This ultimately gets caught and logged. Closes #48 Closes #31 --- .gitignore | 1 + requirements.txt | 1 + speech_to_text.py | 37 +++++++++++++++++++++++++++++++++---- tests/test_inspect_media.py | 23 +++++++++++++++++++++++ 4 files changed, 58 insertions(+), 4 deletions(-) create mode 100644 tests/test_inspect_media.py diff --git a/.gitignore b/.gitignore index dc05fae..c016a9f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ __pycache__/ whisper_models *.log +.python-version diff --git a/requirements.txt b/requirements.txt index 58dba89..5393266 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ boto3 +ffprobe3 openai-whisper python-dotenv pytest diff --git a/speech_to_text.py b/speech_to_text.py index a96be32..b25017f 100755 --- a/speech_to_text.py +++ b/speech_to_text.py @@ -8,6 +8,7 @@ import sys import uuid import shutil +import subprocess import traceback from functools import cache from pathlib import Path @@ -39,7 +40,7 @@ def main(daemon=True) -> None: except KeyboardInterrupt: logging.info("exiting") sys.exit() - except Exception: + except SpeechToTextException: logging.exception("error while processing job") report_error(job, traceback.format_exc()) @@ -79,7 +80,7 @@ def get_job() -> Optional[dict]: return None else: # this should never happen - raise Exception(f"expected 1 job from queue but got {len(jobs)}") + raise SpeechToTextException(f"expected 1 job from queue but got {len(jobs)}") def download_media(job: dict) -> dict: @@ -94,6 +95,9 @@ def download_media(job: dict) -> dict: # e.g. pg879tb2706-v2/video_1.mp4 bucket.download_file(media_file, media_file) + media_info = inspect_media(media_file) + logging.info(f"downloaded {media_file}: {json.dumps(media_info)}") + return job @@ -118,7 +122,13 @@ def run_whisper(job: dict) -> dict: whisper_options.pop("model", None) whisper_options.pop("writer", None) - result = whisper.transcribe(audio=media_file, model=model, **whisper_options) + try: + result = whisper.transcribe( + audio=media_file, model=model, **whisper_options + ) + except Exception as e: + raise SpeechToTextException(str(e)) + logging.info(f"whisper result: {result}") logging.info(f"writing output using writer_options: {writer_options}") @@ -319,7 +329,22 @@ def receive_done_job() -> Optional[dict]: elif len(messages) == 0: return None else: - raise Exception("received more than one message from todo queue!") + raise SpeechToTextException("received more than one message from todo queue!") + + +def inspect_media(path) -> dict: + try: + output = subprocess.check_output( + ["ffprobe", "-show_format", "-print_format", "json", "-v", "quiet", path] + ) + result = json.loads(output) + return { + "duration": float(result["format"]["duration"]), + "format": result["format"]["format_name"], + "size": int(result["format"]["size"]), + } + except subprocess.CalledProcessError: + raise SpeechToTextException(f"Invalid media file {path}") @cache @@ -400,3 +425,7 @@ def load_whisper_model(model_name) -> whisper.model.Whisper: else: main(daemon=args.daemon) + + +class SpeechToTextException(Exception): + pass diff --git a/tests/test_inspect_media.py b/tests/test_inspect_media.py new file mode 100644 index 0000000..a4a86f0 --- /dev/null +++ b/tests/test_inspect_media.py @@ -0,0 +1,23 @@ +from speech_to_text import inspect_media, SpeechToTextException + +import pytest + + +def test_duration(): + result = inspect_media("tests/data/en.wav") + assert result["duration"] == 3.220000 + + +def test_format(): + result = inspect_media("tests/data/en.wav") + assert result["format"] == "wav" + + +def test_size(): + result = inspect_media("tests/data/en.wav") + assert result["size"] == 618318 + + +def test_invalid_media(): + with pytest.raises(SpeechToTextException): + inspect_media("README.md")