From 211241ea4a3c812da576d0abdbb3890472722b8f Mon Sep 17 00:00:00 2001
From: Ed Summers <ehs@pobox.com>
Date: Mon, 25 Nov 2024 12:47:16 -0500
Subject: [PATCH] Check media and log attributes

This commit uses ffprobe, which comes with ffmpeg (a Whisper
requirement), to check that the file being transcribed is an audio or
video file that ffmpeg understands. The format, duration and size of the
file is logged to make it a bit easier to analyze the speech-to-text
logs.

If an invalid file is supplied to the service it will raise an exception
indicating that the format was invalid. This ultimately gets caught and
logged.

Closes #48
Closes #31
---
 .gitignore                  |  1 +
 requirements.txt            |  1 +
 speech_to_text.py           | 37 +++++++++++++++++++++++++++++++++----
 tests/test_inspect_media.py | 23 +++++++++++++++++++++++
 4 files changed, 58 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_inspect_media.py

diff --git a/.gitignore b/.gitignore
index dc05fae..c016a9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 __pycache__/
 whisper_models
 *.log
+.python-version
diff --git a/requirements.txt b/requirements.txt
index 58dba89..5393266 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 boto3
+ffprobe3
 openai-whisper
 python-dotenv
 pytest
diff --git a/speech_to_text.py b/speech_to_text.py
index a96be32..b25017f 100755
--- a/speech_to_text.py
+++ b/speech_to_text.py
@@ -8,6 +8,7 @@
 import sys
 import uuid
 import shutil
+import subprocess
 import traceback
 from functools import cache
 from pathlib import Path
@@ -39,7 +40,7 @@ def main(daemon=True) -> None:
         except KeyboardInterrupt:
             logging.info("exiting")
             sys.exit()
-        except Exception:
+        except SpeechToTextException:
             logging.exception("error while processing job")
             report_error(job, traceback.format_exc())
 
@@ -79,7 +80,7 @@ def get_job() -> Optional[dict]:
         return None
     else:
         # this should never happen
-        raise Exception(f"expected 1 job from queue but got {len(jobs)}")
+        raise SpeechToTextException(f"expected 1 job from queue but got {len(jobs)}")
 
 
 def download_media(job: dict) -> dict:
@@ -94,6 +95,9 @@ def download_media(job: dict) -> dict:
         # e.g. pg879tb2706-v2/video_1.mp4
         bucket.download_file(media_file, media_file)
 
+        media_info = inspect_media(media_file)
+        logging.info(f"downloaded {media_file}: {json.dumps(media_info)}")
+
     return job
 
 
@@ -118,7 +122,13 @@ def run_whisper(job: dict) -> dict:
         whisper_options.pop("model", None)
         whisper_options.pop("writer", None)
 
-        result = whisper.transcribe(audio=media_file, model=model, **whisper_options)
+        try:
+            result = whisper.transcribe(
+                audio=media_file, model=model, **whisper_options
+            )
+        except Exception as e:
+            raise SpeechToTextException(str(e))
+
         logging.info(f"whisper result: {result}")
 
         logging.info(f"writing output using writer_options: {writer_options}")
@@ -319,7 +329,22 @@ def receive_done_job() -> Optional[dict]:
     elif len(messages) == 0:
         return None
     else:
-        raise Exception("received more than one message from todo queue!")
+        raise SpeechToTextException("received more than one message from todo queue!")
+
+
+def inspect_media(path) -> dict:
+    try:
+        output = subprocess.check_output(
+            ["ffprobe", "-show_format", "-print_format", "json", "-v", "quiet", path]
+        )
+        result = json.loads(output)
+        return {
+            "duration": float(result["format"]["duration"]),
+            "format": result["format"]["format_name"],
+            "size": int(result["format"]["size"]),
+        }
+    except subprocess.CalledProcessError:
+        raise SpeechToTextException(f"Invalid media file {path}")
 
 
 @cache
@@ -400,3 +425,7 @@ def load_whisper_model(model_name) -> whisper.model.Whisper:
 
     else:
         main(daemon=args.daemon)
+
+
+class SpeechToTextException(Exception):
+    pass
diff --git a/tests/test_inspect_media.py b/tests/test_inspect_media.py
new file mode 100644
index 0000000..a4a86f0
--- /dev/null
+++ b/tests/test_inspect_media.py
@@ -0,0 +1,23 @@
+from speech_to_text import inspect_media, SpeechToTextException
+
+import pytest
+
+
+def test_duration():
+    result = inspect_media("tests/data/en.wav")
+    assert result["duration"] == 3.220000
+
+
+def test_format():
+    result = inspect_media("tests/data/en.wav")
+    assert result["format"] == "wav"
+
+
+def test_size():
+    result = inspect_media("tests/data/en.wav")
+    assert result["size"] == 618318
+
+
+def test_invalid_media():
+    with pytest.raises(SpeechToTextException):
+        inspect_media("README.md")