Skip to content

Commit

Permalink
Check media and log attributes
Browse files Browse the repository at this point in the history
This commit uses ffprobe, which comes with ffmpeg (a Whisper
requirement), to check that the file being transcribed is an audio or
video file that ffmpeg understands. The format, duration and size of the
file is logged to make it a bit easier to analyze the speech-to-text
logs.

If an invalid file is supplied to the service it will raise an exception
indicating that the format was invalid. This ultimately gets caught and
logged.

Closes #48
Closes #31
  • Loading branch information
edsu committed Nov 25, 2024
1 parent d14f962 commit 6994d9e
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
__pycache__/
whisper_models
*.log
.python-version
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
boto3
ffprobe3
openai-whisper
python-dotenv
pytest
Expand Down
37 changes: 33 additions & 4 deletions speech_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys
import uuid
import shutil
import subprocess
import traceback
from functools import cache
from pathlib import Path
Expand Down Expand Up @@ -39,7 +40,7 @@ def main(daemon=True) -> None:
except KeyboardInterrupt:
logging.info("exiting")
sys.exit()
except Exception:
except SpeechToTextException:
logging.exception("error while processing job")
report_error(job, traceback.format_exc())

Expand Down Expand Up @@ -79,7 +80,7 @@ def get_job() -> Optional[dict]:
return None
else:
# this should never happen
raise Exception(f"expected 1 job from queue but got {len(jobs)}")
raise SpeechToTextException(f"expected 1 job from queue but got {len(jobs)}")


def download_media(job: dict) -> dict:
Expand All @@ -94,6 +95,9 @@ def download_media(job: dict) -> dict:
# e.g. pg879tb2706-v2/video_1.mp4
bucket.download_file(media_file, media_file)

media_info = inspect_media(media_file)
logging.info(f"downloaded {media_file}: {json.dumps(media_info)}")

return job


Expand All @@ -118,7 +122,13 @@ def run_whisper(job: dict) -> dict:
whisper_options.pop("model", None)
whisper_options.pop("writer", None)

result = whisper.transcribe(audio=media_file, model=model, **whisper_options)
try:
result = whisper.transcribe(
audio=media_file, model=model, **whisper_options
)
except Exception as e:
raise SpeechToTextException(str(e))

logging.info(f"whisper result: {result}")

logging.info(f"writing output using writer_options: {writer_options}")
Expand Down Expand Up @@ -319,7 +329,22 @@ def receive_done_job() -> Optional[dict]:
elif len(messages) == 0:
return None
else:
raise Exception("received more than one message from todo queue!")
raise SpeechToTextException("received more than one message from todo queue!")


def inspect_media(path) -> dict:
try:
output = subprocess.check_output(
["ffprobe", "-show_format", "-print_format", "json", "-v", "quiet", path]
)
result = json.loads(output)
return {
"duration": float(result["format"]["duration"]),
"format": result["format"]["format_name"],
"size": int(result["format"]["size"]),
}
except subprocess.CalledProcessError:
raise SpeechToTextException(f"Invalid media file {path}")


@cache
Expand Down Expand Up @@ -400,3 +425,7 @@ def load_whisper_model(model_name) -> whisper.model.Whisper:

else:
main(daemon=args.daemon)


class SpeechToTextException(Exception):
pass
23 changes: 23 additions & 0 deletions tests/test_inspect_media.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from speech_to_text import inspect_media, SpeechToTextException

import pytest


def test_duration():
result = inspect_media("tests/data/en.wav")
assert result["duration"] == 3.220000


def test_format():
result = inspect_media("tests/data/en.wav")
assert result["format"] == "wav"


def test_size():
result = inspect_media("tests/data/en.wav")
assert result["size"] == 618318


def test_invalid_media():
with pytest.raises(SpeechToTextException):
result = inspect_media("README.md")

0 comments on commit 6994d9e

Please sign in to comment.