Add subtitles translation using EasyNMT and OpusMT libraries

Sirozha1337 · Jan 30, 2024 · 8f9d069 · 8f9d069
1 parent 1c0cdb6
commit 8f9d069
Show file tree

Hide file tree

Showing 15 changed files with 324 additions and 32 deletions.
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -18,6 +18,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install pylint
+        pip install wheel
         pip install -r requirements.txt
     - name: Analysing the code with pylint
       run: |

diff --git a/.github/workflows/setup.yml b/.github/workflows/setup.yml
@@ -0,0 +1,23 @@
+name: Setup
+
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install application
+      run: |
+        pip install wheel
+        pip install -e .
+    - name: Check that package was installed successfully
+      run: |
+        faster_auto_subtitle -h
diff --git a/README.md b/README.md
@@ -6,7 +6,9 @@ This repository uses `ffmpeg` and [OpenAI's Whisper](https://openai.com/blog/whi
 
 ## Installation
 
-To get started, you'll need Python 3.7 or newer. Install the binary by running the following command:
+To get started, you'll need Python 3.9 or newer. Install the binary by running the following command:
+
+    pip install wheel
 
     pip install git+https://github.com/Sirozha1337/faster-auto-subtitle.git@dev
 
@@ -37,6 +39,12 @@ Adding `--task translate` will translate the subtitles into English:
 
     faster_auto_subtitle /path/to/video.mp4 --task translate
 
+Adding `--target_language {2-letter-language-code}` will translate the subtitles into specified language using [Opus-MT](https://github.com/Helsinki-NLP/Opus-MT):
+
+    faster_auto_subtitle /path/to/video.mp4 --target_language fr
+
+This will require downloading the appropriate model. If direct translation is not available it will attempt translation from source to english and from english to source.
+
 Run the following to view all available options:
 
     faster_auto_subtitle --help
@@ -49,7 +57,7 @@ Higher `beam_size` usually leads to greater accuracy, but slows down the process
 
 Setting higher `no_speech_threshold` could be useful for videos with a lot of background noise to stop Whisper from "hallucinating" subtitles for it.
 
-In my experience settings option `condition_on_previous_text` to `False` dramatically increases accurracy for videos like TV Shows with an intro song at the start. 
+In my experience settings option `condition_on_previous_text` to `False` dramatically increases accurracy for videos like TV Shows with an intro song at the start.
 
 You can use `sample_interval` parameter to generate subtitles for a portion of the video to play around with those parameters:
 

diff --git a/auto_subtitle/cli.py b/auto_subtitle/cli.py
@@ -46,11 +46,16 @@ def main():
     parser.add_argument("--task", type=str, default="transcribe",
                         choices=["transcribe", "translate"],
                         help="whether to perform X->X speech recognition ('transcribe') \
-                              or X->English translation ('translate')")
+                              or X->Language translation ('translate')")
     parser.add_argument("--language", type=str, default="auto",
                         choices=LANGUAGE_CODES,
                         help="What is the origin language of the video? \
                               If unset, it is detected automatically.")
+    parser.add_argument("--target_language", type=str, default="en",
+                        choices=LANGUAGE_CODES,
+                        help="Desired language to translate subtitles to. \
+                              If language is not en, Opus-MT will be used. \
+                              See https://github.com/Helsinki-NLP/Opus-MT.")
 
     args = parser.parse_args().__dict__
 

diff --git a/auto_subtitle/main.py b/auto_subtitle/main.py
@@ -1,9 +1,9 @@
 import os
 import warnings
-import tempfile
 from .utils.files import filename, write_srt
 from .utils.ffmpeg import get_audio, overlay_subtitles
 from .utils.whisper import WhisperAI
+from .translation.easynmt_utils import EasyNMTWrapper
 
 
 def process(args: dict):
@@ -12,52 +12,94 @@ def process(args: dict):
     output_srt: bool = args.pop("output_srt")
     srt_only: bool = args.pop("srt_only")
     language: str = args.pop("language")
-    sample_interval: str = args.pop("sample_interval")
+    sample_interval: list = args.pop("sample_interval")
+    target_language: str = args.pop("target_language")
 
     os.makedirs(output_dir, exist_ok=True)
 
     if model_name.endswith(".en"):
         warnings.warn(
             f"{model_name} is an English-only model, forcing English detection.")
         args["language"] = "en"
+        language = "en"
     # if translate task used and language argument is set, then use it
     elif language != "auto":
         args["language"] = language
 
+    if target_language != 'en':
+        warnings.warn(
+            f"{target_language} is not English, Opus-MT will be used to perform translation.")
+        args['task'] = 'transcribe'
+
     audios = get_audio(args.pop("video"), args.pop(
         'audio_channel'), sample_interval)
 
-    model_args = {}
-    model_args["model_size_or_path"] = model_name
-    model_args["device"] = args.pop("device")
-    model_args["compute_type"] = args.pop("compute_type")
+    model_args = {
+        "model_size_or_path": model_name,
+        "device": args.pop("device"),
+        "compute_type": args.pop("compute_type")
+    }
+
+    subtitles = get_subtitles(audios, model_args, args)
+    print('Subtitles generated.')
+
+    if target_language != 'en':
+        print('Translating subtitles... This might take a while.')
+        subtitles = translate_subtitles(
+            subtitles, language, target_language, model_args)
 
-    srt_output_dir = output_dir if output_srt or srt_only else tempfile.gettempdir()
-    subtitles = get_subtitles(audios, srt_output_dir, model_args, args)
+    if output_srt or srt_only:
+        print('Saving subtitle files...')
+        save_subtitles(subtitles, output_dir)
 
     if srt_only:
         return
 
     overlay_subtitles(subtitles, output_dir, sample_interval)
 
 
-def get_subtitles(audio_paths: list, output_dir: str,
-                  model_args: dict, transcribe_args: dict):
+def translate_subtitles(subtitles: dict, source_lang: str, target_lang: str, model_args: dict):
+    model = EasyNMTWrapper(device=model_args['device'])
+
+    translated_subtitles = {}
+    for key, subtitle in subtitles.items():
+        src_lang = source_lang
+        if src_lang == '' or src_lang is None:
+            src_lang = subtitle['language']
+
+        translated_segments = model.translate(
+            subtitle['segments'], src_lang, target_lang)
+
+        translated_subtitle = subtitle.copy()
+        translated_subtitle['segments'] = translated_segments
+        translated_subtitles[key] = translated_subtitle
+
+    return translated_subtitles
+
+
+def save_subtitles(subtitles: dict, output_dir: str):
+    for path, subtitle in subtitles.items():
+        subtitle["output_path"] = os.path.join(
+            output_dir, f"{filename(path)}.srt")
+
+        print(f'Saving to path {subtitle["output_path"]}')
+        with open(subtitle['output_path'], "w", encoding="utf-8") as srt:
+            write_srt(subtitle['segments'], file=srt)
+
+
+def get_subtitles(audio_paths: dict, model_args: dict, transcribe_args: dict):
     model = WhisperAI(model_args, transcribe_args)
 
-    subtitles_path = {}
+    subtitles = {}
 
     for path, audio_path in audio_paths.items():
         print(
             f"Generating subtitles for {filename(path)}... This might take a while."
         )
-        srt_path = os.path.join(output_dir, f"{filename(path)}.srt")
-
-        segments = model.transcribe(audio_path)
 
-        with open(srt_path, "w", encoding="utf-8") as srt:
-            write_srt(segments, file=srt)
+        segments, info = model.transcribe(audio_path)
 
-        subtitles_path[path] = srt_path
+        subtitles[path] = {'segments': list(
+            segments), 'language': info.language}
 
-    return subtitles_path
+    return subtitles
diff --git a/auto_subtitle/translation/__init__.py b/auto_subtitle/translation/__init__.py
diff --git a/auto_subtitle/translation/easynmt_utils.py b/auto_subtitle/translation/easynmt_utils.py
@@ -0,0 +1,24 @@
+from easynmt import EasyNMT
+from faster_whisper.transcribe import Segment
+from .opusmt_utils import OpusMT
+
+
+class EasyNMTWrapper:
+    def __init__(self, device):
+        self.translator = OpusMT()
+        self.model = EasyNMT('opus-mt',
+                             translator=self.translator,
+                             device=device if device != 'auto' else None)
+
+    def translate(self, segments: list[Segment], source_lang: str, target_lang: str):
+        source_text = [segment.text for segment in segments]
+        self.translator.load_available_models()
+
+        translated_text = self.model.translate(source_text, target_lang,
+                                               source_lang, show_progress_bar=True)
+        translated_segments = [None] * len(segments)
+        for index, segment in enumerate(segments):
+            translated_segments[index] = segment._replace(
+                text=translated_text[index])
+
+        return translated_segments
diff --git a/auto_subtitle/translation/languages.py b/auto_subtitle/translation/languages.py
@@ -0,0 +1,20 @@
+import langcodes
+from transformers.models.marian.convert_marian_tatoeba_to_pytorch import GROUP_MEMBERS
+
+
+def to_alpha2_languages(languages):
+    return set(item for sublist in [__to_alpha2_language(language) for language in languages] for item in sublist)
+
+
+def __to_alpha2_language(language):
+    if len(language) == 2:
+        return [language]
+
+    if language in GROUP_MEMBERS:
+        return set([langcodes.Language.get(x).language for x in GROUP_MEMBERS[language][1]])
+
+    return [langcodes.Language.get(language).language]
+
+
+def to_alpha3_language(language):
+    return langcodes.Language.get(language).to_alpha3()