Merge pull request #2 from colobas/dev

new video and a couple changes
colobas · Feb 7, 2024 · a711fcb · a711fcb
2 parents 9d735da + 7b04e84
commit a711fcb
Show file tree

Hide file tree

Showing 190 changed files with 2,583 additions and 15 deletions.
diff --git a/debates.yaml b/debates.yaml
@@ -9,3 +9,6 @@
 
 - title: BE vs PSD
   url: https://sicnoticias.pt/especiais/eleicoes-legislativas/2024-02-06-Debate-entre-BE-e-PSD-Quem-e-que-sabe-salvar-o-SNS--a252ab7c
+
+- title: Chega vs IL
+  url: https://www.rtp.pt/play/p12899/e746368/debates-legislativas-2024-sicsic-noticias
diff --git a/environment.yml b/environment.yml
@@ -9,7 +9,6 @@ dependencies:
   - ffmpeg=6.1.1
   - pip:
     - beautifulsoup4
-    - webvtt-to-json
     - git+https://github.com/m-bain/whisperx.git@8227807#egg=whisperx
     - torchaudio==2.0.0
-    - webvtt
+    - webvtt-py
diff --git a/process_debates.py b/process_debates.py
@@ -150,7 +150,7 @@ def transcribe_audio(audio_path, output_root):
 
     name = audio_path.stem
 
-    if not (output_root / f"transcriptions/{name}.vtt").exists():
+    if not (output_root / f"transcriptions/{name}.json").exists():
         Path(f"{output_root}/transcriptions").mkdir(exist_ok=True, parents=True)
 
         cmd = [
@@ -177,19 +177,19 @@ def transcribe_audio(audio_path, output_root):
 
         subprocess.run(cmd)
 
-    # keep only the .vtt file
-    for f in Path(f"{output_root}/transcriptions").glob(f"{name}.*"):
-        if f.suffix not in [".vtt"]:
-            f.unlink()
+        # keep only the .vtt file
+        for f in Path(f"{output_root}/transcriptions").glob(f"{name}.*"):
+            if f.suffix not in [".vtt"]:
+                f.unlink()
 
-    # convert the vtt to json
-    webvtt_to_json(f"{output_root}/transcriptions/{name}.vtt", f"{output_root}/transcriptions/{name}.json")
+        # convert the vtt to json
+        webvtt_to_json(f"{output_root}/transcriptions/{name}.vtt", f"{output_root}/transcriptions/{name}.json")
 
-    # remove the vtt
-    (output_root / f"transcriptions/{name}.vtt").unlink()
+        # remove the vtt
+        (output_root / f"transcriptions/{name}.vtt").unlink()
 
 
-def process_debate(*, title, url, output_root):
+def process_debate(*, title, url, output_root, skip_transcription=False):
     """
     Process a debate from the input data
     """
@@ -223,7 +223,9 @@ def process_debate(*, title, url, output_root):
         headers = None
 
     get_audio_and_video(m3u8_url, audio_path, hls_path, headers=headers)
-    transcribe_audio(audio_path, output_root)
+
+    if not skip_transcription:
+        transcribe_audio(audio_path, output_root)
 
     out = {
         "slug": slug,
@@ -252,7 +254,7 @@ def main(args):
         if output_path.exists() and not args.force:
             continue
 
-        summary = process_debate(**debate, output_root=output_root)
+        summary = process_debate(**debate, output_root=output_root, skip_transcription=args.skip_transcription)
         master_json.append(summary)
 
     with open(args.output_master_json, "w") as f:
@@ -265,5 +267,6 @@ def main(args):
     parser.add_argument("--output_root", type=str, default="public/debates")
     parser.add_argument("--output-master-json", type=str, default="src/debates.json")
     parser.add_argument("--force", action="store_true")
+    parser.add_argument("--skip-transcription", action="store_true")
     args = parser.parse_args()
     main(args)
diff --git a/public/debates/audio/chega-vs-il.mp3 b/public/debates/audio/chega-vs-il.mp3
diff --git a/public/debates/chega-vs-il.json b/public/debates/chega-vs-il.json
@@ -0,0 +1,5 @@
+{
+    "slug": "chega-vs-il",
+    "title": "Chega vs IL",
+    "original_url": "https://www.rtp.pt/play/p12899/e746368/debates-legislativas-2024-sicsic-noticias"
+}