debate processing barebones

colobas · Feb 6, 2024 · d2ed865 · d2ed865
1 parent eb2f8f9
commit d2ed865
Show file tree

Hide file tree

Showing 5 changed files with 258 additions and 39 deletions.
diff --git a/.github/workflows/process-debates.yaml b/.github/workflows/process-debates.yaml
@@ -0,0 +1,36 @@
+name: Process Debates
+
+#on:
+#  push:
+#    paths:
+#      - 'debates.yaml'
+#
+jobs:
+  process-debates:
+    defaults:
+      run:
+        shell: bash -leo pipefail {0}
+
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: mamba-org/setup-micromamba@v1
+        name: install whisperx and other deps
+        with:
+          micromamba-version: '1.3.1-0'
+          environment-file: environment.yml
+          init-shell: >-
+            bash
+          cache-environment: true
+          post-cleanup: 'all'
+
+      - name: Process Debates
+        run: python process_debates.py debates.yaml src/static/debates
+
+      - name: Commit and Push Changes
+        uses: EndBug/add-and-commit@v7
+        with:
+          message: 'Update debates data'
+          add: 'src/static/debates/'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/debates.yaml b/debates.yaml
@@ -0,0 +1,8 @@
+- title: PS vs IL
+  url: https://sicnoticias.pt/pais/2024-02-05-Debate-PS--IL-na-integra-dc65b6a5
+
+- title: Chega vs PAN
+  url: https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024
+
+- title: PCP vs PAN
+  url: https://www.rtp.pt/play/p12900/e746296/debates-legislativas-2024
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,13 @@
+name: debates
+channels:
+  - conda-forge
+  - defaults
+  - pytorch
+dependencies:
+  - python=3.10
+  - pytorch=2.0.0
+  - ffmpeg=6.1.1
+  - pip:
+    - beautifulsoup4
+    - git+https://github.com/m-bain/whisperx.git@8227807#egg=whisperx
+    - torchaudio==2.0.0
diff --git a/process_debates.py b/process_debates.py
@@ -0,0 +1,179 @@
+import os
+import argparse
+from pathlib import Path
+import yaml
+import json
+import subprocess
+import bs4
+import requests
+import logging
+import re
+
+
+def find_m3u8_and_thumbnail(url):
+    """
+    Find the m3u8 link and thumbnail from a given debate URL. This will depend on
+    the specific website where the debates are hosted
+    """
+
+    if "sicnoticias.pt" in url:
+        # sic noticias
+        # the m3u8 url and the thumbnail url are in a script tag of type "application/ld+json"
+        bs = bs4.BeautifulSoup(requests.get(url).text, "html.parser")
+        scripts = bs.find_all("script", type="application/ld+json")
+        for script in scripts:
+            # read json
+            data = json.loads(script.string)
+            if data["@type"] == "VideoObject":
+                return data["contentUrl"], data["thumbnailUrl"]
+    elif "rtp.pt" in url:
+        # https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024
+        # m3u8: https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/p12900/p12900_1_2024020515461.mp4,.urlset/master.m3u8
+        # thumbnail: https://cdn-images.rtp.pt/multimedia/screenshots/p12900/p12900_1_2024020515461.jpg?q=100&format=pjpg&auto=webp&v=3&w=400
+        #
+        # we can find the necessary reference from a string like:
+        # "seekBarThumbnailsLoc: '//cdn-images.rtp.pt/multimedia/screenshots/p12900/preview/p12900_1_2024020515461_preview.vtt',"
+
+        text = requests.get(url).text
+        ref = re.findall(r"seekBarThumbnailsLoc: '(.+?)\.vtt',", text)[0].split("/")[-1].split("_")[:-1] # it ends in "_preview.vtt"
+        series = ref[0]
+        ref = "_".join(ref)
+
+        m3u8_url = f"https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/{series}/{ref}.mp4,.urlset/master.m3u8"
+        thumbnail_url = f"https://cdn-images.rtp.pt/multimedia/screenshots/{series}/{ref}.jpg?q=100&format=pjpg&auto=webp&v=3&w=400"
+        return m3u8_url, thumbnail_url
+
+    return None, None
+
+
+def get_audio(url, output_path):
+    """
+    Use ffmpeg to download the audio from a given m3u8 link
+    """
+
+    if Path(output_path).exists():
+        return
+
+    Path(output_path).parent.mkdir(exist_ok=True, parents=True)
+
+    cmd = [
+        "ffmpeg",
+        "-i",
+        url,
+        output_path,
+    ]
+
+    subprocess.run(cmd)
+
+
+def slugify(title):
+    """
+    Turn a title into a slug
+    """
+
+    return title.lower().replace(" ", "-")
+
+
+def transcribe_audio(audio_path, output_root):
+    """
+    use whisperx to transcribe the audio
+    """
+
+    name = audio_path.stem
+
+    if (output_root / f"transcriptions/{name}.txt").exists():
+        return
+
+    Path(f"{output_root}/transcriptions").mkdir(exist_ok=True, parents=True)
+
+    cmd = [
+        "whisperx",
+        "--hf_token",
+        os.environ["HF_TOKEN"],
+        "--model",
+        "large-v2",
+        "--language",
+        "pt",
+        "--diarize",
+        "--min_speakers",
+        "2",
+        "--max_speakers",
+        "4",
+        "--compute_type",
+        "int8",
+        "--output_dir",
+        f"{output_root}/transcriptions",
+        "--print_progress",
+        "True",
+        audio_path,
+    ]
+
+    subprocess.run(cmd)
+
+    # keep only the .txt and .srt files
+    for f in Path(f"{output_root}/transcriptions").glob("*"):
+        if f.suffix not in [".txt", ".srt"]:
+            f.unlink()
+
+
+def process_debate(*, title, url, output_root):
+    """
+    Process a debate from the input data
+    """
+
+    m3u8_url, thumbnail_url = find_m3u8_and_thumbnail(url)
+    if m3u8_url is None or thumbnail_url is None:
+        logging.warning(f"Could not find m3u8 or thumbnail for {url}")
+        return
+
+    slug = slugify(title)
+    audio_path = output_root / f"audio/{slug}.wav"
+    get_audio(m3u8_url, audio_path)
+    transcribe_audio(audio_path, output_root)
+
+    out = {
+        "slug": slug,
+        "title": title,
+        "original_url": url,
+        "m3u8_url": m3u8_url,
+        "audio_path": str(output_root / f"audio/{slug}.wav"),
+        "transcription_txt": str(output_root / f"transcriptions/{slug}.txt"),
+        "transcription_srt": str(output_root / f"transcriptions/{slug}.srt"),
+    }
+
+    with open(output_root / f"{slug}.json", "w") as f:
+        json.dump(out, f, indent=4)
+
+    return {"title": title, "thumbnail": thumbnail_url, "slug": slug}
+
+
+def main(args):
+    input_path = Path(args.input)
+    output_root = Path(args.output_root)
+    output_root.mkdir(exist_ok=True, parents=True)
+
+    with open(input_path, "r") as f:
+        data = yaml.safe_load(f)
+
+    master_json = []
+    for debate in data:
+        output_path = output_root / f"{debate}.json"
+
+        if output_path.exists() and not args.force:
+            continue
+
+        summary = process_debate(**debate, output_root=output_root)
+        master_json.append(summary)
+
+    with open(args.output_master_json, "w") as f:
+        json.dump(master_json, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", type=str, default="debates.yaml")
+    parser.add_argument("--output_root", type=str, default="src/static/debates")
+    parser.add_argument("--output-master-json", type=str, default="src/debates.json")
+    parser.add_argument("--force", action="store_true")
+    args = parser.parse_args()
+    main(args)
diff --git a/src/App.svelte b/src/App.svelte
@@ -1,47 +1,30 @@
 <script>
-  import svelteLogo from './assets/svelte.svg'
-  import viteLogo from '/vite.svg'
-  import Counter from './lib/Counter.svelte'
+  // Import the debates data from the JSON file
+  import debates from './debates.json';
 </script>
 
-<main>
-  <div>
-    <a href="https://vitejs.dev" target="_blank" rel="noreferrer">
-      <img src={viteLogo} class="logo" alt="Vite Logo" />
-    </a>
-    <a href="https://svelte.dev" target="_blank" rel="noreferrer">
-      <img src={svelteLogo} class="logo svelte" alt="Svelte Logo" />
-    </a>
-  </div>
-  <h1>Vite + Svelte</h1>
-
-  <div class="card">
-    <Counter />
-  </div>
-
-  <p>
-    Check out <a href="https://github.com/sveltejs/kit#readme" target="_blank" rel="noreferrer">SvelteKit</a>, the official Svelte app framework powered by Vite!
-  </p>
-
-  <p class="read-the-docs">
-    Click on the Vite and Svelte logos to learn more
-  </p>
-</main>
-
 <style>
-  .logo {
-    height: 6em;
-    padding: 1.5em;
-    will-change: filter;
-    transition: filter 300ms;
-  }
-  .logo:hover {
-    filter: drop-shadow(0 0 2em #646cffaa);
+  .grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
+    gap: 16px;
+    padding: 20px;
   }
-  .logo.svelte:hover {
-    filter: drop-shadow(0 0 2em #ff3e00aa);
+  .debate {
+    cursor: pointer;
   }
-  .read-the-docs {
-    color: #888;
+  img {
+    width: 100%;
+    height: auto;
+    border-radius: 8px;
   }
 </style>
+
+<div class="grid">
+  {#each debates as { title, thumbnail, slug }}
+    <div class="debate" on:click={() => (window.location.href = `/${slug}`)}>
+      <img src={thumbnail} alt={title} />
+      <p>{title}</p>
+    </div>
+  {/each}
+</div>