From 5f34d1f3caf5741674c94e793a9351340eb2e5ee Mon Sep 17 00:00:00 2001
From: Guilherme Pires <gpires@altius.org>
Date: Tue, 6 Feb 2024 12:49:09 -0700
Subject: [PATCH] debate processing barebones

---
 .github/workflows/process-debates.yaml |  36 +++++
 debates.yaml                           |   8 ++
 environment.yml                        |  13 ++
 process_debates.py                     | 183 +++++++++++++++++++++++++
 src/App.svelte                         |  61 +++------
 src/debates.json                       |  17 +++
 6 files changed, 279 insertions(+), 39 deletions(-)
 create mode 100644 .github/workflows/process-debates.yaml
 create mode 100644 debates.yaml
 create mode 100644 environment.yml
 create mode 100644 process_debates.py
 create mode 100644 src/debates.json

diff --git a/.github/workflows/process-debates.yaml b/.github/workflows/process-debates.yaml
new file mode 100644
index 0000000..adb3168
--- /dev/null
+++ b/.github/workflows/process-debates.yaml
@@ -0,0 +1,36 @@
+name: Process Debates
+
+#on:
+#  push:
+#    paths:
+#      - 'debates.yaml'
+#
+jobs:
+  process-debates:
+    defaults:
+      run:
+        shell: bash -leo pipefail {0}
+
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: mamba-org/setup-micromamba@v1
+        name: install whisperx and other deps
+        with:
+          micromamba-version: '1.3.1-0'
+          environment-file: environment.yml
+          init-shell: >-
+            bash
+          cache-environment: true
+          post-cleanup: 'all'
+
+      - name: Process Debates
+        run: python process_debates.py debates.yaml src/static/debates
+
+      - name: Commit and Push Changes
+        uses: EndBug/add-and-commit@v7
+        with:
+          message: 'Update debates data'
+          add: 'src/static/debates/'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/debates.yaml b/debates.yaml
new file mode 100644
index 0000000..e23af70
--- /dev/null
+++ b/debates.yaml
@@ -0,0 +1,8 @@
+- title: PS vs IL
+  url: https://sicnoticias.pt/pais/2024-02-05-Debate-PS--IL-na-integra-dc65b6a5
+
+- title: Chega vs PAN
+  url: https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024
+
+- title: PCP vs PAN
+  url: https://www.rtp.pt/play/p12900/e746296/debates-legislativas-2024
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..7523b70
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,13 @@
+name: debates
+channels:
+  - conda-forge
+  - defaults
+  - pytorch
+dependencies:
+  - python=3.10
+  - pytorch=2.0.0
+  - ffmpeg=6.1.1
+  - pip:
+    - beautifulsoup4
+    - git+https://github.com/m-bain/whisperx.git@8227807#egg=whisperx
+    - torchaudio==2.0.0
diff --git a/process_debates.py b/process_debates.py
new file mode 100644
index 0000000..88e25c3
--- /dev/null
+++ b/process_debates.py
@@ -0,0 +1,183 @@
+import os
+import argparse
+from pathlib import Path
+import yaml
+import json
+import subprocess
+import bs4
+import requests
+import logging
+import re
+
+
+def find_m3u8_and_thumbnail(url):
+    """
+    Find the m3u8 link and thumbnail from a given debate URL. This will depend on
+    the specific website where the debates are hosted
+    """
+
+    if "sicnoticias.pt" in url:
+        # sic noticias
+        # the m3u8 url and the thumbnail url are in a script tag of type "application/ld+json"
+        bs = bs4.BeautifulSoup(requests.get(url).text, "html.parser")
+        scripts = bs.find_all("script", type="application/ld+json")
+        for script in scripts:
+            # read json
+            data = json.loads(script.string)
+            if data["@type"] == "VideoObject":
+                return data["contentUrl"], data["thumbnailUrl"]
+    elif "rtp.pt" in url:
+        # https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024
+        # m3u8: https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/p12900/p12900_1_2024020515461.mp4,.urlset/master.m3u8
+        # thumbnail: https://cdn-images.rtp.pt/multimedia/screenshots/p12900/p12900_1_2024020515461.jpg?q=100&format=pjpg&auto=webp&v=3&w=400
+        #
+        # we can find the necessary reference from a string like:
+        # "seekBarThumbnailsLoc: '//cdn-images.rtp.pt/multimedia/screenshots/p12900/preview/p12900_1_2024020515461_preview.vtt',"
+
+        text = requests.get(url).text
+        ref = re.findall(r"seekBarThumbnailsLoc: '(.+?)\.vtt',", text)[0].split("/")[-1].split("_")[:-1] # it ends in "_preview.vtt"
+        series = ref[0]
+        ref = "_".join(ref)
+
+        m3u8_url = f"https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/{series}/{ref}.mp4,.urlset/master.m3u8"
+        thumbnail_url = f"https://cdn-images.rtp.pt/multimedia/screenshots/{series}/{ref}.jpg?q=100&format=pjpg&auto=webp&v=3&w=400"
+        return m3u8_url, thumbnail_url
+
+    return None, None
+
+
+def get_audio(url, output_path, headers=None):
+    """
+    Use ffmpeg to download the audio from a given m3u8 link
+    """
+
+    if Path(output_path).exists():
+        return
+
+    Path(output_path).parent.mkdir(exist_ok=True, parents=True)
+
+    if headers is None:
+        cmd = ["ffmpeg", "-i", url, output_path]
+    else:
+        cmd = ["ffmpeg", "-headers", headers, "-i", url, output_path]
+
+    subprocess.run(cmd)
+
+
+def slugify(title):
+    """
+    Turn a title into a slug
+    """
+
+    return title.lower().replace(" ", "-")
+
+
+def transcribe_audio(audio_path, output_root):
+    """
+    use whisperx to transcribe the audio
+    """
+
+    name = audio_path.stem
+
+    if (output_root / f"transcriptions/{name}.txt").exists():
+        return
+
+    Path(f"{output_root}/transcriptions").mkdir(exist_ok=True, parents=True)
+
+    cmd = [
+        "whisperx",
+        "--hf_token",
+        os.environ["HF_TOKEN"],
+        "--model",
+        "large-v2",
+        "--language",
+        "pt",
+        "--diarize",
+        "--min_speakers",
+        "2",
+        "--max_speakers",
+        "4",
+        "--compute_type",
+        "int8",
+        "--output_dir",
+        f"{output_root}/transcriptions",
+        "--print_progress",
+        "True",
+        audio_path,
+    ]
+
+    subprocess.run(cmd)
+
+    # keep only the .txt and .srt files
+    for f in Path(f"{output_root}/transcriptions").glob("*"):
+        if f.suffix not in [".txt", ".srt"]:
+            f.unlink()
+
+
+def process_debate(*, title, url, output_root):
+    """
+    Process a debate from the input data
+    """
+
+    m3u8_url, thumbnail_url = find_m3u8_and_thumbnail(url)
+    if m3u8_url is None or thumbnail_url is None:
+        logging.warning(f"Could not find m3u8 or thumbnail for {url}")
+        return
+
+    slug = slugify(title)
+    audio_path = output_root / f"audio/{slug}.wav"
+
+    if "rtp.pt" in url:
+        headers = "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:122.0) Gecko/20100101 Firefox/122.0\r\nAccept: */*\r\nAccept-Language: en-US,en;q=0.5\r\nAccept-Encoding: gzip, deflate, br\r\nReferer: https://www.rtp.pt/\r\nOrigin: https://www.rtp.pt\r\nDNT: 1\r\nSec-GPC: 1\r\nConnection: keep-alive\r\nSec-Fetch-Dest: empty\r\nSec-Fetch-Mode: cors\r\nSec-Fetch-Site: same-site\r\nTE: trailers\r\n"
+    else:
+        headers = None
+
+    get_audio(m3u8_url, audio_path, headers=headers)
+    #transcribe_audio(audio_path, output_root)
+
+    out = {
+        "slug": slug,
+        "title": title,
+        "original_url": url,
+        "m3u8_url": m3u8_url,
+        "audio_path": str(output_root / f"audio/{slug}.wav"),
+        "transcription_txt": str(output_root / f"transcriptions/{slug}.txt"),
+        "transcription_srt": str(output_root / f"transcriptions/{slug}.srt"),
+    }
+
+    with open(output_root / f"{slug}.json", "w") as f:
+        json.dump(out, f, indent=4)
+
+    return {"title": title, "thumbnail": thumbnail_url, "slug": slug}
+
+
+def main(args):
+    input_path = Path(args.input)
+    output_root = Path(args.output_root)
+    output_root.mkdir(exist_ok=True, parents=True)
+
+    with open(input_path, "r") as f:
+        data = yaml.safe_load(f)
+
+    master_json = []
+    for debate in data:
+        output_path = output_root / f"{debate}.json"
+
+        if output_path.exists() and not args.force:
+            continue
+
+        summary = process_debate(**debate, output_root=output_root)
+        master_json.append(summary)
+
+    with open(args.output_master_json, "w") as f:
+        json.dump(master_json, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", type=str, default="debates.yaml")
+    parser.add_argument("--output_root", type=str, default="src/static/debates")
+    parser.add_argument("--output-master-json", type=str, default="src/debates.json")
+    parser.add_argument("--force", action="store_true")
+    args = parser.parse_args()
+    main(args)
diff --git a/src/App.svelte b/src/App.svelte
index 1f31354..7aff68c 100644
--- a/src/App.svelte
+++ b/src/App.svelte
@@ -1,47 +1,30 @@
 <script>
-  import svelteLogo from './assets/svelte.svg'
-  import viteLogo from '/vite.svg'
-  import Counter from './lib/Counter.svelte'
+  // Import the debates data from the JSON file
+  import debates from './debates.json';
 </script>
 
-<main>
-  <div>
-    <a href="https://vitejs.dev" target="_blank" rel="noreferrer">
-      <img src={viteLogo} class="logo" alt="Vite Logo" />
-    </a>
-    <a href="https://svelte.dev" target="_blank" rel="noreferrer">
-      <img src={svelteLogo} class="logo svelte" alt="Svelte Logo" />
-    </a>
-  </div>
-  <h1>Vite + Svelte</h1>
-
-  <div class="card">
-    <Counter />
-  </div>
-
-  <p>
-    Check out <a href="https://github.com/sveltejs/kit#readme" target="_blank" rel="noreferrer">SvelteKit</a>, the official Svelte app framework powered by Vite!
-  </p>
-
-  <p class="read-the-docs">
-    Click on the Vite and Svelte logos to learn more
-  </p>
-</main>
-
 <style>
-  .logo {
-    height: 6em;
-    padding: 1.5em;
-    will-change: filter;
-    transition: filter 300ms;
-  }
-  .logo:hover {
-    filter: drop-shadow(0 0 2em #646cffaa);
+  .grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
+    gap: 16px;
+    padding: 20px;
   }
-  .logo.svelte:hover {
-    filter: drop-shadow(0 0 2em #ff3e00aa);
+  .debate {
+    cursor: pointer;
   }
-  .read-the-docs {
-    color: #888;
+  img {
+    width: 100%;
+    height: auto;
+    border-radius: 8px;
   }
 </style>
+
+<div class="grid">
+  {#each debates as { title, thumbnail, slug }}
+    <div class="debate" on:click={() => (window.location.href = `/${slug}`)}>
+      <img src={thumbnail} alt={title} />
+      <p>{title}</p>
+    </div>
+  {/each}
+</div>
diff --git a/src/debates.json b/src/debates.json
new file mode 100644
index 0000000..1230475
--- /dev/null
+++ b/src/debates.json
@@ -0,0 +1,17 @@
+[
+    {
+        "title": "PS vs IL",
+        "thumbnail": "https://images.impresa.pt/sicnot/2024-02-05-Pedro-Nuno-Santos-Rui-Rocha-e66319ff/16x9/mw-1200&outputFormat=jpeg",
+        "slug": "ps-vs-il"
+    },
+    {
+        "title": "Chega vs PAN",
+        "thumbnail": "https://cdn-images.rtp.pt/multimedia/screenshots/p1525/p1525_1_202104205996.jpg?q=100&format=pjpg&auto=webp&v=3&w=400",
+        "slug": "chega-vs-pan"
+    },
+    {
+        "title": "PCP vs PAN",
+        "thumbnail": "https://cdn-images.rtp.pt/multimedia/screenshots/p1525/p1525_1_202104205996.jpg?q=100&format=pjpg&auto=webp&v=3&w=400",
+        "slug": "pcp-vs-pan"
+    }
+]
\ No newline at end of file