Vite + Svelte
- -- Check out SvelteKit, the official Svelte app framework powered by Vite! -
- -- Click on the Vite and Svelte logos to learn more -
-diff --git a/.github/workflows/process-debates.yaml b/.github/workflows/process-debates.yaml
new file mode 100644
index 0000000..adb3168
--- /dev/null
+++ b/.github/workflows/process-debates.yaml
@@ -0,0 +1,36 @@
+name: Process Debates
+
+#on:
+# push:
+# paths:
+# - 'debates.yaml'
+#
+jobs:
+ process-debates:
+ defaults:
+ run:
+ shell: bash -leo pipefail {0}
+
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - uses: mamba-org/setup-micromamba@v1
+ name: install whisperx and other deps
+ with:
+ micromamba-version: '1.3.1-0'
+ environment-file: environment.yml
+ init-shell: >-
+ bash
+ cache-environment: true
+ post-cleanup: 'all'
+
+ - name: Process Debates
+ run: python process_debates.py debates.yaml src/static/debates
+
+ - name: Commit and Push Changes
+ uses: EndBug/add-and-commit@v7
+ with:
+ message: 'Update debates data'
+ add: 'src/static/debates/'
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/debates.yaml b/debates.yaml
new file mode 100644
index 0000000..e23af70
--- /dev/null
+++ b/debates.yaml
@@ -0,0 +1,8 @@
+- title: PS vs IL
+ url: https://sicnoticias.pt/pais/2024-02-05-Debate-PS--IL-na-integra-dc65b6a5
+
+- title: Chega vs PAN
+ url: https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024
+
+- title: PCP vs PAN
+ url: https://www.rtp.pt/play/p12900/e746296/debates-legislativas-2024
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..91b5e47
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,13 @@
+name: debates
+channels:
+ - conda-forge
+ - defaults
+ - pytorch-nightly
+dependencies:
+ - python=3.10
+ - pytorch=2.0.0
+ - torchaudio=2.0.0
+ - ffmpeg=6.1.1
+ - pip:
+ - beautifulsoup4
+ - git+https://github.com/m-bain/whisperx.git@8227807#egg=whisperx
diff --git a/process_debates.py b/process_debates.py
new file mode 100644
index 0000000..44cb0a1
--- /dev/null
+++ b/process_debates.py
@@ -0,0 +1,176 @@
+import argparse
+from pathlib import Path
+import yaml
+import json
+import subprocess
+import bs4
+import requests
+import logging
+import re
+
+
+def find_m3u8_and_thumbnail(url):
+ """
+ Find the m3u8 link and thumbnail from a given debate URL. This will depend on
+ the specific website where the debates are hosted
+ """
+
+ if "sicnoticias.pt" in url:
+ # sic noticias
+ # the m3u8 url and the thumbnail url are in a script tag of type "application/ld+json"
+ bs = bs4.BeautifulSoup(requests.get(url).text, "html.parser")
+ scripts = bs.find_all("script", type="application/ld+json")
+ for script in scripts:
+ # read json
+ data = json.loads(script.string)
+ if data["@type"] == "VideoObject":
+ return data["contentUrl"], data["thumbnailUrl"]
+ elif "rtp.pt" in url:
+ # https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024
+ # m3u8: https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/p12900/p12900_1_2024020515461.mp4,.urlset/master.m3u8
+ # thumbnail: https://cdn-images.rtp.pt/multimedia/screenshots/p12900/p12900_1_2024020515461.jpg?q=100&format=pjpg&auto=webp&v=3&w=400
+ #
+ # we can find the necessary reference from a string like:
+ # "seekBarThumbnailsLoc: '//cdn-images.rtp.pt/multimedia/screenshots/p12900/preview/p12900_1_2024020515461_preview.vtt',"
+
+ text = requests.get(url).text
+ ref = re.findall(r"seekBarThumbnailsLoc: '(.+?)\.vtt',", text)[0].split("/")[-1].split("_")[:-1] # it ends in "_preview.vtt"
+ series = ref[0]
+ ref = "_".join(ref)
+
+ m3u8_url = f"https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/{series}/{ref}.mp4,.urlset/master.m3u8"
+ thumbnail_url = f"https://cdn-images.rtp.pt/multimedia/screenshots/{series}/{ref}.jpg?q=100&format=pjpg&auto=webp&v=3&w=400"
+ return m3u8_url, thumbnail_url
+
+ return None, None
+
+
+def get_audio(url, output_path):
+ """
+ Use ffmpeg to download the audio from a given m3u8 link
+ """
+
+ if Path(output_path).exists():
+ return
+
+ Path(output_path).parent.mkdir(exist_ok=True, parents=True)
+
+ cmd = [
+ "ffmpeg",
+ "-i",
+ url,
+ output_path,
+ ]
+
+ subprocess.run(cmd)
+
+
+def slugify(title):
+ """
+ Turn a title into a slug
+ """
+
+ return title.lower().replace(" ", "-")
+
+
+def transcribe_audio(audio_path, output_root):
+ """
+ use whisperx to transcribe the audio
+ """
+
+ name = audio_path.stem
+
+ if (output_root / f"transcriptions/{name}.txt").exists():
+ return
+
+ Path(f"{output_root}/transcriptions").mkdir(exist_ok=True, parents=True)
+
+ cmd = [
+ "whisperx",
+ "--model",
+ "large-v2",
+ "--language",
+ "pt",
+ "--diarize",
+ "--min_speakers",
+ "2",
+ "--max_speakers",
+ "4",
+ "--compute_type",
+ "int8",
+ "--output_dir",
+ f"{output_root}/transcriptions",
+ "--print_progress",
+ "True",
+ audio_path,
+ ]
+
+ subprocess.run(cmd)
+
+ # keep only the .txt and .srt files
+ for f in Path(f"{output_root}/transcriptions").glob("*"):
+ if f.suffix not in [".txt", ".srt"]:
+ f.unlink()
+
+
+def process_debate(*, title, url, output_root):
+ """
+ Process a debate from the input data
+ """
+
+ m3u8_url, thumbnail_url = find_m3u8_and_thumbnail(url)
+ if m3u8_url is None or thumbnail_url is None:
+ logging.warning(f"Could not find m3u8 or thumbnail for {url}")
+ return
+
+ slug = slugify(title)
+ audio_path = output_root / f"audio/{slug}.wav"
+ get_audio(m3u8_url, audio_path)
+ transcribe_audio(audio_path, output_root)
+
+ out = {
+ "slug": slug,
+ "title": title,
+ "original_url": url,
+ "m3u8_url": m3u8_url,
+ "audio_path": str(output_root / f"audio/{slug}.wav"),
+ "transcription_txt": str(output_root / f"transcriptions/{slug}.txt"),
+ "transcription_srt": str(output_root / f"transcriptions/{slug}.srt"),
+ }
+
+ with open(output_root / f"{slug}.json", "w") as f:
+ json.dump(out, f, indent=4)
+
+ return {"title": title, "thumbnail": thumbnail_url, "slug": slug}
+
+
+def main(args):
+ input_path = Path(args.input)
+ output_root = Path(args.output_root)
+ output_root.mkdir(exist_ok=True, parents=True)
+
+ with open(input_path, "r") as f:
+ data = yaml.safe_load(f)
+
+ master_json = []
+ for debate in data:
+ output_path = output_root / f"{debate}.json"
+
+ if output_path.exists() and not args.force:
+ continue
+
+ summary = process_debate(**debate, output_root=output_root)
+ master_json.append(summary)
+
+ with open(args.output_master_json, "w") as f:
+ json.dump(master_json, f, indent=4)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input", type=str, default="debates.yaml")
+ parser.add_argument("--output_root", type=str, default="src/static/debates")
+ parser.add_argument("--output-master-json", type=str, default="src/debates.json")
+ parser.add_argument("--force", action="store_true")
+ args = parser.parse_args()
+ main(args)
diff --git a/src/App.svelte b/src/App.svelte
index 1f31354..7aff68c 100644
--- a/src/App.svelte
+++ b/src/App.svelte
@@ -1,47 +1,30 @@
-
- Check out SvelteKit, the official Svelte app framework powered by Vite!
-
- Click on the Vite and Svelte logos to learn more
- Vite + Svelte
-
-
{title}
+