Vite + Svelte
- -- Check out SvelteKit, the official Svelte app framework powered by Vite! -
- -- Click on the Vite and Svelte logos to learn more -
-diff --git a/.github/workflows/process-debates.yaml b/.github/workflows/process-debates.yaml
new file mode 100644
index 0000000..adb3168
--- /dev/null
+++ b/.github/workflows/process-debates.yaml
@@ -0,0 +1,36 @@
+name: Process Debates
+# push:
+# paths:
+# - 'debates.yaml'
+ process-debates:
+ defaults:
+ run:
+ shell: bash -leo pipefail {0}
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - uses: mamba-org/setup-micromamba@v1
+ name: install whisperx and other deps
+ with:
+ micromamba-version: '1.3.1-0'
+ environment-file: environment.yml
+ init-shell: >-
+ bash
+ cache-environment: true
+ post-cleanup: 'all'
+ - name: Process Debates
+ run: python debates.yaml src/static/debates
+ - name: Commit and Push Changes
+ uses: EndBug/add-and-commit@v7
+ with:
+ message: 'Update debates data'
+ add: 'src/static/debates/'
+ env:
diff --git a/debates.yaml b/debates.yaml
new file mode 100644
index 0000000..e23af70
--- /dev/null
+++ b/debates.yaml
@@ -0,0 +1,8 @@
+- title: PS vs IL
+ url:
+- title: Chega vs PAN
+ url:
+- title: PCP vs PAN
+ url:
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..91b5e47
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,13 @@
+name: debates
+ - conda-forge
+ - defaults
+ - pytorch-nightly
+ - python=3.10
+ - pytorch=2.0.0
+ - torchaudio=2.0.0
+ - ffmpeg=6.1.1
+ - pip:
+ - beautifulsoup4
+ - git+
diff --git a/ b/
new file mode 100644
index 0000000..44cb0a1
--- /dev/null
+++ b/
@@ -0,0 +1,176 @@
+import argparse
+from pathlib import Path
+import yaml
+import json
+import subprocess
+import bs4
+import requests
+import logging
+import re
+def find_m3u8_and_thumbnail(url):
+ """
+ Find the m3u8 link and thumbnail from a given debate URL. This will depend on
+ the specific website where the debates are hosted
+ """
+ if "" in url:
+ # sic noticias
+ # the m3u8 url and the thumbnail url are in a script tag of type "application/ld+json"
+ bs = bs4.BeautifulSoup(requests.get(url).text, "html.parser")
+ scripts = bs.find_all("script", type="application/ld+json")
+ for script in scripts:
+ # read json
+ data = json.loads(script.string)
+ if data["@type"] == "VideoObject":
+ return data["contentUrl"], data["thumbnailUrl"]
+ elif "" in url:
+ #
+ # m3u8:,/h264/512x384/p12900/p12900_1_2024020515461.mp4,.urlset/master.m3u8
+ # thumbnail:
+ #
+ # we can find the necessary reference from a string like:
+ # "seekBarThumbnailsLoc: '//',"
+ text = requests.get(url).text
+ ref = re.findall(r"seekBarThumbnailsLoc: '(.+?)\.vtt',", text)[0].split("/")[-1].split("_")[:-1] # it ends in "_preview.vtt"
+ series = ref[0]
+ ref = "_".join(ref)
+ m3u8_url = f",/h264/512x384/{series}/{ref}.mp4,.urlset/master.m3u8"
+ thumbnail_url = f"{series}/{ref}.jpg?q=100&format=pjpg&auto=webp&v=3&w=400"
+ return m3u8_url, thumbnail_url
+ return None, None
+def get_audio(url, output_path):
+ """
+ Use ffmpeg to download the audio from a given m3u8 link
+ """
+ if Path(output_path).exists():
+ return
+ Path(output_path).parent.mkdir(exist_ok=True, parents=True)
+ cmd = [
+ "ffmpeg",
+ "-i",
+ url,
+ output_path,
+ ]
+def slugify(title):
+ """
+ Turn a title into a slug
+ """
+ return title.lower().replace(" ", "-")
+def transcribe_audio(audio_path, output_root):
+ """
+ use whisperx to transcribe the audio
+ """
+ name = audio_path.stem
+ if (output_root / f"transcriptions/{name}.txt").exists():
+ return
+ Path(f"{output_root}/transcriptions").mkdir(exist_ok=True, parents=True)
+ cmd = [
+ "whisperx",
+ "--model",
+ "large-v2",
+ "--language",
+ "pt",
+ "--diarize",
+ "--min_speakers",
+ "2",
+ "--max_speakers",
+ "4",
+ "--compute_type",
+ "int8",
+ "--output_dir",
+ f"{output_root}/transcriptions",
+ "--print_progress",
+ "True",
+ audio_path,
+ ]
+ # keep only the .txt and .srt files
+ for f in Path(f"{output_root}/transcriptions").glob("*"):
+ if f.suffix not in [".txt", ".srt"]:
+ f.unlink()
+def process_debate(*, title, url, output_root):
+ """
+ Process a debate from the input data
+ """
+ m3u8_url, thumbnail_url = find_m3u8_and_thumbnail(url)
+ if m3u8_url is None or thumbnail_url is None:
+ logging.warning(f"Could not find m3u8 or thumbnail for {url}")
+ return
+ slug = slugify(title)
+ audio_path = output_root / f"audio/{slug}.wav"
+ get_audio(m3u8_url, audio_path)
+ transcribe_audio(audio_path, output_root)
+ out = {
+ "slug": slug,
+ "title": title,
+ "original_url": url,
+ "m3u8_url": m3u8_url,
+ "audio_path": str(output_root / f"audio/{slug}.wav"),
+ "transcription_txt": str(output_root / f"transcriptions/{slug}.txt"),
+ "transcription_srt": str(output_root / f"transcriptions/{slug}.srt"),
+ }
+ with open(output_root / f"{slug}.json", "w") as f:
+ json.dump(out, f, indent=4)
+ return {"title": title, "thumbnail": thumbnail_url, "slug": slug}
+def main(args):
+ input_path = Path(args.input)
+ output_root = Path(args.output_root)
+ output_root.mkdir(exist_ok=True, parents=True)
+ with open(input_path, "r") as f:
+ data = yaml.safe_load(f)
+ master_json = []
+ for debate in data:
+ output_path = output_root / f"{debate}.json"
+ if output_path.exists() and not args.force:
+ continue
+ summary = process_debate(**debate, output_root=output_root)
+ master_json.append(summary)
+ with open(args.output_master_json, "w") as f:
+ json.dump(master_json, f, indent=4)
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input", type=str, default="debates.yaml")
+ parser.add_argument("--output_root", type=str, default="src/static/debates")
+ parser.add_argument("--output-master-json", type=str, default="src/debates.json")
+ parser.add_argument("--force", action="store_true")
+ args = parser.parse_args()
+ main(args)
diff --git a/src/App.svelte b/src/App.svelte
index 1f31354..7aff68c 100644
--- a/src/App.svelte
+++ b/src/App.svelte
@@ -1,47 +1,30 @@
- Check out SvelteKit, the official Svelte app framework powered by Vite!
- Click on the Vite and Svelte logos to learn more
- Vite + Svelte