-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
255 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
name: Process Debates | ||
|
||
#on: | ||
# push: | ||
# paths: | ||
# - 'debates.yaml' | ||
# | ||
jobs: | ||
process-debates: | ||
defaults: | ||
run: | ||
shell: bash -leo pipefail {0} | ||
|
||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v2 | ||
- uses: mamba-org/setup-micromamba@v1 | ||
name: install whisperx and other deps | ||
with: | ||
micromamba-version: '1.3.1-0' | ||
environment-file: environment.yml | ||
init-shell: >- | ||
bash | ||
cache-environment: true | ||
post-cleanup: 'all' | ||
|
||
- name: Process Debates | ||
run: python process_debates.py debates.yaml src/static/debates | ||
|
||
- name: Commit and Push Changes | ||
uses: EndBug/add-and-commit@v7 | ||
with: | ||
message: 'Update debates data' | ||
add: 'src/static/debates/' | ||
env: | ||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
- title: PS vs IL | ||
url: https://sicnoticias.pt/pais/2024-02-05-Debate-PS--IL-na-integra-dc65b6a5 | ||
|
||
- title: Chega vs PAN | ||
url: https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024 | ||
|
||
- title: PCP vs PAN | ||
url: https://www.rtp.pt/play/p12900/e746296/debates-legislativas-2024 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
name: debates | ||
channels: | ||
- conda-forge | ||
- defaults | ||
- pytorch | ||
dependencies: | ||
- python=3.10 | ||
- pytorch=2.0.0 | ||
- torchaudio=2.0.0 | ||
- ffmpeg=6.1.1 | ||
- pip: | ||
- beautifulsoup4 | ||
- git+https://github.com/m-bain/whisperx.git@8227807#egg=whisperx |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
import argparse | ||
from pathlib import Path | ||
import yaml | ||
import json | ||
import subprocess | ||
import bs4 | ||
import requests | ||
import logging | ||
import re | ||
|
||
|
||
def find_m3u8_and_thumbnail(url): | ||
""" | ||
Find the m3u8 link and thumbnail from a given debate URL. This will depend on | ||
the specific website where the debates are hosted | ||
""" | ||
|
||
if "sicnoticias.pt" in url: | ||
# sic noticias | ||
# the m3u8 url and the thumbnail url are in a script tag of type "application/ld+json" | ||
bs = bs4.BeautifulSoup(requests.get(url).text, "html.parser") | ||
scripts = bs.find_all("script", type="application/ld+json") | ||
for script in scripts: | ||
# read json | ||
data = json.loads(script.string) | ||
if data["@type"] == "VideoObject": | ||
return data["contentUrl"], data["thumbnailUrl"] | ||
elif "rtp.pt" in url: | ||
# https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024 | ||
# m3u8: https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/p12900/p12900_1_2024020515461.mp4,.urlset/master.m3u8 | ||
# thumbnail: https://cdn-images.rtp.pt/multimedia/screenshots/p12900/p12900_1_2024020515461.jpg?q=100&format=pjpg&auto=webp&v=3&w=400 | ||
# | ||
# we can find the necessary reference from a string like: | ||
# "seekBarThumbnailsLoc: '//cdn-images.rtp.pt/multimedia/screenshots/p12900/preview/p12900_1_2024020515461_preview.vtt'," | ||
|
||
text = requests.get(url).text | ||
ref = re.findall(r"seekBarThumbnailsLoc: '(.+?)\.vtt',", text)[0].split("/")[-1].split("_")[:-1] # it ends in "_preview.vtt" | ||
series = ref[0] | ||
ref = "_".join(ref) | ||
|
||
m3u8_url = f"https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/{series}/{ref}.mp4,.urlset/master.m3u8" | ||
thumbnail_url = f"https://cdn-images.rtp.pt/multimedia/screenshots/{series}/{ref}.jpg?q=100&format=pjpg&auto=webp&v=3&w=400" | ||
return m3u8_url, thumbnail_url | ||
|
||
return None, None | ||
|
||
|
||
def get_audio(url, output_path): | ||
""" | ||
Use ffmpeg to download the audio from a given m3u8 link | ||
""" | ||
|
||
if Path(output_path).exists(): | ||
return | ||
|
||
Path(output_path).parent.mkdir(exist_ok=True, parents=True) | ||
|
||
cmd = [ | ||
"ffmpeg", | ||
"-i", | ||
url, | ||
output_path, | ||
] | ||
|
||
subprocess.run(cmd) | ||
|
||
|
||
def slugify(title): | ||
""" | ||
Turn a title into a slug | ||
""" | ||
|
||
return title.lower().replace(" ", "-") | ||
|
||
|
||
def transcribe_audio(audio_path, output_root): | ||
""" | ||
use whisperx to transcribe the audio | ||
""" | ||
|
||
name = audio_path.stem | ||
|
||
if (output_root / f"transcriptions/{name}.txt").exists(): | ||
return | ||
|
||
Path(f"{output_root}/transcriptions").mkdir(exist_ok=True, parents=True) | ||
|
||
cmd = [ | ||
"whisperx", | ||
"--model", | ||
"large-v2", | ||
"--language", | ||
"pt", | ||
"--diarize", | ||
"--min_speakers", | ||
"2", | ||
"--max_speakers", | ||
"4", | ||
"--compute_type", | ||
"int8", | ||
"--output_dir", | ||
f"{output_root}/transcriptions", | ||
"--print_progress", | ||
"True", | ||
audio_path, | ||
] | ||
|
||
subprocess.run(cmd) | ||
|
||
# keep only the .txt and .srt files | ||
for f in Path(f"{output_root}/transcriptions").glob("*"): | ||
if f.suffix not in [".txt", ".srt"]: | ||
f.unlink() | ||
|
||
|
||
def process_debate(*, title, url, output_root): | ||
""" | ||
Process a debate from the input data | ||
""" | ||
|
||
m3u8_url, thumbnail_url = find_m3u8_and_thumbnail(url) | ||
if m3u8_url is None or thumbnail_url is None: | ||
logging.warning(f"Could not find m3u8 or thumbnail for {url}") | ||
return | ||
|
||
slug = slugify(title) | ||
audio_path = output_root / f"audio/{slug}.wav" | ||
get_audio(m3u8_url, audio_path) | ||
transcribe_audio(audio_path, output_root) | ||
|
||
out = { | ||
"slug": slug, | ||
"title": title, | ||
"original_url": url, | ||
"m3u8_url": m3u8_url, | ||
"audio_path": str(output_root / f"audio/{slug}.wav"), | ||
"transcription_txt": str(output_root / f"transcriptions/{slug}.txt"), | ||
"transcription_srt": str(output_root / f"transcriptions/{slug}.srt"), | ||
} | ||
|
||
with open(output_root / f"{slug}.json", "w") as f: | ||
json.dump(out, f, indent=4) | ||
|
||
return {"title": title, "thumbnail": thumbnail_url, "slug": slug} | ||
|
||
|
||
def main(args): | ||
input_path = Path(args.input) | ||
output_root = Path(args.output_root) | ||
output_root.mkdir(exist_ok=True, parents=True) | ||
|
||
with open(input_path, "r") as f: | ||
data = yaml.safe_load(f) | ||
|
||
master_json = [] | ||
for debate in data: | ||
output_path = output_root / f"{debate}.json" | ||
|
||
if output_path.exists() and not args.force: | ||
continue | ||
|
||
summary = process_debate(**debate, output_root=output_root) | ||
master_json.append(summary) | ||
|
||
with open(args.output_master_json, "w") as f: | ||
json.dump(master_json, f, indent=4) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--input", type=str, default="debates.yaml") | ||
parser.add_argument("--output_root", type=str, default="src/static/debates") | ||
parser.add_argument("--output-master-json", type=str, default="src/debates.json") | ||
parser.add_argument("--force", action="store_true") | ||
args = parser.parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,47 +1,30 @@ | ||
<script> | ||
import svelteLogo from './assets/svelte.svg' | ||
import viteLogo from '/vite.svg' | ||
import Counter from './lib/Counter.svelte' | ||
// Import the debates data from the JSON file | ||
import debates from './debates.json'; | ||
</script> | ||
|
||
<main> | ||
<div> | ||
<a href="https://vitejs.dev" target="_blank" rel="noreferrer"> | ||
<img src={viteLogo} class="logo" alt="Vite Logo" /> | ||
</a> | ||
<a href="https://svelte.dev" target="_blank" rel="noreferrer"> | ||
<img src={svelteLogo} class="logo svelte" alt="Svelte Logo" /> | ||
</a> | ||
</div> | ||
<h1>Vite + Svelte</h1> | ||
|
||
<div class="card"> | ||
<Counter /> | ||
</div> | ||
|
||
<p> | ||
Check out <a href="https://github.com/sveltejs/kit#readme" target="_blank" rel="noreferrer">SvelteKit</a>, the official Svelte app framework powered by Vite! | ||
</p> | ||
|
||
<p class="read-the-docs"> | ||
Click on the Vite and Svelte logos to learn more | ||
</p> | ||
</main> | ||
|
||
<style> | ||
.logo { | ||
height: 6em; | ||
padding: 1.5em; | ||
will-change: filter; | ||
transition: filter 300ms; | ||
} | ||
.logo:hover { | ||
filter: drop-shadow(0 0 2em #646cffaa); | ||
.grid { | ||
display: grid; | ||
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); | ||
gap: 16px; | ||
padding: 20px; | ||
} | ||
.logo.svelte:hover { | ||
filter: drop-shadow(0 0 2em #ff3e00aa); | ||
.debate { | ||
cursor: pointer; | ||
} | ||
.read-the-docs { | ||
color: #888; | ||
img { | ||
width: 100%; | ||
height: auto; | ||
border-radius: 8px; | ||
} | ||
</style> | ||
|
||
<div class="grid"> | ||
{#each debates as { title, thumbnail, slug }} | ||
<div class="debate" on:click={() => (window.location.href = `/${slug}`)}> | ||
<img src={thumbnail} alt={title} /> | ||
<p>{title}</p> | ||
</div> | ||
{/each} | ||
</div> |