Skip to content

Commit

Permalink
debate processing barebones
Browse files Browse the repository at this point in the history
  • Loading branch information
colobas committed Feb 6, 2024
1 parent eb2f8f9 commit f58c8fc
Show file tree
Hide file tree
Showing 5 changed files with 255 additions and 39 deletions.
36 changes: 36 additions & 0 deletions .github/workflows/process-debates.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Process Debates

#on:
# push:
# paths:
# - 'debates.yaml'
#
jobs:
process-debates:
defaults:
run:
shell: bash -leo pipefail {0}

runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: mamba-org/setup-micromamba@v1
name: install whisperx and other deps
with:
micromamba-version: '1.3.1-0'
environment-file: environment.yml
init-shell: >-
bash
cache-environment: true
post-cleanup: 'all'

- name: Process Debates
run: python process_debates.py debates.yaml src/static/debates

- name: Commit and Push Changes
uses: EndBug/add-and-commit@v7
with:
message: 'Update debates data'
add: 'src/static/debates/'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
8 changes: 8 additions & 0 deletions debates.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
- title: PS vs IL
url: https://sicnoticias.pt/pais/2024-02-05-Debate-PS--IL-na-integra-dc65b6a5

- title: Chega vs PAN
url: https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024

- title: PCP vs PAN
url: https://www.rtp.pt/play/p12900/e746296/debates-legislativas-2024
13 changes: 13 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: debates
channels:
- conda-forge
- defaults
- pytorch
dependencies:
- python=3.10
- pytorch=2.0.0
- torchaudio=2.0.0
- ffmpeg=6.1.1
- pip:
- beautifulsoup4
- git+https://github.com/m-bain/whisperx.git@8227807#egg=whisperx
176 changes: 176 additions & 0 deletions process_debates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import argparse
from pathlib import Path
import yaml
import json
import subprocess
import bs4
import requests
import logging
import re


def find_m3u8_and_thumbnail(url):
"""
Find the m3u8 link and thumbnail from a given debate URL. This will depend on
the specific website where the debates are hosted
"""

if "sicnoticias.pt" in url:
# sic noticias
# the m3u8 url and the thumbnail url are in a script tag of type "application/ld+json"
bs = bs4.BeautifulSoup(requests.get(url).text, "html.parser")
scripts = bs.find_all("script", type="application/ld+json")
for script in scripts:
# read json
data = json.loads(script.string)
if data["@type"] == "VideoObject":
return data["contentUrl"], data["thumbnailUrl"]
elif "rtp.pt" in url:
# https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024
# m3u8: https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/p12900/p12900_1_2024020515461.mp4,.urlset/master.m3u8
# thumbnail: https://cdn-images.rtp.pt/multimedia/screenshots/p12900/p12900_1_2024020515461.jpg?q=100&format=pjpg&auto=webp&v=3&w=400
#
# we can find the necessary reference from a string like:
# "seekBarThumbnailsLoc: '//cdn-images.rtp.pt/multimedia/screenshots/p12900/preview/p12900_1_2024020515461_preview.vtt',"

text = requests.get(url).text
ref = re.findall(r"seekBarThumbnailsLoc: '(.+?)\.vtt',", text)[0].split("/")[-1].split("_")[:-1] # it ends in "_preview.vtt"
series = ref[0]
ref = "_".join(ref)

m3u8_url = f"https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/{series}/{ref}.mp4,.urlset/master.m3u8"
thumbnail_url = f"https://cdn-images.rtp.pt/multimedia/screenshots/{series}/{ref}.jpg?q=100&format=pjpg&auto=webp&v=3&w=400"
return m3u8_url, thumbnail_url

return None, None


def get_audio(url, output_path):
"""
Use ffmpeg to download the audio from a given m3u8 link
"""

if Path(output_path).exists():
return

Path(output_path).parent.mkdir(exist_ok=True, parents=True)

cmd = [
"ffmpeg",
"-i",
url,
output_path,
]

subprocess.run(cmd)


def slugify(title):
"""
Turn a title into a slug
"""

return title.lower().replace(" ", "-")


def transcribe_audio(audio_path, output_root):
"""
use whisperx to transcribe the audio
"""

name = audio_path.stem

if (output_root / f"transcriptions/{name}.txt").exists():
return

Path(f"{output_root}/transcriptions").mkdir(exist_ok=True, parents=True)

cmd = [
"whisperx",
"--model",
"large-v2",
"--language",
"pt",
"--diarize",
"--min_speakers",
"2",
"--max_speakers",
"4",
"--compute_type",
"int8",
"--output_dir",
f"{output_root}/transcriptions",
"--print_progress",
"True",
audio_path,
]

subprocess.run(cmd)

# keep only the .txt and .srt files
for f in Path(f"{output_root}/transcriptions").glob("*"):
if f.suffix not in [".txt", ".srt"]:
f.unlink()


def process_debate(*, title, url, output_root):
"""
Process a debate from the input data
"""

m3u8_url, thumbnail_url = find_m3u8_and_thumbnail(url)
if m3u8_url is None or thumbnail_url is None:
logging.warning(f"Could not find m3u8 or thumbnail for {url}")
return

slug = slugify(title)
audio_path = output_root / f"audio/{slug}.wav"
get_audio(m3u8_url, audio_path)
transcribe_audio(audio_path, output_root)

out = {
"slug": slug,
"title": title,
"original_url": url,
"m3u8_url": m3u8_url,
"audio_path": str(output_root / f"audio/{slug}.wav"),
"transcription_txt": str(output_root / f"transcriptions/{slug}.txt"),
"transcription_srt": str(output_root / f"transcriptions/{slug}.srt"),
}

with open(output_root / f"{slug}.json", "w") as f:
json.dump(out, f, indent=4)

return {"title": title, "thumbnail": thumbnail_url, "slug": slug}


def main(args):
input_path = Path(args.input)
output_root = Path(args.output_root)
output_root.mkdir(exist_ok=True, parents=True)

with open(input_path, "r") as f:
data = yaml.safe_load(f)

master_json = []
for debate in data:
output_path = output_root / f"{debate}.json"

if output_path.exists() and not args.force:
continue

summary = process_debate(**debate, output_root=output_root)
master_json.append(summary)

with open(args.output_master_json, "w") as f:
json.dump(master_json, f, indent=4)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input", type=str, default="debates.yaml")
parser.add_argument("--output_root", type=str, default="src/static/debates")
parser.add_argument("--output-master-json", type=str, default="src/debates.json")
parser.add_argument("--force", action="store_true")
args = parser.parse_args()
main(args)
61 changes: 22 additions & 39 deletions src/App.svelte
Original file line number Diff line number Diff line change
@@ -1,47 +1,30 @@
<script>
import svelteLogo from './assets/svelte.svg'
import viteLogo from '/vite.svg'
import Counter from './lib/Counter.svelte'
// Import the debates data from the JSON file
import debates from './debates.json';
</script>

<main>
<div>
<a href="https://vitejs.dev" target="_blank" rel="noreferrer">
<img src={viteLogo} class="logo" alt="Vite Logo" />
</a>
<a href="https://svelte.dev" target="_blank" rel="noreferrer">
<img src={svelteLogo} class="logo svelte" alt="Svelte Logo" />
</a>
</div>
<h1>Vite + Svelte</h1>

<div class="card">
<Counter />
</div>

<p>
Check out <a href="https://github.com/sveltejs/kit#readme" target="_blank" rel="noreferrer">SvelteKit</a>, the official Svelte app framework powered by Vite!
</p>

<p class="read-the-docs">
Click on the Vite and Svelte logos to learn more
</p>
</main>

<style>
.logo {
height: 6em;
padding: 1.5em;
will-change: filter;
transition: filter 300ms;
}
.logo:hover {
filter: drop-shadow(0 0 2em #646cffaa);
.grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
gap: 16px;
padding: 20px;
}
.logo.svelte:hover {
filter: drop-shadow(0 0 2em #ff3e00aa);
.debate {
cursor: pointer;
}
.read-the-docs {
color: #888;
img {
width: 100%;
height: auto;
border-radius: 8px;
}
</style>

<div class="grid">
{#each debates as { title, thumbnail, slug }}
<div class="debate" on:click={() => (window.location.href = `/${slug}`)}>
<img src={thumbnail} alt={title} />
<p>{title}</p>
</div>
{/each}
</div>

0 comments on commit f58c8fc

Please sign in to comment.