From 5f34d1f3caf5741674c94e793a9351340eb2e5ee Mon Sep 17 00:00:00 2001 From: Guilherme Pires Date: Tue, 6 Feb 2024 12:49:09 -0700 Subject: [PATCH] debate processing barebones --- .github/workflows/process-debates.yaml | 36 +++++ debates.yaml | 8 ++ environment.yml | 13 ++ process_debates.py | 183 +++++++++++++++++++++++++ src/App.svelte | 61 +++------ src/debates.json | 17 +++ 6 files changed, 279 insertions(+), 39 deletions(-) create mode 100644 .github/workflows/process-debates.yaml create mode 100644 debates.yaml create mode 100644 environment.yml create mode 100644 process_debates.py create mode 100644 src/debates.json diff --git a/.github/workflows/process-debates.yaml b/.github/workflows/process-debates.yaml new file mode 100644 index 0000000..adb3168 --- /dev/null +++ b/.github/workflows/process-debates.yaml @@ -0,0 +1,36 @@ +name: Process Debates + +#on: +# push: +# paths: +# - 'debates.yaml' +# +jobs: + process-debates: + defaults: + run: + shell: bash -leo pipefail {0} + + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: mamba-org/setup-micromamba@v1 + name: install whisperx and other deps + with: + micromamba-version: '1.3.1-0' + environment-file: environment.yml + init-shell: >- + bash + cache-environment: true + post-cleanup: 'all' + + - name: Process Debates + run: python process_debates.py debates.yaml src/static/debates + + - name: Commit and Push Changes + uses: EndBug/add-and-commit@v7 + with: + message: 'Update debates data' + add: 'src/static/debates/' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/debates.yaml b/debates.yaml new file mode 100644 index 0000000..e23af70 --- /dev/null +++ b/debates.yaml @@ -0,0 +1,8 @@ +- title: PS vs IL + url: https://sicnoticias.pt/pais/2024-02-05-Debate-PS--IL-na-integra-dc65b6a5 + +- title: Chega vs PAN + url: https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024 + +- title: PCP vs PAN + url: https://www.rtp.pt/play/p12900/e746296/debates-legislativas-2024 diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..7523b70 --- /dev/null +++ b/environment.yml @@ -0,0 +1,13 @@ +name: debates +channels: + - conda-forge + - defaults + - pytorch +dependencies: + - python=3.10 + - pytorch=2.0.0 + - ffmpeg=6.1.1 + - pip: + - beautifulsoup4 + - git+https://github.com/m-bain/whisperx.git@8227807#egg=whisperx + - torchaudio==2.0.0 diff --git a/process_debates.py b/process_debates.py new file mode 100644 index 0000000..88e25c3 --- /dev/null +++ b/process_debates.py @@ -0,0 +1,183 @@ +import os +import argparse +from pathlib import Path +import yaml +import json +import subprocess +import bs4 +import requests +import logging +import re + + +def find_m3u8_and_thumbnail(url): + """ + Find the m3u8 link and thumbnail from a given debate URL. This will depend on + the specific website where the debates are hosted + """ + + if "sicnoticias.pt" in url: + # sic noticias + # the m3u8 url and the thumbnail url are in a script tag of type "application/ld+json" + bs = bs4.BeautifulSoup(requests.get(url).text, "html.parser") + scripts = bs.find_all("script", type="application/ld+json") + for script in scripts: + # read json + data = json.loads(script.string) + if data["@type"] == "VideoObject": + return data["contentUrl"], data["thumbnailUrl"] + elif "rtp.pt" in url: + # https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024 + # m3u8: https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/p12900/p12900_1_2024020515461.mp4,.urlset/master.m3u8 + # thumbnail: https://cdn-images.rtp.pt/multimedia/screenshots/p12900/p12900_1_2024020515461.jpg?q=100&format=pjpg&auto=webp&v=3&w=400 + # + # we can find the necessary reference from a string like: + # "seekBarThumbnailsLoc: '//cdn-images.rtp.pt/multimedia/screenshots/p12900/preview/p12900_1_2024020515461_preview.vtt'," + + text = requests.get(url).text + ref = re.findall(r"seekBarThumbnailsLoc: '(.+?)\.vtt',", text)[0].split("/")[-1].split("_")[:-1] # it ends in "_preview.vtt" + series = ref[0] + ref = "_".join(ref) + + m3u8_url = f"https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/{series}/{ref}.mp4,.urlset/master.m3u8" + thumbnail_url = f"https://cdn-images.rtp.pt/multimedia/screenshots/{series}/{ref}.jpg?q=100&format=pjpg&auto=webp&v=3&w=400" + return m3u8_url, thumbnail_url + + return None, None + + +def get_audio(url, output_path, headers=None): + """ + Use ffmpeg to download the audio from a given m3u8 link + """ + + if Path(output_path).exists(): + return + + Path(output_path).parent.mkdir(exist_ok=True, parents=True) + + if headers is None: + cmd = ["ffmpeg", "-i", url, output_path] + else: + cmd = ["ffmpeg", "-headers", headers, "-i", url, output_path] + + subprocess.run(cmd) + + +def slugify(title): + """ + Turn a title into a slug + """ + + return title.lower().replace(" ", "-") + + +def transcribe_audio(audio_path, output_root): + """ + use whisperx to transcribe the audio + """ + + name = audio_path.stem + + if (output_root / f"transcriptions/{name}.txt").exists(): + return + + Path(f"{output_root}/transcriptions").mkdir(exist_ok=True, parents=True) + + cmd = [ + "whisperx", + "--hf_token", + os.environ["HF_TOKEN"], + "--model", + "large-v2", + "--language", + "pt", + "--diarize", + "--min_speakers", + "2", + "--max_speakers", + "4", + "--compute_type", + "int8", + "--output_dir", + f"{output_root}/transcriptions", + "--print_progress", + "True", + audio_path, + ] + + subprocess.run(cmd) + + # keep only the .txt and .srt files + for f in Path(f"{output_root}/transcriptions").glob("*"): + if f.suffix not in [".txt", ".srt"]: + f.unlink() + + +def process_debate(*, title, url, output_root): + """ + Process a debate from the input data + """ + + m3u8_url, thumbnail_url = find_m3u8_and_thumbnail(url) + if m3u8_url is None or thumbnail_url is None: + logging.warning(f"Could not find m3u8 or thumbnail for {url}") + return + + slug = slugify(title) + audio_path = output_root / f"audio/{slug}.wav" + + if "rtp.pt" in url: + headers = "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:122.0) Gecko/20100101 Firefox/122.0\r\nAccept: */*\r\nAccept-Language: en-US,en;q=0.5\r\nAccept-Encoding: gzip, deflate, br\r\nReferer: https://www.rtp.pt/\r\nOrigin: https://www.rtp.pt\r\nDNT: 1\r\nSec-GPC: 1\r\nConnection: keep-alive\r\nSec-Fetch-Dest: empty\r\nSec-Fetch-Mode: cors\r\nSec-Fetch-Site: same-site\r\nTE: trailers\r\n" + else: + headers = None + + get_audio(m3u8_url, audio_path, headers=headers) + #transcribe_audio(audio_path, output_root) + + out = { + "slug": slug, + "title": title, + "original_url": url, + "m3u8_url": m3u8_url, + "audio_path": str(output_root / f"audio/{slug}.wav"), + "transcription_txt": str(output_root / f"transcriptions/{slug}.txt"), + "transcription_srt": str(output_root / f"transcriptions/{slug}.srt"), + } + + with open(output_root / f"{slug}.json", "w") as f: + json.dump(out, f, indent=4) + + return {"title": title, "thumbnail": thumbnail_url, "slug": slug} + + +def main(args): + input_path = Path(args.input) + output_root = Path(args.output_root) + output_root.mkdir(exist_ok=True, parents=True) + + with open(input_path, "r") as f: + data = yaml.safe_load(f) + + master_json = [] + for debate in data: + output_path = output_root / f"{debate}.json" + + if output_path.exists() and not args.force: + continue + + summary = process_debate(**debate, output_root=output_root) + master_json.append(summary) + + with open(args.output_master_json, "w") as f: + json.dump(master_json, f, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input", type=str, default="debates.yaml") + parser.add_argument("--output_root", type=str, default="src/static/debates") + parser.add_argument("--output-master-json", type=str, default="src/debates.json") + parser.add_argument("--force", action="store_true") + args = parser.parse_args() + main(args) diff --git a/src/App.svelte b/src/App.svelte index 1f31354..7aff68c 100644 --- a/src/App.svelte +++ b/src/App.svelte @@ -1,47 +1,30 @@ -
-
- - - - - - -
-

Vite + Svelte

- -
- -
- -

- Check out SvelteKit, the official Svelte app framework powered by Vite! -

- -

- Click on the Vite and Svelte logos to learn more -

-
- + +
+ {#each debates as { title, thumbnail, slug }} +
(window.location.href = `/${slug}`)}> + {title} +

{title}

+
+ {/each} +
diff --git a/src/debates.json b/src/debates.json new file mode 100644 index 0000000..1230475 --- /dev/null +++ b/src/debates.json @@ -0,0 +1,17 @@ +[ + { + "title": "PS vs IL", + "thumbnail": "https://images.impresa.pt/sicnot/2024-02-05-Pedro-Nuno-Santos-Rui-Rocha-e66319ff/16x9/mw-1200&outputFormat=jpeg", + "slug": "ps-vs-il" + }, + { + "title": "Chega vs PAN", + "thumbnail": "https://cdn-images.rtp.pt/multimedia/screenshots/p1525/p1525_1_202104205996.jpg?q=100&format=pjpg&auto=webp&v=3&w=400", + "slug": "chega-vs-pan" + }, + { + "title": "PCP vs PAN", + "thumbnail": "https://cdn-images.rtp.pt/multimedia/screenshots/p1525/p1525_1_202104205996.jpg?q=100&format=pjpg&auto=webp&v=3&w=400", + "slug": "pcp-vs-pan" + } +] \ No newline at end of file