From f58c8fc2128d2dba4190e496642e39d8879a390e Mon Sep 17 00:00:00 2001 From: Guilherme Pires Date: Tue, 6 Feb 2024 12:49:09 -0700 Subject: [PATCH] debate processing barebones --- .github/workflows/process-debates.yaml | 36 +++++ debates.yaml | 8 ++ environment.yml | 13 ++ process_debates.py | 176 +++++++++++++++++++++++++ src/App.svelte | 61 ++++----- 5 files changed, 255 insertions(+), 39 deletions(-) create mode 100644 .github/workflows/process-debates.yaml create mode 100644 debates.yaml create mode 100644 environment.yml create mode 100644 process_debates.py diff --git a/.github/workflows/process-debates.yaml b/.github/workflows/process-debates.yaml new file mode 100644 index 0000000..adb3168 --- /dev/null +++ b/.github/workflows/process-debates.yaml @@ -0,0 +1,36 @@ +name: Process Debates + +#on: +# push: +# paths: +# - 'debates.yaml' +# +jobs: + process-debates: + defaults: + run: + shell: bash -leo pipefail {0} + + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: mamba-org/setup-micromamba@v1 + name: install whisperx and other deps + with: + micromamba-version: '1.3.1-0' + environment-file: environment.yml + init-shell: >- + bash + cache-environment: true + post-cleanup: 'all' + + - name: Process Debates + run: python process_debates.py debates.yaml src/static/debates + + - name: Commit and Push Changes + uses: EndBug/add-and-commit@v7 + with: + message: 'Update debates data' + add: 'src/static/debates/' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/debates.yaml b/debates.yaml new file mode 100644 index 0000000..e23af70 --- /dev/null +++ b/debates.yaml @@ -0,0 +1,8 @@ +- title: PS vs IL + url: https://sicnoticias.pt/pais/2024-02-05-Debate-PS--IL-na-integra-dc65b6a5 + +- title: Chega vs PAN + url: https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024 + +- title: PCP vs PAN + url: https://www.rtp.pt/play/p12900/e746296/debates-legislativas-2024 diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..9fd2a84 --- /dev/null +++ b/environment.yml @@ -0,0 +1,13 @@ +name: debates +channels: + - conda-forge + - defaults + - pytorch +dependencies: + - python=3.10 + - pytorch=2.0.0 + - torchaudio=2.0.0 + - ffmpeg=6.1.1 + - pip: + - beautifulsoup4 + - git+https://github.com/m-bain/whisperx.git@8227807#egg=whisperx diff --git a/process_debates.py b/process_debates.py new file mode 100644 index 0000000..44cb0a1 --- /dev/null +++ b/process_debates.py @@ -0,0 +1,176 @@ +import argparse +from pathlib import Path +import yaml +import json +import subprocess +import bs4 +import requests +import logging +import re + + +def find_m3u8_and_thumbnail(url): + """ + Find the m3u8 link and thumbnail from a given debate URL. This will depend on + the specific website where the debates are hosted + """ + + if "sicnoticias.pt" in url: + # sic noticias + # the m3u8 url and the thumbnail url are in a script tag of type "application/ld+json" + bs = bs4.BeautifulSoup(requests.get(url).text, "html.parser") + scripts = bs.find_all("script", type="application/ld+json") + for script in scripts: + # read json + data = json.loads(script.string) + if data["@type"] == "VideoObject": + return data["contentUrl"], data["thumbnailUrl"] + elif "rtp.pt" in url: + # https://www.rtp.pt/play/p12900/e746061/debates-legislativas-2024 + # m3u8: https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/p12900/p12900_1_2024020515461.mp4,.urlset/master.m3u8 + # thumbnail: https://cdn-images.rtp.pt/multimedia/screenshots/p12900/p12900_1_2024020515461.jpg?q=100&format=pjpg&auto=webp&v=3&w=400 + # + # we can find the necessary reference from a string like: + # "seekBarThumbnailsLoc: '//cdn-images.rtp.pt/multimedia/screenshots/p12900/preview/p12900_1_2024020515461_preview.vtt'," + + text = requests.get(url).text + ref = re.findall(r"seekBarThumbnailsLoc: '(.+?)\.vtt',", text)[0].split("/")[-1].split("_")[:-1] # it ends in "_preview.vtt" + series = ref[0] + ref = "_".join(ref) + + m3u8_url = f"https://streaming-vod.rtp.pt/hls/nas2.share,/h264/512x384/{series}/{ref}.mp4,.urlset/master.m3u8" + thumbnail_url = f"https://cdn-images.rtp.pt/multimedia/screenshots/{series}/{ref}.jpg?q=100&format=pjpg&auto=webp&v=3&w=400" + return m3u8_url, thumbnail_url + + return None, None + + +def get_audio(url, output_path): + """ + Use ffmpeg to download the audio from a given m3u8 link + """ + + if Path(output_path).exists(): + return + + Path(output_path).parent.mkdir(exist_ok=True, parents=True) + + cmd = [ + "ffmpeg", + "-i", + url, + output_path, + ] + + subprocess.run(cmd) + + +def slugify(title): + """ + Turn a title into a slug + """ + + return title.lower().replace(" ", "-") + + +def transcribe_audio(audio_path, output_root): + """ + use whisperx to transcribe the audio + """ + + name = audio_path.stem + + if (output_root / f"transcriptions/{name}.txt").exists(): + return + + Path(f"{output_root}/transcriptions").mkdir(exist_ok=True, parents=True) + + cmd = [ + "whisperx", + "--model", + "large-v2", + "--language", + "pt", + "--diarize", + "--min_speakers", + "2", + "--max_speakers", + "4", + "--compute_type", + "int8", + "--output_dir", + f"{output_root}/transcriptions", + "--print_progress", + "True", + audio_path, + ] + + subprocess.run(cmd) + + # keep only the .txt and .srt files + for f in Path(f"{output_root}/transcriptions").glob("*"): + if f.suffix not in [".txt", ".srt"]: + f.unlink() + + +def process_debate(*, title, url, output_root): + """ + Process a debate from the input data + """ + + m3u8_url, thumbnail_url = find_m3u8_and_thumbnail(url) + if m3u8_url is None or thumbnail_url is None: + logging.warning(f"Could not find m3u8 or thumbnail for {url}") + return + + slug = slugify(title) + audio_path = output_root / f"audio/{slug}.wav" + get_audio(m3u8_url, audio_path) + transcribe_audio(audio_path, output_root) + + out = { + "slug": slug, + "title": title, + "original_url": url, + "m3u8_url": m3u8_url, + "audio_path": str(output_root / f"audio/{slug}.wav"), + "transcription_txt": str(output_root / f"transcriptions/{slug}.txt"), + "transcription_srt": str(output_root / f"transcriptions/{slug}.srt"), + } + + with open(output_root / f"{slug}.json", "w") as f: + json.dump(out, f, indent=4) + + return {"title": title, "thumbnail": thumbnail_url, "slug": slug} + + +def main(args): + input_path = Path(args.input) + output_root = Path(args.output_root) + output_root.mkdir(exist_ok=True, parents=True) + + with open(input_path, "r") as f: + data = yaml.safe_load(f) + + master_json = [] + for debate in data: + output_path = output_root / f"{debate}.json" + + if output_path.exists() and not args.force: + continue + + summary = process_debate(**debate, output_root=output_root) + master_json.append(summary) + + with open(args.output_master_json, "w") as f: + json.dump(master_json, f, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input", type=str, default="debates.yaml") + parser.add_argument("--output_root", type=str, default="src/static/debates") + parser.add_argument("--output-master-json", type=str, default="src/debates.json") + parser.add_argument("--force", action="store_true") + args = parser.parse_args() + main(args) diff --git a/src/App.svelte b/src/App.svelte index 1f31354..7aff68c 100644 --- a/src/App.svelte +++ b/src/App.svelte @@ -1,47 +1,30 @@ -
-
- - - - - - -
-

Vite + Svelte

- -
- -
- -

- Check out SvelteKit, the official Svelte app framework powered by Vite! -

- -

- Click on the Vite and Svelte logos to learn more -

-
- + +
+ {#each debates as { title, thumbnail, slug }} +
(window.location.href = `/${slug}`)}> + {title} +

{title}

+
+ {/each} +