Skip to content

Commit

Permalink
Merge pull request #2 from colobas/dev
Browse files Browse the repository at this point in the history
new video and a couple changes
  • Loading branch information
colobas authored Feb 7, 2024
2 parents 9d735da + 7b04e84 commit a711fcb
Show file tree
Hide file tree
Showing 190 changed files with 2,583 additions and 15 deletions.
3 changes: 3 additions & 0 deletions debates.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@

- title: BE vs PSD
url: https://sicnoticias.pt/especiais/eleicoes-legislativas/2024-02-06-Debate-entre-BE-e-PSD-Quem-e-que-sabe-salvar-o-SNS--a252ab7c

- title: Chega vs IL
url: https://www.rtp.pt/play/p12899/e746368/debates-legislativas-2024-sicsic-noticias
3 changes: 1 addition & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ dependencies:
- ffmpeg=6.1.1
- pip:
- beautifulsoup4
- webvtt-to-json
- git+https://github.com/m-bain/whisperx.git@8227807#egg=whisperx
- torchaudio==2.0.0
- webvtt
- webvtt-py
27 changes: 15 additions & 12 deletions process_debates.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def transcribe_audio(audio_path, output_root):

name = audio_path.stem

if not (output_root / f"transcriptions/{name}.vtt").exists():
if not (output_root / f"transcriptions/{name}.json").exists():
Path(f"{output_root}/transcriptions").mkdir(exist_ok=True, parents=True)

cmd = [
Expand All @@ -177,19 +177,19 @@ def transcribe_audio(audio_path, output_root):

subprocess.run(cmd)

# keep only the .vtt file
for f in Path(f"{output_root}/transcriptions").glob(f"{name}.*"):
if f.suffix not in [".vtt"]:
f.unlink()
# keep only the .vtt file
for f in Path(f"{output_root}/transcriptions").glob(f"{name}.*"):
if f.suffix not in [".vtt"]:
f.unlink()

# convert the vtt to json
webvtt_to_json(f"{output_root}/transcriptions/{name}.vtt", f"{output_root}/transcriptions/{name}.json")
# convert the vtt to json
webvtt_to_json(f"{output_root}/transcriptions/{name}.vtt", f"{output_root}/transcriptions/{name}.json")

# remove the vtt
(output_root / f"transcriptions/{name}.vtt").unlink()
# remove the vtt
(output_root / f"transcriptions/{name}.vtt").unlink()


def process_debate(*, title, url, output_root):
def process_debate(*, title, url, output_root, skip_transcription=False):
"""
Process a debate from the input data
"""
Expand Down Expand Up @@ -223,7 +223,9 @@ def process_debate(*, title, url, output_root):
headers = None

get_audio_and_video(m3u8_url, audio_path, hls_path, headers=headers)
transcribe_audio(audio_path, output_root)

if not skip_transcription:
transcribe_audio(audio_path, output_root)

out = {
"slug": slug,
Expand Down Expand Up @@ -252,7 +254,7 @@ def main(args):
if output_path.exists() and not args.force:
continue

summary = process_debate(**debate, output_root=output_root)
summary = process_debate(**debate, output_root=output_root, skip_transcription=args.skip_transcription)
master_json.append(summary)

with open(args.output_master_json, "w") as f:
Expand All @@ -265,5 +267,6 @@ def main(args):
parser.add_argument("--output_root", type=str, default="public/debates")
parser.add_argument("--output-master-json", type=str, default="src/debates.json")
parser.add_argument("--force", action="store_true")
parser.add_argument("--skip-transcription", action="store_true")
args = parser.parse_args()
main(args)
Binary file added public/debates/audio/chega-vs-il.mp3
Binary file not shown.
5 changes: 5 additions & 0 deletions public/debates/chega-vs-il.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"slug": "chega-vs-il",
"title": "Chega vs IL",
"original_url": "https://www.rtp.pt/play/p12899/e746368/debates-legislativas-2024-sicsic-noticias"
}
Loading

0 comments on commit a711fcb

Please sign in to comment.