-
Notifications
You must be signed in to change notification settings - Fork 1
/
lyricist.py
73 lines (52 loc) · 1.99 KB
/
lyricist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import re
import whisperx
from text_cleaner import BasicLyricsCleaner
def generate_aligned_lyrics_timestamp(timestamps, audio_length, clean_title):
"""
Generate aligned lyrics timestamp.
Args:
timestamps (list): List of timestamps.
audio_length (float): Length of audio.
clean_title (str): Cleaned title of song.
Returns:
aligned_lyrics (list): List of aligned lyrics.
"""
lyrics_cleaner = BasicLyricsCleaner()
all_starts = [0] + [round(segment["start"], 1) for segment in timestamps]
clean_title = lyrics_cleaner.clean(
clean_title, convert_verbs=True, replace_pronouns=False, correct_spelling=False
)
lyrics = [clean_title]
for segment in timestamps:
text = segment["text"]
# clean text
text = lyrics_cleaner.clean(text)
if len(text.split()) < 3:
text = text + f" {clean_title} "
lyrics.append(text)
ends = []
for i in range(len(all_starts) - 1):
end = all_starts[i + 1]
ends.append(end)
ends.append(round(timestamps[-1]["end"], 1))
# add ending
all_starts.append(ends[-1])
lyrics.append(clean_title)
ends.append(audio_length)
timestamps = []
for i in range(len(all_starts)):
timestamps.append({"start": all_starts[i], "end": ends[i], "lyrics": lyrics[i]})
return timestamps
def load_whisper(model="base", device="cuda", download_root=None):
if download_root is None:
download_root = os.path.join(os.path.expanduser("~"), ".cache")
model = whisperx.load_model(model, device, download_root=download_root)
return model
def transcribe_audio_segments(audio_file, model, device="cuda"):
result = model.transcribe(audio_file, fp16=True, verbose=False)
model_a, metadata = whisperx.load_align_model(language_code="en", device=device)
result_aligned = whisperx.align(
result["segments"], model_a, metadata, audio_file, device
)
return result_aligned["segments"]