-
Notifications
You must be signed in to change notification settings - Fork 0
/
start.py
122 lines (99 loc) · 5.26 KB
/
start.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
from functions import generate_wav, split, cross_fade
from ddsp.vocoder import load_model, F0_Extractor, Volume_Extractor, Units_Encoder
from ddsp.core import upsample
import logging
import torch
import librosa
import argparse
import numpy as np
import soundfile as sf
import pyworld as pw
import parselmouth
from ast import literal_eval
from slicer import Slicer
from enhancer import Enhancer
from tqdm import tqdm
# Modifiable
TEXT_INPUT_PATH = "./text.txt" # Path of text to be converted
VOICE = "zh-CN-XiaoyiNeural" # You can change the available voice here
OUTPUT_NAME = "output" # Name of the output file
MODEL_PATH = "./model_best.pt" # Model path (make sure that config.yaml is in the same folder)
KEY_CHANGE = 0 # Key change (number of semitones)
PITCH_EXTRACTOR = 'crepe' # Pitch extrator type: parselmouth, dio, harvest, crepe
# Unmodifiable
OUTPUT_PATH = "./output_tts/"
F0_MIN = 50 # min f0 (Hz)
F0_MAX = 1100 # max f0 (Hz)
THREHOLD = -60 # response threhold (dB)
if __name__ == "__main__":
logging.getLogger('numba').setLevel(logging.WARNING)
if not os.path.exists(OUTPUT_PATH):
os.mkdir(OUTPUT_PATH)
text = open(TEXT_INPUT_PATH, encoding="utf-8")
lines = text.readlines() # Line feed means segmentation, each segment generates an audio file
for i, line in enumerate(lines):
output_file = OUTPUT_PATH + OUTPUT_NAME + "_ori_" + str(i) + ".wav"
generate_wav(line, VOICE, output_file)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, args = load_model(MODEL_PATH, device=device)
units_encoder = Units_Encoder(
args.data.encoder,
args.data.encoder_ckpt,
args.data.encoder_sample_rate,
args.data.encoder_hop_size,
device = device)
for i in range(len(lines)):
input_wav_file = OUTPUT_PATH + OUTPUT_NAME + "_ori_" + str(i) + ".wav"
while not os.path.exists(input_wav_file):
print("Waiting for file generation")
audio, sample_rate = librosa.load(input_wav_file, sr=None)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio)
hop_size = args.data.block_size * sample_rate / args.data.sampling_rate
print('Pitch extractor type: ' + PITCH_EXTRACTOR)
pitch_extractor = F0_Extractor(
PITCH_EXTRACTOR,
sample_rate,
hop_size,
float(F0_MIN),
float(F0_MAX))
print('Extracting the pitch curve of the input audio...')
f0 = pitch_extractor.extract(audio, uv_interp = True, device = device)
f0 = torch.from_numpy(f0).float().to(device).unsqueeze(-1).unsqueeze(0)
f0 = f0 * 2 ** (float(KEY_CHANGE) / 12)
print('Extracting the volume envelope of the input audio...')
volume_extractor = Volume_Extractor(hop_size)
volume = volume_extractor.extract(audio)
mask = (volume > 10 ** (float(THREHOLD) / 20)).astype('float')
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
mask = np.array([np.max(mask[n : n + 9]) for n in range(len(mask) - 8)])
mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0)
mask = upsample(mask, args.data.block_size).squeeze(-1)
volume = torch.from_numpy(volume).float().to(device).unsqueeze(-1).unsqueeze(0)
spk_mix_dict = literal_eval("None")
spk_id = torch.LongTensor(np.array([[1]])).to(device)
result = np.zeros(0)
current_length = 0
segments = split(audio, sample_rate, hop_size)
print('Cut the input audio into ' + str(len(segments)) + ' slices')
with torch.no_grad():
for segment in tqdm(segments):
start_frame = segment[0]
seg_input = torch.from_numpy(segment[1]).float().unsqueeze(0).to(device)
seg_units = units_encoder.encode(seg_input, sample_rate, hop_size)
seg_f0 = f0[:, start_frame : start_frame + seg_units.size(1), :]
seg_volume = volume[:, start_frame : start_frame + seg_units.size(1), :]
seg_output, _, (s_h, s_n) = model(seg_units, seg_f0, seg_volume, spk_id = spk_id, spk_mix_dict = spk_mix_dict)
seg_output *= mask[:, start_frame * args.data.block_size : (start_frame + seg_units.size(1)) * args.data.block_size]
output_sample_rate = args.data.sampling_rate
seg_output = seg_output.squeeze().cpu().numpy()
silent_length = round(start_frame * args.data.block_size * output_sample_rate / args.data.sampling_rate) - current_length
if silent_length >= 0:
result = np.append(result, np.zeros(silent_length))
result = np.append(result, seg_output)
else:
result = cross_fade(result, seg_output, current_length + silent_length)
current_length = current_length + silent_length + len(seg_output)
wav_gen = OUTPUT_PATH + OUTPUT_NAME + "_target_" + str(i) + ".wav"
sf.write(wav_gen, result, output_sample_rate)