Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/MassiveScaleToucan' into Massive…
Browse files Browse the repository at this point in the history
…ScaleToucan
  • Loading branch information
Flux9665 committed Oct 17, 2024
2 parents 2ec093d + 20ce6c8 commit 6355e67
Show file tree
Hide file tree
Showing 24 changed files with 826 additions and 139 deletions.
15 changes: 15 additions & 0 deletions .github/FUNDING.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# These are supported funding model platforms

github: [Flux9665] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: # Replace with a single Open Collective username
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
polar: # Replace with a single Polar username
buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
thanks_dev: # Replace with a single thanks.dev username
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
7 changes: 5 additions & 2 deletions InferenceInterfaces/ToucanTTSInterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,9 @@ def forward(self,
energy=None,
input_is_phones=False,
return_plot_as_filepath=False,
loudness_in_db=-24.0,
prosody_creativity=0.1):
loudness_in_db=-29.0,
prosody_creativity=0.1,
return_everything=False):
"""
duration_scaling_factor: reasonable values are 0.8 < scale < 1.2.
1.0 means no scaling happens, higher values increase durations for the whole
Expand Down Expand Up @@ -233,6 +234,8 @@ def forward(self,
plt.savefig("tmp.png")
plt.close()
return wave, sr, "tmp.png"
if return_everything:
return wave, mel, durations, pitch
return wave, sr

def read_to_file(self,
Expand Down
2 changes: 1 addition & 1 deletion Modules/ToucanTTS/InferenceToucanTTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def _forward(self,
mask=text_masks.float(),
n_timesteps=20,
temperature=prosody_creativity,
c=utterance_embedding)), min=0.0).long().squeeze(1) if gold_durations is None else gold_durations
c=utterance_embedding)), min=0.0).long().squeeze(1) if gold_durations is None else gold_durations.squeeze(1)

# modifying the predictions with control parameters
for phoneme_index, phoneme_vector in enumerate(text_tensors.squeeze(0)):
Expand Down
2 changes: 1 addition & 1 deletion Preprocessing/multilinguality/create_lang_dist_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from Preprocessing.multilinguality.SimilaritySolver import SimilaritySolver
from Utility.utils import load_json_from_path

LANG_PAIRS_ORACLE_PATH = "lang_1_to_lang_2_to_oracle_dist.json"
LANG_PAIRS_ORACLE_PATH = "lang_1_to_lang_2_to_l1_dist.json"
ISO_LOOKUP_PATH = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_lookup.json")
ISO_TO_FULLNAME_PATH = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json")
LANG_PAIRS_MAP_PATH = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="lang_1_to_lang_2_to_map_dist.json")
Expand Down
68 changes: 31 additions & 37 deletions Preprocessing/multilinguality/eval_lang_emb_approximation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import matplotlib.pyplot as plt
from Utility.utils import load_json_from_path


def compute_loss_for_approximated_embeddings(csv_path, iso_lookup, language_embeddings, weighted_avg=False, min_n_langs=5, max_n_langs=30, threshold_percentile=95, loss_fn="MSE"):
df = pd.read_csv(csv_path, sep="|")

Expand All @@ -23,7 +24,7 @@ def compute_loss_for_approximated_embeddings(csv_path, iso_lookup, language_embe

features_per_closest_lang = 2
# for combined, df has up to 5 features (if containing individual distances) per closest lang + 1 target lang column
if "combined_dist_0" in df.columns:
if "combined_dist_0" in df.columns:
if "map_dist_0" in df.columns:
features_per_closest_lang += 1
if "asp_dist_0" in df.columns:
Expand Down Expand Up @@ -63,7 +64,7 @@ def compute_loss_for_approximated_embeddings(csv_path, iso_lookup, language_embe
except KeyError:
print(f"KeyError: Unable to retrieve language embedding for {row.target_lang}")
continue
avg_emb = torch.zeros([16])
avg_emb = torch.zeros([32])
dists = [getattr(row, d) for i, d in enumerate(closest_dist_columns) if i < min_n_langs or getattr(row, d) < threshold]
langs = [getattr(row, l) for l in closest_lang_columns[:len(dists)]]

Expand All @@ -77,7 +78,7 @@ def compute_loss_for_approximated_embeddings(csv_path, iso_lookup, language_embe
lang_emb = language_embeddings[iso_lookup[-1][lang]]
avg_emb += lang_emb
normalization_factor = len(langs)
avg_emb /= normalization_factor # normalize
avg_emb /= normalization_factor # normalize
current_loss = loss_fn(avg_emb, y).item()
all_losses.append(current_loss)

Expand All @@ -95,12 +96,10 @@ def compute_loss_for_approximated_embeddings(csv_path, iso_lookup, language_embe
parser.add_argument("--loss_fn", choices=["MSE", "L1"], type=str, default="MSE", help="loss function used")
args = parser.parse_args()
csv_paths = [
"distance_datasets/dataset_map_top30_furthest.csv",
"distance_datasets/dataset_random_top30.csv",
"distance_datasets/dataset_asp_top30.csv",
"distance_datasets/dataset_tree_top30.csv",
"distance_datasets/dataset_map_top30.csv",
"distance_datasets/dataset_combined_top30_indiv-dists.csv",
"distance_datasets/dataset_tree_top30.csv",
"distance_datasets/dataset_learned_top30.csv",
"distance_datasets/dataset_oracle_top30.csv",
]
Expand All @@ -112,49 +111,44 @@ def compute_loss_for_approximated_embeddings(csv_path, iso_lookup, language_embe
OUT_DIR = "plots"
os.makedirs(OUT_DIR, exist_ok=True)

fig, ax = plt.subplots(figsize=(3.15022, 3.15022*(2/3)), constrained_layout=True)
fig, ax = plt.subplots(figsize=(6, 4))
plt.ylabel(args.loss_fn)
for i, csv_path in enumerate(csv_paths):
print(f"csv_path: {os.path.basename(csv_path)}")
for condition in weighted:
losses = compute_loss_for_approximated_embeddings(csv_path,
iso_lookup,
lang_embs,
condition,
min_n_langs=args.min_n_langs,
max_n_langs=args.max_n_langs,
threshold_percentile=args.threshold_percentile,
loss_fn=args.loss_fn)
losses = compute_loss_for_approximated_embeddings(csv_path,
iso_lookup,
lang_embs,
condition,
min_n_langs=args.min_n_langs,
max_n_langs=args.max_n_langs,
threshold_percentile=args.threshold_percentile,
loss_fn=args.loss_fn)
print(f"weighted average: {condition} | mean loss: {np.mean(losses)}")
losses_of_multiple_datasets.append(losses)

bp_dict = ax.boxplot(losses_of_multiple_datasets,
labels = [
"map furthest",
"random",
"inv. ASP",
"tree",
"map",
"avg",
"meta-learned",
"oracle",
],
bp_dict = ax.boxplot(losses_of_multiple_datasets,
labels=["Random",
"Inverse ASP",
"Map Distance",
"Tree Distance",
"Learned Distance",
"Oracle"],
patch_artist=True,
boxprops=dict(facecolor = "lightblue",
boxprops=dict(facecolor="lightblue",
),
showfliers=False,
widths=0.45
)

showfliers=False,
widths=0.55
)
# major ticks every 0.1, minor ticks every 0.05, between 0.0 and 0.6
major_ticks = np.arange(0, 0.6, 0.1)
minor_ticks = np.arange(0, 0.6, 0.05)
major_ticks = np.arange(0, 1.0, 0.1)
minor_ticks = np.arange(0, 1.0, 0.05)
ax.set_yticks(major_ticks)
ax.set_yticks(minor_ticks, minor=True)
# horizontal grid lines for minor and major ticks
ax.grid(which='both', linestyle='-', color='lightgray', linewidth=0.3, axis='y')
ax.set_aspect(4.5)
plt.title(f"min. {args.min_n_langs} kNN, max. {args.max_n_langs}\nthreshold: {args.threshold_percentile}th-percentile distance of {args.max_n_langs}th-closest language")
# plt.title(f"Using between {args.min_n_langs} and {args.max_n_langs} Nearest Neighbors to approximate an unseen Embedding")
plt.xticks(rotation=45)

plt.savefig(os.path.join(OUT_DIR, "example_boxplot_release.pdf"), bbox_inches='tight')
plt.tight_layout()
plt.show()
# plt.savefig(os.path.join(OUT_DIR, "example_boxplot_release.pdf"), bbox_inches='tight')
16 changes: 7 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@
IMS Toucan is a toolkit for training, using, and teaching state-of-the-art Text-to-Speech Synthesis, developed at the
**Institute for Natural Language Processing (IMS), University of Stuttgart, Germany**, official home of the massively multilingual ToucanTTS system. Our system is fast, controllable, and doesn't require a ton of compute.

If you find this repo useful, consider giving it a star⭐. Large numbers make me happy, and they are quite motivating.

<br>

![image](Utility/toucan.png)

<br>

If you find this repo useful, consider giving it a star. ⭐ Large numbers make me happy, and they are very motivating. If you want to motivate me even more, you can even consider [sponsoring this toolkit](https://github.com/sponsors/Flux9665). We only use GitHub Sponsors for this, there are scammers on other platforms that pretend to be the creator. Don't let them fool you. The code and the models are absolutely free, and thanks to the generous support of Hugging Face🤗, we even have an [instance of the model running on GPU](https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS) free for anyone to use.

---
<br>

Expand All @@ -29,17 +31,13 @@ If you find this repo useful, consider giving it a star⭐. Large numbers make m

[Cloning prosody across speakers](https://toucanprosodycloningdemo.github.io)

[Multi-lingual and multi-speaker audios](https://multilingualtoucan.github.io/)

[Massively-Multi-Lingual audios and study setup](https://anondemos.github.io/MMDemo)

### Interactive Demo

[Check out our interactive massively-multi-lingual demo on Huggingface🤗](https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS)
[Check out our interactive massively-multi-lingual demo on Hugging Face🤗](https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS)

### Dataset

[We have also published a massively multilingual TTS dataset on Huggingface🤗](https://huggingface.co/datasets/Flux9665/BibleMMS)
[We have also published a massively multilingual TTS dataset on Hugging Face🤗](https://huggingface.co/datasets/Flux9665/BibleMMS)

---
<br>
Expand Down Expand Up @@ -94,7 +92,7 @@ absolute).

#### Pretrained Models

You don't need to use pretrained models, but it can speed things up tremendously. They will be downloaded on the fly automatically when they are needed, thanks to Huggingface🤗 and [VB](https://github.com/Vaibhavs10) in particular.
You don't need to use pretrained models, but it can speed things up tremendously. They will be downloaded on the fly automatically when they are needed, thanks to Hugging Face🤗 and [VB](https://github.com/Vaibhavs10) in particular.

#### \[optional] eSpeak-NG

Expand Down
9 changes: 5 additions & 4 deletions Recipes/AlignerPipeline.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import torch
from torch.utils.data import ConcatDataset

from Modules.Aligner.autoaligner_train_loop import train_loop as train_aligner
from Utility.corpus_preparation import prepare_aligner_corpus
from Utility.path_to_transcript_dicts import *
from Utility.storage_config import MODELS_DIR
from Utility.storage_config import PREPROCESSING_DIR


def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count):
from Modules.Aligner.autoaligner_train_loop import train_loop as train_aligner
from Utility.corpus_preparation import prepare_aligner_corpus
from Utility.storage_config import MODELS_DIR
from Utility.storage_config import PREPROCESSING_DIR

if gpu_id == "cpu":
device = torch.device("cpu")
else:
Expand Down
11 changes: 6 additions & 5 deletions Recipes/BigVGAN_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
import torch
import wandb

from Modules.Vocoder.BigVGAN import BigVGAN
from Modules.Vocoder.HiFiGAN_Dataset import HiFiGANDataset
from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
from Modules.Vocoder.HiFiGAN_train_loop import train_loop
from Utility.path_to_transcript_dicts import *
from Utility.storage_config import MODELS_DIR


def run(gpu_id, resume_checkpoint, finetune, resume, model_dir, use_wandb, wandb_resume_id, gpu_count):
from Modules.Vocoder.BigVGAN import BigVGAN
from Modules.Vocoder.HiFiGAN_Dataset import HiFiGANDataset
from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
from Modules.Vocoder.HiFiGAN_train_loop import train_loop
from Utility.storage_config import MODELS_DIR

if gpu_id == "cpu":
device = torch.device("cpu")
else:
Expand Down
11 changes: 6 additions & 5 deletions Recipes/BigVGAN_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
import torch
import wandb

from Modules.Vocoder.BigVGAN import BigVGAN
from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
from Modules.Vocoder.HiFiGAN_E2E_Dataset import HiFiGANDataset
from Modules.Vocoder.HiFiGAN_train_loop import train_loop
from Utility.path_to_transcript_dicts import *
from Utility.storage_config import MODELS_DIR


def run(gpu_id, resume_checkpoint, finetune, resume, model_dir, use_wandb, wandb_resume_id, gpu_count):
from Modules.Vocoder.BigVGAN import BigVGAN
from Modules.Vocoder.HiFiGAN_Dataset import HiFiGANDataset
from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
from Modules.Vocoder.HiFiGAN_train_loop import train_loop
from Utility.storage_config import MODELS_DIR

if gpu_id == "cpu":
device = torch.device("cpu")
else:
Expand Down
11 changes: 6 additions & 5 deletions Recipes/HiFiGAN_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
import torch
import wandb

from Modules.Vocoder.HiFiGAN_Dataset import HiFiGANDataset
from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
from Modules.Vocoder.HiFiGAN_train_loop import train_loop
from Utility.path_to_transcript_dicts import *
from Utility.storage_config import MODELS_DIR


def run(gpu_id, resume_checkpoint, finetune, resume, model_dir, use_wandb, wandb_resume_id, gpu_count):
from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
from Modules.Vocoder.HiFiGAN_E2E_Dataset import HiFiGANDataset
from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
from Modules.Vocoder.HiFiGAN_train_loop import train_loop
from Utility.storage_config import MODELS_DIR

if gpu_id == "cpu":
device = torch.device("cpu")
else:
Expand Down
11 changes: 6 additions & 5 deletions Recipes/HiFiGAN_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
import torch
import wandb

from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
from Modules.Vocoder.HiFiGAN_E2E_Dataset import HiFiGANDataset
from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
from Modules.Vocoder.HiFiGAN_train_loop import train_loop
from Utility.path_to_transcript_dicts import *
from Utility.storage_config import MODELS_DIR


def run(gpu_id, resume_checkpoint, finetune, resume, model_dir, use_wandb, wandb_resume_id, gpu_count):
from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
from Modules.Vocoder.HiFiGAN_E2E_Dataset import HiFiGANDataset
from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
from Modules.Vocoder.HiFiGAN_train_loop import train_loop
from Utility.storage_config import MODELS_DIR

if gpu_id == "cpu":
device = torch.device("cpu")
else:
Expand Down
15 changes: 9 additions & 6 deletions Recipes/ToucanTTS_IntegrationTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,18 @@
import torch
import wandb

from Modules.ToucanTTS.ToucanTTS import ToucanTTS
from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
from Utility.corpus_preparation import prepare_tts_corpus
from Utility.path_to_transcript_dicts import *
from Utility.storage_config import MODELS_DIR
from Utility.storage_config import PREPROCESSING_DIR


def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count):
from torch.utils.data import ConcatDataset

from Modules.ToucanTTS.ToucanTTS import ToucanTTS
from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
from Utility.corpus_preparation import prepare_tts_corpus
from Utility.storage_config import MODELS_DIR
from Utility.storage_config import PREPROCESSING_DIR

if gpu_id == "cpu":
device = torch.device("cpu")
else:
Expand Down Expand Up @@ -78,4 +81,4 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb
train_samplers=[train_sampler],
gpu_count=gpu_count)
if use_wandb:
wandb.finish()
wandb.finish()
14 changes: 8 additions & 6 deletions Recipes/ToucanTTS_Massive_English_stage1.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,19 @@

import torch
import wandb
from torch.utils.data import ConcatDataset

from Modules.ToucanTTS.ToucanTTS import ToucanTTS
from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
from Utility.corpus_preparation import prepare_tts_corpus
from Utility.path_to_transcript_dicts import *
from Utility.storage_config import MODELS_DIR
from Utility.storage_config import PREPROCESSING_DIR


def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count):
from torch.utils.data import ConcatDataset

from Modules.ToucanTTS.ToucanTTS import ToucanTTS
from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
from Utility.corpus_preparation import prepare_tts_corpus
from Utility.storage_config import MODELS_DIR
from Utility.storage_config import PREPROCESSING_DIR

if gpu_id == "cpu":
device = torch.device("cpu")
else:
Expand Down
Loading

0 comments on commit 6355e67

Please sign in to comment.