Merge remote-tracking branch 'origin/MassiveScaleToucan' into Massive…

…ScaleToucan
DigitalPhonetics · Oct 17, 2024 · 6355e67 · 6355e67
2 parents 2ec093d + 20ce6c8
commit 6355e67
Show file tree

Hide file tree

Showing 24 changed files with 826 additions and 139 deletions.
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
@@ -0,0 +1,15 @@
+# These are supported funding model platforms
+
+github: [Flux9665] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+polar: # Replace with a single Polar username
+buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
+thanks_dev: # Replace with a single thanks.dev username
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
diff --git a/InferenceInterfaces/ToucanTTSInterface.py b/InferenceInterfaces/ToucanTTSInterface.py
@@ -143,8 +143,9 @@ def forward(self,
                 energy=None,
                 input_is_phones=False,
                 return_plot_as_filepath=False,
-                loudness_in_db=-24.0,
-                prosody_creativity=0.1):
+                loudness_in_db=-29.0,
+                prosody_creativity=0.1,
+                return_everything=False):
         """
         duration_scaling_factor: reasonable values are 0.8 < scale < 1.2.
                                      1.0 means no scaling happens, higher values increase durations for the whole
@@ -233,6 +234,8 @@ def forward(self,
                 plt.savefig("tmp.png")
                 plt.close()
                 return wave, sr, "tmp.png"
+        if return_everything:
+            return wave, mel, durations, pitch
         return wave, sr
 
     def read_to_file(self,

diff --git a/Modules/ToucanTTS/InferenceToucanTTS.py b/Modules/ToucanTTS/InferenceToucanTTS.py
@@ -242,7 +242,7 @@ def _forward(self,
                                                                              mask=text_masks.float(),
                                                                              n_timesteps=20,
                                                                              temperature=prosody_creativity,
-                                                                             c=utterance_embedding)), min=0.0).long().squeeze(1) if gold_durations is None else gold_durations
+                                                                             c=utterance_embedding)), min=0.0).long().squeeze(1) if gold_durations is None else gold_durations.squeeze(1)
 
         # modifying the predictions with control parameters
         for phoneme_index, phoneme_vector in enumerate(text_tensors.squeeze(0)):

diff --git a/Preprocessing/multilinguality/create_lang_dist_dataset.py b/Preprocessing/multilinguality/create_lang_dist_dataset.py
@@ -10,7 +10,7 @@
 from Preprocessing.multilinguality.SimilaritySolver import SimilaritySolver
 from Utility.utils import load_json_from_path
 
-LANG_PAIRS_ORACLE_PATH = "lang_1_to_lang_2_to_oracle_dist.json"
+LANG_PAIRS_ORACLE_PATH = "lang_1_to_lang_2_to_l1_dist.json"
 ISO_LOOKUP_PATH = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_lookup.json")
 ISO_TO_FULLNAME_PATH = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json")
 LANG_PAIRS_MAP_PATH = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="lang_1_to_lang_2_to_map_dist.json")

diff --git a/Preprocessing/multilinguality/eval_lang_emb_approximation.py b/Preprocessing/multilinguality/eval_lang_emb_approximation.py
@@ -13,6 +13,7 @@
 import matplotlib.pyplot as plt
 from Utility.utils import load_json_from_path
 
+
 def compute_loss_for_approximated_embeddings(csv_path, iso_lookup, language_embeddings, weighted_avg=False, min_n_langs=5, max_n_langs=30, threshold_percentile=95, loss_fn="MSE"):
     df = pd.read_csv(csv_path, sep="|")
 
@@ -23,7 +24,7 @@ def compute_loss_for_approximated_embeddings(csv_path, iso_lookup, language_embe
 
     features_per_closest_lang = 2
     # for combined, df has up to 5 features (if containing individual distances) per closest lang + 1 target lang column
-    if "combined_dist_0" in df.columns: 
+    if "combined_dist_0" in df.columns:
         if "map_dist_0" in df.columns:
             features_per_closest_lang += 1
         if "asp_dist_0" in df.columns:
@@ -63,7 +64,7 @@ def compute_loss_for_approximated_embeddings(csv_path, iso_lookup, language_embe
         except KeyError:
             print(f"KeyError: Unable to retrieve language embedding for {row.target_lang}")
             continue
-        avg_emb = torch.zeros([16])
+        avg_emb = torch.zeros([32])
         dists = [getattr(row, d) for i, d in enumerate(closest_dist_columns) if i < min_n_langs or getattr(row, d) < threshold]
         langs = [getattr(row, l) for l in closest_lang_columns[:len(dists)]]
 
@@ -77,7 +78,7 @@ def compute_loss_for_approximated_embeddings(csv_path, iso_lookup, language_embe
                 lang_emb = language_embeddings[iso_lookup[-1][lang]]
                 avg_emb += lang_emb
             normalization_factor = len(langs)
-        avg_emb /= normalization_factor # normalize
+        avg_emb /= normalization_factor  # normalize
         current_loss = loss_fn(avg_emb, y).item()
         all_losses.append(current_loss)
 
@@ -95,12 +96,10 @@ def compute_loss_for_approximated_embeddings(csv_path, iso_lookup, language_embe
     parser.add_argument("--loss_fn", choices=["MSE", "L1"], type=str, default="MSE", help="loss function used")
     args = parser.parse_args()
     csv_paths = [
-        "distance_datasets/dataset_map_top30_furthest.csv",
         "distance_datasets/dataset_random_top30.csv",
         "distance_datasets/dataset_asp_top30.csv",
-        "distance_datasets/dataset_tree_top30.csv",
         "distance_datasets/dataset_map_top30.csv",
-        "distance_datasets/dataset_combined_top30_indiv-dists.csv",
+        "distance_datasets/dataset_tree_top30.csv",
         "distance_datasets/dataset_learned_top30.csv",
         "distance_datasets/dataset_oracle_top30.csv",
     ]
@@ -112,49 +111,44 @@ def compute_loss_for_approximated_embeddings(csv_path, iso_lookup, language_embe
     OUT_DIR = "plots"
     os.makedirs(OUT_DIR, exist_ok=True)
 
-    fig, ax = plt.subplots(figsize=(3.15022, 3.15022*(2/3)), constrained_layout=True)
+    fig, ax = plt.subplots(figsize=(6, 4))
     plt.ylabel(args.loss_fn)
     for i, csv_path in enumerate(csv_paths):
         print(f"csv_path: {os.path.basename(csv_path)}")
         for condition in weighted:
-            losses = compute_loss_for_approximated_embeddings(csv_path, 
-                                                         iso_lookup, 
-                                                         lang_embs, 
-                                                         condition, 
-                                                         min_n_langs=args.min_n_langs, 
-                                                         max_n_langs=args.max_n_langs,
-                                                         threshold_percentile=args.threshold_percentile,
-                                                         loss_fn=args.loss_fn)
+            losses = compute_loss_for_approximated_embeddings(csv_path,
+                                                              iso_lookup,
+                                                              lang_embs,
+                                                              condition,
+                                                              min_n_langs=args.min_n_langs,
+                                                              max_n_langs=args.max_n_langs,
+                                                              threshold_percentile=args.threshold_percentile,
+                                                              loss_fn=args.loss_fn)
             print(f"weighted average: {condition} | mean loss: {np.mean(losses)}")
             losses_of_multiple_datasets.append(losses)
 
-    bp_dict = ax.boxplot(losses_of_multiple_datasets, 
-                         labels = [
-                             "map furthest",
-                             "random", 
-                             "inv. ASP", 
-                             "tree", 
-                             "map", 
-                             "avg", 
-                             "meta-learned", 
-                             "oracle", 
-                             ], 
+    bp_dict = ax.boxplot(losses_of_multiple_datasets,
+                         labels=["Random",
+                                 "Inverse ASP",
+                                 "Map Distance",
+                                 "Tree Distance",
+                                 "Learned Distance",
+                                 "Oracle"],
                          patch_artist=True,
-                         boxprops=dict(facecolor = "lightblue", 
+                         boxprops=dict(facecolor="lightblue",
                                        ),
-                        showfliers=False,
-                        widths=0.45
-                        )
-
+                         showfliers=False,
+                         widths=0.55
+                         )
     # major ticks every 0.1, minor ticks every 0.05, between 0.0 and 0.6
-    major_ticks = np.arange(0, 0.6, 0.1)
-    minor_ticks = np.arange(0, 0.6, 0.05)
+    major_ticks = np.arange(0, 1.0, 0.1)
+    minor_ticks = np.arange(0, 1.0, 0.05)
     ax.set_yticks(major_ticks)
     ax.set_yticks(minor_ticks, minor=True)
     # horizontal grid lines for minor and major ticks
     ax.grid(which='both', linestyle='-', color='lightgray', linewidth=0.3, axis='y')
-    ax.set_aspect(4.5)
-    plt.title(f"min. {args.min_n_langs} kNN, max. {args.max_n_langs}\nthreshold: {args.threshold_percentile}th-percentile distance of {args.max_n_langs}th-closest language")
+    # plt.title(f"Using between {args.min_n_langs} and {args.max_n_langs} Nearest Neighbors to approximate an unseen Embedding")
     plt.xticks(rotation=45)
-
-    plt.savefig(os.path.join(OUT_DIR, "example_boxplot_release.pdf"), bbox_inches='tight')
+    plt.tight_layout()
+    plt.show()
+    # plt.savefig(os.path.join(OUT_DIR, "example_boxplot_release.pdf"), bbox_inches='tight')
diff --git a/README.md b/README.md
@@ -12,12 +12,14 @@
 IMS Toucan is a toolkit for training, using, and teaching state-of-the-art Text-to-Speech Synthesis, developed at the
 **Institute for Natural Language Processing (IMS), University of Stuttgart, Germany**, official home of the massively multilingual ToucanTTS system. Our system is fast, controllable, and doesn't require a ton of compute.
 
-If you find this repo useful, consider giving it a star⭐. Large numbers make me happy, and they are quite motivating.
-
 <br>
 
 ![image](Utility/toucan.png)
 
+<br>
+
+If you find this repo useful, consider giving it a star. ⭐ Large numbers make me happy, and they are very motivating. If you want to motivate me even more, you can even consider [sponsoring this toolkit](https://github.com/sponsors/Flux9665). We only use GitHub Sponsors for this, there are scammers on other platforms that pretend to be the creator. Don't let them fool you. The code and the models are absolutely free, and thanks to the generous support of Hugging Face🤗, we even have an [instance of the model running on GPU](https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS) free for anyone to use.
+
 --- 
 <br>
 
@@ -29,17 +31,13 @@ If you find this repo useful, consider giving it a star⭐. Large numbers make m
 
 [Cloning prosody across speakers](https://toucanprosodycloningdemo.github.io)
 
-[Multi-lingual and multi-speaker audios](https://multilingualtoucan.github.io/)
-
-[Massively-Multi-Lingual audios and study setup](https://anondemos.github.io/MMDemo)
-
 ### Interactive Demo
 
-[Check out our interactive massively-multi-lingual demo on Huggingface🤗](https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS)
+[Check out our interactive massively-multi-lingual demo on Hugging Face🤗](https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS)
 
 ### Dataset
 
-[We have also published a massively multilingual TTS dataset on Huggingface🤗](https://huggingface.co/datasets/Flux9665/BibleMMS)
+[We have also published a massively multilingual TTS dataset on Hugging Face🤗](https://huggingface.co/datasets/Flux9665/BibleMMS)
 
 --- 
 <br>
@@ -94,7 +92,7 @@ absolute).
 
 #### Pretrained Models
 
-You don't need to use pretrained models, but it can speed things up tremendously. They will be downloaded on the fly automatically when they are needed, thanks to Huggingface🤗 and [VB](https://github.com/Vaibhavs10) in particular.
+You don't need to use pretrained models, but it can speed things up tremendously. They will be downloaded on the fly automatically when they are needed, thanks to Hugging Face🤗 and [VB](https://github.com/Vaibhavs10) in particular.
 
 #### \[optional] eSpeak-NG
 

diff --git a/Recipes/AlignerPipeline.py b/Recipes/AlignerPipeline.py
@@ -1,14 +1,15 @@
 import torch
 from torch.utils.data import ConcatDataset
 
-from Modules.Aligner.autoaligner_train_loop import train_loop as train_aligner
-from Utility.corpus_preparation import prepare_aligner_corpus
 from Utility.path_to_transcript_dicts import *
-from Utility.storage_config import MODELS_DIR
-from Utility.storage_config import PREPROCESSING_DIR
 
 
 def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count):
+    from Modules.Aligner.autoaligner_train_loop import train_loop as train_aligner
+    from Utility.corpus_preparation import prepare_aligner_corpus
+    from Utility.storage_config import MODELS_DIR
+    from Utility.storage_config import PREPROCESSING_DIR
+
     if gpu_id == "cpu":
         device = torch.device("cpu")
     else:

diff --git a/Recipes/BigVGAN_combined.py b/Recipes/BigVGAN_combined.py
@@ -4,15 +4,16 @@
 import torch
 import wandb
 
-from Modules.Vocoder.BigVGAN import BigVGAN
-from Modules.Vocoder.HiFiGAN_Dataset import HiFiGANDataset
-from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
-from Modules.Vocoder.HiFiGAN_train_loop import train_loop
 from Utility.path_to_transcript_dicts import *
-from Utility.storage_config import MODELS_DIR
 
 
 def run(gpu_id, resume_checkpoint, finetune, resume, model_dir, use_wandb, wandb_resume_id, gpu_count):
+    from Modules.Vocoder.BigVGAN import BigVGAN
+    from Modules.Vocoder.HiFiGAN_Dataset import HiFiGANDataset
+    from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
+    from Modules.Vocoder.HiFiGAN_train_loop import train_loop
+    from Utility.storage_config import MODELS_DIR
+
     if gpu_id == "cpu":
         device = torch.device("cpu")
     else:

diff --git a/Recipes/BigVGAN_e2e.py b/Recipes/BigVGAN_e2e.py
@@ -3,15 +3,16 @@
 import torch
 import wandb
 
-from Modules.Vocoder.BigVGAN import BigVGAN
-from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
-from Modules.Vocoder.HiFiGAN_E2E_Dataset import HiFiGANDataset
-from Modules.Vocoder.HiFiGAN_train_loop import train_loop
 from Utility.path_to_transcript_dicts import *
-from Utility.storage_config import MODELS_DIR
 
 
 def run(gpu_id, resume_checkpoint, finetune, resume, model_dir, use_wandb, wandb_resume_id, gpu_count):
+    from Modules.Vocoder.BigVGAN import BigVGAN
+    from Modules.Vocoder.HiFiGAN_Dataset import HiFiGANDataset
+    from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
+    from Modules.Vocoder.HiFiGAN_train_loop import train_loop
+    from Utility.storage_config import MODELS_DIR
+
     if gpu_id == "cpu":
         device = torch.device("cpu")
     else:

diff --git a/Recipes/HiFiGAN_combined.py b/Recipes/HiFiGAN_combined.py
@@ -4,15 +4,16 @@
 import torch
 import wandb
 
-from Modules.Vocoder.HiFiGAN_Dataset import HiFiGANDataset
-from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
-from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
-from Modules.Vocoder.HiFiGAN_train_loop import train_loop
 from Utility.path_to_transcript_dicts import *
-from Utility.storage_config import MODELS_DIR
 
 
 def run(gpu_id, resume_checkpoint, finetune, resume, model_dir, use_wandb, wandb_resume_id, gpu_count):
+    from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
+    from Modules.Vocoder.HiFiGAN_E2E_Dataset import HiFiGANDataset
+    from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
+    from Modules.Vocoder.HiFiGAN_train_loop import train_loop
+    from Utility.storage_config import MODELS_DIR
+
     if gpu_id == "cpu":
         device = torch.device("cpu")
     else:

diff --git a/Recipes/HiFiGAN_e2e.py b/Recipes/HiFiGAN_e2e.py
@@ -3,15 +3,16 @@
 import torch
 import wandb
 
-from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
-from Modules.Vocoder.HiFiGAN_E2E_Dataset import HiFiGANDataset
-from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
-from Modules.Vocoder.HiFiGAN_train_loop import train_loop
 from Utility.path_to_transcript_dicts import *
-from Utility.storage_config import MODELS_DIR
 
 
 def run(gpu_id, resume_checkpoint, finetune, resume, model_dir, use_wandb, wandb_resume_id, gpu_count):
+    from Modules.Vocoder.HiFiGAN_Discriminators import AvocodoHiFiGANJointDiscriminator
+    from Modules.Vocoder.HiFiGAN_E2E_Dataset import HiFiGANDataset
+    from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
+    from Modules.Vocoder.HiFiGAN_train_loop import train_loop
+    from Utility.storage_config import MODELS_DIR
+
     if gpu_id == "cpu":
         device = torch.device("cpu")
     else:

diff --git a/Recipes/ToucanTTS_IntegrationTest.py b/Recipes/ToucanTTS_IntegrationTest.py
@@ -7,15 +7,18 @@
 import torch
 import wandb
 
-from Modules.ToucanTTS.ToucanTTS import ToucanTTS
-from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
-from Utility.corpus_preparation import prepare_tts_corpus
 from Utility.path_to_transcript_dicts import *
-from Utility.storage_config import MODELS_DIR
-from Utility.storage_config import PREPROCESSING_DIR
 
 
 def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count):
+    from torch.utils.data import ConcatDataset
+
+    from Modules.ToucanTTS.ToucanTTS import ToucanTTS
+    from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
+    from Utility.corpus_preparation import prepare_tts_corpus
+    from Utility.storage_config import MODELS_DIR
+    from Utility.storage_config import PREPROCESSING_DIR
+
     if gpu_id == "cpu":
         device = torch.device("cpu")
     else:
@@ -78,4 +81,4 @@ def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb
                train_samplers=[train_sampler],
                gpu_count=gpu_count)
     if use_wandb:
-        wandb.finish()
+        wandb.finish()
diff --git a/Recipes/ToucanTTS_Massive_English_stage1.py b/Recipes/ToucanTTS_Massive_English_stage1.py
@@ -2,17 +2,19 @@
 
 import torch
 import wandb
-from torch.utils.data import ConcatDataset
 
-from Modules.ToucanTTS.ToucanTTS import ToucanTTS
-from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
-from Utility.corpus_preparation import prepare_tts_corpus
 from Utility.path_to_transcript_dicts import *
-from Utility.storage_config import MODELS_DIR
-from Utility.storage_config import PREPROCESSING_DIR
 
 
 def run(gpu_id, resume_checkpoint, finetune, model_dir, resume, use_wandb, wandb_resume_id, gpu_count):
+    from torch.utils.data import ConcatDataset
+
+    from Modules.ToucanTTS.ToucanTTS import ToucanTTS
+    from Modules.ToucanTTS.toucantts_train_loop_arbiter import train_loop
+    from Utility.corpus_preparation import prepare_tts_corpus
+    from Utility.storage_config import MODELS_DIR
+    from Utility.storage_config import PREPROCESSING_DIR
+
     if gpu_id == "cpu":
         device = torch.device("cpu")
     else: