load asr pipeline only if needed

SWivid · Oct 21, 2024 · b899a35 · b899a35
1 parent 795cb19
commit b899a35
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 14 deletions.
diff --git a/api.py b/api.py
@@ -33,10 +33,10 @@ def __init__(
         )
 
         # Load models
-        self.load_vecoder_model(local_path)
+        self.load_vocoder_model(local_path)
         self.load_ema_model(model_type, ckpt_file, vocab_file, ode_method, use_ema)
 
-    def load_vecoder_model(self, local_path):
+    def load_vocoder_model(self, local_path):
         self.vocos = load_vocoder(local_path is not None, local_path, self.device)
 
     def load_ema_model(self, model_type, ckpt_file, vocab_file, ode_method, use_ema):

diff --git a/inference-cli.py b/inference-cli.py
@@ -104,7 +104,7 @@
         exp_name = "F5TTS_Base"
         ckpt_step = 1200000
         ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
-        # ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors; local path
+        # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors; local path
 
 elif model == "E2-TTS":
     model_cls = UNetT
@@ -114,7 +114,7 @@
         exp_name = "E2TTS_Base"
         ckpt_step = 1200000
         ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
-        # ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors; local path
+        # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors; local path
 
 print(f"Using {model}...")
 ema_model = load_model(model_cls, model_cfg, ckpt_file, vocab_file)

diff --git a/model/utils_infer.py b/model/utils_infer.py
@@ -22,13 +22,6 @@
 device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
 print(f"Using {device} device")
 
-asr_pipe = pipeline(
-    "automatic-speech-recognition",
-    model="openai/whisper-large-v3-turbo",
-    torch_dtype=torch.float16,
-    device=device,
-)
-
 vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
 
 
@@ -82,8 +75,6 @@ def chunk_text(text, max_chars=135):
 
 
 # load vocoder
-
-
 def load_vocoder(is_local=False, local_path="", device=device):
     if is_local:
         print(f"Load vocos from local path {local_path}")
@@ -97,6 +88,22 @@ def load_vocoder(is_local=False, local_path="", device=device):
     return vocos
 
 
+# load asr pipeline
+
+asr_pipe = None
+
+
+def initialize_asr_pipeline(device=device):
+    global asr_pipe
+
+    asr_pipe = pipeline(
+        "automatic-speech-recognition",
+        model="openai/whisper-large",
+        torch_dtype=torch.float16,
+        device=device,
+    )
+
+
 # load model for inference
 
 
@@ -133,7 +140,7 @@ def load_model(model_cls, model_cfg, ckpt_path, vocab_file="", ode_method="euler
 # preprocess reference audio and text
 
 
-def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print):
+def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print, device=device):
     show_info("Converting audio...")
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
         aseg = AudioSegment.from_file(ref_audio_orig)
@@ -152,6 +159,9 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=print):
         ref_audio = f.name
 
     if not ref_text.strip():
+        global asr_pipe
+        if asr_pipe is None:
+            initialize_asr_pipeline(device=device)
         show_info("No reference text provided, transcribing reference audio...")
         ref_text = asr_pipe(
             ref_audio,
@@ -329,6 +339,8 @@ def infer_batch_process(
 
 
 # remove silence from generated wav
+
+
 def remove_silence_for_generated_wav(filename):
     aseg = AudioSegment.from_file(filename)
     non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500)