diff --git a/requirements_webui.txt b/requirements_webui.txt
index 91189403..4b421a80 100644
--- a/requirements_webui.txt
+++ b/requirements_webui.txt
@@ -1,8 +1,9 @@
 # PyTorch and its dependencies
 # These libraries include PyTorch and its related packages, supporting CUDA 11.8.
-# torch==2.0.1
-# torchvision==0.15.2
-# torchaudio==2.0.2
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.0.1
+torchvision==0.15.2
+torchaudio==2.0.2
 # Installation source for PyTorch: -f https://download.pytorch.org/whl/cu118
 # Example installation command:
 
@@ -20,8 +21,9 @@ imageio[ffmpeg]
 omegaconf
 spaces
 moviepy
-librosa==0.8.1
-ultralytics
+librosa==0.10.2
+ultralytics # for wav2lipv2
+gradio==4.16.0
 
 # SadTalker related libraries
 numpy==1.23.4
@@ -35,11 +37,11 @@ kornia==0.6.8
 yacs==0.1.8
 joblib==1.1.0
 facexlib==0.3.0
-gradio==4.16.0
 scikit-image==0.19.3
 protobuf==3.20.2
 basicsr==1.4.2
 gfpgan==1.3.8
+matplotlib==3.7.5
 
 # MuseTalk related libraries
 diffusers==0.27.2
@@ -53,11 +55,10 @@ transformers==4.39.2
 # mim install "mmdet>=3.1.0" 
 # mim install "mmpose>=1.1.0" 
 
-# PaddleTTS related libraries
-paddlepaddle==2.5.2
-paddlespeech==1.4.1
-opencc==1.1.1
-matplotlib==3.8.4
+# # PaddleTTS related libraries
+# paddlepaddle==2.5.2
+# paddlespeech==1.4.1
+# opencc==1.1.1
 
 # ASR (Automatic Speech Recognition) related libraries
 openai
@@ -70,6 +71,7 @@ zhconv
 # LLM (Large Language Model) related libraries
 openai
 g4f
+curl_cffi
 grpcio-status==1.48.2
 google-generativeai
 google-api-python-client==2.126.0
@@ -94,4 +96,10 @@ PyYAML
 psutil
 jieba_fast
 jieba
-LangSegment
\ No newline at end of file
+LangSegment
+
+
+# CosyVoice  related libraries
+conformer==0.3.2
+lightning==2.2.4
+wget==3.2
\ No newline at end of file
diff --git a/webui.py b/webui.py
index d9ba2c0b..2baa87d7 100644
--- a/webui.py
+++ b/webui.py
@@ -1,8 +1,9 @@
 import os
 import random 
 import gradio as gr
+import numpy as np
 import time
-import torch
+import torch, torchaudio
 import gc
 import warnings
 warnings.filterwarnings('ignore')
@@ -31,37 +32,32 @@ def get_title(title = 'Linly 智能对话系统 (Linly-Talker)'):
     """
     return description
 
-
-# 设置默认system
-default_system = '你是一个很有帮助的助手'
-# 设置默认的prompt
-prefix_prompt = '''请用少于25个字回答以下问题\n\n'''
+# Default system and prompt settings
+DEFAULT_SYSTEM = '你是一个很有帮助的助手'
+PREFIX_PROMPT = '请用少于25个字回答以下问题\n\n'
+# Default parameters
+IMAGE_SIZE = 256
+PREPROCESS_TYPE = 'crop'
+FACERENDER = 'facevid2vid'
+ENHANCER = False
+IS_STILL_MODE = False
+EXP_WEIGHT = 1
+USE_REF_VIDEO = False
+REF_VIDEO = None
+REF_INFO = 'pose'
+USE_IDLE_MODE = False
+AUDIO_LENGTH = 5
 
 edgetts = EdgeTTS()
 
-# 设定默认参数值，可修改
-blink_every = True
-size_of_image = 256
-preprocess_type = 'crop'
-facerender = 'facevid2vid'
-enhancer = False
-is_still_mode = False
-exp_weight = 1
-use_ref_video = False
-ref_video = None
-ref_info = 'pose'
-use_idle_mode = False
-length_of_audio = 5
-
 @calculate_time
 def Asr(audio):
     try:
         question = asr.transcribe(audio)
         question = convert(question, 'zh-cn')
     except Exception as e:
-        print("ASR Error: ", e)
+        gr.Warning("ASR Error: ", e)
         question = 'Gradio存在一些bug，麦克风模式有时候可能音频还未传入，请重新点击一下语音识别即可'
-        gr.Warning(question)
     return question
 
 def clear_memory():
@@ -79,271 +75,185 @@ def clear_memory():
     print(f"Cached memory: {torch.cuda.memory_reserved() / (1024 ** 2):.2f} MB")
     print(f"Max cached memory: {torch.cuda.max_memory_reserved() / (1024 ** 2):.2f} MB")
 
+def generate_seed():
+    seed = random.randint(1, 100000000)
+    return {"__type__": "update", "value": seed}
+
+def set_all_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def change_instruction(mode):
+    return instruct_dict.get(mode, '未知模式')
+
+PROMPT_SR, TARGET_SR = 16000, 22050
+DEFAULT_DATA = np.zeros(TARGET_SR)
+
 @calculate_time
-def TTS_response(text, 
-                 voice, rate, volume, pitch,
-                 am, voc, lang, male,
-                 inp_ref, prompt_text, prompt_language, text_language, how_to_cut, 
-                 question_audio, question, use_mic_voice,
-                 tts_method = 'PaddleTTS', save_path = 'answer.wav'):
-    # print(text, voice, rate, volume, pitch, am, voc, lang, male, tts_method, save_path)
+def TTS_response(text, voice, rate, volume, pitch, am, voc, lang, male,
+                ref_audio, prompt_text, prompt_language, text_language,
+                cut_method, question_audio, question, use_mic_voice,
+                mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor, 
+                tts_method='Edge-TTS', save_path='answer.wav'):
+    if text == '':
+        text = '请输入文字/问题'
     if tts_method == 'Edge-TTS':
         if not edgetts.network:
-            gr.Warning("请检查网络或者使用其他模型，例如PaddleTTS") 
-            return None, None
+            gr.Warning("请检查网络或使用其他模型，例如PaddleTTS")
+            return None
         try:
-            edgetts.predict(text, voice, rate, volume, pitch , 'answer.wav', 'answer.vtt')
-        except:
-            os.system(f'edge-tts --text "{text}" --voice {voice} --write-media answer.wav --write-subtitles answer.vtt')
-        return 'answer.wav', 'answer.vtt'
-    elif tts_method == 'PaddleTTS':
-        tts.predict(text, am, voc, lang = lang, male=male, save_path = save_path)
-        return save_path, None
-    elif tts_method == 'GPT-SoVITS克隆声音':
-        if use_mic_voice:
-            try:
-                vits.predict(ref_wav_path = question_audio,
-                                prompt_text = question,
-                                prompt_language = "中文",
-                                text = text, # 回答
-                                text_language = "中文",
-                                how_to_cut = "凑四句一切",
-                                save_path = 'answer.wav')
-                return 'answer.wav', None
-            except Exception as e:
-                gr.Warning("无克隆环境或者无克隆模型权重，无法克隆声音", e)
-                return None, None
+            edgetts.predict(text, voice, rate, volume, pitch, save_path, 'answer.vtt')
+        except Exception as e:
+            os.system(f'edge-tts --text "{text}" --voice {voice} --write-media {save_path} --write-subtitles answer.vtt')
+        return save_path
+    
+    if tts_method == 'PaddleTTS':
+        tts.predict(text, am, voc, lang=lang, male=male, save_path=save_path)
+        return save_path
+    
+    if tts_method == 'GPT-SoVITS克隆声音':
+        try:
+            vits.predict(ref_wav_path=question_audio if use_mic_voice else ref_audio,
+                         prompt_text=question if use_mic_voice else prompt_text,
+                         prompt_language=prompt_language,
+                         text=text,
+                         text_language=text_language,
+                         how_to_cut=cut_method,
+                         save_path=save_path)
+            return save_path
+        except Exception as e:
+            gr.Warning("无克隆环境或模型权重，无法克隆声音", e)
+            return None
+    elif "CosyVoice" in tts_method:
+        if prompt_wav_upload is not None:
+            prompt_wav = prompt_wav_upload
+        elif prompt_wav_record is not None:
+            prompt_wav = prompt_wav_record
         else:
-            try:
-                vits.predict(ref_wav_path = inp_ref,
-                                prompt_text = prompt_text,
-                                prompt_language = prompt_language,
-                                text = text, # 回答
-                                text_language = text_language,
-                                how_to_cut = how_to_cut,
-                                save_path = 'answer.wav')
-                return 'answer.wav', None
-            except Exception as e:
-                gr.Warning("无克隆环境或者无克隆模型权重，无法克隆声音", e)
-                return None, None
-    return None, None
+            prompt_wav = None
+        if mode_checkbox_group in ['跨语种复刻']:
+            if prompt_wav is None:
+                gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
+                return (TARGET_SR, DEFAULT_DATA)
+            gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言')
+        # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
+        if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
+            if prompt_wav is None:
+                gr.Warning('prompt音频为空，您是否忘记输入prompt音频？')
+                return (TARGET_SR, DEFAULT_DATA)
+            if torchaudio.info(prompt_wav).sample_rate < PROMPT_SR:
+                gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, PROMPT_SR))
+                return (TARGET_SR, DEFAULT_DATA)
+        # sft mode only use sft_dropdown
+        if mode_checkbox_group in ['预训练音色']:
+            if prompt_wav is not None or prompt_text_cv != '':
+                gr.Info('您正在使用预训练音色模式，prompt文本/prompt音频/instruct文本会被忽略！')
+        # zero_shot mode only use prompt_wav prompt text
+        if mode_checkbox_group in ['3s极速复刻']:
+            if prompt_text_cv == '':
+                gr.Warning('prompt文本为空，您是否忘记输入prompt文本？')
+                return (TARGET_SR, DEFAULT_DATA)
+            # if instruct_text != '':
+            #     gr.Info('您正在使用3s极速复刻模式，预训练音色/instruct文本会被忽略！')
+
+        if mode_checkbox_group == '预训练音色':
+            set_all_random_seed(seed)
+            output = cosyvoice.predict_sft(text, sft_dropdown, speed_factor=speed_factor, save_path=save_path)
+        elif mode_checkbox_group == '3s极速复刻':
+            set_all_random_seed(seed)
+            output = cosyvoice.predict_zero_shot(text, prompt_text_cv, prompt_wav, speed_factor=speed_factor, save_path=save_path)
+        elif mode_checkbox_group == '跨语种复刻':
+            set_all_random_seed(seed)
+            output = cosyvoice.predict_cross_lingual(text, prompt_wav, speed_factor=speed_factor, save_path=save_path)
+        return output
+    else:
+        gr.Warning('未知模型')
+    return None
+
+inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻']
+instruct_dict = {'预训练音色': '1. 选择预训练音色\n2. 点击生成音频按钮',
+                '3s极速复刻': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮',
+                '跨语种复刻': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 点击生成音频按钮',
+                '自然语言控制': '1. 选择预训练音色\n2. 输入instruct文本\n3. 点击生成音频按钮'}
+
+
 @calculate_time
-def LLM_response(question_audio, question, 
-                 voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0,
-                 am='fastspeech2', voc='pwgan',lang='zh', male=False, 
-                 inp_ref = None, prompt_text = "", prompt_language = "", text_language = "", how_to_cut = "", use_mic_voice = False,
-                 tts_method = 'Edge-TTS'):
+def LLM_response(
+    question_audio, question,  # 输入的音频和文本问题
+    voice, rate, volume, pitch,  # 语音合成参数
+    am, voc, lang, male,  # TTS 模型参数
+    ref_audio, prompt_text, prompt_language, text_language,  # 提示音频、文本及其语言设置
+    cut_method, use_mic_voice, mode_checkbox_group, sft_dropdown,  # 其他TTS选项
+    prompt_text_cv, prompt_wav_upload, prompt_wav_record,  # 提示信息和音频选项
+    seed, speed_factor,  # 随机种子和语速因子
+    tts_method='Edge-TTS'  # TTS 方法，默认使用 'Edge-TTS'
+):
     if len(question) == 0:
         gr.Warning("请输入问题")
         return None, None, None
-    answer = llm.generate(question, default_system)
-    print(answer)
-    driven_audio, driven_vtt = TTS_response(answer, voice, rate, volume, pitch, 
-                 am, voc, lang, male, 
-                 inp_ref, prompt_text, prompt_language, text_language, how_to_cut, question_audio, question, use_mic_voice,
-                 tts_method)
-    return driven_audio, driven_vtt, answer
 
-@calculate_time
-def Talker_response(question_audio = None, method = 'SadTalker', text = '',
-                    voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 100, pitch = 0, 
-                    am = 'fastspeech2', voc = 'pwgan', lang = 'zh', male = False, 
-                    inp_ref = None, prompt_text = "", prompt_language = "", text_language = "", how_to_cut = "", use_mic_voice = False,
-                    tts_method = 'Edge-TTS',batch_size = 2, character = '女性角色', 
-                    progress=gr.Progress(track_tqdm=True)):
-    default_voice = None
-    if character == '女性角色':
-        # 女性角色
-        source_image, pic_path = r'inputs/girl.png', r'inputs/girl.png'
-        crop_pic_path = "./inputs/first_frame_dir_girl/girl.png"
-        first_coeff_path = "./inputs/first_frame_dir_girl/girl.mat"
-        crop_info = ((403, 403), (19, 30, 502, 513), [40.05956541381802, 40.17324339233366, 443.7892505041507, 443.9029284826663])
-        default_voice = 'zh-CN-XiaoxiaoNeural'
-    elif character == '男性角色':
-        # 男性角色
-        source_image = r'./inputs/boy.png'
-        pic_path = "./inputs/boy.png"
-        crop_pic_path = "./inputs/first_frame_dir_boy/boy.png"
-        first_coeff_path = "./inputs/first_frame_dir_boy/boy.mat"
-        crop_info = ((876, 747), (0, 0, 886, 838), [10.382158280494476, 0, 886, 747.7078990925525])
-        default_voice = 'zh-CN-YunyangNeural'
-    else:
-        gr.Warning('未知角色')
-        return None
-    
-    voice = default_voice if not voice else voice
-    
-    if not voice:
-        gr.Warning('请选择声音')
-    
-    driven_audio, driven_vtt, _ = LLM_response(question_audio, text, 
-                                               voice, rate, volume, pitch, 
-                                               am, voc, lang, male, 
-                                               inp_ref, prompt_text, prompt_language, text_language, how_to_cut, use_mic_voice,
-                                               tts_method)
-    if driven_audio is None:
-        gr.Warning("音频没有正常生成，请检查TTS是否正确")
-        return None
-    if method == 'SadTalker':
-        pose_style = random.randint(0, 45)
-        video = talker.test(pic_path,
-                        crop_pic_path,
-                        first_coeff_path,
-                        crop_info,
-                        source_image,
-                        driven_audio,
-                        preprocess_type,
-                        is_still_mode,
-                        enhancer,
-                        batch_size,                            
-                        size_of_image,
-                        pose_style,
-                        facerender,
-                        exp_weight,
-                        use_ref_video,
-                        ref_video,
-                        ref_info,
-                        use_idle_mode,
-                        length_of_audio,
-                        blink_every,
-                        fps=20)
-    elif method == 'Wav2Lip':
-        video = talker.predict(crop_pic_path, driven_audio, batch_size, enhancer)
-    elif method == 'Wav2Lipv2':
-        video = talker.run(crop_pic_path, driven_audio, batch_size, enhancer)
-    elif method == 'NeRFTalk':
-        video = talker.predict(driven_audio)
-    else:
-        gr.Warning("不支持的方法：" + method)
-        return None
-    if driven_vtt:
-        return video, driven_vtt
-    else:
-        return video
+    # 生成回答
+    answer = llm.generate(question, DEFAULT_SYSTEM)
+    print("LLM 回复：", answer)
+
+    # 合成回答语音
+    tts_audio = TTS_response(
+        answer, voice, rate, volume, pitch, am, voc, lang, male,
+        ref_audio, prompt_text, prompt_language, text_language, 
+        cut_method, question_audio, question, use_mic_voice, 
+        mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, 
+        prompt_wav_record, seed, speed_factor, tts_method
+    )
+
+    # 生成VTT文件（如果TTS方法为'Edge-TTS'）
+    tts_vtt = 'answer.vtt' if tts_method == 'Edge-TTS' else None
+
+    return tts_audio, tts_vtt, answer
 
 @calculate_time
-def Talker_response_img(question_audio, method, text, voice, rate, volume, pitch, 
-                        am, voc, lang, male, 
-                        inp_ref , prompt_text, prompt_language, text_language, how_to_cut, use_mic_voice,
-                        tts_method,
-                        source_image,
-                        preprocess_type, 
-                        is_still_mode,
-                        enhancer,
-                        batch_size,                            
-                        size_of_image,
-                        pose_style,
-                        facerender,
-                        exp_weight,
-                        blink_every,
-                        fps, progress=gr.Progress(track_tqdm=True)
-                    ):
+def Talker_response_img(question_audio, method, text, voice, rate, volume, pitch,
+                        am, voc, lang, male, inp_ref, prompt_text, prompt_language,
+                        text_language, how_to_cut, use_mic_voice, 
+                        mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor, 
+                        tts_method, source_image, preprocess_type, is_still_mode, enhancer,
+                        batch_size, size_of_image, pose_style, facerender,
+                        exp_weight, blink_every, fps, progress=gr.Progress(track_tqdm=True)):
+
     if enhancer:
-        gr.Warning("记得请先安装GFPGAN库，pip install gfpgan, 已安装可忽略")
+        gr.Warning("请先安装GFPGAN库 (pip install gfpgan)，已安装可忽略")
+
     if not voice:
-        gr.Warning("请先选择声音")
-    driven_audio, driven_vtt, _ = LLM_response(question_audio, text, voice, rate, volume, pitch, 
-                                               am, voc, lang, male, 
-                                               inp_ref, prompt_text, prompt_language, text_language, how_to_cut, use_mic_voice,
-                                               tts_method = tts_method)
-    if driven_audio is None:
-        gr.Warning("音频没有正常生成，请检查TTS是否正确")
+        gr.Warning("请选择声音")
         return None
-    if method == 'SadTalker':
-        video = talker.test2(source_image,
-                        driven_audio,
-                        preprocess_type,
-                        is_still_mode,
-                        enhancer,
-                        batch_size,                            
-                        size_of_image,
-                        pose_style,
-                        facerender,
-                        exp_weight,
-                        use_ref_video,
-                        ref_video,
-                        ref_info,
-                        use_idle_mode,
-                        length_of_audio,
-                        blink_every,
-                        fps=fps)
-    elif method == 'Wav2Lip':
-        video = talker.predict(source_image, driven_audio, batch_size)
-    elif method == 'Wav2Lipv2':
-        video = talker.run(source_image, driven_audio, batch_size)
-    elif method == 'NeRFTalk':
-        video = talker.predict(driven_audio)
-    else:
-        return None
-    if driven_vtt:
-        return video, driven_vtt
-    else:
-        return video
+    driven_audio, driven_vtt, _ = LLM_response(question_audio, text, voice, rate, volume, pitch,
+                                            am, voc, lang, male, inp_ref, prompt_text, prompt_language,
+                                            text_language, how_to_cut, use_mic_voice, 
+                                            mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor, tts_method)
 
-@calculate_time
-def Talker_Say(preprocess_type, 
-                        is_still_mode,
-                        enhancer,
-                        batch_size,                            
-                        size_of_image,
-                        pose_style,
-                        facerender,
-                        exp_weight,
-                        blink_every,
-                        fps,source_image = None, source_video = None, question_audio = None, method = 'SadTalker', text = '', 
-                    voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 100, pitch = 0, 
-                    am = 'fastspeech2', voc = 'pwgan', lang = 'zh', male = False, 
-                    inp_ref = None, prompt_text = "", prompt_language = "", text_language = "", how_to_cut = "", use_mic_voice = False,
-                    tts_method = 'Edge-TTS', character = '女性角色',
-                    progress=gr.Progress(track_tqdm=True)):
-    if source_video:
-        source_image = source_video
-    default_voice = None
-    
-    voice = default_voice if not voice else voice
-    
-    if not voice:
-        gr.Warning('请选择声音')
-    
-    driven_audio, driven_vtt = TTS_response(text, voice, rate, volume, pitch, 
-                 am, voc, lang, male, 
-                 inp_ref, prompt_text, prompt_language, text_language, how_to_cut, question_audio, text, use_mic_voice,
-                 tts_method)
     if driven_audio is None:
         gr.Warning("音频没有正常生成，请检查TTS是否正确")
         return None
+
+    # 视频生成
+    video = None
     if method == 'SadTalker':
-        pose_style = random.randint(0, 45)
-        video = talker.test2(source_image,
-                        driven_audio,
-                        preprocess_type,
-                        is_still_mode,
-                        enhancer,
-                        batch_size,                            
-                        size_of_image,
-                        pose_style,
-                        facerender,
-                        exp_weight,
-                        use_ref_video,
-                        ref_video,
-                        ref_info,
-                        use_idle_mode,
-                        length_of_audio,
-                        blink_every,
-                        fps=fps)
+        video = talker.test2(source_image, driven_audio, preprocess_type, is_still_mode, enhancer,
+                             batch_size, size_of_image, pose_style, facerender, exp_weight,
+                             REF_VIDEO, REF_INFO, USE_IDLE_MODE, AUDIO_LENGTH, blink_every, 
+                             fps=fps)
     elif method == 'Wav2Lip':
-        video = talker.predict(source_image, driven_audio, batch_size, enhancer)
+        video = talker.predict(source_image, driven_audio, batch_size)
     elif method == 'Wav2Lipv2':
-        video = talker.run(crop_pic_path, driven_audio, batch_size, enhancer)
+        video = talker.run(source_image, driven_audio, batch_size)
     elif method == 'NeRFTalk':
         video = talker.predict(driven_audio)
     else:
         gr.Warning("不支持的方法：" + method)
         return None
-    if driven_vtt:
-        return video, driven_vtt
-    else:
-        return video
+
+    return (video, driven_vtt) if driven_vtt else video
 
 def chat_response(system, message, history):
     # response = llm.generate(message)
@@ -368,65 +278,51 @@ def clear_session():
 
 
 def human_response(source_image, history, question_audio, talker_method, voice, rate, volume, pitch,
-                  am, voc, lang, male, 
-                  inp_ref, prompt_text, prompt_language, text_language, how_to_cut, use_mic_voice,
-                  tts_method, character, 
-                  preprocess_type, is_still_mode, enhancer, batch_size, size_of_image,
-                  pose_style, facerender, exp_weight, blink_every, fps = 20, progress=gr.Progress(track_tqdm=True)):
+                   am, voc, lang, male, inp_ref, prompt_text, prompt_language, text_language, cut_method, use_mic_voice, 
+                   mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor, 
+                   tts_method, character, preprocess_type, is_still_mode,
+                   enhancer, batch_size, size_of_image, pose_style, facerender, exp_weight,
+                   blink_every, fps=20, progress=gr.Progress(track_tqdm=True)):
     response = history[-1][1]
-    qusetion = history[-1][0]
-    # driven_audio, video_vtt = 'answer.wav', 'answer.vtt'
+    question = history[-1][0]
+
+    # 角色信息设置
     if character == '女性角色':
-        # 女性角色
-        source_image, pic_path = r'./inputs/girl.png', r"./inputs/girl.png"
-        crop_pic_path = "./inputs/first_frame_dir_girl/girl.png"
-        first_coeff_path = "./inputs/first_frame_dir_girl/girl.mat"
-        crop_info = ((403, 403), (19, 30, 502, 513), [40.05956541381802, 40.17324339233366, 443.7892505041507, 443.9029284826663])
+        source_image = pic_path = crop_pic_path = first_coeff_path = r'./inputs/girl.png'
+        crop_info = ((403, 403), (19, 30, 502, 513), [40.06, 40.17, 443.79, 443.90])
         default_voice = 'zh-CN-XiaoxiaoNeural'
     elif character == '男性角色':
-        # 男性角色
-        source_image = r'./inputs/boy.png'
-        pic_path = "./inputs/boy.png"
-        crop_pic_path = "./inputs/first_frame_dir_boy/boy.png"
-        first_coeff_path = "./inputs/first_frame_dir_boy/boy.mat"
-        crop_info = ((876, 747), (0, 0, 886, 838), [10.382158280494476, 0, 886, 747.7078990925525])
+        source_image = pic_path = crop_pic_path = first_coeff_path = r'./inputs/boy.png'
+        crop_info = ((876, 747), (0, 0, 886, 838), [10.38, 0, 886, 747.71])
         default_voice = 'zh-CN-YunyangNeural'
     elif character == '自定义角色':
         if source_image is None:
             gr.Error("自定义角色需要上传正确的图片")
             return None
         default_voice = 'zh-CN-XiaoxiaoNeural'
+    else:
+        gr.Error("未知角色")
+        return None
+
     voice = default_voice if not voice else voice
-    # tts.predict(response, voice, rate, volume, pitch, driven_audio, video_vtt)
-    driven_audio, driven_vtt = TTS_response(response, voice, rate, volume, pitch, 
-                 am, voc, lang, male, 
-                 inp_ref, prompt_text, prompt_language, text_language, how_to_cut, question_audio, qusetion, use_mic_voice,
-                 tts_method)
+
+    # TTS响应生成
+    driven_audio = TTS_response(response, voice, rate, volume, pitch, am, voc, lang, male,
+                                            inp_ref, prompt_text, prompt_language, text_language,
+                                            cut_method, question_audio, question, use_mic_voice, 
+                                            mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor,tts_method)
+    driven_vtt = 'answer.vtt' if tts_method == 'Edge-TTS' else None
     if driven_audio is None:
         gr.Warning("音频没有正常生成，请检查TTS是否正确")
         return None
+
+    # 视频生成
+    video = None
     if talker_method == 'SadTalker':
         pose_style = random.randint(0, 45)
-        video = talker.test(pic_path,
-                        crop_pic_path,
-                        first_coeff_path,
-                        crop_info,
-                        source_image,
-                        driven_audio,
-                        preprocess_type,
-                        is_still_mode,
-                        enhancer,
-                        batch_size,                            
-                        size_of_image,
-                        pose_style,
-                        facerender,
-                        exp_weight,
-                        use_ref_video,
-                        ref_video,
-                        ref_info,
-                        use_idle_mode,
-                        length_of_audio,
-                        blink_every,
+        video = talker.test2(source_image, driven_audio, preprocess_type, is_still_mode, enhancer,
+                        batch_size, size_of_image, pose_style, facerender, exp_weight,
+                        REF_VIDEO, REF_INFO, USE_IDLE_MODE, AUDIO_LENGTH, blink_every, 
                         fps=fps)
     elif talker_method == 'Wav2Lip':
         video = talker.predict(crop_pic_path, driven_audio, batch_size, enhancer)
@@ -437,68 +333,61 @@ def human_response(source_image, history, question_audio, talker_method, voice,
     else:
         gr.Warning("不支持的方法：" + talker_method)
         return None
-    if driven_vtt:
-        return video, driven_vtt
-    else:
-        return video
+
+    return video, driven_vtt if driven_vtt else video
 
 
 @calculate_time
-def MuseTalker_response(source_video, bbox_shift, question_audio = None, text = '',
-                    voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 100, pitch = 0, 
-                    am = 'fastspeech2', voc = 'pwgan', lang = 'zh', male = False, 
-                    inp_ref = None, prompt_text = "", prompt_language = "", text_language = "", how_to_cut = "", use_mic_voice = False,
-                    tts_method = 'Edge-TTS', batch_size = 4,
-                    progress=gr.Progress(track_tqdm=True)):
-    default_voice = None    
+def MuseTalker_response(source_video, bbox_shift, question_audio, text, voice,
+                        rate, volume, pitch, am, voc, lang, male, 
+                        ref_audio, prompt_text, prompt_language, text_language, cut_method, use_mic_voice,
+                        mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor,  
+                        tts_method='Edge-TTS', batch_size=4, progress=gr.Progress(track_tqdm=True)):
+    default_voice = None
     voice = default_voice if not voice else voice
-    
+
     if not voice:
         gr.Warning('请选择声音')
-    
-    driven_audio, driven_vtt, _ = LLM_response(question_audio, text, 
-                                               voice, rate, volume, pitch, 
-                                               am, voc, lang, male, 
-                                               inp_ref, prompt_text, prompt_language, text_language, how_to_cut, use_mic_voice,
+        return None
+
+    # LLM响应生成
+    driven_audio, driven_vtt, _ = LLM_response(question_audio, text, voice, rate, volume, pitch,
+                                               am, voc, lang, male, ref_audio, prompt_text, prompt_language,
+                                               text_language, cut_method, use_mic_voice, 
+                                               mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor, 
                                                tts_method)
-    print(driven_audio, driven_vtt)
-    video = musetalker.inference_noprepare(driven_audio, 
-                                            source_video, 
-                                            bbox_shift,
-                                            batch_size,
-                                            fps = 25) 
-    
-    if driven_vtt:
-        return (video, driven_vtt)
-    else:
-        return video 
+
+    if driven_audio is None:
+        gr.Warning("音频没有正常生成，请检查TTS是否正确")
+        return None
+
+    # MuseTalker 视频生成
+    video = musetalker.inference_noprepare(driven_audio, source_video, bbox_shift, batch_size, fps=25)
+
+    return video, driven_vtt if driven_vtt else video
+
 GPT_SoVITS_ckpt = "GPT_SoVITS/pretrained_models"
 def load_vits_model(gpt_path, sovits_path, progress=gr.Progress(track_tqdm=True)):
     global vits
     print("模型加载中...", gpt_path, sovits_path)
-    all_gpt_path, all_sovits_path = os.path.join(GPT_SoVITS_ckpt, gpt_path), os.path.join(GPT_SoVITS_ckpt, sovits_path)
+    all_gpt_path = os.path.join(GPT_SoVITS_ckpt, gpt_path)
+    all_sovits_path = os.path.join(GPT_SoVITS_ckpt, sovits_path)
     vits.load_model(all_gpt_path, all_sovits_path)
     gr.Info("模型加载成功")
     return gpt_path, sovits_path
 
-def list_models(dir, endwith = ".pth"):
-    list_folder = os.listdir(dir)
-    list_folder = [i for i in list_folder if i.endswith(endwith)]
-    return list_folder
-
 def character_change(character):
     if character == '女性角色':
-        # 女性角色
-        source_image = r'./inputs/girl.png'
+        return r'./inputs/girl.png'
     elif character == '男性角色':
-        # 男性角色
-        source_image = r'./inputs/boy.png'
+        return r'./inputs/boy.png'
     elif character == '自定义角色':
-        # gr.Warnings("自定义角色暂未更新，请继续关注后续，可通过自由上传图片模式进行自定义角色")
-        source_image = None
-    return source_image
+        return None
+    else:
+        gr.Warning("不支持的角色类型：" + character)
+        return None
 
-def webui_setting(talk = False):
+def webui_setting(talk=False):
     if not talk:
         with gr.Tabs():
             with gr.TabItem('数字人形象设定'):
@@ -508,104 +397,83 @@ def webui_setting(talk = False):
     with gr.Tabs("TTS Method"):
         with gr.Accordion("TTS Method语音方法调节 ", open=True):
             with gr.Tab("Edge-TTS"):
-                voice = gr.Dropdown(edgetts.SUPPORTED_VOICE, 
-                                    value='zh-CN-XiaoxiaoNeural', 
-                                    label="Voice 声音选择")
-                rate = gr.Slider(minimum=-100,
-                                    maximum=100,
-                                    value=0,
-                                    step=1.0,
-                                    label='Rate 速率')
-                volume = gr.Slider(minimum=0,
-                                        maximum=100,
-                                        value=100,
-                                        step=1,
-                                        label='Volume 音量')
-                pitch = gr.Slider(minimum=-100,
-                                    maximum=100,
-                                    value=0,
-                                    step=1,
-                                    label='Pitch 音调')
+                voice = gr.Dropdown(edgetts.SUPPORTED_VOICE, value='zh-CN-XiaoxiaoNeural', label="Voice 声音选择")
+                rate = gr.Slider(minimum=-100, maximum=100, value=0, step=1.0, label='Rate 速率')
+                volume = gr.Slider(minimum=0, maximum=100, value=100, step=1, label='Volume 音量')
+                pitch = gr.Slider(minimum=-100, maximum=100, value=0, step=1, label='Pitch 音调')
             with gr.Tab("PaddleTTS"):
-                am = gr.Dropdown(["FastSpeech2"], label="声学模型选择", value = 'FastSpeech2')
-                voc = gr.Dropdown(["PWGan", "HifiGan"], label="声码器选择", value = 'PWGan')
-                lang = gr.Dropdown(["zh", "en", "mix", "canton"], label="语言选择", value = 'zh')
+                am = gr.Dropdown(["FastSpeech2"], label="声学模型选择", value='FastSpeech2')
+                voc = gr.Dropdown(["PWGan", "HifiGan"], label="声码器选择", value='PWGan')
+                lang = gr.Dropdown(["zh", "en", "mix", "canton"], label="语言选择", value='zh')
                 male = gr.Checkbox(label="男声(Male)", value=False)
             with gr.Tab('GPT-SoVITS'):
                 with gr.Row():
-                    gpt_path = gr.FileExplorer(root = GPT_SoVITS_ckpt, glob = "*.ckpt", value = "s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", file_count='single', label="GPT模型路径")
-                    sovits_path = gr.FileExplorer(root = GPT_SoVITS_ckpt, glob = "*.pth", value = "s2G488k.pth", file_count='single', label="SoVITS模型路径")
-                    # gpt_path = gr.Dropdown(choices=list_models(GPT_SoVITS_ckpt, 'ckpt'))
-                    # sovits_path = gr.Dropdown(choices=list_models(GPT_SoVITS_ckpt, 'pth'))
-                    # gpt_path = gr.Textbox(label="GPT模型路径", 
-                    #                       value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
-                    # sovits_path = gr.Textbox(label="SoVITS模型路径", 
-                    #                          value="GPT_SoVITS/pretrained_models/s2G488k.pth")
+                    gpt_path = gr.FileExplorer(root=GPT_SoVITS_ckpt, glob="*.ckpt", value="s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", file_count='single', label="GPT模型路径")
+                    sovits_path = gr.FileExplorer(root=GPT_SoVITS_ckpt, glob="*.pth", value="s2G488k.pth", file_count='single', label="SoVITS模型路径")
                 button = gr.Button("加载模型")
-                button.click(fn = load_vits_model, 
-                             inputs=[gpt_path, sovits_path], 
-                             outputs=[gpt_path, sovits_path])
-                
+                button.click(fn=load_vits_model, inputs=[gpt_path, sovits_path], outputs=[gpt_path, sovits_path])
                 with gr.Row():
-                    inp_ref = gr.Audio(label="请上传3~10秒内参考音频，超过会报错！", sources=["microphone", "upload"], type="filepath")
+                    ref_audio = gr.Audio(label="请上传3~10秒内参考音频，超过会报错！", sources=["microphone", "upload"], type="filepath")
                     use_mic_voice = gr.Checkbox(label="使用语音问答的麦克风")
                     prompt_text = gr.Textbox(label="参考音频的文本", value="")
-                    prompt_language = gr.Dropdown(
-                        label="参考音频的语种", choices=["中文", "英文", "日文"], value="中文"
-                    )
+                    prompt_language = gr.Dropdown(label="参考音频的语种", choices=["中文", "英文", "日文"], value="中文")
                 asr_button = gr.Button("语音识别 - 克隆参考音频")
-                asr_button.click(fn=Asr,inputs=[inp_ref],outputs=[prompt_text])
+                asr_button.click(fn=Asr, inputs=[ref_audio], outputs=[prompt_text])
                 with gr.Row():
-                    text_language = gr.Dropdown(
-                        label="需要合成的语种", choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], value="中文"
-                    )
-                    
-                    how_to_cut = gr.Dropdown(
-                        label="怎么切",
-                        choices=["不切", "凑四句一切", "凑50字一切", "按中文句号。切", "按英文句号.切", "按标点符号切" ],
-                        value="凑四句一切",
-                        interactive=True,
-                    )
+                    text_language = gr.Dropdown(label="需要合成的语种", choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], value="中文")
+                    cut_method = gr.Dropdown(label="怎么切", choices=["不切", "凑四句一切", "凑50字一切", "按中文句号。切", "按英文句号.切", "按标点符号切"], value="凑四句一切", interactive=True)
             
-            with gr.Column(variant='panel'): 
-                batch_size = gr.Slider(minimum=1,
-                                    maximum=10,
-                                    value=2,
-                                    step=1,
-                                    label='Talker Batch size')
-
-    character = gr.Radio(['女性角色', 
-                          '男性角色', 
-                          '自定义角色'], 
-                         label="角色选择", value='自定义角色')
-    character.change(fn = character_change, inputs=[character], outputs = [source_image])
-    tts_method = gr.Radio(['Edge-TTS', 'PaddleTTS', 'GPT-SoVITS克隆声音', 'Comming Soon!!!'], label="Text To Speech Method", 
-                                              value = 'Edge-TTS')
-    tts_method.change(fn = tts_model_change, inputs=[tts_method], outputs = [tts_method])
-    asr_method = gr.Radio(choices = ['Whisper-tiny', 'Whisper-base', 'FunASR', 'Comming Soon!!!'], value='Whisper-base', label = '语音识别模型选择')
-    asr_method.change(fn = asr_model_change, inputs=[asr_method], outputs = [asr_method])
-    talker_method = gr.Radio(choices = ['SadTalker', 'Wav2Lip', 'Wav2Lipv2', 'NeRFTalk', 'Comming Soon!!!'], 
-                      value = 'SadTalker', label = '数字人模型选择')
-    talker_method.change(fn = talker_model_change, inputs=[talker_method], outputs = [talker_method])
-    llm_method = gr.Dropdown(choices = ['Qwen', 'Qwen2', 'Linly', 'Gemini', 'ChatGLM', 'ChatGPT', 'GPT4Free', '直接回复 Direct Reply', 'Comming Soon!!!'], value = '直接回复 Direct Reply', label = 'LLM 模型选择')
-    llm_method.change(fn = llm_model_change, inputs=[llm_method], outputs = [llm_method])
-    return  (source_image, voice, rate, volume, pitch, 
-             am, voc, lang, male, 
-             inp_ref, prompt_text, prompt_language, text_language, how_to_cut, use_mic_voice,
-             tts_method, batch_size, character, talker_method, asr_method, llm_method)
-
-
-def exmaple_setting(asr, text, character, talk , tts, voice, llm):
+            with gr.Tab('CosyVoice'):
+                # tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型，提供舒适自然的语音合成能力。")
+                speed_factor = gr.Slider(minimum=0.25, maximum=4, step=0.05, label="语速调节", value=1.0, interactive=True)
+                with gr.Row():
+                    mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
+                    instruction_text = gr.Text(label="操作步骤", lines=3, value=instruct_dict[inference_mode_list[0]], scale=0.5)
+                    sft_dropdown = gr.Dropdown(choices=['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女'], label='选择预训练音色', value="中文女", scale=0.25)
+                with gr.Row():
+                    seed_button = gr.Button(value="\U0001F3B2")
+                    seed = gr.Number(value=0, label="随机推理种子")
+                with gr.Row():
+                    prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='选择prompt音频文件，注意采样率不低于16khz')
+                    prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='录制prompt音频文件')
+                prompt_text_cv = gr.Textbox(label="输入prompt文本", lines=1, placeholder="请输入prompt文本，需与prompt音频内容一致，暂时不支持自动识别...", value='')
+                # instruct_text = gr.Textbox(label="输入instruct文本", lines=1, placeholder="请输入instruct文本.", value='')
+                seed_button.click(generate_seed, inputs=[], outputs=seed)
+                mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
+            generate_button = gr.Button("生成音频")
+            audio_output = gr.Audio(label="合成音频")
+            
+            with gr.Column(variant='panel'):
+                batch_size = gr.Slider(minimum=1, maximum=10, value=2, step=1, label='Talker Batch size')
+    if not talk:
+        character = gr.Radio(['女性角色', '男性角色', '自定义角色'], label="角色选择", value='自定义角色')
+        character.change(fn=character_change, inputs=[character], outputs=[source_image])
+        talker_method = gr.Radio(choices=['SadTalker', 'Wav2Lip', 'Wav2Lipv2', 'NeRFTalk', 'Comming Soon!!!'], value='SadTalker', label='数字人模型选择')
+        talker_method.change(fn=talker_model_change, inputs=[talker_method], outputs=[talker_method])
+    else:
+        character = None
+        talker_method = None
+    tts_method = gr.Radio(['Edge-TTS', 'PaddleTTS', 'GPT-SoVITS克隆声音', 'CosyVoice-SFT模式', 'CosyVoice-克隆翻译模式', 'Comming Soon!!!'], label="Text To Speech Method", value='Edge-TTS')
+    tts_method.change(fn=tts_model_change, inputs=[tts_method], outputs=[tts_method])
+    asr_method = gr.Radio(choices=['Whisper-tiny', 'Whisper-base', 'FunASR', 'Comming Soon!!!'], value='Whisper-base', label='语音识别模型选择')
+    asr_method.change(fn=asr_model_change, inputs=[asr_method], outputs=[asr_method])
+    llm_method = gr.Dropdown(choices=['Qwen', 'Qwen2', 'Linly', 'Gemini', 'ChatGLM', 'ChatGPT', 'GPT4Free', '直接回复 Direct Reply', 'Comming Soon!!!'], value='直接回复 Direct Reply', label='LLM 模型选择')
+    llm_method.change(fn=llm_model_change, inputs=[llm_method], outputs=[llm_method])
+    return (source_image, voice, rate, volume, pitch, am, voc, lang, male, 
+            ref_audio, prompt_text, prompt_language, text_language, cut_method, use_mic_voice, tts_method, 
+            batch_size, character, talker_method, asr_method, llm_method, generate_button, audio_output, 
+            mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor)
+
+def exmaple_setting(asr, text, character, talk, tts, voice, llm):
     # 默认text的Example
-    examples =  [
+    examples = [
         ['Whisper-base', '应对压力最有效的方法是什么？', '女性角色', 'SadTalker', 'Edge-TTS', 'zh-CN-XiaoxiaoNeural', '直接回复 Direct Reply'],
         ['Whisper-tiny', '应对压力最有效的方法是什么？', '女性角色', 'SadTalker', 'PaddleTTS', 'None', '直接回复 Direct Reply'],
         ['Whisper-base', '应对压力最有效的方法是什么？', '女性角色', 'SadTalker', 'Edge-TTS', 'zh-CN-XiaoxiaoNeural', 'Qwen'],
-        ['FunASR', '如何进行时间管理？','男性角色', 'SadTalker', 'Edge-TTS', 'zh-CN-YunyangNeural', 'Qwen'],
-        ['Whisper-tiny', '为什么有些人选择使用纸质地图或寻求方向，而不是依赖GPS设备或智能手机应用程序？','女性角色', 'Wav2Lip', 'PaddleTTS', 'None', 'Qwen'],
-        ['Whisper-tiny', '为什么有些人选择使用纸质地图或寻求方向，而不是依赖GPS设备或智能手机应用程序？','女性角色', 'Wav2Lipv2', 'Edge-TTS', 'None', 'Qwen'],
-        ]
-
+        ['FunASR', '如何进行时间管理？', '男性角色', 'SadTalker', 'Edge-TTS', 'zh-CN-YunyangNeural', 'Qwen'],
+        ['Whisper-tiny', '为什么有些人选择使用纸质地图或寻求方向，而不是依赖GPS设备或智能手机应用程序？', '女性角色', 'Wav2Lip', 'PaddleTTS', 'None', 'Qwen'],
+        ['Whisper-tiny', '为什么有些人选择使用纸质地图或寻求方向，而不是依赖GPS设备或智能手机应用程序？', '女性角色', 'Wav2Lipv2', 'Edge-TTS', 'None', 'Qwen'],
+    ]
     with gr.Row(variant='panel'):
         with gr.Column(variant='panel'):
             gr.Markdown("## Test Examples")
@@ -613,502 +481,253 @@ def exmaple_setting(asr, text, character, talk , tts, voice, llm):
                 examples = examples,
                 inputs = [asr, text, character, talk , tts, voice, llm],
             )
-def app():
-    with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
-        gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 文本/语音对话"))
-        with gr.Row(equal_height=False):
-            with gr.Column(variant='panel'): 
-                (source_image, voice, rate, volume, pitch, 
-                am, voc, lang, male, 
-                inp_ref, prompt_text, prompt_language, text_language, how_to_cut, use_mic_voice,
-                tts_method, batch_size, character, talker_method, asr_method, llm_method)= webui_setting()
-             
-            
-            with gr.Column(variant='panel'):
-                with gr.Tabs():
-                    with gr.TabItem('对话'):
-                        with gr.Group():
-                            question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
-                            input_text = gr.Textbox(label="输入文字/问题", lines=3)
-                            asr_text = gr.Button('语音识别（语音对话后点击）')
-                        asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])
-                # with gr.TabItem('SadTalker数字人参数设置'):
-                #     with gr.Accordion("Advanced Settings",
-                #                     open=False):
-                #         gr.Markdown("SadTalker: need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials")
-                #         with gr.Column(variant='panel'):
-                #             # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
-                #             # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
-                #             with gr.Row():
-                #                 pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
-                #                 exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) # 
-                #                 blink_every = gr.Checkbox(label="use eye blink", value=True)
-
-                #             with gr.Row():
-                #                 size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model? 256 is faster") # 
-                #                 preprocess_type = gr.Radio(['crop', 'resize','full'], value='full', label='preprocess', info="How to handle input image?")
-                            
-                #             with gr.Row():
-                #                 is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
-                #                 facerender = gr.Radio(['facevid2vid'], value='facevid2vid', label='facerender', info="which face render?")
-                                
-                #             with gr.Row():
-                #                 # batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
-                #                 fps = gr.Slider(label='fps in generation', step=1, maximum=30, value =20)
-                #                 enhancer = gr.Checkbox(label="GFPGAN as Face enhancer(slow)")       
-                with gr.Tabs():
-                    with gr.TabItem('数字人问答'):
-                        gen_video = gr.Video(label="生成视频", format="mp4", autoplay=False)
-                video_button = gr.Button("🎬 生成数字人视频", variant='primary')
-            video_button.click(fn=Talker_response,inputs=[question_audio, talker_method, input_text, voice, rate, volume, pitch,
-                                                          am, voc, lang, male, 
-                                                          inp_ref, prompt_text, prompt_language, text_language, how_to_cut, use_mic_voice,
-                                                          tts_method, batch_size, character],outputs=[gen_video])
-        exmaple_setting(asr_method, input_text, character, talker_method, tts_method, voice, llm_method)
-    return inference
-
 def app_multi():
-    with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
+    with gr.Blocks(analytics_enabled=False, title='Linly-Talker') as inference:
+        # 显示标题
         gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 多轮GPT对话"))
+        
         with gr.Row():
             with gr.Column():
+                # 加载 Web UI 设置
                 (source_image, voice, rate, volume, pitch, 
-                am, voc, lang, male, 
-                inp_ref, prompt_text, prompt_language, text_language, how_to_cut,  use_mic_voice,
-                tts_method, batch_size, character, talker_method, asr_method, llm_method)= webui_setting()
-                video = gr.Video(label = '数字人问答', scale = 0.5)
-                video_button = gr.Button("🎬 生成数字人视频（对话后）", variant = 'primary')
+                 am, voc, lang, male, 
+                 ref_audio, prompt_text, prompt_language, text_language, cut_method, use_mic_voice,
+                 tts_method, batch_size, character, talker_method, asr_method, llm_method, generate_button, audio_output,
+                 mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor) = webui_setting()
+
+                
+                # 数字人问答视频显示
+                video = gr.Video(label='数字人问答', scale=0.5)
+                video_button = gr.Button("🎬 生成数字人视频（对话后）", variant='primary')
             
             with gr.Column():
                 with gr.Tabs(elem_id="sadtalker_checkbox"):
                     with gr.TabItem('SadTalker数字人参数设置'):
-                        with gr.Accordion("Advanced Settings",
-                                        open=False):
-                            gr.Markdown("SadTalker: need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials")
+                        with gr.Accordion("Advanced Settings", open=False):
+                            gr.Markdown("SadTalker: need help? please visit our [best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md) for more details")
                             with gr.Column(variant='panel'):
-                                # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
-                                # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
+                                # 数字人参数设置
                                 with gr.Row():
-                                    pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
-                                    exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) # 
+                                    pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0)
+                                    exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1)
                                     blink_every = gr.Checkbox(label="use eye blink", value=True)
-
                                 with gr.Row():
-                                    size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model? 256 is faster") # 
+                                    size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model? 256 is faster")
                                     preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
-                                
                                 with gr.Row():
                                     is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
                                     facerender = gr.Radio(['facevid2vid'], value='facevid2vid', label='facerender', info="which face render?")
-                                    
                                 with gr.Row():
-                                    fps = gr.Slider(label='fps in generation', step=1, maximum=30, value =20)
-                                    enhancer = gr.Checkbox(label="GFPGAN as Face enhancer(slow)")   
+                                    fps = gr.Slider(label='fps in generation', step=1, maximum=30, value=20)
+                                    enhancer = gr.Checkbox(label="GFPGAN as Face enhancer(slow)")
+                
+                # System 设定及清除历史对话
                 with gr.Row():
                     with gr.Column(scale=3):
-                        system_input = gr.Textbox(value=default_system, lines=1, label='System (设定角色)')
+                        system_input = gr.Textbox(value=DEFAULT_SYSTEM, lines=1, label='System (设定角色)')
                     with gr.Column(scale=1):
                         modify_system = gr.Button("🛠️ 设置system并清除历史对话", scale=2)
-                    system_state = gr.Textbox(value=default_system, visible=False)
-
+                    system_state = gr.Textbox(value=DEFAULT_SYSTEM, visible=False)
+                
+                # 聊天机器人界面
                 chatbot = gr.Chatbot(height=400, show_copy_button=True)
+                
+                # 语音输入及识别按钮
                 with gr.Group():
                     question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label='语音对话', autoplay=False)
-                    asr_text = gr.Button('🎤 语音识别（语音对话后点击）')
+                    asr_btn = gr.Button('🎤 语音识别（语音对话后点击）')
                 
-                # 创建一个文本框组件，用于输入 prompt。
-                msg = gr.Textbox(label="Prompt/问题")
-                asr_text.click(fn=Asr,inputs=[question_audio],outputs=[msg])
+                # 文本输入框
+                msg = gr.Textbox(label="输入文字/问题", lines=3, placeholder='请输入文本或问题，同时可以设置LLM模型。默认使用直接回复。')
+                asr_btn.click(fn=Asr, inputs=[question_audio], outputs=[msg])
                 
+
+                generate_button.click(fn=TTS_response, 
+                                      inputs=[msg, voice, rate, volume, pitch, am, voc, lang, male,
+                                                ref_audio, prompt_text, prompt_language, text_language,
+                                                cut_method, question_audio, prompt_text, use_mic_voice, 
+                                                mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor, tts_method, ],
+                                      outputs=[audio_output])
+
+                # 清除历史记录和提交按钮
                 with gr.Row():
                     clear_history = gr.Button("🧹 清除历史对话")
-                    sumbit = gr.Button("🚀 发送", variant = 'primary')
+                    sumbit = gr.Button("🚀 发送", variant='primary')
                     
-            # 设置按钮的点击事件。当点击时，调用上面定义的 函数，并传入用户的消息和聊天历史记录，然后更新文本框和聊天机器人组件。
-            sumbit.click(chat_response, inputs=[system_input, msg, chatbot], 
-                         outputs=[msg, chatbot])
-            
-            # 点击后清空后端存储的聊天记录
-            clear_history.click(fn = clear_session, outputs = [msg, chatbot])
-            
-            # 设置system并清除历史对话
-            modify_system.click(fn=modify_system_session,
-                        inputs=[system_input],
-                        outputs=[system_state, system_input, chatbot])
-            
-            video_button.click(fn = human_response, inputs = [source_image, chatbot, question_audio, talker_method, voice, rate, volume, pitch,
+                # 设置按钮的点击事件
+                sumbit.click(chat_response, inputs=[system_input, msg, chatbot], outputs=[msg, chatbot])
+                clear_history.click(fn=clear_session, outputs=[msg, chatbot])
+                modify_system.click(fn=modify_system_session, inputs=[system_input], outputs=[system_state, system_input, chatbot])
+                video_button.click(fn=human_response, inputs=[source_image, chatbot, question_audio, talker_method, voice, rate, volume, pitch,
                                                              am, voc, lang, male, 
-                                                             inp_ref, prompt_text, prompt_language, text_language, how_to_cut,  use_mic_voice, 
-                                                             tts_method, character,preprocess_type, 
+                                                             ref_audio, prompt_text, prompt_language, text_language, cut_method,  use_mic_voice, 
+                                                             mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor,
+                                                             tts_method, character, preprocess_type, 
                                                              is_still_mode, enhancer, batch_size, size_of_image,
-                                                             pose_style, facerender, exp_weight, blink_every, fps], outputs = [video])
+                                                             pose_style, facerender, exp_weight, blink_every, fps], outputs=[video])
 
+        # 示例设置
         exmaple_setting(asr_method, msg, character, talker_method, tts_method, voice, llm_method)
     return inference
 
 def app_img():
-    with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
+    with gr.Blocks(analytics_enabled=False, title='Linly-Talker') as inference:
+        # 显示标题
         gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 个性化角色互动"))
+        
         with gr.Row(equal_height=False):
-            with gr.Column(variant='panel'):                                
+            with gr.Column(variant='panel'):
+                # 加载 Web UI 设置
                 (source_image, voice, rate, volume, pitch, 
-                am, voc, lang, male, 
-                inp_ref, prompt_text, prompt_language, text_language, how_to_cut, use_mic_voice,
-                tts_method, batch_size, character, talker_method, asr_method, llm_method)= webui_setting()
-                                
-            # driven_audio = 'answer.wav'           
-            with gr.Column(variant='panel'): 
+                 am, voc, lang, male, 
+                 ref_audio, prompt_text, prompt_language, text_language, cut_method, use_mic_voice,
+                 tts_method, batch_size, character, talker_method, asr_method, llm_method, generate_button, audio_output,
+                 mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor) = webui_setting()
+            
+            with gr.Column(variant='panel'):
                 with gr.Tabs():
                     with gr.TabItem('对话'):
                         with gr.Group():
-                            question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
-                            input_text = gr.Textbox(label="输入文字/问题", lines=3)
-                            asr_text = gr.Button('语音识别（语音对话后点击）')
-                        asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])
+                            question_audio = gr.Audio(sources=['microphone', 'upload'], type="filepath", label='语音对话')
+                            input_text = gr.Textbox(label="输入文字/问题", lines=3, placeholder='请输入文本或问题，同时可以设置LLM模型。默认使用直接回复。')
+                            asr_btn = gr.Button('语音识别（语音对话后点击）')
+                        asr_btn.click(fn=Asr, inputs=[question_audio], outputs=[input_text])
+                generate_button.click(fn=TTS_response, 
+                                      inputs=[input_text, voice, rate, volume, pitch, am, voc, lang, male,
+                                                ref_audio, prompt_text, prompt_language, text_language,
+                                                cut_method, question_audio, prompt_text, use_mic_voice, 
+                                                mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor, tts_method, ],
+                                      outputs=[audio_output])
                 with gr.Tabs(elem_id="text_examples"): 
                     gr.Markdown("## Text Examples")
-                    examples =  [
+                    examples = [
                         ['应对压力最有效的方法是什么？'],
                         ['如何进行时间管理？'],
                         ['为什么有些人选择使用纸质地图或寻求方向，而不是依赖GPS设备或智能手机应用程序？'],
                     ]
-                    gr.Examples(
-                        examples = examples,
-                        inputs = [input_text],
-                    )
+                    gr.Examples(examples=examples, inputs=[input_text])
+                
                 with gr.Tabs(elem_id="sadtalker_checkbox"):
                     with gr.TabItem('SadTalker数字人参数设置'):
-                        with gr.Accordion("Advanced Settings",
-                                        open=False):
-                            gr.Markdown("SadTalker: need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials")
+                        with gr.Accordion("Advanced Settings", open=False):
+                            gr.Markdown("SadTalker: need help? please visit our [best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md) for more details")
                             with gr.Column(variant='panel'):
-                                # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
-                                # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
                                 with gr.Row():
-                                    pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
-                                    exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) # 
+                                    pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0)
+                                    exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1)
                                     blink_every = gr.Checkbox(label="use eye blink", value=True)
-
                                 with gr.Row():
-                                    size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model? 256 is faster") # 
-                                    preprocess_type = gr.Radio(['crop', 'resize','full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
-                                
+                                    size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model? 256 is faster")
+                                    preprocess_type = gr.Radio(['crop', 'resize', 'full', 'extcrop', 'extfull'], value='crop', label='preprocess', info="How to handle input image?")
                                 with gr.Row():
                                     is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
                                     facerender = gr.Radio(['facevid2vid'], value='facevid2vid', label='facerender', info="which face render?")
-                                    
                                 with gr.Row():
-                                    fps = gr.Slider(label='fps in generation', step=1, maximum=30, value =20)
-                                    enhancer = gr.Checkbox(label="GFPGAN as Face enhancer(slow)")                                               
-
+                                    fps = gr.Slider(label='fps in generation', step=1, maximum=30, value=20)
+                                    enhancer = gr.Checkbox(label="GFPGAN as Face enhancer(slow)")
+                
                 with gr.Tabs(elem_id="sadtalker_genearted"):
                     gen_video = gr.Video(label="数字人视频", format="mp4")
 
                 submit = gr.Button('🎬 生成数字人视频', elem_id="sadtalker_generate", variant='primary')
-            submit.click(
-                fn=Talker_response_img,
-                inputs=[question_audio,
-                        talker_method, 
-                        input_text,
-                        voice, rate, volume, pitch,
-                        am, voc, lang, male, 
-                        inp_ref, prompt_text, prompt_language, text_language, how_to_cut,  use_mic_voice,
-                        tts_method,
-                        source_image, 
-                        preprocess_type,
-                        is_still_mode,
-                        enhancer,
-                        batch_size,                            
-                        size_of_image,
-                        pose_style,
-                        facerender,
-                        exp_weight,
-                        blink_every,
-                        fps], 
-                outputs=[gen_video]
+                submit.click(
+                    fn=Talker_response_img,
+                    inputs=[question_audio, talker_method, input_text, voice, rate, volume, pitch,
+                            am, voc, lang, male, ref_audio, prompt_text, prompt_language, text_language, cut_method, use_mic_voice,
+                            mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor, 
+                            tts_method, source_image, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image,
+                            pose_style, facerender, exp_weight, blink_every, fps], 
+                    outputs=[gen_video]
                 )
         
         with gr.Row():
             examples = [
-                [
-                    'examples/source_image/full_body_2.png', 'SadTalker',
-                    'crop',
-                    False,
-                    False
-                ],
-                [
-                    'examples/source_image/full_body_1.png', 'Wav2Lipv2',
-                    'full',
-                    False,
-                    False
-                ],
-                [
-                    'examples/source_image/full_body_1.png', 'Wav2Lip',
-                    'full',
-                    True,
-                    False
-                ],
-                [
-                    'examples/source_image/full_body_1.png', 'SadTalker',
-                    'full',
-                    True,
-                    False
-                ],
-                [
-                    'examples/source_image/full4.jpeg', 'SadTalker',
-                    'crop',
-                    False,
-                    True
-                ],
+                ['examples/source_image/full_body_2.png', 'SadTalker', 'crop', False, False],
+                ['examples/source_image/full_body_1.png', 'Wav2Lipv2', 'full', False, False],
+                ['examples/source_image/full_body_2.png', 'Wav2Lipv2', 'full', False, False],
+                ['examples/source_image/full_body_1.png', 'Wav2Lip', 'full', True, False],
+                ['examples/source_image/full_body_1.png', 'SadTalker', 'full', True, False],
+                ['examples/source_image/full4.jpeg', 'SadTalker', 'crop', False, True],
             ]
-            gr.Examples(examples=examples,
-                        inputs=[
-                            source_image, talker_method,
-                            preprocess_type,
-                            is_still_mode,
-                            enhancer], 
-                        outputs=[gen_video],
-                        # cache_examples=True,
-                        )
-    return inference
-
-def app_vits():
-    with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
-        gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 语音克隆"))
-        with gr.Row(equal_height=False):
-            with gr.Column(variant='panel'): 
-                (source_image, voice, rate, volume, pitch, 
-                am, voc, lang, male, 
-                inp_ref, prompt_text, prompt_language, text_language, how_to_cut, use_mic_voice,
-                tts_method, batch_size, character, talker_method, asr_method, llm_method)= webui_setting()
-            with gr.Column(variant='panel'): 
-                with gr.Tabs():
-                    with gr.TabItem('对话'):
-                        with gr.Group():
-                            question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
-                            input_text = gr.Textbox(label="输入文字/问题", lines=3)
-                            asr_text = gr.Button('语音识别（语音对话后点击）')
-                        asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])
-                with gr.Tabs():
-                    with gr.TabItem('数字人问答'):
-                        gen_video = gr.Video(label="数字人视频", format="mp4", autoplay=False)
-                video_button = gr.Button("🎬 生成数字人视频", variant='primary')
-            video_button.click(fn=Talker_response,inputs=[question_audio, talker_method, input_text, voice, rate, volume, pitch, am, voc, lang, male, 
-                            inp_ref, prompt_text, prompt_language, text_language, how_to_cut,  use_mic_voice,
-                            tts_method, batch_size, character],outputs=[gen_video])
-        exmaple_setting(asr_method, input_text, character, talker_method, tts_method, voice, llm_method)
+            gr.Examples(
+                examples=examples,
+                inputs=[source_image, talker_method, preprocess_type, is_still_mode, enhancer],
+                outputs=[gen_video],
+                # cache_examples=True,
+            )
     return inference
 
-def app_talk():
-    with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
-        gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) 数字人播报"))
-        with gr.Row(equal_height=False):
-            with gr.Column(variant='panel'): 
-                with gr.Tabs():
-                    with gr.Tab("图片人物"):
-                        source_image = gr.Image(label='Source image', type = 'filepath')
-                        
-                    with gr.Tab("视频人物"):
-                        source_video = gr.Video(label="Source video")
-               
-                (_, voice, rate, volume, pitch, 
-                am, voc, lang, male, 
-                inp_ref, prompt_text, prompt_language, text_language, how_to_cut, use_mic_voice,
-                tts_method, batch_size, character, talker_method, asr_method, llm_method)= webui_setting()
-        
-            with gr.Column(variant='panel'):
-                with gr.Tabs():
-                    with gr.TabItem('对话'):
-                        with gr.Group():
-                            question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
-                            input_text = gr.Textbox(label="输入文字/问题", lines=3)
-                            asr_text = gr.Button('语音识别（语音对话后点击）')
-                        asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text]) 
-                with gr.Tabs():
-                    with gr.TabItem('SadTalker数字人参数设置'):
-                        with gr.Accordion("Advanced Settings",
-                                        open=False):
-                            gr.Markdown("SadTalker: need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials")
-                            with gr.Column(variant='panel'):
-                                # width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
-                                # height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
-                                with gr.Row():
-                                    pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
-                                    exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) # 
-                                    blink_every = gr.Checkbox(label="use eye blink", value=True)
-
-                                with gr.Row():
-                                    size_of_image = gr.Radio([256, 512], value=256, label='face model resolution', info="use 256/512 model? 256 is faster") # 
-                                    preprocess_type = gr.Radio(['crop', 'resize','full'], value='full', label='preprocess', info="How to handle input image?")
-                                
-                                with gr.Row():
-                                    is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion, works with preprocess `full`)")
-                                    facerender = gr.Radio(['facevid2vid'], value='facevid2vid', label='facerender', info="which face render?")
-                                    
-                                with gr.Row():
-                                    # batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
-                                    fps = gr.Slider(label='fps in generation', step=1, maximum=30, value =20)
-                                    enhancer = gr.Checkbox(label="GFPGAN as Face enhancer(slow)")                                               
-
-                with gr.Tabs():
-                    gen_video = gr.Video(label="数字人视频", format="mp4")
-
-                video_button = gr.Button('🎬 生成数字人视频', elem_id="sadtalker_generate", variant='primary')
-
-                video_button.click(fn=Talker_Say,inputs=[preprocess_type, is_still_mode, enhancer, batch_size, size_of_image,
-                                pose_style, facerender, exp_weight, blink_every, fps,
-                                source_image, source_video, question_audio, talker_method, input_text, voice, rate, volume, pitch, am, voc, lang, male, 
-                                inp_ref, prompt_text, prompt_language, text_language, how_to_cut,  use_mic_voice,
-                                tts_method, character],outputs=[gen_video])
-            
-        with gr.Row():
-            with gr.Column(variant='panel'):
-                gr.Markdown("## Test Examples")
-                gr.Examples(
-                    examples = [
-                        [
-                            'examples/source_image/full_body_2.png',
-                            '应对压力最有效的方法是什么？',
-                        ],
-                        [
-                            'examples/source_image/full_body_1.png',
-                            '如何进行时间管理？',
-                        ],
-                        [
-                            'examples/source_image/full3.png',
-                            '为什么有些人选择使用纸质地图或寻求方向，而不是依赖GPS设备或智能手机应用程序？',
-                        ],
-                    ],
-                    fn = Talker_Say,
-                    inputs = [source_image, input_text],
-                )   
-    return inference
 
 def load_musetalk_model():
-    gr.Warning("若显存不足，可能会导致模型加载失败，可以尝试使用其他摸型或者换其他设备尝试。")
+    """加载MuseTalk模型，显示加载状态和结果信息。"""
+    gr.Warning("若显存不足，可能会导致模型加载失败，可以尝试使用其他模型或者换其他设备。")
     gr.Info("MuseTalk模型导入中...")
     musetalker.init_model()
     gr.Info("MuseTalk模型导入成功")
     return "MuseTalk模型导入成功"
+
 def musetalk_prepare_material(source_video, bbox_shift):
+    """准备MuseTalk所需的素材，检查模型是否已加载。"""
     if musetalker.load is False:
         gr.Warning("请先加载MuseTalk模型后重新上传文件")
         return source_video, None
     return musetalker.prepare_material(source_video, bbox_shift)
+
 def app_muse():
-    with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
+    """定义MuseTalk应用的UI和逻辑。"""
+    with gr.Blocks(analytics_enabled=False, title='Linly-Talker') as inference:
         gr.HTML(get_title("Linly 智能对话系统 (Linly-Talker) MuseTalker数字人实时对话"))
+
+        # 上传参考视频和调整bbox_shift
         with gr.Row(equal_height=False):
-            with gr.Column(variant='panel'): 
+            with gr.Column(variant='panel'):
                 with gr.TabItem('MuseV Video'):
-                    gr.Markdown("MuseV: need help? please visit MuseVDemo to generate Video https://huggingface.co/spaces/AnchorFake/MuseVDemo")
-                    with gr.Row():
-                        source_video = gr.Video(label="Reference Video",sources=['upload'])
-                    gr.Markdown("BBox_shift 推荐值下限，在生成初始结果后生成相应的 bbox 范围。如果结果不理想，可以根据该参考值进行调整。\n一般来说，在我们的实验观察中，我们发现正值（向下半部分移动）通常会增加嘴巴的张开度，而负值（向上半部分移动）通常会减少嘴巴的张开度。然而，需要注意的是，这并不是绝对的规则，用户可能需要根据他们的具体需求和期望效果来调整该参数。")
-                    with gr.Row():
-                        bbox_shift = gr.Number(label="BBox_shift value, px", value=0)
-                        bbox_shift_scale = gr.Textbox(label="bbox_shift_scale", 
-                                                        value="",interactive=False)
+                    gr.Markdown("MuseV: 需要帮助？请访问 [MuseVDemo](https://huggingface.co/spaces/AnchorFake/MuseVDemo) 生成视频。")
+                    source_video = gr.Video(label="Reference Video", sources=['upload'])
+                    gr.Markdown(
+                        "BBox_shift 推荐值下限，在生成初始结果后生成相应的 bbox 范围。"
+                        "一般来说，正值（向下半部分移动）通常会增加嘴巴的张开度，"
+                        "而负值（向上半部分移动）通常会减少嘴巴的张开度。"
+                        "用户可根据具体需求调整此参数。"
+                    )
+                    bbox_shift = gr.Number(label="BBox_shift value, px", value=0)
+                    bbox_shift_scale = gr.Textbox(label="bbox_shift_scale", value="", interactive=False)
+                
+                # 加载MuseTalk模型按钮
                 load_musetalk = gr.Button("加载MuseTalk模型(传入视频前先加载)", variant='primary')
                 load_musetalk.click(fn=load_musetalk_model, outputs=bbox_shift_scale)
 
-                # (_, voice, rate, volume, pitch, 
-                # am, voc, lang, male, 
-                # inp_ref, prompt_text, prompt_language, text_language, how_to_cut, use_mic_voice,
-                # tts_method, batch_size, character, talker_method, asr_method, llm_method)= webui_setting()
-                with gr.Tabs("TTS Method"):
-                    with gr.Accordion("TTS Method语音方法调节 ", open=True):
-                        with gr.Tab("Edge-TTS"):
-                            voice = gr.Dropdown(edgetts.SUPPORTED_VOICE, 
-                                                value='zh-CN-XiaoxiaoNeural', 
-                                                label="Voice 声音选择")
-                            rate = gr.Slider(minimum=-100,
-                                                maximum=100,
-                                                value=0,
-                                                step=1.0,
-                                                label='Rate 速率')
-                            volume = gr.Slider(minimum=0,
-                                                    maximum=100,
-                                                    value=100,
-                                                    step=1,
-                                                    label='Volume 音量')
-                            pitch = gr.Slider(minimum=-100,
-                                                maximum=100,
-                                                value=0,
-                                                step=1,
-                                                label='Pitch 音调')
-                        with gr.Tab("PaddleTTS"):
-                            am = gr.Dropdown(["FastSpeech2"], label="声学模型选择", value = 'FastSpeech2')
-                            voc = gr.Dropdown(["PWGan", "HifiGan"], label="声码器选择", value = 'PWGan')
-                            lang = gr.Dropdown(["zh", "en", "mix", "canton"], label="语言选择", value = 'zh')
-                            male = gr.Checkbox(label="男声(Male)", value=False)
-                        with gr.Tab('GPT-SoVITS'):
-                            with gr.Row():
-                                gpt_path = gr.FileExplorer(root = GPT_SoVITS_ckpt, glob = "*.ckpt", value = "s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt", file_count='single', label="GPT模型路径")
-                                sovits_path = gr.FileExplorer(root = GPT_SoVITS_ckpt, glob = "*.pth", value = "s2G488k.pth", file_count='single', label="SoVITS模型路径")
-                                # gpt_path = gr.Dropdown(choices=list_models(GPT_SoVITS_ckpt, 'ckpt'))
-                                # sovits_path = gr.Dropdown(choices=list_models(GPT_SoVITS_ckpt, 'pth'))
-                                # gpt_path = gr.Textbox(label="GPT模型路径", 
-                                #                       value="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
-                                # sovits_path = gr.Textbox(label="SoVITS模型路径", 
-                                #                          value="GPT_SoVITS/pretrained_models/s2G488k.pth")
-                            button = gr.Button("加载模型")
-                            button.click(fn = load_vits_model, 
-                                        inputs=[gpt_path, sovits_path], 
-                                        outputs=[gpt_path, sovits_path])
-                            
-                            with gr.Row():
-                                inp_ref = gr.Audio(label="请上传3~10秒内参考音频，超过会报错！", sources=["microphone", "upload"], type="filepath")
-                                use_mic_voice = gr.Checkbox(label="使用语音问答的麦克风")
-                                prompt_text = gr.Textbox(label="参考音频的文本", value="")
-                                prompt_language = gr.Dropdown(
-                                    label="参考音频的语种", choices=["中文", "英文", "日文"], value="中文"
-                                )
-                            asr_button = gr.Button("语音识别 - 克隆参考音频")
-                            asr_button.click(fn=Asr,inputs=[inp_ref],outputs=[prompt_text])
-                            with gr.Row():
-                                text_language = gr.Dropdown(
-                                    label="需要合成的语种", choices=["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"], value="中文"
-                                )
-                                
-                                how_to_cut = gr.Dropdown(
-                                    label="怎么切",
-                                    choices=["不切", "凑四句一切", "凑50字一切", "按中文句号。切", "按英文句号.切", "按标点符号切" ],
-                                    value="凑四句一切",
-                                    interactive=True,
-                                )
-                        
-                        with gr.Column(variant='panel'): 
-                            batch_size = gr.Slider(minimum=1,
-                                                maximum=10,
-                                                value=2,
-                                                step=1,
-                                                label='Talker Batch size')
-
-                tts_method = gr.Radio(['Edge-TTS', 'PaddleTTS', 'GPT-SoVITS克隆声音', 'Comming Soon!!!'], label="Text To Speech Method", 
-                                                        value = 'Edge-TTS')
-                tts_method.change(fn = tts_model_change, inputs=[tts_method], outputs = [tts_method])
-                asr_method = gr.Radio(choices = ['Whisper-tiny', 'Whisper-base', 'FunASR', 'Comming Soon!!!'], value='Whisper-base', label = '语音识别模型选择')
-                asr_method.change(fn = asr_model_change, inputs=[asr_method], outputs = [asr_method])
-                llm_method = gr.Dropdown(choices = ['Qwen', 'Qwen2', 'Linly', 'Gemini', 'ChatGLM', 'ChatGPT', 'GPT4Free', '直接回复 Direct Reply', 'Comming Soon!!!'], value = '直接回复 Direct Reply', label = 'LLM 模型选择')
-                llm_method.change(fn = llm_model_change, inputs=[llm_method], outputs = [llm_method])
+                # 加载 Web UI 设置
+                (_, voice, rate, volume, pitch, 
+                 am, voc, lang, male, 
+                 ref_audio, prompt_text, prompt_language, text_language, cut_method, use_mic_voice,
+                 tts_method, batch_size, _, _, asr_method, llm_method, generate_button, audio_output,
+                 mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor) = webui_setting(talk=True)
             
+            # 处理source_video变化
             source_video.change(fn=musetalk_prepare_material, inputs=[source_video, bbox_shift], outputs=[source_video, bbox_shift_scale])
-            
+
+            # 问题输入和ASR识别
             with gr.Column(variant='panel'):
                 with gr.Tabs():
                     with gr.TabItem('对话'):
                         with gr.Group():
-                            question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
-                            input_text = gr.Textbox(label="输入文字/问题", lines=3)
-                            asr_text = gr.Button('语音识别（语音对话后点击）')
-                        asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text]) 
-            
+                            question_audio = gr.Audio(sources=['microphone', 'upload'], type="filepath", label='语音对话')
+                            input_text = gr.Textbox(label="输入文字/问题", lines=3, placeholder='请输入文本或问题，同时可以设置LLM模型。默认使用直接回复。')
+                            asr_btn = gr.Button('语音识别（语音对话后点击）')
+                        asr_btn.click(fn=Asr, inputs=[question_audio], outputs=[input_text]) 
+                    generate_button.click(fn=TTS_response, 
+                                      inputs=[input_text, voice, rate, volume, pitch, am, voc, lang, male,
+                                                ref_audio, prompt_text, prompt_language, text_language,
+                                                cut_method, question_audio, prompt_text, use_mic_voice, 
+                                                mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor, tts_method, ],
+                                      outputs=[audio_output])
+
+                # 生成MuseTalk视频
                 with gr.TabItem("MuseTalk Video"):
                     gen_video = gr.Video(label="数字人视频", format="mp4")
                 submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
-                examples = [os.path.join('Musetalk/data/video', video) for video in os.listdir("Musetalk/data/video")]
-                # ['Musetalk/data/video/yongen_musev.mp4', 'Musetalk/data/video/musk_musev.mp4', 'Musetalk/data/video/monalisa_musev.mp4', 'Musetalk/data/video/sun_musev.mp4', 'Musetalk/data/video/seaside4_musev.mp4', 'Musetalk/data/video/sit_musev.mp4', 'Musetalk/data/video/man_musev.mp4']
+                # examples = [os.path.join('Musetalk/data/video', video) for video in os.listdir("Musetalk/data/video")]
                 
                 gr.Markdown("## MuseV Video Examples")
                 gr.Examples(
@@ -1120,271 +739,254 @@ def app_muse():
                         ['Musetalk/data/video/seaside4_musev.mp4', 5],
                         ['Musetalk/data/video/sit_musev.mp4', 5],
                         ['Musetalk/data/video/man_musev.mp4', 5]
-                        ],
-                    inputs =[source_video, bbox_shift], 
+                    ],
+                    inputs=[source_video, bbox_shift], 
                 )
 
+            # 提交按钮点击事件
             submit.click(
                 fn=MuseTalker_response,
-                inputs=[source_video, bbox_shift, question_audio, input_text, voice, rate, volume, pitch, am, voc, lang, male, 
-                            inp_ref, prompt_text, prompt_language, text_language, how_to_cut,  use_mic_voice,
-                            tts_method, batch_size], 
+                inputs=[
+                    source_video, bbox_shift, question_audio, input_text, 
+                    voice, rate, volume, pitch, am, voc, lang, male, 
+                    ref_audio, prompt_text, prompt_language, text_language, cut_method, use_mic_voice, 
+                    mode_checkbox_group, sft_dropdown, prompt_text_cv, prompt_wav_upload, prompt_wav_record, seed, speed_factor, 
+                    tts_method, batch_size
+                ],
                 outputs=[gen_video]
-                )
+            )
+
     return inference
 def asr_model_change(model_name, progress=gr.Progress(track_tqdm=True)):
+    """根据选择的模型名称更换ASR模型。"""
     global asr
+    clear_memory()  # 清理显存
 
-    # 清理显存，在加载新的模型之前释放不必要的显存
-    clear_memory()
-
-    if model_name == "Whisper-tiny":
-        try:
-            if os.path.exists('Whisper/tiny.pt'):
-                asr = WhisperASR('Whisper/tiny.pt')
-            else:
-                asr = WhisperASR('tiny')
+    try:
+        if model_name == "Whisper-tiny":
+            asr_path = 'Whisper/tiny.pt' if os.path.exists('Whisper/tiny.pt') else 'tiny'
+            asr = WhisperASR(asr_path)
             gr.Info("Whisper-tiny模型导入成功")
-        except Exception as e:
-            gr.Warning(f"Whisper-tiny模型下载失败 {e}")
-    elif model_name == "Whisper-base":
-        try:
-            if os.path.exists('Whisper/base.pt'):
-                asr = WhisperASR('Whisper/base.pt')
-            else:
-                asr = WhisperASR('base')
+        elif model_name == "Whisper-base":
+            asr_path = 'Whisper/base.pt' if os.path.exists('Whisper/base.pt') else 'base'
+            asr = WhisperASR(asr_path)
             gr.Info("Whisper-base模型导入成功")
-        except Exception as e:
-            gr.Warning(f"Whisper-base模型下载失败 {e}")
-    elif model_name == 'FunASR':
-        try:
+        elif model_name == 'FunASR':
             from ASR import FunASR
             asr = FunASR()
             gr.Info("FunASR模型导入成功")
-        except Exception as e:
-            gr.Warning(f"FunASR模型下载失败 {e}")
-    else:
-        gr.Warning("未知ASR模型，可提issue和PR 或者 建议更新模型")
+        else:
+            gr.Warning("未知ASR模型，可提issue和PR 或者 建议更新模型")
+    except Exception as e:
+        gr.Warning(f"{model_name}模型加载失败: {e}")
+
     return model_name
 
 def llm_model_change(model_name, progress=gr.Progress(track_tqdm=True)):
+    """更换LLM模型，并根据选择的模型加载相应资源。"""
     global llm
-    gemini_apikey = ""
-    openai_apikey = ""
-    proxy_url = None
+    gemini_apikey = ""  # Gemini模型的API密钥
+    openai_apikey = ""  # OpenAI的API密钥
+    proxy_url = None  # 代理URL
 
-    # 清理显存，在加载新的模型之前释放不必要的显存
+    # 清理显存，释放不必要的显存以便加载新模型
     clear_memory()
 
-    if model_name == 'Linly':
-        try:
-            llm = llm_class.init_model('Linly', 'Linly-AI/Chinese-LLaMA-2-7B-hf', prefix_prompt=prefix_prompt)
+    try:
+        if model_name == 'Linly':
+            llm = llm_class.init_model('Linly', 'Linly-AI/Chinese-LLaMA-2-7B-hf', prefix_prompt=PREFIX_PROMPT)
             gr.Info("Linly模型导入成功")
-        except Exception as e:
-            gr.Warning(f"Linly模型下载失败 {e}")
-    elif model_name == 'Qwen':
-        try:
-            llm = llm_class.init_model('Qwen', 'Qwen/Qwen-1_8B-Chat', prefix_prompt=prefix_prompt)
+        elif model_name == 'Qwen':
+            llm = llm_class.init_model('Qwen', 'Qwen/Qwen-1_8B-Chat', prefix_prompt=PREFIX_PROMPT)
             gr.Info("Qwen模型导入成功")
-        except Exception as e:
-            gr.Warning(f"Qwen模型下载失败 {e}")
-    elif model_name == 'Qwen2':
-        try:
-            llm = llm_class.init_model('Qwen2', 'Qwen/Qwen1.5-0.5B-Chat', prefix_prompt=prefix_prompt)
+        elif model_name == 'Qwen2':
+            llm = llm_class.init_model('Qwen2', 'Qwen/Qwen1.5-0.5B-Chat', prefix_prompt=PREFIX_PROMPT)
             gr.Info("Qwen2模型导入成功")
-        except Exception as e:
-            gr.Warning(f"Qwen2模型下载失败 {e}")
-    elif model_name == 'Gemini':
-        if gemini_apikey:
-            llm = llm_class.init_model('Gemini', 'gemini-pro', gemini_apikey, proxy_url)
-            gr.Info("Gemini模型导入成功")
-        else:
-            gr.Warning("请填写Gemini的api_key")
-    elif model_name == 'ChatGLM':
-        try:
-            llm = llm_class.init_model('ChatGLM', 'THUDM/chatglm3-6b', prefix_prompt=prefix_prompt)
+        elif model_name == 'Gemini':
+            if gemini_apikey:
+                llm = llm_class.init_model('Gemini', 'gemini-pro', gemini_apikey, proxy_url)
+                gr.Info("Gemini模型导入成功")
+            else:
+                gr.Warning("请填写Gemini的API密钥")
+        elif model_name == 'ChatGLM':
+            llm = llm_class.init_model('ChatGLM', 'THUDM/chatglm3-6b', prefix_prompt=PREFIX_PROMPT)
             gr.Info("ChatGLM模型导入成功")
-        except Exception as e:
-            gr.Warning(f"ChatGLM模型导入失败 {e}")
-    elif model_name == 'ChatGPT':
-        if openai_apikey:
-            llm = llm_class.init_model('ChatGPT', api_key=openai_apikey, proxy_url=proxy_url, prefix_prompt=prefix_prompt)
+        elif model_name == 'ChatGPT':
+            if openai_apikey:
+                llm = llm_class.init_model('ChatGPT', api_key=openai_apikey, proxy_url=proxy_url, prefix_prompt=PREFIX_PROMPT)
+                gr.Info("ChatGPT模型导入成功")
+            else:
+                gr.Warning("请填写OpenAI的API密钥")
+        elif model_name == '直接回复 Direct Reply':
+            llm = llm_class.init_model(model_name)
+            gr.Info("直接回复，不使用LLM模型")
+        elif model_name == 'GPT4Free':
+            llm = llm_class.init_model('GPT4Free', prefix_prompt=PREFIX_PROMPT)
+            gr.Info("GPT4Free模型导入成功，请注意该模型可能不稳定")
         else:
-            gr.Warning("请填写OpenAI的api_key")
-    elif model_name == '直接回复 Direct Reply':
-        llm =llm_class.init_model(model_name)
-        gr.Info("直接回复，不实用LLM模型")
-    elif model_name == 'GPT4Free':
-        try:
-            llm = llm_class.init_model('GPT4Free', prefix_prompt=prefix_prompt)
-            gr.Info("GPT4Free模型导入成功, 请注意GPT4Free可能不稳定")
-        except Exception as e:
-            gr.Warning(f"GPT4Free模型下载失败 {e}")
-    else:
-        gr.Warning("未知LLM模型，可提issue和PR 或者 建议更新模型")
+            gr.Warning("未知LLM模型，请检查模型名称或提出Issue")
+    except Exception as e:
+        gr.Warning(f"{model_name}模型加载失败: {e}")
+
     return model_name
-    
 def talker_model_change(model_name, progress=gr.Progress(track_tqdm=True)):
+    """更换数字人对话模型，并根据选择的模型加载相应资源。"""
     global talker
 
-    # 清理显存，在加载新的模型之前释放不必要的显存
+    # 清理显存，释放不必要的显存以便加载新模型
     clear_memory()
 
     if model_name not in ['SadTalker', 'Wav2Lip', 'Wav2Lipv2', 'NeRFTalk']:
-        gr.Warning("其他模型还未集成，请等待")
-    if model_name == 'SadTalker':
-        try:
+        gr.Warning("其他模型暂未集成，请等待更新")
+        return model_name
+
+    try:
+        if model_name == 'SadTalker':
             from TFG import SadTalker
             talker = SadTalker(lazy_load=True)
             gr.Info("SadTalker模型导入成功")
-        except Exception as e:
-            gr.Warning("SadTalker模型加载失败", e)
-    elif model_name == 'Wav2Lip':
-        try:
+        elif model_name == 'Wav2Lip':
             from TFG import Wav2Lip
             clear_memory()
             talker = Wav2Lip("checkpoints/wav2lip_gan.pth")
             gr.Info("Wav2Lip模型导入成功")
-        except Exception as e:
-            gr.Warning("Wav2Lip模型加载失败", e)
-    elif model_name == 'Wav2Lipv2':
-        try:
+        elif model_name == 'Wav2Lipv2':
             from TFG import Wav2Lipv2
             clear_memory()
             talker = Wav2Lipv2('checkpoints/wav2lipv2.pth')
-            gr.Info("Wav2Lipv2模型导入成功, 能得到更高质量的结果")
-        except Exception as e:
-            gr.Warning("Wav2Lipv2模型加载失败", e)
-    elif model_name == 'NeRFTalk':
-        try:
+            gr.Info("Wav2Lipv2模型导入成功，能够生成更高质量的结果")
+        elif model_name == 'NeRFTalk':
             from TFG import ERNeRF
             talker = ERNeRF()
             talker.init_model('checkpoints/Obama_ave.pth', 'checkpoints/Obama.json')
             gr.Info("NeRFTalk模型导入成功")
-            gr.Warning("NeRFTalk模型是针对单个人进行训练的，内置了奥班马Obama的模型，上传图片无效")
-        except Exception as e:
-            gr.Warning("NeRFTalk模型加载失败", e)
-    else:
-        gr.Warning("未知TFG模型，可提issue和PR 或者 建议更新模型")
+            gr.Warning("NeRFTalk模型仅针对单个人训练，内置奥巴马模型，上传其他图片无效")
+    except Exception as e:
+        gr.Warning(f"{model_name}模型加载失败: {e}")
+
     return model_name
 
 def tts_model_change(model_name, progress=gr.Progress(track_tqdm=True)):
+    """更换TTS模型，并根据选择的模型加载相应资源。"""
     global tts
-
-    # 清理显存，在加载新的模型之前释放不必要的显存
+    global cosyvoice
+    # 清理显存，释放不必要的显存以便加载新模型
     clear_memory()
 
-    if model_name == 'Edge-TTS':
-        # tts = EdgeTTS()
-        if edgetts.network:
-            gr.Info("EdgeTTS模型导入成功")
-        else:
-            gr.Warning("EdgeTTS模型加载失败，请检查网络是否正常连接，否则无法使用")
-    elif model_name == 'PaddleTTS':
-        try:
+    try:
+        if model_name == 'Edge-TTS':
+            # tts = EdgeTTS()  # Uncomment when implementation available
+            if edgetts.network:
+                gr.Info("EdgeTTS模型导入成功")
+            else:
+                gr.Warning("EdgeTTS模型加载失败，请检查网络连接")
+        elif model_name == 'PaddleTTS':
             from TTS import PaddleTTS
             tts = PaddleTTS()
-            gr.Info("PaddleTTS模型导入成功")
-        except Exception as e:
-            gr.Warning(f"PaddleTTS模型下载失败 {e}")
-    elif model_name == 'GPT-SoVITS克隆声音':
-        try:
+            gr.Info("PaddleTTS模型导入成功, 效果有限，不建议使用")
+        elif model_name == 'GPT-SoVITS克隆声音':
             gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
             sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
             vits.load_model(gpt_path, sovits_path)
-            gr.Info("模型加载成功")
-        except Exception as e:
-            gr.Warning(f"模型加载失败 {e}")
-        gr.Warning("注意注意⚠️：GPT-SoVITS要上传参考音频进行克隆，请点击TTS Method语音方法调节操作")
-    else:
-        gr.Warning("未知TTS模型，可提issue和PR 或者 建议更新模型")
+            gr.Info("GPT-SoVITS模型加载成功，请上传参考音频进行克隆")
+        elif model_name == 'CosyVoice-SFT模式':
+            from VITS import CosyVoiceTTS
+            model_path = 'checkpoints/CosyVoice_ckpt/CosyVoice-300M-SFT'
+            cosyvoice = CosyVoiceTTS(model_path)
+            gr.Info("CosyVoice模型导入成功，适合使用SFT模式，用微调后数据")
+        elif model_name == 'CosyVoice-克隆翻译模式':
+            from VITS import CosyVoiceTTS
+            model_path = 'checkpoints/CosyVoice_ckpt/CosyVoice-300M'
+            cosyvoice = CosyVoiceTTS(model_path)
+            gr.Info("CosyVoice模型导入成功，更适合进行克隆声音和翻译声音")
+        else:
+            gr.Warning("未知TTS模型，请检查模型名称或提出Issue")
+    except Exception as e:
+        gr.Warning(f"{model_name}模型加载失败: {e}")
+
     return model_name
 
 def success_print(text):
-    print(f"\033[1;32;40m{text}\033[0m")
+    """输出绿色文本，表示成功信息。"""
+    print(f"\033[1;32m{text}\033[0m")
 
 def error_print(text):
-    print(f"\033[1;31;40m{text}\033[0m")
-
+    """输出红色文本，表示错误信息。"""
+    print(f"\033[1;31m{text}\033[0m")
+    
 if __name__ == "__main__":
+    # 初始化LLM类
     llm_class = LLM(mode='offline')
     llm = llm_class.init_model('直接回复 Direct Reply')
     success_print("默认不使用LLM模型，直接回复问题，同时减少显存占用！")
-    
+
+    # 尝试加载GPT-SoVITS模块
     try:
         from VITS import *
         vits = GPT_SoVITS()
-        success_print("Success!!! GPT-SoVITS模块加载成功，语音克隆默认使用GPT-SoVITS模型")
-        # gpt_path = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
-        # sovits_path = "GPT_SoVITS/pretrained_models/s2G488k.pth"
-        # vits.load_model(gpt_path, sovits_path)
+        success_print("Success! GPT-SoVITS模块加载成功，语音克隆默认使用GPT-SoVITS模型")
     except Exception as e:
-        error_print(f"GPT-SoVITS Error: {e}")
-        error_print("如果使用VITS，请先下载GPT-SoVITS模型和安装环境")
-    
+        error_print(f"GPT-SoVITS 加载失败: {e}")
+        error_print("如果使用VITS，请先下载GPT-SoVITS模型并安装环境")
+
+    # 尝试加载SadTalker模块
     try:
         from TFG import SadTalker
         talker = SadTalker(lazy_load=True)
-        success_print("Success!!! SadTalker模块加载成功，默认使用SadTalker模型")
+        success_print("Success! SadTalker模块加载成功，默认使用SadTalker模型")
     except Exception as e:
-        error_print(f"SadTalker Error: {e}")
+        error_print(f"SadTalker 加载失败: {e}")
         error_print("如果使用SadTalker，请先下载SadTalker模型")
-    
+
+    # 尝试加载Whisper ASR模块
     try:
         from ASR import WhisperASR
         asr = WhisperASR('base')
-        success_print("Success!!! WhisperASR模块加载成功，默认使用Whisper-base模型")
+        success_print("Success! WhisperASR模块加载成功，默认使用Whisper-base模型")
     except Exception as e:
-        error_print(f"ASR Error: {e}")
-        error_print("如果使用FunASR，请先下载WhisperASR模型和安装环境")
-    
-    # 判断显存是否8g，若小于8g不建议使用MuseTalk功能
-    # Check if GPU is available and has at least 8GB of memory
+        error_print(f"WhisperASR 加载失败: {e}")
+        error_print("如果使用FunASR，请先下载WhisperASR模型并安装环境")
+
+    # 检查GPU显存
     if torch.cuda.is_available():
         gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)  # Convert bytes to GB
         if gpu_memory < 8:
             error_print("警告: 您的显卡显存小于8GB，不建议使用MuseTalk功能")
-    
+
+    # 尝试加载MuseTalk模块
     try:
         from TFG import MuseTalk_RealTime
         musetalker = MuseTalk_RealTime()
-        success_print("Success!!! MuseTalk模块加载成功")
+        success_print("Success! MuseTalk模块加载成功")
     except Exception as e:
-        error_print(f"MuseTalk Error: {e}")
+        error_print(f"MuseTalk 加载失败: {e}")
         error_print("如果使用MuseTalk，请先下载MuseTalk模型")
 
-    tts = edgetts
-    if not tts.network:
-        error_print("EdgeTTS模块加载失败，请检查网络是否正常连接，否则无法使用")
+    # 尝试加载EdgeTTS模块
+    try:
+        tts = edgetts
+        if not tts.network:
+            error_print("EdgeTTS模块加载失败，请检查网络连接")
+    except Exception as e:
+        error_print(f"EdgeTTS 加载失败: {e}")
 
+    # Gradio UI的初始化和启动
     gr.close_all()
-    # demo_app = app()
     demo_img = app_img()
     demo_multi = app_multi()
-    # demo_vits = app_vits()
-    # demo_talk = app_talk()
     demo_muse = app_muse()
-    demo = gr.TabbedInterface(interface_list = [
-        # demo_app, 
-        demo_img, 
-        demo_multi, 
-        # demo_vits, 
-        # demo_talk,
-        demo_muse,
-        ], 
-        tab_names = [
-            "个性化角色互动", 
-            "数字人多轮智能对话", 
-            "MuseTalk数字人实时对话"
-            ],
-        title = "Linly-Talker WebUI")
-    demo.queue()
-    demo.launch(server_name=ip, # 本地端口localhost:127.0.0.1 全局端口转发:"0.0.0.0"
-                server_port=port,
-                # 似乎在Gradio4.0以上版本可以不使用证书也可以进行麦克风对话
-                # ssl_certfile=ssl_certfile,
-                # ssl_keyfile=ssl_keyfile,
-                # ssl_verify=False,
-                # share=True,
-                debug=True,
-                )
\ No newline at end of file
+    demo = gr.TabbedInterface(
+        interface_list=[demo_img, demo_multi, demo_muse],
+        tab_names=["个性化角色互动", "数字人多轮智能对话", "MuseTalk数字人实时对话"],
+        title="Linly-Talker WebUI"
+    )
+    demo.queue(max_size=4, default_concurrency_limit=2)
+    demo.launch(
+        server_name=ip,  # 本地localhost:127.0.0.1 或 "0.0.0.0" 进行全局端口转发
+        server_port=port,
+        # ssl_certfile=ssl_certfile,  # SSL证书文件
+        # ssl_keyfile=ssl_keyfile,  # SSL密钥文件
+        # ssl_verify=False,
+        # share=True,
+        debug=True,
+    )
\ No newline at end of file