Merge pull request #25 from FrancescoCaracciolo/nyarch-sync-0.5.0

Nyarch sync 0.5.0
NyarchLinux · Dec 7, 2024 · 2feb280 · 2feb280
2 parents 58b4ca9 + 0a74e81
commit 2feb280
Show file tree

Hide file tree

Showing 10 changed files with 288 additions and 31 deletions.
diff --git a/data/moe.nyarchlinux.assistant.gschema.xml b/data/moe.nyarchlinux.assistant.gschema.xml
@@ -72,10 +72,19 @@
         </key>
 	      <key name="stt-settings" type="s">
 	        <default>"{}"</default>
-	      </key>
-	      <key name="welcome-screen-shown" type="b">
-	        <default>false</default>
-	      </key>
+        </key>
+        <key name="automatic-stt" type="b">
+          <default>false</default>
+        </key>
+        <key name="stt-silence-detection-threshold" type="d">
+          <default>0.07</default>
+        </key>
+        <key name="stt-silence-detection-duration" type="i">
+          <default>3</default>
+          </key> 
+        <key name="welcome-screen-shown" type="b"> 
+          <default>false</default> 
+        </key>
 	      <key name="custom-prompts" type="s">
 	        <default>"{}"</default>
         </key>

diff --git a/src/constants.py b/src/constants.py
@@ -1,5 +1,6 @@
+
 from .llm import BingHandler, GPT4AllHandler, GroqHandler, NyarchApiHandler, OllamaHandler, OpenAIHandler, CustomLLMHandler, GPT3AnyHandler, GeminiHandler, MistralHandler, OpenRouterHandler
-from .tts import VoiceVoxHanlder, gTTSHandler, EspeakHandler, CustomTTSHandler, VitsHandler, EdgeTTSHandler
+from .tts import VoiceVoxHanlder,ElevenLabs, gTTSHandler, EspeakHandler, CustomTTSHandler, VitsHandler, EdgeTTSHandler
 from .stt import GroqSRHandler, OpenAISRHandler, SphinxHandler, GoogleSRHandler, WhisperHandler, WitAIHandler, VoskHandler, CustomSRHandler
 
 from .avatar import Live2DHandler, LivePNGHandler
@@ -154,6 +155,12 @@
         "description": _("Google's text to speech"),
         "class": gTTSHandler,
     },
+    "elevenlabs": {
+        "key": "elevenlabs",
+        "title": _("ElevenLabs TTS"),
+        "description": _("Natural sounding TTS"),
+        "class": ElevenLabs,
+    },
     "voicevox": {
         "key": "voicevox",
         "title": _("Voicevox API"),

diff --git a/src/dataset.py b/src/dataset.py
@@ -197,6 +197,11 @@ def reconstruct_dataset_from_csv(filename):
 ```console
 sudo pacman -S ollama
 ```
+If you have a Nvidia GPU, remember to install ollama-cuda for better performances:
+```console 
+sudo pacman -S ollama-cuda
+```
+
 Then you can run ollama by executing this command:
 ```console
 ollama serve
@@ -205,7 +210,12 @@ def reconstruct_dataset_from_csv(filename):
 ```console
 ollama pull llama3.1
 ```
-("""
+
+You can also install custom models from HuggingFace:
+```console
+ollama pull hf.co/{username}/{repository}
+```
+"""
 }
 
 if is_flatpak():

diff --git a/src/llm.py b/src/llm.py
@@ -158,7 +158,7 @@ def generate_chat_name(self, request_prompt:str = "") -> str:
 class G4FHandler(LLMHandler):
     """Common methods for g4f models"""
     key = "g4f"
-    version = "0.3.3.4" 
+    version = "0.3.5.8" 
 
     @staticmethod
     def get_extra_requirements() -> list:
@@ -218,6 +218,7 @@ def generate_text(self, prompt: str, history: list[dict[str, str]] = [], system_
             return response.choices[0].message.content
         except Exception as e:
             return f"Error: {e}"
+
     def generate_text_stream(self, prompt: str, history: list[dict[str, str]] = [], system_prompt: list[str] = [], on_update: Callable[[str], Any] = lambda _: None, extra_args: list = []) -> str:
         model = self.get_setting("model")
         img = None
@@ -1070,7 +1071,7 @@ def get_extra_settings(self) -> list:
 
 class GroqHandler(OpenAIHandler):
     key = "groq"
-    default_models = (("llama-3.1-70B-versatile", "llama-3.1-70B-versatile" ), ) 
+    default_models = (("llama-3.3-70B-versatile", "llama-3.3-70B-versatile" ), ) 
     def supports_vision(self) -> bool:
         return "vision" in self.get_setting("model")
 

diff --git a/src/main.py b/src/main.py
@@ -205,6 +205,12 @@ def new_chat(self,*a):
         self.win.notification_block.add_toast(
                 Adw.Toast(title=_('Chat is created')))
 
+    def start_recording(self,*a):
+        self.win.start_recording(self.win.recording_button)
+
+    def stop_tts(self,*a):
+        self.win.mute_tts(self.win.mute_tts_button)
+
     def do_shutdown(self):
         self.win.save_chat()
         settings = Gio.Settings.new('moe.nyarchlinux.assistant')
@@ -221,4 +227,6 @@ def main(version):
     app.create_action('reload_folder', app.reload_folder, ['<primary>e'])
     app.create_action('new_chat', app.new_chat, ['<primary>t'])
     app.create_action('focus_message', app.focus_message, ['<primary>l'])
+    app.create_action('start_recording', app.start_recording, ['<primary>s'])
+    app.create_action('stop_tts', app.stop_tts, ['<primary>k'])
     app.run(sys.argv)
diff --git a/src/settings.py b/src/settings.py
@@ -47,6 +47,8 @@ def __init__(self,app,headless=False, *args, **kwargs):
         self.prompts_settings = json.loads(self.settings.get_string("prompts-settings"))
         self.prompts = override_prompts(self.custom_prompts, PROMPTS)
         self.sandbox = can_escape_sandbox()
+
+        self.cache_handlers()
         # Page building
         self.general_page = Adw.PreferencesPage()
 
@@ -109,6 +111,10 @@ def __init__(self,app,headless=False, *args, **kwargs):
         for stt_key in AVAILABLE_STT:
             row = self.build_row(AVAILABLE_STT, stt_key, selected, group)
             stt_engine.add_row(row)
+        # Automatic STT settings 
+        self.auto_stt = Adw.ExpanderRow(title=_('Automatic Speech To Text'), subtitle=_("Automatically restart speech to text at the end of a text/TTS"))
+        self.build_auto_stt()
+        self.Voicegroup.add(self.auto_stt)
 
         # Build the AVATAR settings
         self.avatargroup = Adw.PreferencesGroup(title=_('Avatar'))
@@ -136,6 +142,7 @@ def __init__(self,app,headless=False, *args, **kwargs):
         for smart_prompt_key in AVAILABLE_SMART_PROMPTS:
            row = self.build_row(AVAILABLE_SMART_PROMPTS, smart_prompt_key, selected, group) 
            smartprompt.add_row(row)
+
         # Prompts settings
         self.prompt = Adw.PreferencesGroup(title=_('Prompt control'))
         self.general_page.add(self.prompt)
@@ -213,6 +220,47 @@ def __init__(self,app,headless=False, *args, **kwargs):
 
         self.add(self.general_page)
 
+    def build_auto_stt(self):
+        auto_stt_enabled = Gtk.Switch(valign=Gtk.Align.CENTER)
+        self.settings.bind("automatic-stt", auto_stt_enabled, 'active', Gio.SettingsBindFlags.DEFAULT)
+        self.auto_stt.add_suffix(auto_stt_enabled) 
+        def update_scale(scale, label, setting_value, type):
+            value = scale.get_value()
+            if type is float:
+                self.settings.set_double(setting_value, value)
+            elif type is int:
+                value = int(value)
+                self.settings.set_int(setting_value, value)
+            label.set_text(str(value))
+
+        # Silence Threshold
+        silence_threshold = Adw.ActionRow(title=_("Silence threshold"), subtitle=_("Silence threshold in seconds, percentage of the volume to be considered silence"))
+        threshold = Gtk.Scale(digits=0, round_digits=2)
+        threshold.set_range(0, 0.5)
+        threshold.set_size_request(120, -1)
+        th = self.settings.get_double("stt-silence-detection-threshold")
+        label = Gtk.Label(label=str(th))
+        threshold.set_value(th)
+        threshold.connect("value-changed", update_scale, label, "stt-silence-detection-threshold", float)
+        box = Gtk.Box()
+        box.append(threshold)
+        box.append(label)
+        silence_threshold.add_suffix(box)
+        # Silence time 
+        silence_time = Adw.ActionRow(title=_("Silence time"), subtitle=_("Silence time in seconds before recording stops automatically"))
+        time_scale = Gtk.Scale(digits=0, round_digits=0)
+        time_scale.set_range(0, 10)
+        time_scale.set_size_request(120, -1)
+        value = self.settings.get_int("stt-silence-detection-duration")
+        time_scale.set_value(value)
+        label = Gtk.Label(label=str(value))
+        time_scale.connect("value-changed", update_scale, label, "stt-silence-detection-duration", int)
+        box = Gtk.Box()
+        box.append(time_scale)
+        box.append(label)
+        silence_time.add_suffix(box)
+        self.auto_stt.add_row(silence_threshold) 
+        self.auto_stt.add_row(silence_time) 
 
     def update_prompt(self, switch: Gtk.Switch, state, key: str):
         """Update the prompt in the settings
@@ -269,6 +317,14 @@ def build_row(self, constants: dict[str, Any], key: str, selected: str, group: G
         row.add_prefix(button)
         return row
 
+    def cache_handlers(self):
+        self.handlers = {}
+        for key in AVAILABLE_TTS:
+            self.handlers[(key, self.convert_constants(AVAILABLE_TTS))] = self.get_object(AVAILABLE_TTS, key)
+        for key in AVAILABLE_STT:
+            self.handlers[(key, self.convert_constants(AVAILABLE_STT))] = self.get_object(AVAILABLE_STT, key)
+        for key in AVAILABLE_LLMS:
+            self.handlers[(key, self.convert_constants(AVAILABLE_LLMS))] = self.get_object(AVAILABLE_LLMS, key)
 
     def get_object(self, constants: dict[str, Any], key:str) -> (Handler):
 
@@ -284,12 +340,15 @@ def get_object(self, constants: dict[str, Any], key:str) -> (Handler):
         Returns:
             The created handler           
         """
+        if (key, self.convert_constants(constants)) in self.handlers:
+            return self.handlers[(key, self.convert_constants(constants))]
+
         if constants == AVAILABLE_LLMS:
             model = constants[key]["class"](self.settings, os.path.join(self.directory, "pip"))
         elif constants == AVAILABLE_STT:
             model = constants[key]["class"](self.settings,os.path.join(self.directory, "models"))
         elif constants == AVAILABLE_TTS:
-            model = constants[key]["class"](self.settings, self.directory)
+            model = constants[key]["class"](self.settings, os.path.join(self.directory, "pip"))
         elif constants == AVAILABLE_AVATARS:
             model = constants[key]["class"](self.settings, self.directory)
         elif constants == AVAILABLE_TRANSLATORS:

diff --git a/src/shortcuts.py b/src/shortcuts.py
@@ -17,6 +17,8 @@ def __init__(self,app, *args, **kwargs):
         gr.append(Gtk.ShortcutsShortcut(title=_("New tab"), accelerator='<primary>t'))
         gr.append(Gtk.ShortcutsShortcut(title=_("Paste Image"), accelerator='<primary>v'))
         gr.append(Gtk.ShortcutsShortcut(title=_("Focus message box"), accelerator='<primary>l'))
+        gr.append(Gtk.ShortcutsShortcut(title=_("Start recording"), accelerator='<primary>s'))
+        gr.append(Gtk.ShortcutsShortcut(title=_("Stop TTS"), accelerator='<primary>k'))
 
         sect_main.append(gr)
         self.set_child(sect_main)
diff --git a/src/stt.py b/src/stt.py
@@ -5,38 +5,60 @@
 from typing import Any
 import pyaudio
 import wave
+import struct
 import speech_recognition as sr
 from .extra import find_module, get_spawn_command, install_module
 from .handler import Handler
+import math
+
 
 class AudioRecorder:
-    """Record audio"""
-    def __init__(self):
+    """Record audio with optional auto-stop on silence detection."""
+    def __init__(self, auto_stop: bool = False, stop_function: callable = lambda _: (), silence_threshold_percent: float = 0.01, silence_duration: int = 2):
         self.recording = False
         self.frames = []
+        self.auto_stop = auto_stop
+        self.stop_function = stop_function
+        self.silence_threshold_percent = silence_threshold_percent
+        self.silence_duration = silence_duration
         self.sample_format = pyaudio.paInt16
         self.channels = 1
         self.sample_rate = 44100
         self.chunk_size = 1024
+        self.silent_chunks = 0
+        self.max_rms = 32767  # Maximum possible RMS for 16-bit audio
 
-    def start_recording(self):
+    def start_recording(self, output_file):
         self.recording = True
         self.frames = []
+        self.silent_chunks = 0
         p = pyaudio.PyAudio()
         stream = p.open(format=self.sample_format,
                         channels=self.channels,
                         rate=self.sample_rate,
                         frames_per_buffer=self.chunk_size,
                         input=True)
+        silence_threshold = self.max_rms * self.silence_threshold_percent
         while self.recording:
             data = stream.read(self.chunk_size)
             self.frames.append(data)
+            if self.auto_stop:
+                rms = self._calculate_rms(data)
+                if rms < silence_threshold:
+                    self.silent_chunks += 1
+                else:
+                    self.silent_chunks = 0
+                if self.silent_chunks >= self.silence_duration * (self.sample_rate / self.chunk_size):
+                    self.recording = False
         stream.stop_stream()
         stream.close()
         p.terminate()
+        self.save_recording(output_file)
 
     def stop_recording(self, output_file):
         self.recording = False
+
+    def save_recording(self, output_file):
         p = pyaudio.PyAudio()
         wf = wave.open(output_file, 'wb')
         wf.setnchannels(self.channels)
@@ -45,6 +67,18 @@ def stop_recording(self, output_file):
         wf.writeframes(b''.join(self.frames))
         wf.close()
         p.terminate()
+        self.stop_function()
+
+    def _calculate_rms(self, data):
+        """Calculate the root mean square of the audio data."""
+        count = len(data) // 2  # Each sample is 2 bytes (16-bit)
+        format = "<" + str(count) + "h"  # little-endian signed shorts
+        shorts = struct.unpack(format, data)
+        mean = sum(shorts) / count
+        shorts_demeaned = [sample - mean for sample in shorts]
+        sum_squares = sum(sample * sample for sample in shorts_demeaned)
+        rms = (sum_squares / count) ** 0.5
+        return rms
 
 
 class STTHandler(Handler):