add whisper voice to interact

lalaland-ai · Dec 10, 2023 · 3f0def4 · 3f0def4
1 parent 0bf55b5
commit 3f0def4
Show file tree

Hide file tree

Showing 4 changed files with 143 additions and 7 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "lala-companion",
   "productName": "lala-companion",
-  "version": "0.0.4",
+  "version": "0.0.5",
   "description": "3D personified desktop assistants, tuned for you, powered by AI vision and voice.",
   "main": ".vite/build/main.js",
   "scripts": {
@@ -30,6 +30,7 @@
     "@electron-forge/plugin-auto-unpack-natives": "^7.2.0",
     "@electron-forge/plugin-vite": "^7.2.0",
     "@electron-forge/publisher-github": "^7.2.0",
+    "@types/hark": "^1.2.5",
     "@types/react-dom": "^18.2.17",
     "@typescript-eslint/eslint-plugin": "^5.62.0",
     "@typescript-eslint/parser": "^5.62.0",
@@ -46,9 +47,11 @@
     "ai": "^2.2.28",
     "dotenv": "^16.3.1",
     "electron-squirrel-startup": "^1.0.0",
+    "hark": "^1.2.3",
     "react": "^18.2.0",
     "react-dom": "^18.2.0",
     "three": "^0.159.0",
-    "update-electron-app": "^3.0.0"
+    "update-electron-app": "^3.0.0",
+    "wavesurfer.js": "^7.5.1"
   }
 }
diff --git a/src/components/VRMCompanion.tsx b/src/components/VRMCompanion.tsx
@@ -48,12 +48,16 @@ interface VrmCompanionProps {
   virtualText: string;
   voiceUrl: string;
   audioRef?: MutableRefObject<HTMLAudioElement>;
+  onSpeakStart?: () => void;
+  onSpeakEnd?: () => void;
 }
 
 const VrmCompanion = ({
   virtualText,
   voiceUrl,
   audioRef,
+  onSpeakStart,
+  onSpeakEnd,
 }: VrmCompanionProps) => {
   const [gltf, setGltf] = useState(null);
   const [animationMixer, setAnimationMixer] = useState<AnimationMixer>(null);
@@ -199,6 +203,7 @@ const VrmCompanion = ({
   useEffect(() => {
     const main = async () => {
       if (voiceUrl) {
+        onSpeakStart?.();
         audioRef.current.src = voiceUrl;
         audioRef.current.play();
 
@@ -222,6 +227,7 @@ const VrmCompanion = ({
         lipsAction.play();
 
         audioRef.current.onended = async () => {
+          onSpeakEnd?.();
           animationMixer.clipAction(talkClip).fadeOut(2);
           lipsAction.fadeOut(1);
           const randomIdle = getRandomAnimation("idle");
@@ -263,7 +269,7 @@ const VrmCompanion = ({
   );
 };
 
-const Scene = ({ virtualText, voiceUrl }: VrmCompanionProps) => {
+const Scene = ({ virtualText, voiceUrl, onSpeakStart, onSpeakEnd }: VrmCompanionProps) => {
   const audioRef = useRef<HTMLAudioElement>(null);
 
   return (
@@ -287,6 +293,8 @@ const Scene = ({ virtualText, voiceUrl }: VrmCompanionProps) => {
           virtualText={virtualText}
           voiceUrl={voiceUrl}
           audioRef={audioRef}
+          onSpeakStart={onSpeakStart}
+          onSpeakEnd={onSpeakEnd}
         />
       </Canvas>
       <audio autoPlay ref={audioRef} src={""} />

diff --git a/src/overlay/Overlay.tsx b/src/overlay/Overlay.tsx
@@ -1,10 +1,14 @@
 import React, { useCallback, useEffect, useState } from "react";
 import VRMCompanion from "../components/VRMCompanion";
 import { useChat } from "../../node_modules/ai/react/dist/index";
+import hark from "hark";
+import WaveSurfer from "wavesurfer.js";
+import RecordPlugin from "wavesurfer.js/dist/plugins/record";
 
 const Overlay = () => {
   const [voiceUrl, setVoiceUrl] = useState<string>("");
   const [recentResponse, setRecentResponse] = useState<string>("");
+  const [isLalaSpeaking, setIsLalaSpeaking] = useState<boolean>(false);
 
   const getVoiceAudio = useCallback(async (text: string) => {
     try {
@@ -52,14 +56,108 @@ const Overlay = () => {
     });
   }, []);
 
+  // whisper chunking magic here
+  useEffect(() => {
+    let stream: MediaStream = null;
+    let speechEvents: hark.Harker = null;
+    let wavesurfer: WaveSurfer = null;
+    let recorder: RecordPlugin = null;
+    let isUserSpeaking = false;
+    let isLoading = false;
+
+    const main = async () => {
+      stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      speechEvents = hark(stream);
+
+      wavesurfer = WaveSurfer.create({
+        container: "#recorder",
+        height: 0,
+      });
+
+      recorder = wavesurfer.registerPlugin(
+        RecordPlugin.create({
+          scrollingWaveform: true,
+          renderRecordedAudio: false,
+        })
+      );
+
+      speechEvents.on("speaking", () => {
+        if (isLalaSpeaking || isLoading) return;
+        isUserSpeaking = true;
+        recorder.startRecording();
+        console.log("Started speaking");
+      });
+
+      speechEvents.on("stopped_speaking", () => {
+        if (isLalaSpeaking) return;
+        isLoading = true;
+        recorder.stopRecording();
+        isUserSpeaking = false;
+        console.log("Stopped speaking");
+      });
+
+      recorder.on("record-end", async (blob) => {
+        console.log("recording stopped");
+        const formData = new FormData();
+
+        const file = new File([blob], "voice.wav", {
+          type: "audio/wav",
+        });
+
+        console.log(file);
+
+        formData.append("file", file);
+
+        const whisperResp = await fetch(
+          "https://lalaland.chat/api/magic/whisper",
+          {
+            method: "POST",
+            body: formData,
+          }
+        );
+
+        if (whisperResp.ok) {
+          const whisperText = await whisperResp.json();
+          console.log(whisperText);
+          await append({
+            role: "user",
+            content: whisperText,
+          });
+          setTimeout(() => {
+            isLoading = false;
+          }, 5000);
+        } else {
+          console.log("error whispering", whisperResp);
+          isLoading = false;
+        }
+      });
+    };
+    main();
+
+    return () => {
+      stream?.getTracks().forEach((track) => track.stop());
+      speechEvents?.stop();
+      wavesurfer?.destroy();
+      recorder?.destroy();
+      isUserSpeaking = false;
+      isLoading = false;
+    };
+  }, [isLalaSpeaking]);
+
   return (
     <div
       style={{
         height: "100%",
         width: "100%",
       }}
     >
-      <VRMCompanion virtualText={recentResponse} voiceUrl={voiceUrl} />
+      <VRMCompanion
+        virtualText={recentResponse}
+        voiceUrl={voiceUrl}
+        onSpeakStart={() => setIsLalaSpeaking(true)}
+        onSpeakEnd={() => setIsLalaSpeaking(false)}
+      />
+      <div id="recorder" />
     </div>
   );
 };