diff --git a/crates/goose-server/Cargo.toml b/crates/goose-server/Cargo.toml index c410f8e2e..c15d4aec3 100644 --- a/crates/goose-server/Cargo.toml +++ b/crates/goose-server/Cargo.toml @@ -10,10 +10,10 @@ description.workspace = true [dependencies] goose = { path = "../goose" } mcp-core = { path = "../mcp-core" } -axum = { version = "0.7", features = ["ws"] } +axum = { version = "0.7", features = ["ws", "multipart"] } tokio = { version = "1.0", features = ["full"] } chrono = "0.4" -tower-http = { version = "0.5", features = ["cors"] } +tower-http = { version = "0.5", features = ["cors", "trace"] } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" futures = "0.3" @@ -26,6 +26,7 @@ http-body-util = "0.1" http = "1.0" config = { version = "0.14.1", features = ["toml"] } thiserror = "1.0" +tempfile = "3.8" [[bin]] name = "goosed" diff --git a/crates/goose-server/src/routes/mod.rs b/crates/goose-server/src/routes/mod.rs index 2d798a0da..f83671373 100644 --- a/crates/goose-server/src/routes/mod.rs +++ b/crates/goose-server/src/routes/mod.rs @@ -1,9 +1,12 @@ // Export route modules pub mod reply; +pub mod transcribe; use axum::Router; // Function to configure all routes pub fn configure(state: crate::state::AppState) -> Router { - Router::new().merge(reply::routes(state)) + Router::new() + .merge(reply::routes(state.clone())) + .merge(transcribe::routes()) } diff --git a/crates/goose-server/src/routes/transcribe.rs b/crates/goose-server/src/routes/transcribe.rs new file mode 100644 index 000000000..21ace4985 --- /dev/null +++ b/crates/goose-server/src/routes/transcribe.rs @@ -0,0 +1,476 @@ +use axum::{ + extract::Multipart, + http::StatusCode, + routing::{get, post}, + Json, Router, +}; +use serde::{Deserialize, Serialize}; +use serde_json::json; +use std::io::Write; +use std::path::PathBuf; +use std::process::Command; +use std::sync::Arc; +use tempfile::Builder; +use tokio::fs; +use tokio::sync::{OnceCell, RwLock}; +use tower_http::cors::{Any, CorsLayer}; + +// Status tracking +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WhisperStatus { + installed: bool, + built: bool, + model_downloaded: bool, +} + +impl WhisperStatus { + fn new() -> Self { + Self { + installed: false, + built: false, + model_downloaded: false, + } + } + + fn is_ready(&self) -> bool { + self.installed && self.built && self.model_downloaded + } +} + +static STATUS: OnceCell>> = OnceCell::const_new(); +static INIT: OnceCell<()> = OnceCell::const_new(); + +async fn get_status() -> Arc> { + STATUS + .get_or_init(|| async { Arc::new(RwLock::new(WhisperStatus::new())) }) + .await + .clone() +} + +/// Ensures whisper is built and the model is downloaded +async fn ensure_whisper() { + INIT.get_or_init(|| async { + let status = get_status().await; + + // Get the project root directory + let mut project_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + project_dir.pop(); // go up from goose-server + project_dir.pop(); // go up from crates + let project_dir = Arc::new(project_dir); + + // Check whisper directory and installation + let whisper_dir = project_dir.join("whisper.cpp"); + if whisper_dir.exists() { + let mut status = status.write().await; + status.installed = true; + } + + // Check whisper executable + let whisper_path = project_dir.join("whisper.cpp/build/bin/main"); + if whisper_path.exists() { + let mut status = status.write().await; + status.built = true; + } + + // Check model file + let model_path = project_dir.join("whisper.cpp/models/ggml-base.en.bin"); + if model_path.exists() { + let mut status = status.write().await; + status.model_downloaded = true; + } + }) + .await; +} + +/// Get the current status of whisper setup +async fn whisper_status() -> Json { + ensure_whisper().await; + let status = get_status().await; + let status = status.read().await; + Json(json!({ + "ready": status.is_ready(), + "status": { + "installed": status.installed, + "built": status.built, + "model_downloaded": status.model_downloaded + } + })) +} + +pub fn routes() -> Router { + // Spawn the initialization in the background + tokio::spawn(ensure_whisper()); + + Router::new() + .route("/transcribe", post(transcribe)) + .route("/whisper-status", get(whisper_status)) + .layer( + CorsLayer::new() + .allow_origin(Any) + .allow_methods(Any) + .allow_headers(Any), + ) +} + +/// Check if whisper is ready for transcription +async fn check_whisper_ready() -> Result<(), (StatusCode, Json)> { + ensure_whisper().await; + let status = get_status().await; + let status = status.read().await; + + if !status.is_ready() { + Err(( + StatusCode::SERVICE_UNAVAILABLE, + Json(json!({ + "success": false, + "error": "Whisper is not ready yet", + "status": { + "installed": status.installed, + "built": status.built, + "model_downloaded": status.model_downloaded + } + })), + )) + } else { + Ok(()) + } +} + +async fn transcribe( + mut multipart: Multipart, +) -> Result, (StatusCode, Json)> { + // Check if whisper is ready + check_whisper_ready().await?; + + eprintln!("Starting transcription process..."); + + while let Some(field) = multipart.next_field().await.unwrap() { + eprintln!("Processing multipart field: {:?}", field.name()); + + if let Ok(data) = field.bytes().await { + eprintln!("Received audio data of size: {} bytes", data.len()); + if data.len() == 0 { + eprintln!("Error: Received empty audio data"); + return Err(( + StatusCode::BAD_REQUEST, + Json(json!({ + "success": false, + "error": "Received empty audio data" + })), + )); + } + + // Create temporary files with proper extensions + let webm_file = match Builder::new().suffix(".webm").tempfile() { + Ok(file) => file, + Err(e) => { + eprintln!("Error creating WebM tempfile: {}", e); + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ + "success": false, + "error": format!("Failed to create temporary WebM file: {}", e) + })), + )); + } + }; + let webm_path = webm_file.path().to_str().unwrap().to_string(); + + let wav_file = match Builder::new().suffix(".wav").tempfile() { + Ok(file) => file, + Err(e) => { + eprintln!("Error creating WAV tempfile: {}", e); + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ + "success": false, + "error": format!("Failed to create temporary WAV file: {}", e) + })), + )); + } + }; + let wav_path = wav_file.path().to_str().unwrap().to_string(); + + // Write the WebM data + match webm_file.as_file().write_all(&data) { + Ok(_) => eprintln!("Successfully wrote WebM data to temporary file"), + Err(e) => { + eprintln!("Error writing WebM data: {}", e); + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ + "success": false, + "error": format!("Failed to write WebM data: {}", e) + })), + )); + } + } + + // Get the path to the whisper executable in the project directory + let mut whisper_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + whisper_path.pop(); // go up from goose-server + whisper_path.pop(); // go up from crates + whisper_path.push("whisper.cpp"); + whisper_path.push("build"); + whisper_path.push("bin"); + whisper_path.push("main"); + + // Get the path to the model file + let mut model_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + model_path.pop(); // go up from goose-server + model_path.pop(); // go up from crates + model_path.push("whisper.cpp"); + model_path.push("models"); + model_path.push("ggml-base.en.bin"); + + eprintln!("Paths configuration:"); + eprintln!("Whisper path: {:?}", whisper_path); + eprintln!("Model path: {:?}", model_path); + eprintln!("WebM path: {:?}", webm_path); + eprintln!("WAV path: {:?}", wav_path); + + // Verify whisper executable exists + if !whisper_path.exists() { + eprintln!("Error: Whisper executable not found at {:?}", whisper_path); + return Err(( + StatusCode::SERVICE_UNAVAILABLE, + Json(json!({ + "success": false, + "error": "Whisper executable not found" + })), + )); + } + + // Verify model exists + if !model_path.exists() { + eprintln!("Error: Whisper model not found at {:?}", model_path); + return Err(( + StatusCode::SERVICE_UNAVAILABLE, + Json(json!({ + "success": false, + "error": "Whisper model not found" + })), + )); + } + + // Check WebM file size + let webm_size = match std::fs::metadata(&webm_path) { + Ok(metadata) => metadata.len(), + Err(e) => { + eprintln!("Error getting WebM file metadata: {}", e); + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ + "success": false, + "error": format!("Failed to verify WebM file: {}", e) + })), + )); + } + }; + eprintln!("WebM file size: {} bytes", webm_size); + + // Check WebM file content + eprintln!("Analyzing WebM file with FFprobe..."); + let ffprobe_webm = Command::new("ffprobe") + .arg("-v") + .arg("error") // Only show errors + .arg("-show_format") + .arg("-show_streams") + .arg(&webm_path) + .output() + .unwrap(); + + let webm_probe_output = String::from_utf8_lossy(&ffprobe_webm.stdout); + eprintln!("WebM FFprobe analysis:"); + eprintln!("{}", webm_probe_output); + + if !ffprobe_webm.status.success() { + eprintln!( + "WebM FFprobe error: {}", + String::from_utf8_lossy(&ffprobe_webm.stderr) + ); + return Err(( + StatusCode::BAD_REQUEST, + Json(json!({ + "success": false, + "error": format!("Invalid WebM file: {}", String::from_utf8_lossy(&ffprobe_webm.stderr)) + })), + )); + } + + // Run ffmpeg to convert WebM to WAV + eprintln!("Converting WebM to WAV..."); + let ffmpeg_output = Command::new("ffmpeg") + .arg("-hide_banner") + .arg("-loglevel") + .arg("debug") // Increased logging level + .arg("-i") + .arg(&webm_path) + .arg("-vn") // Ignore video stream if present + .arg("-acodec") + .arg("pcm_s16le") // Force audio codec + .arg("-ar") + .arg("16000") // Sample rate that whisper expects + .arg("-ac") + .arg("1") // Mono audio + .arg("-f") + .arg("wav") // Force WAV format + .arg("-y") // Overwrite output file + .arg(&wav_path) + .output() + .unwrap(); + + eprintln!("FFmpeg conversion details:"); + eprintln!("stdout: {}", String::from_utf8_lossy(&ffmpeg_output.stdout)); + eprintln!("stderr: {}", String::from_utf8_lossy(&ffmpeg_output.stderr)); + + if !ffmpeg_output.status.success() { + eprintln!("FFmpeg conversion failed!"); + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ + "success": false, + "error": format!("FFmpeg conversion failed: {}", String::from_utf8_lossy(&ffmpeg_output.stderr)) + })), + )); + } + + // Check WAV file size + let wav_size = match std::fs::metadata(&wav_path) { + Ok(metadata) => metadata.len(), + Err(e) => { + eprintln!("Error getting WAV file metadata: {}", e); + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ + "success": false, + "error": format!("Failed to verify WAV file: {}", e) + })), + )); + } + }; + eprintln!("WAV file size: {} bytes", wav_size); + + // Check if WAV file exists and has content + if wav_size == 0 { + eprintln!("Error: WAV file is empty!"); + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ + "success": false, + "error": "WAV conversion failed - output file is empty" + })), + )); + } + + // Analyze WAV file + eprintln!("Analyzing WAV file with FFprobe..."); + let ffprobe_wav = Command::new("ffprobe") + .arg("-v") + .arg("error") // Only show errors + .arg("-show_format") + .arg("-show_streams") + .arg(&wav_path) + .output() + .unwrap(); + + let wav_probe_output = String::from_utf8_lossy(&ffprobe_wav.stdout); + eprintln!("WAV FFprobe analysis:"); + eprintln!("{}", wav_probe_output); + + if !ffprobe_wav.status.success() { + eprintln!( + "WAV FFprobe error: {}", + String::from_utf8_lossy(&ffprobe_wav.stderr) + ); + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ + "success": false, + "error": format!("Invalid WAV file: {}", String::from_utf8_lossy(&ffprobe_wav.stderr)) + })), + )); + } + + // Run whisper transcription + eprintln!("Running whisper on WAV file..."); + let output = Command::new(&whisper_path) + .arg("-m") + .arg(&model_path) + .arg("-f") + .arg(&wav_path) + .arg("-l") + .arg("en") + .arg("-t") + .arg("4") + .arg("-pp") + .arg("0") + .arg("-otxt") + .output() + .unwrap(); + + eprintln!("Whisper process completed"); + eprintln!( + "Whisper stdout: {}", + String::from_utf8_lossy(&output.stdout) + ); + eprintln!( + "Whisper stderr: {}", + String::from_utf8_lossy(&output.stderr) + ); + + if output.status.success() { + // Read the output text file + let txt_path = format!("{}.txt", wav_path); + match fs::read_to_string(&txt_path).await { + Ok(text) => { + // Clean up temporary files + eprintln!("Cleaning up temporary files..."); + let _ = fs::remove_file(&txt_path).await; + + eprintln!("Transcription successful: {}", text.trim()); + return Ok(Json(json!({ + "success": true, + "text": text.trim() + }))); + } + Err(e) => { + eprintln!("Error reading transcription output: {}", e); + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ + "success": false, + "error": format!("Failed to read transcription output: {}", e) + })), + )); + } + } + } else { + eprintln!("Whisper process failed"); + eprintln!("Error output: {}", String::from_utf8_lossy(&output.stderr)); + eprintln!( + "Standard output: {}", + String::from_utf8_lossy(&output.stdout) + ); + return Err(( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ + "success": false, + "error": format!("Whisper failed: {}", String::from_utf8_lossy(&output.stderr)) + })), + )); + } + } else { + eprintln!("Error: Failed to read audio data from multipart field"); + } + } + + eprintln!("Error: No valid audio data found in request"); + Err(( + StatusCode::BAD_REQUEST, + Json(json!({ + "success": false, + "error": "Failed to process audio" + })), + )) +} diff --git a/scripts/install_whisper.sh b/scripts/install_whisper.sh new file mode 100755 index 000000000..b6cb6b309 --- /dev/null +++ b/scripts/install_whisper.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Create scripts directory if it doesn't exist +mkdir -p scripts + +# Install Whisper.cpp +echo "Installing Whisper.cpp..." +git clone https://github.com/ggerganov/whisper.cpp.git +cd whisper.cpp + +# Build the project +make + +# Download the base English model +bash ./models/download-ggml-model.sh base.en + +# Create a symbolic link to the whisper executable in /usr/local/bin +echo "Creating symbolic link to whisper executable..." +sudo ln -sf "$(pwd)/main" /usr/local/bin/whisper + +echo "Whisper installation complete!" +echo "You can now use the 'whisper' command to transcribe audio files." diff --git a/ui/desktop/package-lock.json b/ui/desktop/package-lock.json index 919582bfb..5614226fb 100644 --- a/ui/desktop/package-lock.json +++ b/ui/desktop/package-lock.json @@ -37,7 +37,8 @@ "react-router-dom": "^6.28.0", "react-syntax-highlighter": "^15.6.1", "tailwind-merge": "^2.5.4", - "tailwindcss-animate": "^1.0.7" + "tailwindcss-animate": "^1.0.7", + "wavesurfer.js": "^7.8.11" }, "devDependencies": { "@electron-forge/cli": "^7.5.0", @@ -14605,6 +14606,12 @@ } } }, + "node_modules/wavesurfer.js": { + "version": "7.8.11", + "resolved": "https://registry.npmjs.org/wavesurfer.js/-/wavesurfer.js-7.8.11.tgz", + "integrity": "sha512-bZs7A0vtTVOhuPoDGOXVevAIm+KVYBGwddjL9AeOS7kp/oPcVH9hQWQyR2rBAAfN6s0BKI+EdPEalkNaOmkA6A==", + "license": "BSD-3-Clause" + }, "node_modules/wcwidth": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/wcwidth/-/wcwidth-1.0.1.tgz", diff --git a/ui/desktop/package.json b/ui/desktop/package.json index 249c10234..9a6a17d3e 100644 --- a/ui/desktop/package.json +++ b/ui/desktop/package.json @@ -75,6 +75,7 @@ "react-router-dom": "^6.28.0", "react-syntax-highlighter": "^15.6.1", "tailwind-merge": "^2.5.4", - "tailwindcss-animate": "^1.0.7" + "tailwindcss-animate": "^1.0.7", + "wavesurfer.js": "^7.8.11" } -} \ No newline at end of file +} diff --git a/ui/desktop/src/components/AudioRecorder.tsx b/ui/desktop/src/components/AudioRecorder.tsx new file mode 100644 index 000000000..a07195498 --- /dev/null +++ b/ui/desktop/src/components/AudioRecorder.tsx @@ -0,0 +1,180 @@ +import React, { useState, useRef, useEffect, useCallback } from 'react'; +import { Button } from './ui/button'; +import { Mic, Square } from 'lucide-react'; +import { getApiUrl } from "../config"; +import WaveSurfer from 'wavesurfer.js'; +import RecordPlugin from 'wavesurfer.js/dist/plugins/record.esm.js'; +declare class Blob{} +declare class FormData{} + +// Separate button component +export const AudioButton = ({ + isRecording, + onClick, +}: { + isRecording: boolean; + onClick: () => void; +}) => ( + +); + +// Separate waveform component with its own state management +export const AudioWaveform = React.forwardRef< + HTMLDivElement, + { + isRecording: boolean; + onRecordEnd?: (blob: Blob) => void; + className?: string; + } +>(({ isRecording, onRecordEnd, className = '' }, ref) => { + const wavesurferRef = useRef(null); + const recordPluginRef = useRef(null); + const [progress, setProgress] = useState('00:00'); + + const handleRecordProgress = useCallback((time: number) => { + const minutes = Math.floor((time % 3600000) / 60000); + const seconds = Math.floor((time % 60000) / 1000); + const formattedTime = [minutes, seconds] + .map(v => v < 10 ? '0' + v : v) + .join(':'); + setProgress(formattedTime); + }, []); + + useEffect(() => { + const container = ref as React.RefObject; + if (!container.current) return; + + const wavesurfer = WaveSurfer.create({ + container: container.current, + waveColor: 'rgb(99, 102, 241)', // Indigo-600 + progressColor: 'rgb(79, 70, 229)', // Indigo-700 + height: 26, + barWidth: 2, + barGap: 1, + barRadius: 1, + normalize: true, + minPxPerSec: 50, // Increase this value to make the waveform wider + }); + + const recordPlugin = wavesurfer.registerPlugin( + RecordPlugin.create({ + renderRecordedAudio: false, + scrollingWaveform: false, + continuousWaveform: true, + continuousWaveformDuration: 30, + }) + ); + + if (onRecordEnd) { + recordPlugin.on('record-end', onRecordEnd); + } + recordPlugin.on('record-progress', handleRecordProgress); + + wavesurferRef.current = wavesurfer; + recordPluginRef.current = recordPlugin; + + return () => { + wavesurfer.destroy(); + wavesurferRef.current = null; + recordPluginRef.current = null; + }; + }, [ref, onRecordEnd, handleRecordProgress]); + + useEffect(() => { + const recordPlugin = recordPluginRef.current; + if (!recordPlugin) return; + + const handleRecording = async () => { + if (isRecording) { + try { + await recordPlugin.startRecording(); + } catch (err) { + console.error('Failed to start recording:', err); + } + } else { + try { + if (recordPlugin.isRecording()) { + await recordPlugin.stopRecording(); + setProgress('00:00'); + } + } catch (err) { + console.error('Failed to stop recording:', err); + } + } + }; + + handleRecording(); + }, [isRecording]); + + return ( +
+
+
+ ); +}); + +AudioWaveform.displayName = 'AudioWaveform'; + +// Main AudioRecorder component that combines both +export function AudioRecorder({ onTranscription, containerClassName }: { + onTranscription: (text: string) => void; + containerClassName?: string; +}) { + const [isRecording, setIsRecording] = useState(false); + const micContainerRef = useRef(null); + + const handleRecordEnd = useCallback(async (blob: Blob) => { + try { + console.log('Recording completed, size:', blob.size, 'type:', blob.type); + const formData = new FormData(); + formData.append('audio', blob, 'audio.webm'); + + const response = await fetch(getApiUrl('/transcribe'), { + method: 'POST', + body: formData, + }); + + if (!response.ok) { + throw new Error('Transcription failed'); + } + + const result = await response.json(); + console.log('Received response:', result); + if (result.success) { + onTranscription(result.text); + } else { + console.error('Transcription error:', result.error); + } + } catch (err) { + console.error('Transcription error:', err); + } + }, [onTranscription]); + + const handleToggleRecording = useCallback(() => { + setIsRecording(prev => !prev); + }, []); + + return ( +
+ + +
+ ); +} diff --git a/ui/desktop/src/components/Input.tsx b/ui/desktop/src/components/Input.tsx index c189059c1..f5621c04f 100644 --- a/ui/desktop/src/components/Input.tsx +++ b/ui/desktop/src/components/Input.tsx @@ -3,6 +3,10 @@ import { Button } from './ui/button'; import Send from './ui/Send'; import Stop from './ui/Stop'; import { Paperclip } from 'lucide-react'; +import { getApiUrl } from "../config"; +import { AudioButton, AudioWaveform } from './AudioRecorder'; +declare class Blob{} +declare class FormData{} interface InputProps { handleSubmit: (e: React.FormEvent) => void; @@ -26,7 +30,10 @@ export default function Input({ onStop }: InputProps) { const [value, setValue] = useState(''); + const [isRecording, setIsRecording] = useState(false); + const [isTranscribing, setIsTranscribing] = useState(false); const textAreaRef = useRef(null); + const waveformRef = useRef(null); useEffect(() => { if (textAreaRef.current && !disabled) { @@ -80,39 +87,85 @@ export default function Input({ } }; + const handleRecordEnd = async (blob: Blob) => { + try { + setIsTranscribing(true); + console.log('Recording completed, size:', blob.size, 'type:', blob.type); + const formData = new FormData(); + formData.append('audio', blob, 'audio.webm'); + + const response = await fetch(getApiUrl('/transcribe'), { + method: 'POST', + body: formData, + }); + + if (!response.ok) { + throw new Error('Transcription failed'); + } + + const result = await response.json(); + console.log('Received response:', result); + if (result.success) { + setValue(result.text); + textAreaRef.current?.focus(); + } else { + console.error('Transcription error:', result.error); + } + } catch (err) { + console.error('Transcription error:', err); + } finally { + setIsTranscribing(false); + } + }; + return (
-