Whisper-live-transcript (#1099)

* refactor whisper * more refactoring * more refactoring * reduce reused code * reduce reused code * make deepgram urls available throughout mod * refactor whisper processing * update stt * add realtime audio settings * cleanup errors... * create SSE endpoint * deepgram live transcription * fixes and deepgram api_key * live transcription with whisper * only no speech detected from decoder with debug * reduce logs from decoder unless debug
mediar-ai · Jan 8, 2025 · 480a3f0 · 480a3f0
1 parent ae1ab1e
commit 480a3f0
Show file tree

Hide file tree

Showing 31 changed files with 1,305 additions and 718 deletions.
diff --git a/screenpipe-actions/src/run.rs b/screenpipe-actions/src/run.rs
@@ -1,6 +1,5 @@
 use crate::type_and_animate::{delete_characters, type_slowly, EnigoCommand};
 use crate::{call_ai, run_keystroke_monitor, KeystrokeCommand};
-use reqwest;
 use std::path::Path;
 use std::string::ToString;
 use std::sync::atomic::{AtomicBool, Ordering};

diff --git a/screenpipe-actions/src/type_and_animate.rs b/screenpipe-actions/src/type_and_animate.rs
@@ -22,7 +22,7 @@ pub struct EnigoResponse {
 }
 
 thread_local! {
-    static ENIGO: RefCell<Option<Enigo>> = RefCell::new(None);
+    static ENIGO: RefCell<Option<Enigo>> = const { RefCell::new(None) };
 }
 
 fn with_enigo<F, R>(f: F) -> R

diff --git a/screenpipe-audio/Cargo.toml b/screenpipe-audio/Cargo.toml
@@ -76,6 +76,9 @@ ndarray = "0.16"
 ort = "=2.0.0-rc.6"
 knf-rs = { git = "https://github.com/Neptune650/knf-rs.git" }
 ort-sys = "=2.0.0-rc.8"
+futures = "0.3.31"
+deepgram = "0.6.4"
+bytes = { version = "1.9.0", features = ["serde"] }
 
 [target.'cfg(target_os = "windows")'.dependencies]
 ort = { version = "=2.0.0-rc.6", features = [

diff --git a/screenpipe-audio/examples/stt.rs b/screenpipe-audio/examples/stt.rs
@@ -95,7 +95,7 @@ async fn main() {
                 &segmentation_model_path,
                 embedding_manager,
                 embedding_extractor,
-                "default",
+                &audio_input.device.to_string(),
             )
             .await
             .unwrap();

diff --git a/screenpipe-audio/src/audio_processing.rs b/screenpipe-audio/src/audio_processing.rs
@@ -1,6 +1,14 @@
 use anyhow::Result;
+use chrono::Utc;
+use log::debug;
 use realfft::num_complex::{Complex32, ComplexFloat};
 use realfft::RealFftPlanner;
+use rubato::{
+    Resampler, SincFixedIn, SincInterpolationParameters, SincInterpolationType, WindowFunction,
+};
+use std::path::PathBuf;
+
+use crate::encode_single_audio;
 
 pub fn normalize_v2(audio: &[f32]) -> Vec<f32> {
     let rms = (audio.iter().map(|&x| x * x).sum::<f32>() / audio.len() as f32).sqrt();
@@ -101,3 +109,55 @@ pub fn audio_to_mono(audio: &[f32], channels: u16) -> Vec<f32> {
 
     mono_samples
 }
+
+pub fn resample(input: &[f32], from_sample_rate: u32, to_sample_rate: u32) -> Result<Vec<f32>> {
+    debug!("Resampling audio");
+    let params = SincInterpolationParameters {
+        sinc_len: 256,
+        f_cutoff: 0.95,
+        interpolation: SincInterpolationType::Linear,
+        oversampling_factor: 256,
+        window: WindowFunction::BlackmanHarris2,
+    };
+
+    let mut resampler = SincFixedIn::<f32>::new(
+        to_sample_rate as f64 / from_sample_rate as f64,
+        2.0,
+        params,
+        input.len(),
+        1,
+    )?;
+
+    let waves_in = vec![input.to_vec()];
+    debug!("Performing resampling");
+    let waves_out = resampler.process(&waves_in, None)?;
+    debug!("Resampling complete");
+    Ok(waves_out.into_iter().next().unwrap())
+}
+
+pub fn write_audio_to_file(
+    audio: &[f32],
+    sample_rate: u32,
+    output_path: &PathBuf,
+    device: &str,
+    skip_encoding: bool,
+) -> Result<String> {
+    let new_file_name = Utc::now().format("%Y-%m-%d_%H-%M-%S").to_string();
+    let sanitized_device_name = device.replace(['/', '\\'], "_");
+    let file_path = PathBuf::from(output_path)
+        .join(format!("{}_{}.mp4", sanitized_device_name, new_file_name))
+        .to_str()
+        .expect("Failed to create valid path")
+        .to_string();
+    let file_path_clone = file_path.clone();
+    // Run FFmpeg in a separate task
+    if !skip_encoding {
+        encode_single_audio(
+            bytemuck::cast_slice(audio),
+            sample_rate,
+            1,
+            &file_path.into(),
+        )?;
+    }
+    Ok(file_path_clone)
+}
diff --git a/screenpipe-audio/src/bin/screenpipe-audio-forever.rs b/screenpipe-audio/src/bin/screenpipe-audio-forever.rs
@@ -130,6 +130,7 @@ async fn main() -> Result<()> {
                         }
                     }
                 })
+                .await
             }
         })
         .collect();

diff --git a/screenpipe-audio/src/core.rs b/screenpipe-audio/src/core.rs
@@ -1,10 +1,12 @@
 use crate::audio_processing::audio_to_mono;
+use crate::realtime::{realtime_stt, RealtimeTranscriptionEvent};
 use crate::AudioInput;
 use anyhow::{anyhow, Result};
 use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
 use cpal::StreamError;
 use lazy_static::lazy_static;
 use log::{debug, error, info, warn};
+use screenpipe_core::Language;
 use serde::{Deserialize, Serialize};
 use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 use std::sync::mpsc;
@@ -200,6 +202,44 @@ pub async fn record_and_transcribe(
     Ok(())
 }
 
+pub async fn start_realtime_recording(
+    audio_stream: Arc<AudioStream>,
+    realtime_transcription_engine: Arc<AudioTranscriptionEngine>,
+    languages: Vec<Language>,
+    is_running: Arc<AtomicBool>,
+    realtime_transcription_sender: Arc<tokio::sync::broadcast::Sender<RealtimeTranscriptionEvent>>,
+    deepgram_api_key: Option<String>,
+) -> Result<()> {
+    while is_running.load(Ordering::Relaxed) {
+        match realtime_stt(
+            audio_stream.clone(),
+            realtime_transcription_engine.clone(),
+            languages.clone(),
+            realtime_transcription_sender.clone(),
+            is_running.clone(),
+            deepgram_api_key.clone(),
+        )
+        .await
+        {
+            Ok(_) => {
+                // Normal shutdown
+                break;
+            }
+            Err(e) => {
+                if !is_running.load(Ordering::Relaxed) {
+                    // Normal shutdown
+                    break;
+                }
+
+                error!("realtime_stt error, restarting: {}", e);
+                // Add a small delay before restarting to prevent rapid restart loops
+                tokio::time::sleep(Duration::from_secs(1)).await;
+            }
+        }
+    }
+    Ok(())
+}
+
 async fn run_record_and_transcribe(
     audio_stream: Arc<AudioStream>,
     duration: Duration,

diff --git a/screenpipe-audio/src/deepgram/mod.rs b/screenpipe-audio/src/deepgram/mod.rs
@@ -0,0 +1,16 @@
+mod process_chunk;
+mod realtime;
+use lazy_static::lazy_static;
+pub use realtime::stream_transcription_deepgram;
+use std::env;
+
+lazy_static! {
+    pub(crate) static ref DEEPGRAM_API_URL: String = env::var("DEEPGRAM_API_URL")
+        .unwrap_or_else(|_| "https://api.deepgram.com/v1/listen".to_string());
+    pub(crate) static ref DEEPGRAM_WEBSOCKET_URL: String = env::var("DEEPGRAM_WEBSOCKET_URL")
+        .unwrap_or_else(|_| "wss://api.deepgram.com/v1/listen".to_string());
+    pub(crate) static ref CUSTOM_DEEPGRAM_API_TOKEN: String =
+        env::var("CUSTOM_DEEPGRAM_API_TOKEN").unwrap_or_else(|_| String::new());
+}
+
+pub use process_chunk::transcribe_with_deepgram;
diff --git a/screenpipe-audio/src/deepgram/process_chunk.rs b/screenpipe-audio/src/deepgram/process_chunk.rs
@@ -0,0 +1,134 @@
+use anyhow::Result;
+use hound::{WavSpec, WavWriter};
+use log::{debug, error, info};
+use reqwest::Client;
+use screenpipe_core::Language;
+use serde_json::Value;
+use std::io::Cursor;
+
+use crate::deepgram::{CUSTOM_DEEPGRAM_API_TOKEN, DEEPGRAM_API_URL};
+
+pub async fn transcribe_with_deepgram(
+    api_key: &str,
+    audio_data: &[f32],
+    device: &str,
+    sample_rate: u32,
+    languages: Vec<Language>,
+) -> Result<String> {
+    debug!("starting deepgram transcription");
+    let client = Client::new();
+
+    // Use token from env var
+    let custom_api_key = CUSTOM_DEEPGRAM_API_TOKEN.as_str();
+
+    // Create a WAV file in memory
+    let mut cursor = Cursor::new(Vec::new());
+    {
+        let spec = WavSpec {
+            channels: 1,
+            sample_rate: match sample_rate {
+                88200 => 16000,   // Deepgram expects 16kHz for 88.2kHz
+                _ => sample_rate, // Fallback for other sample rates
+            },
+            bits_per_sample: 32,
+            sample_format: hound::SampleFormat::Float,
+        };
+        let mut writer = WavWriter::new(&mut cursor, spec)?;
+        for &sample in audio_data {
+            writer.write_sample(sample)?;
+        }
+        writer.finalize()?;
+    }
+
+    // Get the WAV data from the cursor
+    let wav_data = cursor.into_inner();
+
+    let mut query_params = String::from("model=nova-2&smart_format=true&sample_rate=16000");
+
+    if !languages.is_empty() {
+        query_params = [
+            query_params,
+            "&".into(),
+            languages
+                .iter()
+                .map(|lang| format!("detect_language={}", lang.as_lang_code()))
+                .collect::<Vec<String>>()
+                .join("&"),
+        ]
+        .concat();
+    }
+
+    // rationale: custom api key = custom AI proxy to use deepgram
+    // no custom api key = use deepgram api key for real deepgram endpoint
+    let api_key_to_use = if custom_api_key.is_empty() {
+        api_key
+    } else {
+        custom_api_key
+    };
+    let is_custom_endpoint = !custom_api_key.is_empty();
+
+    debug!("deepgram api key: {}", api_key_to_use);
+
+    let response = client
+        .post(format!("{}?{}", *DEEPGRAM_API_URL, query_params))
+        .header("Content-Type", "audio/wav")
+        // Use Bearer format when using custom endpoint/proxy
+        .header(
+            "Authorization",
+            if is_custom_endpoint {
+                format!("Bearer {}", api_key_to_use)
+            } else {
+                format!("Token {}", api_key_to_use)
+            },
+        )
+        .body(wav_data)
+        .send();
+
+    match response.await {
+        Ok(resp) => {
+            debug!("received response from deepgram api");
+            match resp.json::<Value>().await {
+                Ok(result) => {
+                    debug!("successfully parsed json response");
+                    if let Some(err_code) = result.get("err_code") {
+                        error!(
+                            "deepgram api error code: {:?}, result: {:?}",
+                            err_code, result
+                        );
+                        return Err(anyhow::anyhow!("Deepgram API error: {:?}", result));
+                    }
+                    let transcription = result["results"]["channels"][0]["alternatives"][0]
+                        ["transcript"]
+                        .as_str()
+                        .unwrap_or("");
+
+                    if transcription.is_empty() {
+                        info!(
+                            "device: {}, transcription is empty. full response: {:?}",
+                            device, result
+                        );
+                    } else {
+                        info!(
+                            "device: {}, transcription successful. length: {} characters",
+                            device,
+                            transcription.len()
+                        );
+                    }
+
+                    Ok(transcription.to_string())
+                }
+                Err(e) => {
+                    error!("Failed to parse JSON response: {:?}", e);
+                    Err(anyhow::anyhow!("Failed to parse JSON response: {:?}", e))
+                }
+            }
+        }
+        Err(e) => {
+            error!("Failed to send request to Deepgram API: {:?}", e);
+            Err(anyhow::anyhow!(
+                "Failed to send request to Deepgram API: {:?}",
+                e
+            ))
+        }
+    }
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -130,6 +130,7 @@ async fn main() -> Result<()> { @@
                             }
                         }
                     })
+                    .await
                 }
             })
             .collect();
@@ Expand Down @@