Skip to content

Commit

Permalink
Whisper-live-transcript (#1099)
Browse files Browse the repository at this point in the history
* refactor whisper

* more refactoring

* more refactoring

* reduce reused code

* reduce reused code

* make deepgram urls available throughout mod

* refactor whisper processing

* update stt

* add realtime audio settings

* cleanup errors...

* create SSE endpoint

* deepgram live transcription

* fixes and deepgram api_key

* live transcription with whisper

* only no speech detected from decoder with debug

* reduce logs from decoder unless debug
  • Loading branch information
EzraEllette authored Jan 8, 2025
1 parent ae1ab1e commit 480a3f0
Show file tree
Hide file tree
Showing 31 changed files with 1,305 additions and 718 deletions.
1 change: 0 additions & 1 deletion screenpipe-actions/src/run.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use crate::type_and_animate::{delete_characters, type_slowly, EnigoCommand};
use crate::{call_ai, run_keystroke_monitor, KeystrokeCommand};
use reqwest;
use std::path::Path;
use std::string::ToString;
use std::sync::atomic::{AtomicBool, Ordering};
Expand Down
2 changes: 1 addition & 1 deletion screenpipe-actions/src/type_and_animate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ pub struct EnigoResponse {
}

thread_local! {
static ENIGO: RefCell<Option<Enigo>> = RefCell::new(None);
static ENIGO: RefCell<Option<Enigo>> = const { RefCell::new(None) };
}

fn with_enigo<F, R>(f: F) -> R
Expand Down
3 changes: 3 additions & 0 deletions screenpipe-audio/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ ndarray = "0.16"
ort = "=2.0.0-rc.6"
knf-rs = { git = "https://github.com/Neptune650/knf-rs.git" }
ort-sys = "=2.0.0-rc.8"
futures = "0.3.31"
deepgram = "0.6.4"
bytes = { version = "1.9.0", features = ["serde"] }

[target.'cfg(target_os = "windows")'.dependencies]
ort = { version = "=2.0.0-rc.6", features = [
Expand Down
2 changes: 1 addition & 1 deletion screenpipe-audio/examples/stt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ async fn main() {
&segmentation_model_path,
embedding_manager,
embedding_extractor,
"default",
&audio_input.device.to_string(),
)
.await
.unwrap();
Expand Down
60 changes: 60 additions & 0 deletions screenpipe-audio/src/audio_processing.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
use anyhow::Result;
use chrono::Utc;
use log::debug;
use realfft::num_complex::{Complex32, ComplexFloat};
use realfft::RealFftPlanner;
use rubato::{
Resampler, SincFixedIn, SincInterpolationParameters, SincInterpolationType, WindowFunction,
};
use std::path::PathBuf;

use crate::encode_single_audio;

pub fn normalize_v2(audio: &[f32]) -> Vec<f32> {
let rms = (audio.iter().map(|&x| x * x).sum::<f32>() / audio.len() as f32).sqrt();
Expand Down Expand Up @@ -101,3 +109,55 @@ pub fn audio_to_mono(audio: &[f32], channels: u16) -> Vec<f32> {

mono_samples
}

pub fn resample(input: &[f32], from_sample_rate: u32, to_sample_rate: u32) -> Result<Vec<f32>> {
debug!("Resampling audio");
let params = SincInterpolationParameters {
sinc_len: 256,
f_cutoff: 0.95,
interpolation: SincInterpolationType::Linear,
oversampling_factor: 256,
window: WindowFunction::BlackmanHarris2,
};

let mut resampler = SincFixedIn::<f32>::new(
to_sample_rate as f64 / from_sample_rate as f64,
2.0,
params,
input.len(),
1,
)?;

let waves_in = vec![input.to_vec()];
debug!("Performing resampling");
let waves_out = resampler.process(&waves_in, None)?;
debug!("Resampling complete");
Ok(waves_out.into_iter().next().unwrap())
}

pub fn write_audio_to_file(
audio: &[f32],
sample_rate: u32,
output_path: &PathBuf,
device: &str,
skip_encoding: bool,
) -> Result<String> {
let new_file_name = Utc::now().format("%Y-%m-%d_%H-%M-%S").to_string();
let sanitized_device_name = device.replace(['/', '\\'], "_");
let file_path = PathBuf::from(output_path)
.join(format!("{}_{}.mp4", sanitized_device_name, new_file_name))
.to_str()
.expect("Failed to create valid path")
.to_string();
let file_path_clone = file_path.clone();
// Run FFmpeg in a separate task
if !skip_encoding {
encode_single_audio(
bytemuck::cast_slice(audio),
sample_rate,
1,
&file_path.into(),
)?;
}
Ok(file_path_clone)
}
1 change: 1 addition & 0 deletions screenpipe-audio/src/bin/screenpipe-audio-forever.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ async fn main() -> Result<()> {
}
}
})
.await
}
})
.collect();
Expand Down
40 changes: 40 additions & 0 deletions screenpipe-audio/src/core.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
use crate::audio_processing::audio_to_mono;
use crate::realtime::{realtime_stt, RealtimeTranscriptionEvent};
use crate::AudioInput;
use anyhow::{anyhow, Result};
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use cpal::StreamError;
use lazy_static::lazy_static;
use log::{debug, error, info, warn};
use screenpipe_core::Language;
use serde::{Deserialize, Serialize};
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::mpsc;
Expand Down Expand Up @@ -200,6 +202,44 @@ pub async fn record_and_transcribe(
Ok(())
}

pub async fn start_realtime_recording(
audio_stream: Arc<AudioStream>,
realtime_transcription_engine: Arc<AudioTranscriptionEngine>,
languages: Vec<Language>,
is_running: Arc<AtomicBool>,
realtime_transcription_sender: Arc<tokio::sync::broadcast::Sender<RealtimeTranscriptionEvent>>,
deepgram_api_key: Option<String>,
) -> Result<()> {
while is_running.load(Ordering::Relaxed) {
match realtime_stt(
audio_stream.clone(),
realtime_transcription_engine.clone(),
languages.clone(),
realtime_transcription_sender.clone(),
is_running.clone(),
deepgram_api_key.clone(),
)
.await
{
Ok(_) => {
// Normal shutdown
break;
}
Err(e) => {
if !is_running.load(Ordering::Relaxed) {
// Normal shutdown
break;
}

error!("realtime_stt error, restarting: {}", e);
// Add a small delay before restarting to prevent rapid restart loops
tokio::time::sleep(Duration::from_secs(1)).await;
}
}
}
Ok(())
}

async fn run_record_and_transcribe(
audio_stream: Arc<AudioStream>,
duration: Duration,
Expand Down
16 changes: 16 additions & 0 deletions screenpipe-audio/src/deepgram/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
mod process_chunk;
mod realtime;
use lazy_static::lazy_static;
pub use realtime::stream_transcription_deepgram;
use std::env;

lazy_static! {
pub(crate) static ref DEEPGRAM_API_URL: String = env::var("DEEPGRAM_API_URL")
.unwrap_or_else(|_| "https://api.deepgram.com/v1/listen".to_string());
pub(crate) static ref DEEPGRAM_WEBSOCKET_URL: String = env::var("DEEPGRAM_WEBSOCKET_URL")
.unwrap_or_else(|_| "wss://api.deepgram.com/v1/listen".to_string());
pub(crate) static ref CUSTOM_DEEPGRAM_API_TOKEN: String =
env::var("CUSTOM_DEEPGRAM_API_TOKEN").unwrap_or_else(|_| String::new());
}

pub use process_chunk::transcribe_with_deepgram;
134 changes: 134 additions & 0 deletions screenpipe-audio/src/deepgram/process_chunk.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
use anyhow::Result;
use hound::{WavSpec, WavWriter};
use log::{debug, error, info};
use reqwest::Client;
use screenpipe_core::Language;
use serde_json::Value;
use std::io::Cursor;

use crate::deepgram::{CUSTOM_DEEPGRAM_API_TOKEN, DEEPGRAM_API_URL};

pub async fn transcribe_with_deepgram(
api_key: &str,
audio_data: &[f32],
device: &str,
sample_rate: u32,
languages: Vec<Language>,
) -> Result<String> {
debug!("starting deepgram transcription");
let client = Client::new();

// Use token from env var
let custom_api_key = CUSTOM_DEEPGRAM_API_TOKEN.as_str();

// Create a WAV file in memory
let mut cursor = Cursor::new(Vec::new());
{
let spec = WavSpec {
channels: 1,
sample_rate: match sample_rate {
88200 => 16000, // Deepgram expects 16kHz for 88.2kHz
_ => sample_rate, // Fallback for other sample rates
},
bits_per_sample: 32,
sample_format: hound::SampleFormat::Float,
};
let mut writer = WavWriter::new(&mut cursor, spec)?;
for &sample in audio_data {
writer.write_sample(sample)?;
}
writer.finalize()?;
}

// Get the WAV data from the cursor
let wav_data = cursor.into_inner();

let mut query_params = String::from("model=nova-2&smart_format=true&sample_rate=16000");

if !languages.is_empty() {
query_params = [
query_params,
"&".into(),
languages
.iter()
.map(|lang| format!("detect_language={}", lang.as_lang_code()))
.collect::<Vec<String>>()
.join("&"),
]
.concat();
}

// rationale: custom api key = custom AI proxy to use deepgram
// no custom api key = use deepgram api key for real deepgram endpoint
let api_key_to_use = if custom_api_key.is_empty() {
api_key
} else {
custom_api_key
};
let is_custom_endpoint = !custom_api_key.is_empty();

debug!("deepgram api key: {}", api_key_to_use);

let response = client
.post(format!("{}?{}", *DEEPGRAM_API_URL, query_params))
.header("Content-Type", "audio/wav")
// Use Bearer format when using custom endpoint/proxy
.header(
"Authorization",
if is_custom_endpoint {
format!("Bearer {}", api_key_to_use)
} else {
format!("Token {}", api_key_to_use)
},
)
.body(wav_data)
.send();

match response.await {
Ok(resp) => {
debug!("received response from deepgram api");
match resp.json::<Value>().await {
Ok(result) => {
debug!("successfully parsed json response");
if let Some(err_code) = result.get("err_code") {
error!(
"deepgram api error code: {:?}, result: {:?}",
err_code, result
);
return Err(anyhow::anyhow!("Deepgram API error: {:?}", result));
}
let transcription = result["results"]["channels"][0]["alternatives"][0]
["transcript"]
.as_str()
.unwrap_or("");

if transcription.is_empty() {
info!(
"device: {}, transcription is empty. full response: {:?}",
device, result
);
} else {
info!(
"device: {}, transcription successful. length: {} characters",
device,
transcription.len()
);
}

Ok(transcription.to_string())
}
Err(e) => {
error!("Failed to parse JSON response: {:?}", e);
Err(anyhow::anyhow!("Failed to parse JSON response: {:?}", e))
}
}
}
Err(e) => {
error!("Failed to send request to Deepgram API: {:?}", e);
Err(anyhow::anyhow!(
"Failed to send request to Deepgram API: {:?}",
e
))
}
}
}
Loading

0 comments on commit 480a3f0

Please sign in to comment.