-
Notifications
You must be signed in to change notification settings - Fork 751
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* refactor whisper * more refactoring * more refactoring * reduce reused code * reduce reused code * make deepgram urls available throughout mod * refactor whisper processing * update stt * add realtime audio settings * cleanup errors... * create SSE endpoint * deepgram live transcription * fixes and deepgram api_key * live transcription with whisper * only no speech detected from decoder with debug * reduce logs from decoder unless debug
- Loading branch information
1 parent
ae1ab1e
commit 480a3f0
Showing
31 changed files
with
1,305 additions
and
718 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -130,6 +130,7 @@ async fn main() -> Result<()> { | |
} | ||
} | ||
}) | ||
.await | ||
} | ||
}) | ||
.collect(); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
mod process_chunk; | ||
mod realtime; | ||
use lazy_static::lazy_static; | ||
pub use realtime::stream_transcription_deepgram; | ||
use std::env; | ||
|
||
lazy_static! { | ||
pub(crate) static ref DEEPGRAM_API_URL: String = env::var("DEEPGRAM_API_URL") | ||
.unwrap_or_else(|_| "https://api.deepgram.com/v1/listen".to_string()); | ||
pub(crate) static ref DEEPGRAM_WEBSOCKET_URL: String = env::var("DEEPGRAM_WEBSOCKET_URL") | ||
.unwrap_or_else(|_| "wss://api.deepgram.com/v1/listen".to_string()); | ||
pub(crate) static ref CUSTOM_DEEPGRAM_API_TOKEN: String = | ||
env::var("CUSTOM_DEEPGRAM_API_TOKEN").unwrap_or_else(|_| String::new()); | ||
} | ||
|
||
pub use process_chunk::transcribe_with_deepgram; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
use anyhow::Result; | ||
use hound::{WavSpec, WavWriter}; | ||
use log::{debug, error, info}; | ||
use reqwest::Client; | ||
use screenpipe_core::Language; | ||
use serde_json::Value; | ||
use std::io::Cursor; | ||
|
||
use crate::deepgram::{CUSTOM_DEEPGRAM_API_TOKEN, DEEPGRAM_API_URL}; | ||
|
||
pub async fn transcribe_with_deepgram( | ||
api_key: &str, | ||
audio_data: &[f32], | ||
device: &str, | ||
sample_rate: u32, | ||
languages: Vec<Language>, | ||
) -> Result<String> { | ||
debug!("starting deepgram transcription"); | ||
let client = Client::new(); | ||
|
||
// Use token from env var | ||
let custom_api_key = CUSTOM_DEEPGRAM_API_TOKEN.as_str(); | ||
|
||
// Create a WAV file in memory | ||
let mut cursor = Cursor::new(Vec::new()); | ||
{ | ||
let spec = WavSpec { | ||
channels: 1, | ||
sample_rate: match sample_rate { | ||
88200 => 16000, // Deepgram expects 16kHz for 88.2kHz | ||
_ => sample_rate, // Fallback for other sample rates | ||
}, | ||
bits_per_sample: 32, | ||
sample_format: hound::SampleFormat::Float, | ||
}; | ||
let mut writer = WavWriter::new(&mut cursor, spec)?; | ||
for &sample in audio_data { | ||
writer.write_sample(sample)?; | ||
} | ||
writer.finalize()?; | ||
} | ||
|
||
// Get the WAV data from the cursor | ||
let wav_data = cursor.into_inner(); | ||
|
||
let mut query_params = String::from("model=nova-2&smart_format=true&sample_rate=16000"); | ||
|
||
if !languages.is_empty() { | ||
query_params = [ | ||
query_params, | ||
"&".into(), | ||
languages | ||
.iter() | ||
.map(|lang| format!("detect_language={}", lang.as_lang_code())) | ||
.collect::<Vec<String>>() | ||
.join("&"), | ||
] | ||
.concat(); | ||
} | ||
|
||
// rationale: custom api key = custom AI proxy to use deepgram | ||
// no custom api key = use deepgram api key for real deepgram endpoint | ||
let api_key_to_use = if custom_api_key.is_empty() { | ||
api_key | ||
} else { | ||
custom_api_key | ||
}; | ||
let is_custom_endpoint = !custom_api_key.is_empty(); | ||
|
||
debug!("deepgram api key: {}", api_key_to_use); | ||
|
||
let response = client | ||
.post(format!("{}?{}", *DEEPGRAM_API_URL, query_params)) | ||
.header("Content-Type", "audio/wav") | ||
// Use Bearer format when using custom endpoint/proxy | ||
.header( | ||
"Authorization", | ||
if is_custom_endpoint { | ||
format!("Bearer {}", api_key_to_use) | ||
} else { | ||
format!("Token {}", api_key_to_use) | ||
}, | ||
) | ||
.body(wav_data) | ||
.send(); | ||
|
||
match response.await { | ||
Ok(resp) => { | ||
debug!("received response from deepgram api"); | ||
match resp.json::<Value>().await { | ||
Ok(result) => { | ||
debug!("successfully parsed json response"); | ||
if let Some(err_code) = result.get("err_code") { | ||
error!( | ||
"deepgram api error code: {:?}, result: {:?}", | ||
err_code, result | ||
); | ||
return Err(anyhow::anyhow!("Deepgram API error: {:?}", result)); | ||
} | ||
let transcription = result["results"]["channels"][0]["alternatives"][0] | ||
["transcript"] | ||
.as_str() | ||
.unwrap_or(""); | ||
|
||
if transcription.is_empty() { | ||
info!( | ||
"device: {}, transcription is empty. full response: {:?}", | ||
device, result | ||
); | ||
} else { | ||
info!( | ||
"device: {}, transcription successful. length: {} characters", | ||
device, | ||
transcription.len() | ||
); | ||
} | ||
|
||
Ok(transcription.to_string()) | ||
} | ||
Err(e) => { | ||
error!("Failed to parse JSON response: {:?}", e); | ||
Err(anyhow::anyhow!("Failed to parse JSON response: {:?}", e)) | ||
} | ||
} | ||
} | ||
Err(e) => { | ||
error!("Failed to send request to Deepgram API: {:?}", e); | ||
Err(anyhow::anyhow!( | ||
"Failed to send request to Deepgram API: {:?}", | ||
e | ||
)) | ||
} | ||
} | ||
} |
Oops, something went wrong.