From 825087805ede165ab7e9e0ce4ad129803f62a4e5 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 20 Nov 2024 10:25:29 +0100 Subject: [PATCH] WIP Signed-off-by: Ettore Di Giacinto --- core/http/endpoints/openai/realtime.go | 143 ++++++++++--------------- 1 file changed, 54 insertions(+), 89 deletions(-) diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 6634b3ce8e02..c36bad965821 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -462,12 +462,10 @@ func updateSession(session *Session, update *Session, cl *config.BackendConfigLo const ( minMicVolume = 450 sendToVADDelay = time.Second - maxWhisperSegmentDuration = time.Second * 25 + maxWhisperSegmentDuration = time.Second * 15 ) -// Placeholder function to handle VAD (Voice Activity Detection) -// https://github.com/snakers4/silero-vad/tree/master/examples/go -// XXX: use session.ModelInterface for VAD or hook directly VAD runtime here? +// handle VAD (Voice Activity Detection) func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn, done chan struct{}) { vadContext, cancel := context.WithCancel(context.Background()) @@ -480,6 +478,7 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn, audioDetected := false timeListening := time.Now() + // Implement VAD logic here // For brevity, this is a placeholder // When VAD detects end of speech, generate a response @@ -492,7 +491,54 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn, // Check if there's audio data to process session.AudioBufferLock.Lock() - if len(session.InputAudioBuffer) > 16000 { + if len(session.InputAudioBuffer) > 0 { + + if audioDetected && time.Since(timeListening) < maxWhisperSegmentDuration { + log.Debug().Msgf("VAD detected speech, but still listening") + // audioDetected = false + // keep listening + session.AudioBufferLock.Unlock() + continue + } + + if audioDetected { + log.Debug().Msgf("VAD detected speech that we can process") + + // Commit the audio buffer as a conversation item + item := &Item{ + ID: generateItemID(), + Object: "realtime.item", + Type: "message", + Status: "completed", + Role: "user", + Content: []ConversationContent{ + { + Type: "input_audio", + Audio: base64.StdEncoding.EncodeToString(session.InputAudioBuffer), + }, + }, + } + + // Add item to conversation + conversation.Lock.Lock() + conversation.Items = append(conversation.Items, item) + conversation.Lock.Unlock() + + // Reset InputAudioBuffer + session.InputAudioBuffer = nil + session.AudioBufferLock.Unlock() + + // Send item.created event + sendEvent(c, OutgoingMessage{ + Type: "conversation.item.created", + Item: item, + }) + + audioDetected = false + // Generate a response + generateResponse(session, conversation, ResponseCreate{}, c, websocket.TextMessage) + continue + } adata := sound.BytesToInt16sLE(session.InputAudioBuffer) @@ -522,24 +568,6 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn, continue } - speechStart, speechEnd := float32(0), float32(0) - - /* - volume := sound.CalculateRMS16(adata) - if volume > minMicVolume { - startListening = time.Now() - } - - if time.Since(startListening) < sendToVADDelay && time.Since(startListening) < maxWhisperSegmentDuration { - log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer)) - - session.AudioBufferLock.Unlock() - log.Debug().Msg("speech is ongoing") - - continue - } - */ - if len(resp.Segments) == 0 { log.Debug().Msg("VAD detected no speech activity") log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer)) @@ -553,75 +581,12 @@ func handleVAD(session *Session, conversation *Conversation, c *websocket.Conn, continue } - timeListening = time.Now() - - log.Debug().Msgf("VAD detected %d segments", len(resp.Segments)) - log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer)) - - speechStart = resp.Segments[0].Start - log.Debug().Msgf("speech starts at %0.2fs", speechStart) - - audioDetected = true - - for _, s := range resp.Segments { - if s.End > 0 { - log.Debug().Msgf("speech ends at %0.2fs", s.End) - speechEnd = s.End - audioDetected = false - } - } - - if speechEnd == 0 { - log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer)) - - session.AudioBufferLock.Unlock() - log.Debug().Msg("speech is ongoing, no end found ?") - continue + if !audioDetected { + timeListening = time.Now() } + audioDetected = true - // Handle when input is too long without a voice activity (reset the buffer) - if speechStart == 0 && speechEnd == 0 { - // log.Debug().Msg("VAD detected no speech activity") - session.InputAudioBuffer = nil - session.AudioBufferLock.Unlock() - continue - } - - // TODO: Shall we cut the audio from speechStart and SpeechEnd? - log.Debug().Msgf("VAD detected Start speech at: %0.2fs, End speech at: %0.2fs", speechStart, speechEnd) - - // Commit the audio buffer as a conversation item - item := &Item{ - ID: generateItemID(), - Object: "realtime.item", - Type: "message", - Status: "completed", - Role: "user", - Content: []ConversationContent{ - { - Type: "input_audio", - Audio: base64.StdEncoding.EncodeToString(session.InputAudioBuffer), - }, - }, - } - - // Add item to conversation - conversation.Lock.Lock() - conversation.Items = append(conversation.Items, item) - conversation.Lock.Unlock() - - // Reset InputAudioBuffer - session.InputAudioBuffer = nil session.AudioBufferLock.Unlock() - - // Send item.created event - sendEvent(c, OutgoingMessage{ - Type: "conversation.item.created", - Item: item, - }) - - // Generate a response - generateResponse(session, conversation, ResponseCreate{}, c, websocket.TextMessage) } else { session.AudioBufferLock.Unlock() }