From 3d4508fd756a3755a8a3e1d3c1b82f45d930688c Mon Sep 17 00:00:00 2001 From: luxuhui Date: Fri, 4 Oct 2024 01:26:41 +0800 Subject: [PATCH] feat: fix chunk spliting and chat message --- realtime_agent/agent.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/realtime_agent/agent.py b/realtime_agent/agent.py index d500a0a..76e6391 100644 --- a/realtime_agent/agent.py +++ b/realtime_agent/agent.py @@ -11,7 +11,7 @@ from agora_realtime_ai_api.rtc import Channel, ChatMessage, RtcEngine, RtcOptions from .logger import setup_logger -from .realtime.struct import InputAudioBufferCommitted, InputAudioBufferSpeechStarted, InputAudioBufferSpeechStopped, ItemCreated, RateLimitsUpdated, ResponseAudioDelta, ResponseAudioDone, ResponseAudioTranscriptDelta, ResponseAudioTranscriptDone, ResponseContentPartAdded, ResponseContentPartDone, ResponseCreated, ResponseDone, ResponseOutputItemAdded, ResponseOutputItemDone, ServerVADUpdateParams, SessionUpdate, SessionUpdateParams, SessionUpdated, Voices, to_json +from .realtime.struct import InputAudioBufferCommitted, InputAudioBufferSpeechStarted, InputAudioBufferSpeechStopped, InputAudioTranscription, ItemCreated, ItemInputAudioTranscriptionCompleted, RateLimitsUpdated, ResponseAudioDelta, ResponseAudioDone, ResponseAudioTranscriptDelta, ResponseAudioTranscriptDone, ResponseContentPartAdded, ResponseContentPartDone, ResponseCreated, ResponseDone, ResponseOutputItemAdded, ResponseOutputItemDone, ServerVADUpdateParams, SessionUpdate, SessionUpdateParams, SessionUpdated, Voices, to_json from .realtime.connection import RealtimeApiConnection from .tools import ClientToolCallResponse, ToolContext from .utils import PCMWriter @@ -102,6 +102,7 @@ async def setup_and_run_agent( modalities=["text", "audio"], temperature=0.8, max_response_output_tokens="inf", + input_audio_transcription=InputAudioTranscription(model="whisper-1") ) ) ) @@ -242,7 +243,7 @@ async def _process_model_messages(self) -> None: # logger.info("Received audio message") self.audio_queue.put_nowait(base64.b64decode(message.delta)) # loop.call_soon_threadsafe(self.audio_queue.put_nowait, base64.b64decode(message.delta)) - logger.info(f"TMS:ResponseAudioDelta: response_id:{message.response_id},item_id: {message.item_id}") + logger.debug(f"TMS:ResponseAudioDelta: response_id:{message.response_id},item_id: {message.item_id}") case ResponseAudioTranscriptDelta(): # logger.info(f"Received text message {message=}") asyncio.create_task(self.channel.chat.send_message( @@ -267,6 +268,13 @@ async def _process_model_messages(self) -> None: case InputAudioBufferSpeechStopped(): logger.info(f"TMS:InputAudioBufferSpeechStopped: item_id: {message.item_id}") pass + case ItemInputAudioTranscriptionCompleted(): + logger.info(f"ItemInputAudioTranscriptionCompleted: {message=}") + asyncio.create_task(self.channel.chat.send_message( + ChatMessage( + message=to_json(message), msg_id=message.item_id + ) + )) # InputAudioBufferCommitted case InputAudioBufferCommitted(): pass