From 33f67381d9186cb58ee04ad6196a17c35ebc5cbd Mon Sep 17 00:00:00 2001 From: mint Date: Tue, 31 Oct 2023 09:38:27 +0900 Subject: [PATCH] =?UTF-8?q?=ED=95=9C=EA=B5=AD=EC=96=B4=20STT=20API=20?= =?UTF-8?q?=ED=85=8C=EC=8A=A4=ED=8A=B8,=20=ED=83=90=EC=83=89=20=EB=B0=8F?= =?UTF-8?q?=20server.py=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastapi_backend/server.py | 97 ++++++++++++++++------------ streamlit_frontend/speech_to_text.py | 19 +++--- 2 files changed, 67 insertions(+), 49 deletions(-) diff --git a/fastapi_backend/server.py b/fastapi_backend/server.py index 8564789..ba4b9c5 100644 --- a/fastapi_backend/server.py +++ b/fastapi_backend/server.py @@ -579,49 +579,66 @@ async def transcribe_api_endpoint(client_id: str = Form(...), file: UploadFile = File(...)): # Save temporary audio file - extension = os.path.splitext(file.filename)[1] # Get the file extension - temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=extension) - audio_file = await file.read() - temp_audio_file.write(audio_file) - temp_audio_file.close() - - wav_path = convert_audio_to_wav(temp_audio_file.name) - - resp_token = requests.post( - 'https://openapi.vito.ai/v1/authenticate', - data={'client_id': client_id, - 'client_secret': client_secret} - ) - resp_token.raise_for_status() - access_token = resp_token.json()['access_token'] - - time.sleep(5) - - config = { - "diarization": { - "use_verification": False - }, - "use_multi_channel": False - } + ################## VITO API ################################# + # Save temporary audio file + # extension = os.path.splitext(file.filename)[1] # Get the file extension + # temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=extension) + # audio_file = await file.read() + + # temp_audio_file.write(audio_file) + # temp_audio_file.close() + + # print(temp_audio_file) + # wav_path = convert_audio_to_wav(temp_audio_file.name) + # print(wav_path) + + # resp_token = requests.post( + # 'https://openapi.vito.ai/v1/authenticate', + # data={'client_id': client_id, + # 'client_secret': client_secret} + # ) + # resp_token.raise_for_status() + # access_token = resp_token.json()['access_token'] + + # time.sleep(5) + + # config = { + # "diarization": { + # "use_verification": False + # }, + # "use_multi_channel": False + # } + + # resp_api = requests.post( + # 'https://openapi.vito.ai/v1/transcribe', + # headers={'Authorization': 'bearer '+ access_token}, + # data={'config': json.dumps(config)}, + # files={'file': open(wav_path, 'rb')} + # ) + + # resp_api.raise_for_status() + # api_id = resp_api.json()['id'] + + # resp_msg = requests.get( + # 'https://openapi.vito.ai/v1/transcribe/'+ api_id, + # headers={'Authorization': 'bearer '+ access_token}, + # ) + + # transcription = resp_msg.json()['results']['utterances'][0]['msg'] + + ################## VITO API Code End ################################# + + # openai whisper api code 추가 + openai.api_key = api_key - resp_api = requests.post( - 'https://openapi.vito.ai/v1/transcribe', - headers={'Authorization': 'bearer '+ access_token}, - data={'config': json.dumps(config)}, - files={'file': open(wav_path, 'rb')} - ) - - resp_api.raise_for_status() - api_id = resp_api.json()['id'] + audio_file = await file.read() - time.sleep(20) + with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file: + temp_file.write(audio_file) + temp_file_path = temp_file.name - resp_msg = requests.get( - 'https://openapi.vito.ai/v1/transcribe/'+ api_id, - headers={'Authorization': 'bearer '+ access_token}, - ) - - transcription = resp_msg.json()['results']['utterances'][0]['msg'] + with open(temp_file_path, 'rb') as open_audio_file: + transcription = openai.Audio.transcribe(model="whisper-1", file=open_audio_file, response_format="text", language='ko') return {"transcription": transcription} ########### Speech2Text End ############# diff --git a/streamlit_frontend/speech_to_text.py b/streamlit_frontend/speech_to_text.py index 6201ab1..551898b 100644 --- a/streamlit_frontend/speech_to_text.py +++ b/streamlit_frontend/speech_to_text.py @@ -19,7 +19,7 @@ def main(): left_column, right_column = st.columns(2) left_column.caption(translate('warning', st.session_state.ko_en)) - uploaded_file = left_column.file_uploader(translate('choose_audio', st.session_state.ko_en), type=["wav", "mp3", "flac", "ogg"]) + uploaded_file = left_column.file_uploader(translate('choose_audio', st.session_state.ko_en), type=["wav", "mp3", "flac", "ogg", "wma", "aac"]) mode = st.radio(translate('radio', st.session_state.ko_en), (translate('korean', st.session_state.ko_en), translate('english', st.session_state.ko_en)), horizontal=True) @@ -33,36 +33,37 @@ def main(): unsafe_allow_html=True, ) - client_id_input, client_secret_input, _, _ = st.columns(4) + api_key_input, _, _, _ = st.columns(4) - client_id = client_id_input.text_input(translate('Client_ID', st.session_state.ko_en),value="") + api_key = api_key_input.text_input(translate('api_key', st.session_state.ko_en), value="", type='password') - client_secret = client_secret_input.text_input(translate('Client_Secret', st.session_state.ko_en),value="", type='password') + # client_secret = client_secret_input.text_input(translate('Client_Secret', st.session_state.ko_en),value="", type='password') _,_,_,l4_column,_,_,_,_ = st.columns(8) submit_button = l4_column.button(translate('transcribe_button', st.session_state.ko_en), use_container_width=True) if uploaded_file is not None : st.session_state.transcription = None - + right_column.markdown('#') right_column.audio(uploaded_file) - if len(client_id) > 0 and len(client_secret) > 0 : + if len(api_key) > 0 : wav_data = io.BytesIO(uploaded_file.read()) if submit_button: - response = requests.post(f"{BACKEND_URL}/speech_to_text_api", files={"file": wav_data}, data={'client_id': client_id, - 'client_secret': client_secret}) + response = requests.post(f"{BACKEND_URL}/speech_to_text_api", files={"file": wav_data}, data={'api_key': api_key}) if response.status_code != 200: st.error(response.status_code) else: transcription = response.json()["transcription"] + st.session_state.transcription = ''.join(transcription) st.text_area(label = "Transcription:", value = st.session_state.transcription, disabled=True) + _, right_column = st.columns(2) - right_column.caption('
Model Api: https://developers.vito.ai/
', unsafe_allow_html=True) + right_column.caption('
Model Api: https://platform.openai.com/docs/guides/speech-to-text
', unsafe_allow_html=True) elif mode == 'english' or mode == '영어': _,_,_,l4_column,_,_,_,_ = st.columns(8)