From 33f67381d9186cb58ee04ad6196a17c35ebc5cbd Mon Sep 17 00:00:00 2001
From: mint <mint@phidigital.co.kr>
Date: Tue, 31 Oct 2023 09:38:27 +0900
Subject: [PATCH] =?UTF-8?q?=ED=95=9C=EA=B5=AD=EC=96=B4=20STT=20API=20?=
 =?UTF-8?q?=ED=85=8C=EC=8A=A4=ED=8A=B8,=20=ED=83=90=EC=83=89=20=EB=B0=8F?=
 =?UTF-8?q?=20server.py=20=EC=88=98=EC=A0=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastapi_backend/server.py            | 97 ++++++++++++++++------------
 streamlit_frontend/speech_to_text.py | 19 +++---
 2 files changed, 67 insertions(+), 49 deletions(-)

diff --git a/fastapi_backend/server.py b/fastapi_backend/server.py
index 8564789..ba4b9c5 100644
--- a/fastapi_backend/server.py
+++ b/fastapi_backend/server.py
@@ -579,49 +579,66 @@ async def transcribe_api_endpoint(client_id: str = Form(...),
                                   file: UploadFile = File(...)):
     
     # Save temporary audio file
-    extension = os.path.splitext(file.filename)[1]  # Get the file extension
-    temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=extension)
-    audio_file = await file.read()
-    temp_audio_file.write(audio_file)
-    temp_audio_file.close()
-    
-    wav_path = convert_audio_to_wav(temp_audio_file.name)
-
-    resp_token = requests.post(
-    'https://openapi.vito.ai/v1/authenticate',
-    data={'client_id': client_id,
-          'client_secret': client_secret}
-    )
-    resp_token.raise_for_status()
-    access_token = resp_token.json()['access_token']
-    
-    time.sleep(5)
-
-    config = {
-    "diarization": {
-        "use_verification": False
-        },
-        "use_multi_channel": False
-    }
+    ################## VITO API #################################
+        # Save temporary audio file
+        # extension = os.path.splitext(file.filename)[1]  # Get the file extension
+        # temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=extension)
+        # audio_file = await file.read()
+
+        # temp_audio_file.write(audio_file)
+        # temp_audio_file.close()
+
+        # print(temp_audio_file)
+        # wav_path = convert_audio_to_wav(temp_audio_file.name)
+        # print(wav_path)
+
+        # resp_token = requests.post(
+        # 'https://openapi.vito.ai/v1/authenticate',
+        # data={'client_id': client_id,
+        #       'client_secret': client_secret}
+        # )
+        # resp_token.raise_for_status()
+        # access_token = resp_token.json()['access_token']
+
+        # time.sleep(5)
+
+        # config = {
+        # "diarization": {
+        #     "use_verification": False
+        #     },
+        #     "use_multi_channel": False
+        # }
+
+        # resp_api = requests.post(
+        #     'https://openapi.vito.ai/v1/transcribe',
+        #     headers={'Authorization': 'bearer '+ access_token},
+        #     data={'config': json.dumps(config)},
+        #     files={'file': open(wav_path, 'rb')}
+        # )
+
+        # resp_api.raise_for_status()
+        # api_id = resp_api.json()['id']
+
+        # resp_msg = requests.get(
+        # 'https://openapi.vito.ai/v1/transcribe/'+ api_id,
+        # headers={'Authorization': 'bearer '+ access_token},
+        # )
+
+        # transcription = resp_msg.json()['results']['utterances'][0]['msg']
+
+        ################## VITO API Code End #################################
+
+        # openai whisper api code 추가
+    openai.api_key = api_key
 
-    resp_api = requests.post(
-        'https://openapi.vito.ai/v1/transcribe',
-        headers={'Authorization': 'bearer '+ access_token},
-        data={'config': json.dumps(config)},
-        files={'file': open(wav_path, 'rb')}
-    )
-
-    resp_api.raise_for_status()
-    api_id = resp_api.json()['id']
+    audio_file = await file.read()
 
-    time.sleep(20)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
+        temp_file.write(audio_file)
+        temp_file_path = temp_file.name
 
-    resp_msg = requests.get(
-    'https://openapi.vito.ai/v1/transcribe/'+ api_id,
-    headers={'Authorization': 'bearer '+ access_token},
-    )
-    
-    transcription = resp_msg.json()['results']['utterances'][0]['msg']
+    with open(temp_file_path, 'rb') as open_audio_file:
+        transcription = openai.Audio.transcribe(model="whisper-1", file=open_audio_file, response_format="text", language='ko')
 
     return {"transcription": transcription}
 ########### Speech2Text End #############
diff --git a/streamlit_frontend/speech_to_text.py b/streamlit_frontend/speech_to_text.py
index 6201ab1..551898b 100644
--- a/streamlit_frontend/speech_to_text.py
+++ b/streamlit_frontend/speech_to_text.py
@@ -19,7 +19,7 @@ def main():
 
     left_column, right_column = st.columns(2)
     left_column.caption(translate('warning', st.session_state.ko_en))
-    uploaded_file = left_column.file_uploader(translate('choose_audio', st.session_state.ko_en), type=["wav", "mp3", "flac", "ogg"])
+    uploaded_file = left_column.file_uploader(translate('choose_audio', st.session_state.ko_en), type=["wav", "mp3", "flac", "ogg", "wma", "aac"])
     
     mode = st.radio(translate('radio', st.session_state.ko_en), (translate('korean', st.session_state.ko_en), 
                                                                  translate('english', st.session_state.ko_en)), horizontal=True)
@@ -33,36 +33,37 @@ def main():
         unsafe_allow_html=True,
         )
 
-        client_id_input, client_secret_input, _, _  = st.columns(4)
+        api_key_input, _, _, _  = st.columns(4)
 
-        client_id = client_id_input.text_input(translate('Client_ID', st.session_state.ko_en),value="")
+        api_key = api_key_input.text_input(translate('api_key', st.session_state.ko_en), value="", type='password')
 
-        client_secret = client_secret_input.text_input(translate('Client_Secret', st.session_state.ko_en),value="", type='password')
+        # client_secret = client_secret_input.text_input(translate('Client_Secret', st.session_state.ko_en),value="", type='password')
 
         _,_,_,l4_column,_,_,_,_   = st.columns(8)
         submit_button = l4_column.button(translate('transcribe_button', st.session_state.ko_en), use_container_width=True)
 
         if uploaded_file is not None :
             st.session_state.transcription = None
-            
+
             right_column.markdown('#')
             right_column.audio(uploaded_file)
 
-            if len(client_id) > 0 and len(client_secret) > 0 :
+            if len(api_key) > 0 :
 
                 wav_data = io.BytesIO(uploaded_file.read())
 
                 if submit_button:
-                    response = requests.post(f"{BACKEND_URL}/speech_to_text_api", files={"file": wav_data}, data={'client_id': client_id,
-                                                                                                                        'client_secret': client_secret})
+                    response = requests.post(f"{BACKEND_URL}/speech_to_text_api", files={"file": wav_data}, data={'api_key': api_key})
                     if response.status_code != 200:
                         st.error(response.status_code)
                     else:
                         transcription = response.json()["transcription"]
+
                         st.session_state.transcription = ''.join(transcription)
                         st.text_area(label = "Transcription:", value = st.session_state.transcription, disabled=True)
+
         _, right_column = st.columns(2)
-        right_column.caption('<div style="text-align: right;">Model Api: https://developers.vito.ai/</div>', unsafe_allow_html=True)
+        right_column.caption('<div style="text-align: right;">Model Api: https://platform.openai.com/docs/guides/speech-to-text</div>', unsafe_allow_html=True)
 
     elif mode == 'english' or mode == '영어':
         _,_,_,l4_column,_,_,_,_   = st.columns(8)