Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev wzw #26

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
records/
*.mp3
*.mp4
20 changes: 14 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,25 @@ Real-time video understanding and interaction through text,audio,image and video

利用多模态大模型的实时视频理解和交互框架,通过文本、语音、图像和视频和这是世界进行问答和交流。

## 启动后端服务
## 启动前端对话服务
主要实现了下面2个功能

- 1、streamlit对话界面
- 2、gpt4v请求接口

打开`run.sh`输入自己的api key, 然后启动
```shell
sh run.sh
```

## 启动前端界面
:construction: (施工中)
```
python demo.py
## TTS和ASR服务
- ASR
服务调用自S组(TODO:要不要更新一个服务在这里)
- TTS
见[tts.py](./real_gemini/tts.py),启动脚本:
```shell
python tts.py
```
启动这些服务需要一些额外的环境和模型:`torch, torchaudio, TTS`,用`pip`安装即可,模型文件路径见py脚本。

## Acknowledgement
- [Fastapi](https://github.com/tiangolo/fastapi)
Expand Down
5 changes: 5 additions & 0 deletions install_pyaudio.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
wget http://www.portaudio.com/archives/pa_stable_v190600_20161030.tgz
tar -zxvf pa_stable_v190600_20161030.tgz
cd portaudio/
./configure && make && sudo make install
pip install pyaudio
176 changes: 176 additions & 0 deletions real_gemini/chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import uuid
import streamlit as st
from pathlib import Path
from utils_st.audio2text import audio2text_from_bytes
from utils_st.extracte_img import get_main_img
from utils_st.get_gpt4v_response import gpt4v_client
from utils_st.get_qwen_response import QwenVL_client
from utils_st.text2audio import text2audio,autoplay_audio
from utils_st.record_video import record
from queue import Queue
import time
import cv2
from threading import Thread,Event

img = {'assistant':'./source/bot.png','user':None}
res_ = {'Qwen-vl':QwenVL_client,'gpt4v':gpt4v_client}

# 设置事件锁
event_record = Event()
event_chat = Event()
event_record.set() # 初始打开录音锁

with st.sidebar:
with st.form('参数配置'):
max_chat_turn = st.slider('最大对话轮数:',min_value=1,max_value=10000,value=10)
response_name = st.selectbox('选择模型',['Qwen-vl','gpt4v'],index=1)
st.form_submit_button('提交配置')
responser = res_[response_name]
max_record_round = 2*max_chat_turn
q = Queue(max_record_round)

st.title("Gemini-like对话测试")
#########################存储录入的文件#####################
# RECORD_DIR = Path("./records")
# RECORD_DIR.mkdir(exist_ok=True)
# if "prefix" not in st.session_state:
# st.session_state["prefix"] = str(uuid.uuid4())
# prefix = st.session_state["prefix"]
# in_file_video = RECORD_DIR / f"{prefix}_input_video.mp4"
# in_file_audio = RECORD_DIR / f"{prefix}_input_audio.mp3"
#########################存储录入的文件#####################
# 对话机器人的图标


if "messages" not in st.session_state:
st.session_state.messages = []

def my_recorder():
for i in range(max_record_round):
# 等待录入条件触发,最开始是默认触发
print('holding to record')
event_record.wait()
print(f'record {i}')
imgs,audio = record()
input_text,code_status,request_id = audio2text_from_bytes(audio.get_wav_data())
# 过滤一些无意义的文本
if input_text and len(input_text)>5:
q.put((imgs,audio,input_text))
else:
print(f'非预期输入: id--{request_id},status--{code_status},text--{input_text}')
time.sleep(2)# 给2秒时间,调整准备输入
continue
print(f'{i}录制结束,{q.qsize()}')
# 录制结束,解开对话阻塞,同时阻塞下一轮录入
event_record.clear()
event_chat.set()
print('释放对话锁,加录音锁')
print('输入处理服务结束')

def show_chat_message_from_history(show_num_history=None):
# Display chat messages from history on app rerun
# show_num_history: 应当为负偶数或者正奇数,负偶数表示为最后N条,正数表示跳过前N条
if show_num_history is None:
history = st.session_state.messages
else:
history = st.session_state.messages[show_num_history:]
for message in history:
with st.chat_message(message["role"],avatar=img[message['role']]):
try:
if message['audio'] is not None:
st.audio(message['audio'],sample_rate=24000)
except:
pass
st.markdown(message["content"])
try:
if message['img'] is not None:
st.image(message['img'])
except:
pass

def response(prompt=None,imgs=None,autoplay=True,audio_response=True):
"""
prompt:输入的文本
imgs:输入的图片
autoplay:是否自动播放语音
audio_response:是否将文本转换成语音响应
"""
if prompt:
sound = None
# Display user message in chat message container
with st.chat_message("user"):
st.markdown(prompt)
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": prompt})
# Display assistant response in chat message container
with st.chat_message("assistant",avatar='./source/bot.png'):
res = responser(query=prompt,imgs=imgs)
print('res[text]:',res['text'])
if audio_response:
sound,rate,byte_sound_array = text2audio(res["text"])
else:
autoplay = False
if autoplay:
autoplay_audio(byte_sound_array)
if not autoplay and audio_response:
# 不自动播放语音
st.audio(sound,sample_rate=rate)
st.markdown(res['text'])
try:
st.image(res['imgs'])
except:
pass
# 由于是自动播放音频,需要等待音频播放完毕
if autoplay:
time.sleep(int(len(sound)/rate)+1)
st.session_state.messages.append({"role": "assistant", "content": res['text'],'audio':sound})


if __name__ == '__main__':
max_round=max_chat_turn+50 # 为了保证安全,没有写没条件的while循环
record_thread = Thread(target=my_recorder)
# 展示录像设备的图像信息
video_show = st.container()
video_show.camera_input('tt',label_visibility='hidden')
# 开始录入输入
if video_show.button('开始对话'):
st.info(f'开始监听麦克风...')
record_thread.start()
else:
st.stop()
# 展示录入信息的处理
placeholder = st.empty()
# 展示对话
chat_placeholder = st.empty()
while max_round>0:
# 等待对话开始,初始化是阻塞,等待第一次输入录入完成,才会打开锁
print('等待对话开始')
event_chat.wait()
print('开始对话')
if not q.empty():
# 进入到对话时,停止录入,防止录入播放的音频
print('进入对话响应,暂停录入')
imgs,audio,input_text = q.get()
with placeholder.status('处理输入信号...',state='running',expanded=True) as status:
if len(imgs)>0:
st.write('getMainFrame...')
imgs = get_main_img(imgs)
imgs = imgs[-3:]
cls = st.columns(min(3,len(imgs)))
for idx,cl in enumerate(cls):
cl.image(cv2.cvtColor(imgs[idx],cv2.COLOR_BGR2RGB))
st.audio(audio.get_wav_data())
st.text(f'识别后的文本:{input_text}')
status.update(label="输入信号处理完成", state="complete", expanded=False)
with chat_placeholder.container(height=600):# 1.30支持设置 height=300px
# with st.container(height=600):# 1.30支持设置 height=300px
# 容器高度设置,要等1.30版本更新,https://github.com/streamlit/streamlit/issues/2169
show_chat_message_from_history() # 现在关闭展示历史,只展示单轮
response(prompt=input_text,imgs=imgs,autoplay=True,audio_response=True)
print('对话完毕,释放录音锁,打开对话锁')
# 对话响应完毕,打开事件
event_record.set()
# 如果没有录入输入,等待
event_chat.clear()
# chat_placeholder.empty()
print('达到最大对话轮数,结束程序!')
87 changes: 87 additions & 0 deletions real_gemini/pages/audio_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import streamlit as st
import speech_recognition as sr
import base64
from utils_st.audio2text import audio2text_from_bytes
from moviepy.editor import AudioFileClip
from utils_st.record_video import VideoRecorder
import time
from utils_st.record_video import record
from queue import Queue

max_turn = 20
q = Queue(max_turn)

def audio_record():
r = sr.Recognizer()
r.energy_threshold=500 # 检测声音的阈值
with sr.Microphone() as source:
st.write('请开始说话,下面开始监听')
# phrase_time_limit 最大录制时常,timeout 等待时常
for i in range(max_turn):
print(f'turn {i} start')
audio = r.listen(source,phrase_time_limit=15,timeout=None)
q.put(audio)
print(f'声音录制结束,{q.qsize()}')
print('结束')

def my_recorder():
for i in range(max_turn):
imgs,audio = record()
q.put((imgs,audio))
print(f'录制结束,{q.qsize()}')
print('输入处理服务结束')

def res():
print('getin response')
i = 20
while i>0:
if q.empty():
print(f'q is empty , waiting')
time.sleep(5)
i-=1
else:
print('reqeusts ok~')
audio = q.get()
st.audio(audio.get_wav_data())
input_text,code_status,request_id = audio2text_from_bytes(audio.get_wav_data())
print(f'这是识别出来的文字:{input_text}')
st.text(f'这是识别出来的文字:{input_text}')
i-=1
print('response over~')

def show_chat_message_from_history():
pass

if __name__ == '__main__':
from threading import Thread
t1 = Thread(target=my_recorder)
t2 = Thread(target=res)
# st.camera_input('tt',label_visibility='hidden')
st.camera_input('tt',label_visibility='hidden')
if st.button('开始对话'):
t1.start()
# t2.start()
# t1.join()
# t2.join()
i = 20
placeholder = st.empty()
while i>0:
if q.empty():
print(f'q is empty , waiting')
time.sleep(5)
i-=1
else:
print('reqeusts ok~')
imgs,audio = q.get()
st.audio(audio.get_wav_data())
input_text,code_status,request_id = audio2text_from_bytes(audio.get_wav_data())
print(f'这是识别出来的文字:{input_text}')
st.text(f'这是识别出来的文字:{input_text}')
st.text('下面是录制到的图片')
for idx,cl in enumerate(st.columns(min(3,len(imgs)))):
cl.image(imgs[idx])
st.divider()
i-=1
print('response over~')


44 changes: 44 additions & 0 deletions real_gemini/pages/chat_with_gpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import streamlit as st
from openai import OpenAI
import os
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

st.title("ChatGPT-like")

# Set OpenAI API key from Streamlit secrets
client = OpenAI(api_key=OPENAI_API_KEY)

# Set a default model
with st.sidebar:
with st.form('参数配置'):
model = st.selectbox('选择模型版本',['gpt-4', 'gpt-4 turbo', 'gpt-3.5-turbo'],index=0)
st.form_submit_button('提交配置')

# Initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = []

# Display chat messages from history on app rerun
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])

# Accept user input
if prompt := st.chat_input("输入你的问题"):
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": prompt})
# Display user message in chat message container
with st.chat_message("user"):
st.markdown(prompt)
# Display assistant response in chat message container
with st.chat_message("assistant"):
message_placeholder = st.empty()
full_response = ""
responses = client.chat.completions.create(model=model,
messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages],
stream=True,)
for response in responses:
full_response += (response.choices[0].delta.content or "")
message_placeholder.markdown(full_response + "▌")
message_placeholder.markdown(full_response)
st.session_state.messages.append({"role": "assistant", "content": full_response})
38 changes: 38 additions & 0 deletions real_gemini/pages/empty_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import streamlit as st
import time

# with st.empty():
# for seconds in range(10):
# st.write(f"⏳ {seconds} seconds have passed")
# time.sleep(1)
# st.write("✔️ 1 minute over!")


placeholder = st.empty()

# Replace the placeholder with some text:
# placeholder.text("Hello")
# time.sleep(5)
# # Replace the text with a chart:
# placeholder.line_chart({"data": [1, 5, 2, 6]})
# st.text('other')
# time.sleep(5)
# # Replace the chart with several elements:
# with placeholder.container():
# st.write("This is one element")
# time.sleep(5)
# st.write("This is another")
# time.sleep(5)
for i in range(10):
with placeholder.status('doing',expanded=True,state='running') as status:
st.text(f'这是测试{i}')
time.sleep(2)
status.update(label="done", state="complete", expanded=False)
if i % 2 == 0:
with st.chat_message('user'):
st.text(f'user:test text {i}')
else:
with st.chat_message('assistant'):
st.text(f'bot:test text {i}')
# Clear all those elements:
# placeholder.empty()
Loading