-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvisionai.py
98 lines (82 loc) · 3.29 KB
/
visionai.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
import subprocess
import speech_recognition as sr
from gtts import gTTS
import firebase_admin
from firebase_admin import credentials, storage
import replicate
from time import sleep
cred = credentials.Certificate("glasses.json")#replace with your own
firebase_admin.initialize_app(cred, {"storageBucket": "glasses-68bfa.appspot.com"})
def upload_image_to_firebase(image_path, destination_path):
#for uploading image to firebase
bucket = storage.bucket()
blob = bucket.blob(destination_path)
blob.upload_from_filename(image_path)
def get_firebase_image_url(image_path):
bucket = storage.bucket()
blob = bucket.blob(image_path)
#for firebase url calling
return "https://firebasestorage.googleapis.com/v0/b/glasses-68bfa.appspot.com/o/images%2Fcaptured_image.png?alt=media&token=2f619167-527e-4964-971c-278df8e259bb"
def record_audio():
#for recording audio
r = sr.Recognizer()
print("Before Recording")
with sr.Microphone() as source:
print("Say something...")
r.adjust_for_ambient_noise(source)
audio = r.listen(source, timeout=5)
print("After Recording")
try:
text = r.recognize_google(audio)
print(text)
return text
except sr.UnknownValueError:
print("Google Speech Recognition could not understand audio.")
return None
except sr.RequestError as e:
print(f"Could not request results from Google Speech Recognition service; {e}")
return None
def text_to_speech(text):
#using google text to speech recognition (gtts)
tts = gTTS(text=text, lang='en')
tts.save("output.mp3")
sleep(2)
subprocess.run(["xdg-open","output.mp3"])
def call_api_with_firebase(prompt, image_url):
#calling llava model
#
output = replicate.run(
"yorickvp/llava-13b:e272157381e2a3bf12df3a8edd1f38d1dbd736bbb7437277c8b34175f8fce358",
input={"image": image_url, "prompt": prompt},
)
stri = ""
for item in output:
stri+=item
text_to_speech(stri)
def capture_image(file_path='captured_image.png'):
try:
subprocess.run(['libcamera-still', '-e', 'png', '-o', file_path], check=True)
print(f"Image captured and saved to {file_path}")
except subprocess.CalledProcessError as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
while True:
command = record_audio()
if command and ("hey vision" in command.lower() or "hello vision" in command.lower() or "vision" in command.lower()):
print("Capturing image...")
subprocess.run(["xdg-open","CapturingImage.mp3"])
capture_image()
print("Recording prompt...")
subprocess.run(["xdg-open","RecordingPrompt.mp3"])
sleep(1)
prompt_text = record_audio()
print("Uploading image to Firebase...")
upload_image_to_firebase('captured_image.png', 'images/captured_image.png')
firebase_image_url = get_firebase_image_url('images/captured_image.png')
print("Deleting captured image...")
os.remove('captured_image.png')
print("Calling API with Firebase image URL...")
call_api_with_firebase(prompt_text, firebase_image_url)
print("Converting API output to speech...")
sleep(2)