-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathv1.py
118 lines (91 loc) · 3.46 KB
/
v1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import whisper
from moviepy import VideoFileClip
import pytesseract
from PIL import Image
import cv2
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
# Initialize models
text_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
print("Alternative model loaded successfully!")
# Extracting the audio
def transcribe_audio(video_path, model_name="base"):
# Extract audio from the video
clip = VideoFileClip(video_path)
audio_path = "temp_audio.wav"
clip.audio.write_audiofile(audio_path)
# Transcribe audio using Whisper
model = whisper.load_model(model_name)
result = model.transcribe(audio_path)
return result["text"]
# Step 2: Extract Frames and Visual Features
def extract_frames(video_path, interval=30):
cap = cv2.VideoCapture(video_path)
frame_count = 0
frames = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_count % interval == 0:
frames.append(frame)
frame_count += 1
cap.release()
return frames
def extract_visual_context(frames):
visual_embeddings = []
for frame in frames:
pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
inputs = clip_processor(images=pil_image, return_tensors="pt")
outputs = clip_model.get_image_features(**inputs)
visual_embeddings.append(outputs.detach().numpy())
return np.vstack(visual_embeddings)
# Step 3: Run OCR for Text Extraction
def run_ocr(frame):
gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
pil_image = Image.fromarray(gray_frame)
text = pytesseract.image_to_string(pil_image)
return text
# Step 4: Generate Text Embeddings
def generate_embeddings(texts):
embeddings = text_model.encode(texts)
return embeddings
# Step 5: Store Embeddings in Faiss Index
def store_in_faiss(embeddings):
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return index
# Step 6: Query the Video
def query_video(query, index, texts):
query_embedding = text_model.encode([query])
distances, indices = index.search(query_embedding, k=5)
results = [texts[i] for i in indices[0]]
return results, distances
# Main Pipeline
video_path = "demo.mp4"
# Step 1: Transcribe Audio
transcription = transcribe_audio(video_path)
print("Transcription:", transcription)
# Step 2: Extract Frames
frames = extract_frames(video_path)
print(f"Extracted {len(frames)} frames.")
# Step 3: Extract Visual Embeddings
visual_embeddings = extract_visual_context(frames)
# Step 4: Extract OCR Text
frame_texts = [run_ocr(frame) for frame in frames]
print("Extracted Text from Frames:", frame_texts)
# Step 5: Generate Embeddings and Store in Faiss
texts = [transcription] + frame_texts
text_embeddings = generate_embeddings(texts)
# Combine Visual and Text Embeddings
combined_embeddings = np.vstack((text_embeddings, visual_embeddings))
index = store_in_faiss(combined_embeddings)
# Step 6: Query and Retrieve
query = "What is the video about?"
results, distances = query_video(query, index, texts)
print("Query Results:", results)