-
Notifications
You must be signed in to change notification settings - Fork 0
/
07_exact_timestamps.py
79 lines (63 loc) · 3.44 KB
/
07_exact_timestamps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import zipfile
import re
import shutil
# ===============================
# User Configuration Section
# ===============================
# Directory where the script and zip file are located
base_directory = "C:\\Users\\clean\\video_to_spr\\data_for_knowledge_base"
# Subdirectory for the files
measurements_subdirectory = "Basic_physical_quantities"
# Name of the zip file
zip_file_name = os.path.join(measurements_subdirectory, "transcripts_packed_Basic_physical_quantities.zip")
# Name of the file containing matched quotes
matched_quotes_file_name = os.path.join(measurements_subdirectory, "Matched_quotes_Basic_physical_quantities.txt")
# Name of the output file
output_file_name = os.path.join(measurements_subdirectory, "Timestamps_Basic_physical_quantities.txt")
# Temporary directory for extracted files
temp_extract_dir = "temp_extracted"
# ===============================
# End of User Configuration Section
# ===============================
# Path for the zip, matched quotes, and output files
zip_file_path = os.path.join(base_directory, zip_file_name)
matched_quotes_path = os.path.join(base_directory, matched_quotes_file_name)
output_file_path = os.path.join(base_directory, output_file_name)
temp_extract_path = os.path.join(base_directory, temp_extract_dir)
def extract_zip_file(zip_path, extract_to):
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
print(f"Extracted files from {zip_path} to {extract_to}")
def process_matched_quotes(quotes_path, extract_dir, output_path):
with open(quotes_path, 'r', encoding='utf-8') as quotes_file:
quotes_content = quotes_file.read()
# Updated regex pattern to handle both formats
pattern = re.compile(r'(?:\d+\. )?Q(\d+): "(.+?)"\s+- found in (\d{8} - .+?)\.txt, line\(s\)? (\d+)-\d+')
with open(output_path, 'w', encoding='utf-8') as output_file:
for match in pattern.finditer(quotes_content):
quote_number, quote_text, file_name, line_number = match.groups()
print(f"Processing Quote {quote_number}")
file_path = os.path.join(extract_dir, file_name + '.txt')
line_number = int(line_number)
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
for i, line in enumerate(f, start=1):
if i == line_number:
timestamp_match = re.search(r'\[(\d{2}:\d{2}:\d{2})', line) or re.search(r'\[(\d{2}:\d{2}:\d{2}\.\d{3})', line)
if timestamp_match:
timestamp = timestamp_match.group(1)
output_file.write(f"{quote_number}. Q{quote_number}: \"{quote_text}\"\n - found in {file_name}.txt, line(s) {line_number}\n - exact timestamp: {timestamp}\n\n")
else:
print(f"Timestamp not found for Quote {quote_number}")
else:
print(f"File not found for Quote {quote_number}: {file_path}")
def clean_up_temp_directory(path):
if os.path.exists(path):
shutil.rmtree(path)
print(f"Cleaned up temporary directory: {path}")
if __name__ == '__main__':
extract_zip_file(zip_file_path, temp_extract_path)
process_matched_quotes(matched_quotes_path, temp_extract_path, output_file_path)
clean_up_temp_directory(temp_extract_path)
print(f"Processed matched quotes and stored timestamps in {output_file_path}")