-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset-creation.py
81 lines (63 loc) · 2.46 KB
/
dataset-creation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import glob
import re
import os
import requests
from bs4 import BeautifulSoup
def get_image_urls(query, num_images):
search_url = f"https://www.google.com/search?q={query}&tbm=isch&hl=en"
response = requests.get(search_url)
soup = BeautifulSoup(response.text, 'html.parser')
image_urls = []
for img in soup.find_all('img'):
if len(image_urls) == num_images:
break
if img.get('src'):
image_urls.append(img.get('src'))
return image_urls
def download_images(image_urls, save_dir):
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for i, url in enumerate(image_urls):
try:
response = requests.get(url)
file_path = os.path.join(save_dir, f"image_{i}.jpg")
with open(file_path, 'wb') as f:
f.write(response.content)
print(f"Downloaded: {file_path}")
except Exception as e:
print(f"Error downloading image: {e}")
def get_images(search_text, directory, num_images):
image_urls = get_image_urls(search_text, num_images)
download_images(image_urls, directory)
def process_file(file_path, folder_path):
concatenated_text = ""
lines_read = 0
with open(file_path, 'r') as file:
for line in file:
if not line.startswith('#'):
parts = line.split()
if len(parts) >= 2:
num, text = parts[0], ' '.join(parts[1:])
concatenated_text += text.strip().replace(":", '_').replace('"', '') + "__"
lines_read += 1
if lines_read >= 2:
break
if concatenated_text:
words = re.split(r'_|__', concatenated_text)
for word in words:
sub_sub_folder_path = os.path.join('Dataset', word)
get_images(word, sub_sub_folder_path, 50)
def process_folder(folder_path):
for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
process_file(file_path, folder_path)
for sub_folder in os.listdir(folder_path):
sub_folder_path = os.path.join(folder_path, sub_folder)
if os.path.isdir(sub_folder_path):
process_folder(sub_folder_path)
# Path to the main 'data' folder
data_folder = 'SemEval-2012-Gold-Ratings'
# Process Training and Testing folders
for sub_folder in ['Training', 'Testing']:
sub_folder_path = os.path.join(data_folder, sub_folder)
process_folder(sub_folder_path)