-
Notifications
You must be signed in to change notification settings - Fork 3
/
preprocessing.py
143 lines (125 loc) · 6.52 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# Author: Sitong Ye [email protected]
# Date: 05.02.2020
import yaml
from matplotlib import pyplot as plt
import cv2
import numpy as np
import os
import json
import pandas as pd
import random
from shutil import copyfile
from optparse import OptionParser
class Datagenerator:
def __init__(self, configfile):
with open(configfile) as file:
self.dataconfig = yaml.load(file, Loader=yaml.FullLoader)
print(self.dataconfig)
# parameters to be parsed: trainsetsplit, pathconfig, classes, annotation_name
self.traintestsplit= self.dataconfig["TRAIN_TEST_SPLIT"]
self.classes = self.dataconfig["DETECT_CLASSES"]
self.dataframe= pd.DataFrame()
self.annotationname = self.dataconfig["ANNOTATION_NAME"]
def __create_dataframe(self):
self.dataframe = pd.DataFrame(columns=[
'file_name', 'xmin', 'ymin',
'xmax', 'ymax', 'label'])
data_path = os.path.join(self.dataconfig['DATA_PATH'])
image_folder = self.dataconfig['IMAGE_FOLDER']
sig_config_path = os.path.join(data_path, self.dataconfig["JSON_CONFIG_FOLDER"])
with open(os.path.join(sig_config_path, self.dataconfig['JSON_CONFIG_FILE'])) as json_cfg_file:
cfg = json.load(json_cfg_file)
# read configuration file into dataframe
for image in cfg:
filename = image['filename'].split('/')[-1]
#fileclass = image['class']
annotations = image['annotations'] # this is a list...
for obj in annotations: # every obj is a dictionary...
cls = obj['class']
xmin = obj['x']
ymin = obj['y']
width = obj['width']
height = obj['height']
if 'id' in obj:
iden = obj['id']
else:
iden = None
typ = obj['type']
# ready to assign to dataframe
self.dataframe = self.dataframe.append({'file_name': filename,
'xmin': int(xmin),
'ymin': int(ymin),
'xmax': int(xmin + width),
'ymax': int(ymin + height),
'label': cls}, ignore_index=True)
self.dataframe = self.dataframe[self.dataframe['label'].isin(self.classes)]
print('number of image:', len(self.dataframe['file_name'].unique()))
print('all classes of labels', list(self.dataframe['label'].unique()))
return self.dataframe
def __train_test_split(self):
random.seed(21102019)
all_image = list(self.dataframe['file_name'].unique())
test_image = random.sample(all_image,
int(len(all_image)*self.traintestsplit))
train_image = list(set(all_image) - set(test_image))
# create train and test directory
train_path = os.path.join('.','data', self.dataconfig["OUTPUT_TRAIN_FOLDER_NAME"])
if os.path.exists(train_path) is False:
os.mkdir(train_path)
test_path = os.path.join('.','data', self.dataconfig["OUTPUT_TEST_FOLDER_NAME"])
if os.path.exists(test_path) is False:
os.mkdir(test_path)
train_df = pd.DataFrame(columns=self.dataframe.columns)
for img in train_image:
train_df = train_df.append(self.dataframe.loc[self.dataframe['file_name']==img,:],ignore_index=True)
test_df = pd.DataFrame(columns=self.dataframe.columns)
for img in test_image:
test_df = test_df.append(self.dataframe.loc[self.dataframe['file_name']==img,:],ignore_index=True)
# separate train and test into train_test folder
for roots, dirs, files in os.walk(os.path.join(self.dataconfig["DATA_PATH"], self.dataconfig["IMAGE_FOLDER"])):
for file in files:
old_path = os.path.join(os.path.join(self.dataconfig["DATA_PATH"], self.dataconfig["IMAGE_FOLDER"]), file)
# print(file)
if str(file) in train_df['file_name'].values:
new_path = os.path.join(train_path, file)
elif str(file) in test_df['file_name'].values:
new_path = os.path.join(test_path, file)
copyfile(old_path, new_path)
# generate txt_annotation_file
f = open(os.path.join(self.dataconfig["OUTPUT_DATA_PATH"], "train_"+self.annotationname), "w+")
for idx, row in train_df.iterrows():
img = cv2.imread(os.path.join(self.dataconfig["OUTPUT_DATA_PATH"], self.dataconfig["OUTPUT_TRAIN_FOLDER_NAME"], row['file_name']))
height, width = img.shape[:2]
x1 = int(row['xmin'])
x2 = int(row['xmax'])
y1 = int(row['ymin'])
y2 = int(row['ymax'])
file_path = os.path.join(self.dataconfig["OUTPUT_DATA_PATH"], self.dataconfig["OUTPUT_TRAIN_FOLDER_NAME"])
fileName = os.path.join(file_path, row['file_name'])
className = row['label']
f.write(fileName + ',' + str(x1) + ',' + str(y1) + ',' + str(x2) + ',' + str(y2) + ',' + className + '\n')
f.close()
f = open(os.path.join(self.dataconfig["OUTPUT_DATA_PATH"], "test_"+self.annotationname), "w+")
for idx, row in test_df.iterrows():
img = cv2.imread(os.path.join(self.dataconfig["OUTPUT_DATA_PATH"], self.dataconfig["OUTPUT_TEST_FOLDER_NAME"],
row['file_name']))
#height, width = img.shape[:2]
x1 = int(row['xmin'])
x2 = int(row['xmax'])
y1 = int(row['ymin'])
y2 = int(row['ymax'])
file_path = os.path.join(self.dataconfig["OUTPUT_DATA_PATH"], self.dataconfig["OUTPUT_TEST_FOLDER_NAME"])
fileName = os.path.join(file_path, row['file_name'])
className = row['label']
f.write(fileName + ',' + str(x1) + ',' + str(y1) + ',' + str(x2) + ',' + str(y2) + ',' + className + '\n')
f.close()
def generate(self):
self.__create_dataframe()
self.__train_test_split()
if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-c", "--config", help="path to pass config yaml file")
(options, args) = parser.parse_args()
cfg = options.config
print(options)
Datagenerator(cfg).generate()