forked from cernvm/busybox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
collect_data.py
executable file
·183 lines (145 loc) · 5.17 KB
/
collect_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/python
'''
Python file that loads the data containing time stamps of all commands
executing during booting time of a virtual machine and analyzes them to
get duration of each so it can provide the best part to be optimized
and reduced in a sense of time.
'''
import os
import numpy as numpy
import fnmatch
from operator import itemgetter
if ("ASH_SCORES" in os.environ):
directoryScores = os.environ["ASH_SCORES"]
else:
print("Chosen directory with data from BusyBox ash does not exist.")
exit()
def loadScores():
'''
Method that loads all time stamps related to the specific commands,
removes duplicates and time stamps that were used just as an
orientation (e.g. -1).
Result is written in a text file named "all".
'''
mergedFiles = directoryScores + "/all"
fAll = open(mergedFiles, "w")
fAll.write("command!start!pid!end\n")
allWrittenLines = []
for scorePid in fnmatch.filter(os.listdir(directoryScores), 'scores.csv.*'):
scoreFile = directoryScores + "/" + scorePid
f = open(scoreFile, "r")
lines = f.readlines()
f.close()
allShareLines = []
sharePidLines = []
listWritten = []
for line in lines:
flag = False
if (line != "command!start!pid!end\n") :
#set the flag if line contains -1 meaning it takes more time to execute complex commands
for i in range ( len(line.split('!'))):
item = line.split('!')[i]
if ("-1" in item and (len(item) == 2 or item.endswith('\n'))):
flag = True
#list of repeating pids
if (flag == False and findByPid(line, listWritten) == False and line not in allWrittenLines):
fAll.write(line)
listWritten.append(line)
for i in range(len(listWritten)):
allWrittenLines.append(listWritten[i])
fAll.close()
def findByPid(line, writtenLines):
'''
Method that returns true statement if there already exists a time stamp
related to the specific pid, i.e. if that command is already written
in a text file with results.
'''
for i in range(len(writtenLines)):
if (len(line.split('!')) == len(writtenLines[i].split('!'))):
countTheSame = 0
for j in range(len(writtenLines[i].split('!'))):
if (writtenLines[i].split('!')[j] == line.split('!')[j]):
countTheSame += 1
if (countTheSame == len(writtenLines[i].split('!'))-1):
return True
return False
def hasSamePid(first, second):
'''
Method that returns true statement if two lines containing the
command, start time stamp and pid, i.e. end time stamp and pid have
the same pid so that duration can be calculated for that specific pid.
'''
first = first.split('!')[2]
if first.endswith('\n'):
first = first[:-1]
if (first == second.split('!')[2]):
return True
return False
def calculateDuration():
'''
Given all scores from the file "all", this method needs to group lines
two by two based on its pid to calculate duraion of each process that
is executing. When duration is calculated, results are sorted based
on it value and saved to a new file named "sortedAll".
'''
mergedFiles = directoryScores + "/all"
newSorted = directoryScores + "/sortedAll"
fAll = open(mergedFiles, "r")
fSort = open(newSorted, "w")
fSort.write("command ! total ! nbOfTimes ! avgDuration\n")
lines = fAll.readlines()
minStart = float("inf")
maxEnd = 0
for i in range(len(lines)):
if (lines[i] != "command!start!pid!end\n"):
if (len(lines[i].split("!")) == 3):
firstTime = lines[i].split('!')[1]
firstLine = [long(s) for s in firstTime.split() if s.isdigit()]
if (firstLine[0] < minStart):
minStart = firstLine[0]
else:
firstTime = lines[i].split('!')[3]
firstLine = [long(s) for s in firstTime.split() if s.isdigit()]
if (firstLine[0] > maxEnd):
maxEnd = firstLine[0]
duration = maxEnd- minStart
writeLine = "total duration: " + "%ld" % duration + "\n"
fSort.write(writeLine)
usedCommands=[]
hashAll = {}
for i in range(1, len(lines) - 1):
if (lines[i] != "command!start!pid!end\n" and len(lines[i].split('!')) == 3):
j = i+1
while (j < len(lines)):
secondLine = lines[j]
if (len(lines[i].split('!')) < len(secondLine.split('!'))):
if (hasSamePid(lines[i],secondLine)):
a = lines[i].split('!')[2]
if a.endswith('\n'):
a = a[:-1]
firstTime = lines[i].split('!')[1]
firstLine = [long(s) for s in firstTime.split() if s.isdigit()]
secondTime= secondLine.split('!')[3]
secondLine = [long(s) for s in secondTime.split() if s.isdigit()]
duration = secondLine[0] - firstLine[0]
comLine = lines[i].split('!')[0]
if (comLine not in usedCommands):
usedCommands.append(comLine)
hashAll[comLine] = [duration, 1, duration]
else:
hashAll[comLine][0] += duration
hashAll[comLine][1] += 1
hashAll[comLine][2] = hashAll[comLine][0]/hashAll[comLine][1]
break
else:
j += 1
continue
else:
j += 1
continue
hashAll=sorted(hashAll.items(), key=lambda e: e[1][2])
for i in range(len(hashAll)):
writeLine = "\n" + hashAll[i][0] + "!" + " %ld " % hashAll[i][1][0] + "!" + " %ld " % hashAll[i][1][1] + "!" + " %ld " % hashAll[i][1][2]
fSort.write(writeLine)
fAll.close()
fSort.close()