-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
210 lines (179 loc) · 6.72 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import os
import numpy as np
from collections import Counter
from matplotlib import pyplot
from mpl_toolkits.mplot3d import Axes3D
N = 200 #Number of data points
#M = 10 #Number of features for each data point
M = 100 #Number of features for each data point LARGE data set only
#path to data set txt
#file = 'data/CS170_SMALLtestdata__96.txt'
#file = 'data/CS170_SMALLtestdata__108.txt'
#file = 'data/CS170_SMALLtestdata__109.txt'
#file = 'data/CS170_SMALLtestdata__110.txt'
file = 'data/CS170_LARGEtestdata__58.txt'
#file = 'data/CS170_SMALLtestdata__SAMPLE.txt'
def readdataset():
# Read in data
datapy = []
fileDir = os.path.dirname(os.path.realpath('__file__'))
filename = os.path.join(fileDir, file)
with open(filename, "r") as infile: #read in data from file
for line in infile:
row_list = []
data_list = line.split()
for i in range(M+1):
row_list.append(float(data_list[i]))
datapy.append(row_list)
datanp = np.array(datapy) #turn in numpy array for easier functionality
return datanp
def createtestsets(data, currentfeatures, entry):
# create training data set and leave on out test point based on current features
# data is dataset currently being worked on
# currentfeatures to be tested
# entry is the row that the testdata point will be
testdatatmp = []
testdatatmp.append(data[entry][0])
for j in currentfeatures:
testdatatmp.append(data[entry][j])
testdata = np.array(testdatatmp)
#print(testdata)
traindatatmp = []
for k in range(len(data)):
traininit = []
traininit.append(data[k][0])
for l in currentfeatures:
traininit.append(data[k][l])
traindatatmp.append(traininit)
traindata = np.array(traindatatmp)
traindata = np.delete(traindata, entry, 0)
#print(traindata)
return traindata, testdata
def defaultrate(data):
classes = []
classes = data[:, 0]
c = Counter(classes)
defrate = c.most_common(1)
return (defrate[0][1]/200) * 100
def nearestneighbor(traindata, testdata):
# nearest neighbors
distances = []
for i in range(len(traindata)):
distance = np.sqrt(np.sum(np.square(testdata[1:] - traindata[i, 1:])))#find euclidean distance
distances.append([distance, i])
distances = sorted(distances)
return distances[0]#return index of point with shortest distance
def leave_one_out_crossvalidation(data, currentfeatures, j, choice, prevnumwrong = 0):
# Leave one out cross validation
# testdata is the data you're testing
# traindata is the data set with testdata removed
features = []
for i in currentfeatures:
features.append(i)
if choice == 1:
features.append(j)
elif choice == 2:
features.remove(j)
closest = []
testclasses = []
trainclasses = []
numcorrect = 0
numwrong = 0
for i in range(len(data)):
traindata, testdata = createtestsets(data, features, i)
closest = nearestneighbor(traindata, testdata)
if testdata[0] == traindata[closest[1]][0]:
numcorrect = numcorrect + 1
else:
numwrong = numwrong + 1
if prevnumwrong == numwrong:
break
accuracy = (float(numcorrect) / len(data)) * 100
return accuracy
def backwardsselection(data, choice):
# Backward selection for features
print()
print("Backward selection")
print()
currentfeatures = []
for i in range(1, M + 1):
currentfeatures.append(i)
for i in range(1, M + 1):
print("On the ", i, "th level of the search tree", sep='')
feature = []
bestaccuracy = 0
for j in range(1, M + 1):
if j in currentfeatures:
accuracy = leave_one_out_crossvalidation(data, currentfeatures, j, choice)
print("Considering removing feature", j, "with accuracy", accuracy)
if accuracy > bestaccuracy:
bestaccuracy = accuracy
feature = j
currentfeatures.remove(feature)
print('Level ', i, ': removed feature ', feature, ' from current set with accuracy ', bestaccuracy, '%', sep='')
print("Feature list: ", currentfeatures)
print()
def forwardselection(data, choice):
# Forward selection for features
print()
print("Forward selection")
print()
currentfeatures = []
for i in range(1, M + 1):
print("On the ", i, "th level of the search tree", sep='')
feature = []
bestaccuracy = 0
for j in range(1, M + 1):
if j not in currentfeatures:
accuracy = leave_one_out_crossvalidation(data, currentfeatures, j, choice)
print("Considering adding feature", j, "with accuracy", accuracy)
if accuracy > bestaccuracy:
bestaccuracy = accuracy
feature = j
currentfeatures.append(feature)
print('Level ', i, ': added feature ', feature, ' to current set with accuracy ', bestaccuracy, '%', sep='')
print("Feature list: ", currentfeatures)
print()
def dereksalgorithm(data, choice):
# Derek's Algorithm for features
print()
print("Derek's Algorithm is Feature selection with pruning")
print()
currentfeatures = []
for i in range(1, M + 1):
print("On the ", i, "th level of the search tree", sep='')
feature = []
bestaccuracy = 0
numwrong = 0;
for j in range(1, M + 1):
if j not in currentfeatures:
accuracy = leave_one_out_crossvalidation(data, currentfeatures, j, choice, numwrong)
print("Considering adding feature", j, "with accuracy", accuracy)
if accuracy > bestaccuracy:
bestaccuracy = accuracy
feature = j
numwrong = 200 - (bestaccuracy/100) * 200
currentfeatures.append(feature)
print('Level ', i, ': added feature ', feature, ' to current set with accuracy ', bestaccuracy, '%', sep='')
print("Feature list: ", currentfeatures)
print()
def main():
data = readdataset()
print("Welcome to Derek Sayler's nearest neighbor classifier with feature selection program")
print("1. Forward Selection")
print("2. Backwards Selection")
print("3. Derek's Custom Algorithm")
print()
print("Enter which algorithm you want")
choice = int(input())
print("Default rate is ", defaultrate(data), "%", sep='')
if choice is 1:
forwardselection(data, choice)
elif choice is 2:
backwardsselection(data, choice)
elif choice is 3:
dereksalgorithm(data, 1)
else:
print('ERROR: Invalid option')
return
main()