-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlogisticRegression.py
259 lines (191 loc) · 7.17 KB
/
logisticRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
from pyexcel_ods import get_data
import json
import ast
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *
import nltk
import string
import numpy
import math
from sklearn import linear_model
from scipy import optimize
global_essay_list = []
global_marks_list = []
number_classes = 0
global_word_list = []
all_vector_list = []
#Removing @ words
def removeAt(text):
a = text.split()
b = []
for each in a:
if "@" not in each and "'" not in each and "/" not in each:
b.append(each.lower())
return " ".join(b)
#Removing Punctuations and Stop words
def cleanText(text):
stop_words = set(stopwords.words("english"))
word_set = word_tokenize(text)
word_set = filter(lambda x: x not in string.punctuation, word_set)
cleaned_text = filter(lambda x: x not in stop_words, word_set)
clean_text = " ".join(str(x) for x in cleaned_text)
return clean_text
#Created global marks and essay list
def readODS(filename):
data = get_data(filename)
a = json.dumps(data)
a = ast.literal_eval(str(a))
lists = a['Sheet1']
for i in range(1, len(lists)):
clean_essay = cleanText(removeAt(lists[i][2]))
global_essay_list.append(clean_essay)
global_marks_list.append(lists[i][6])
global number_classes
number_classes = max(global_marks_list)
#Stemming and Lemmatization using Snowball stemmer
def wordStemmingSnowball(word):
stemmer = SnowballStemmer("english")
stem = str(stemmer.stem(word))
return stem
#Stemming and Lemmatization using Porter stemmer
def wordStemmingPorter(word):
stemmer = PorterStemmer()
stem = str(stemmer.stem(word))
return stem
#Keeping only nouns, adjectives and verbs
def keepImportant(sentence):
stemmed_list = []
text=nltk.word_tokenize(sentence)
for each in text:
stemmed_word = wordStemmingPorter(each)
stemmed_list.append(stemmed_word)
important_words = []
pos_list = nltk.pos_tag(stemmed_list)
for every in pos_list:
if (every[1] == 'NN' or every[1] == 'JJ' or every[1] == 'VB' or every[1] == 'VBP' or every[1] == 'VBD'):
important_words.append(every[0])
return important_words
#Generate global list of important words
def generateGlobalList(global_essay_list):
for each in global_essay_list:
imp_words = keepImportant(each)
for every in imp_words:
if every not in global_word_list:
global_word_list.append(every)
#generate cost function vector of X's
def generateVector(sentence):
temp_vector = []
#inserting 1 in the beginning of each vector as X0
temp_vector.append(1)
imp_words = keepImportant(sentence)
for each in global_word_list:
if each in imp_words:
temp_vector.append(1)
else:
temp_vector.append(0)
return temp_vector
#Generate list of vectors
def generateAllVectors(global_essay_list):
for each in global_essay_list:
temp_vector = generateVector(each)
all_vector_list.append(temp_vector)
def zerolistmaker(n):
listofzeros = [0] * n
return listofzeros
#Calculate sigmoid of a function. Takes an numpy.matrix as an input and return an array
def sigmoid(x):
k = numpy.exp(numpy.multiply(x, -1))
k = k + 1
k = numpy.divide(1, k)
return k
def lrCostFunction(theta, X, y, lambdas):
m = len(y)
#print X
grad = zerolistmaker(len(theta))
theta_matrix = numpy.matrix(theta)
X_matrix = numpy.matrix(X)
y_matrix = numpy.matrix(y)
#print "theta shape ", theta_matrix.shape
#print "X shape ",X_matrix.shape
#print "y shape ", y_matrix.shape
mult_matrix = sigmoid(X_matrix*theta_matrix.T)
J = (y_matrix*numpy.log(mult_matrix)) + ((1-y_matrix) * numpy.log(1-mult_matrix))
J = numpy.array(J)[0].tolist()[0]
J = (J * -1)/m
p = 0
for i in range(1,len(theta)):
p = p + math.pow(theta[i], 2)
J = J + (lambdas/(2*m))*p
return J
def gradientCalculation(theta, X, y, lambdas, num_iter,alpha):
#print "mult ",mult_matrix.shape
#print X_matrix[:, 0].shape
m = len(y)
#print X
grad = zerolistmaker(len(theta))
theta_matrix = numpy.matrix(theta)
X_matrix = numpy.matrix(X)
y_matrix = numpy.matrix(y)
for j in range(0, num_iter):
print "iteration ", j
mult_matrix = sigmoid(X_matrix*theta_matrix.T)
grad[0] = (numpy.array(((mult_matrix - y_matrix.T).T*X_matrix[:, 0]))[0].tolist()[0])/m
for i in range(1, len(theta)):
grad[i] = ((numpy.array(((mult_matrix - y_matrix.T).T*X_matrix[:, i]))[0].tolist()[0])/m) + ((lambdas/m)*theta[i])
grad_matrix = numpy.matrix(grad)*alpha
theta_matrix = theta_matrix - grad_matrix
print lrCostFunction(theta_matrix, X, y, lambdas)
return numpy.array(theta_matrix)[0].tolist()
def oneVsAll(X, y, num_labels, lambdas, initial_theta):
y_matrix = numpy.matrix(y)
final_matrix = []
for i in range(0, num_labels):
print "current class is, ", i
a = gradientCalculation(initial_theta, X, y_matrix == i, lambdas, 200, 0.01)
final_matrix.append(a)
return final_matrix
def predictValue(final_theta, X):
final_theta_matrix = numpy.matrix(final_theta)
X_matrix = numpy.matrix(X)
mult = sigmoid(X_matrix * final_theta_matrix.T)
mult_list = numpy.array(mult)[0].tolist()
#print mult_list
return mult_list.index(max(mult_list))
def checkAccuracy(final_theta, essay_list, marks_list):
count = 0
for i in range(0, len(essay_list)):
pred = predictValue(final_theta, generateVector(essay_list[i]))
print "predicted value is: ", pred, " and actual value is ", marks_list[i]
if pred == marks_list[i]:
count = count + 1
print "accurate: ",count
print "total: ", len(marks_list)
print "point to point accuracy is: ", (count/len(marks_list)*1.0)*100, "%"
def main():
readODS("set1_ml.ods")
print "Reading Complete"
global_train_essay = global_essay_list[:1550]
global_train_marks = global_marks_list[:1550]
global_test_essay = global_essay_list[1550:]
global_test_marks = global_marks_list[1550:]
generateGlobalList(global_essay_list)
print "Generated Global List with size, ", len(global_word_list)
generateAllVectors(global_train_essay)
print "Generated all Vectors"
theta_vector = zerolistmaker(len(global_word_list)+1)
global number_classes
#print number_classes
#theta = [0.25, 0.5, -0.5]
#X = [[1.0000000, 2.2873553, 0.8908079],[1.0000000, 2.4717267,-0.6861101], [1.0000000, 0.3836040,-1.6322217], [1.0000000,-2.0572025,-1.0776761],[1.0000000,-2.6066264, 0.4676799],[1.0000000,-0.7595301, 1.5830532],[1.0000000, 1.7858747, 1.2429747],[1.0000000, 2.6893545,-0.2398890],[1.0000000, 1.1202542,-1.5021998],[1.0000000,-1.4788027,-1.3833951],[1.0000000,-2.7182552, 0.0072967],[1.0000000,-1.4585564, 1.3912800],[1.0000000, 1.1421324, 1.4961268],[1.0000000, 2.6927500, 0.2254416],[1.0000000, 1.7676656,-1.2525136],[1.0000000,-0.7826024,-1.5789136],[1.0000000,-2.6133493,-0.4536676],[1.0000000,-2.0413950, 1.0886782],[1.0000000, 0.4074085, 1.6300983],[1.0000000, 2.4816425, 0.6728136]]
#y = [0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0]
#lrCostFunction(theta, X, y, 0.1)
#lrCostFunction(theta_vector, all_vector_list, global_marks_list, 0.1)
final_theta = oneVsAll(all_vector_list, global_train_marks, number_classes, 0.1, theta_vector)
print "Generated Final Theta"
#print final_theta
#print generateVector(global_test_essay[1])
checkAccuracy(final_theta, global_test_essay, global_test_marks)
if __name__ == "__main__":
main()