-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcel_exp.py
96 lines (82 loc) · 2.74 KB
/
cel_exp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
OA
from gensim.modeOAls import Word2Vec
from sys import argv
from gensim.models import Doc2Vec
import logging
from sys import argv
from gensim.models import Word2Vec
from numpy import array, float32 as REAL,dot
import numpy as np
from sklearn.neighbors import KDTree
from gensim import utils, matutils
from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.metrics import log_loss as loss
from sklearn.metrics import roc_auc_score as auc
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import warnings
warnings.filterwarnings("ignore")
EPOCHS = 21
#MODEL_DIR = "/lustre/amar/Word2VecModels"
MODEL_DIR = "/lustre/amar/office_models"
def infer_bug_vector(words, dimensions, model, algorithm):
if algorithm in ['pvdm', 'dbow']:
return model.infer_vector(words)
result = array([0.0]*int(dimensions))
count = 0
for word in words:
if word in model:
result+=model[word]
count+=1
return result/count
def val(algorithm, dimensions, test_filename, epoch):
test_file = file(test_filename)
for line in test_file:
break #metaline
model_filename = MODEL_DIR + "/" + algorithm + "_model_dimensions_"+str(dimensions)+"_epoch_%s.word2vec"%epoch
model = Word2Vec.load(model_filename)
slave_vectors = []
master_vectors = []
total_vectors = []
actual_labels = []
pred_labels = []
c = 0
s = 0
m = 0
for line in test_file:
c+=1
#try:
line = line.strip().split("\t")
#slave = matutils.unitvec(array(infer_bug_vector(line[1].split(" ") + line[2].split(" "), int(dimensions)))).astype(REAL)
#master = matutils.unitvec(array(infer_bug_vector(line[5].split(" ") + line[6].split(" "), int(dimensions)))).astype(REAL)
slave_vec = list(infer_bug_vector(line[1].split(" ") + line[2].split(" "), int(dimensions), model, algorithm))
master_vec = list(infer_bug_vector(line[3].split(" ") + line[4].split(" "), int(dimensions), model, algorithm))
if (np.isnan(slave_vec).any()):
s+=1
elif(np.isnan(master_vec).any()):
m+=1
else:
label = abs(cs(slave_vec, master_vec)[0][0])
actual_labels.append(int(line[5]))
pred_labels.append(label)
#print label
#except ValueError:
# print "Val E"
#except AttributeError:
# print "Att Error"
print c,s,m
print len(actual_labels), len(pred_labels)
result = auc(actual_labels, pred_labels)
result_file = file("OfficeValAucResults_"+ algorithm +".txt","a+")
result = algorithm + "\t" +str(dimensions) + "\t" + str(epoch)+ "\t" + str(result)+"\n"
result_file.write(result)
result_file.close()
test_file.close()
print s, m
def __main__():
algorithm = argv[1]
dimensions = [50, 100, 150, 200, 250]
test_file = argv[2]
for d in dimensions:
for i in xrange(1, EPOCHS, 1):
val(algorithm, str(d), test_file, str(i))
__main__()