-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata2cv.py
104 lines (86 loc) · 2.65 KB
/
data2cv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from dataset import *
import time
import cPickle
def get_idx(snum, filter_h=5, max_l=100):
pad = int(filter_h/2)
x = [0]*pad
if len(snum) < max_l:
for ind in snum:
x.append(ind)
else:
for i in xrange(max_l):
x.append(snum[i])
#padding the end of sentence
while len(x) < max_l+2*pad:
x.append(0)
return x
def get_pf(sentLen, allIndice, filter_h=5, max_l=100):
if sentLen < max_l:
#index = range(0, sentLen)
index = np.arange(sentLen)
else:
index = np.arange(max_l)
#index = range(0, max_l)
pf1 = index - allIndice[0] + 1 + 51
pf2 = index - allIndice[1] + 1 + 51
for i, pf in enumerate(pf1):
if pf1[i] < 1:
pf1[i] = 1
elif pf1[i] > 101:
pf1[i] = 101
if pf2[i] < 1:
pf2[i] = 1
elif pf2[i] > 101:
pf2[i] = 101
pad = int(filter_h/2)
x1 = [0]*pad
x2 = [0]*pad
x1.extend(pf1)
x2.extend(pf2)
#for i, pf in enumerate(pf1):
# x1.append(pf1[i])
# x2.append(pf2[i])
#padding the end of sentence
while len(x1) < max_l+2*pad:
x1.append(0)
x2.append(0)
return [x1, x2]
def make_idx_data_cv(data, filter_h, max_l):
newData = []
for ins in data:
entities = ins.entities
rel = ins.rel
num = ins.num
sentences = ins.sentences
positions = ins.positions
newSent = []
newPos = []
entitiesPos = ins.entitiesPos
for i, sentence in enumerate(sentences):
idx = get_idx(sentence, filter_h, max_l)
newSent.append(idx)
pf = get_pf(len(sentence), positions[i], filter_h, max_l)
newPos.append(pf)
newIns = InstanceBag(entities, rel, num, newSent, newPos, entitiesPos)
newData += [newIns]
return newData
if __name__ == "__main__":
print "load test and train raw data..."
testData = cPickle.load(open('test_len_60_gap_40.p'))
trainData = cPickle.load(open('test_len_60_gap_40.p'))
sentence_len = 60
max_filter_len = 3
now = time.strftime("%Y-%m-%d %H:%M:%S")
print 'point 0 time: ' + '\t\t' + str(now)
test = make_idx_data_cv(testData, max_filter_len, sentence_len)
now = time.strftime("%Y-%m-%d %H:%M:%S")
print 'point 1 time: ' + '\t\t' + str(now)
train = make_idx_data_cv(trainData, max_filter_len, sentence_len)
now = time.strftime("%Y-%m-%d %H:%M:%S")
print 'point 2 time: ' + '\t\t' + str(now)
f = open('test_3_60.p', 'w')
cPickle.dump(test, f, -1)
f.close()
f = open('train_3_60.p', 'w')
cPickle.dump(train, f, -1)
f.close()