-
Notifications
You must be signed in to change notification settings - Fork 2
/
gold2db.py
211 lines (190 loc) · 7.98 KB
/
gold2db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#export PYTHONPATH=~/svn/pydelphin
# python3 gold2db.py
##
## takes two paramaters -- directory with the xml and database
##
## Actually does the lexicon too :-)
##
## ToDo:
## * add mrs in error log
##
import sqlite3, sys, re, os
from collections import defaultdict as dd
from delphin import itsdb
import delphin.mrs
import delphin.derivation
import delphin.mrs.xmrs
import delphin.mrs.simplemrs
import json
if (len(sys.argv) < 3):
# prints standard error msg (stderr)
print('You need to give two arguments, ' \
'grammar directory and LTDB', file=sys.stderr)
sys.exit(1)
else:
(script, grmdir, dbfile) = sys.argv
conn = sqlite3.connect(dbfile) # loads dbfile as con
c = conn.cursor() # creates a cursor object that can perform SQL commands with c.execute("...")
ltypes =dd(str)
lorth =dd(str)
lfreq=dd(int)
lex=dd(set)
c.execute("select lexid, typ, orth FROM lex")
for (lexid, typ, orth) in c:
ltypes[lexid] = typ
lorth[lexid]= orth
lfreq[typ] +=1
lex[typ].add(lexid)
mroot=re.compile(r'^\(([-a-zA-z0-9_+]+?)\s+\(')
mrule=re.compile(r'\([0-9]+ ([^ ]+) [-0-9.]+ ([0-9]+) ([0-9]+) ')
mlex=re.compile(r'\([0-9]+ ([^ ]+) [-0-9.]+ [0-9]+ [0-9]+ \("(.*?)" ')
### make a log in the same directory as the database
log = open(os.path.join(os.path.dirname(dbfile),"gold.log"), 'w')
golddir = '%s/tsdb/gold' % grmdir
typefreq=dd(int) # typefreq[type] = freq
lexfreq=dd(lambda: dd(int)) # lexfreq[lexid][surf] = freq
lxidfreq=dd(lambda: dd(int)) # lxidfreq[typ][lexid] = freq
typind=dd(lambda: dd(set)) # typind[type][sid]((frm, to), ...)
sent=dd(list) # sent[sid][(surf, lexid)]
pname=dict() # pname[sid]=profile
roots=dd(lambda: 'rootless')
allroots=set()
for root, dirs, files in os.walk(golddir):
### find valid profiles
if 'result' in files or 'result.gz' in files:
# if 'mrs' not in root: ## debug
# continue
print("Processing %s" % root, file=sys.stderr)
profile = itsdb.ItsdbProfile(root)
head, profname = os.path.split(root)
items = {}
for row in profile.read_table('item'):
items[row['i-id']] = (row['i-input'], row['i-comment'])
for row in profile.read_table('result'):
pid = row['parse-id']
pname[pid] = profname
deriv = row['derivation'] # DERIVATION TREE
deriv_json = delphin.derivation.Derivation.from_string(deriv).to_dict(fields=['id','entity','score','form','tokens'])
mrs_string = row['mrs']
try:
mrs_obj = delphin.mrs.simplemrs.loads(mrs_string, single=True, version=1.1, errors='strict')
# mrs_obj = delphin.mrs.simplemrs.loads(row['mrs'], single=True, version=1.1, strict=False, errors='warn')
# mrs_string = row['mrs'] # CHANGING
mrs_json = delphin.mrs.xmrs.Mrs.to_dict(mrs_obj)
dmrs_json = delphin.mrs.xmrs.Dmrs.to_dict(mrs_obj)
except Exception as e:
log.write("\n\nMRS failed to convert in pydelphin:\n")
log.write("{}: {}\n".format(root, pid))
log.write(items[pid][0])
log.write("\n\n")
log.write(str(mrs_string))
log.write("\n\n")
if hasattr(e, 'message'):
log.write(e.message)
else:
log.write(str(e))
log.write("\n\n")
mrs_json = dict()
dmrs_json = dict()
# STORE gold info IN DB
try:
c.execute("""INSERT INTO gold (sid, sent, comment,
deriv, deriv_json, pst,
mrs, mrs_json, dmrs_json, flags)
VALUES (?,?,?,?,?,?,?,?,?,?)""", (pid, items[pid][0], items[pid][1],
deriv, json.dumps(deriv_json), None,
mrs_string, json.dumps(mrs_json),
json.dumps(dmrs_json), None))
### ToDo use pydelphin to walk down tree
### leaves
m = re.findall(mlex,deriv)
lexids=set()
if m:
#print('leaves')
#print(m)
wid =0
for (lexid, surf) in m:
lexids.add(lexid)
lexfreq[lexid][surf] +=1
sent[pid].append((surf, lexid))
if ltypes[lexid]:
typefreq[ltypes[lexid]] += 1
lxidfreq[ltypes[lexid]][lexid] += 1
typind[ltypes[lexid]][pid].add((wid, wid+1))
wid+=1
### rules (store as type)
m = re.findall(mrule,deriv)
if m:
for (typ, frm, to) in m:
if typ not in lexids: ## counted these!
typefreq[typ] += 1
typind[typ][pid].add((frm, to))
#print('rule')
#print(m)
### Root (treat as another type)
m = re.search(mroot,deriv)
if m:
#print('root {}'.format(root))
#print(m.groups()[0])
#print(deriv)
#print()
roots[pid] = m.groups()[0]
##print('\n\n\n')
except sqlite3.Error as e:
log.write('ERROR: ({}) of type ({}), {}: {}\n'.format(e, type(e).__name__,
root, pid))
### each sentence should have a root
for s in sent:
allroots.add(roots[s])
typind[roots[s]][s].add((0, len(sent[s])))
typefreq[roots[s]] += 1
### calculate the lexical type frequencies
for typ in lxidfreq:
words=list() ## get three most frequent words in corpus
for lexid in sorted(lxidfreq[typ],
key=lambda x:lxidfreq[typ][x],
reverse=True):
if lorth[lexid]:
### lexid<TAB>freq<TAB>orthography
words.append(lexid)
if len(words) > 2:
break
if len(words) < 3: ### if less than three examples in the corpus
### add more from the lexicon
for lexid in lex[typ]:
if lorth[lexid] and (lexid not in words):
words.append(lexid)
if len(words) > 2:
break
wrds='\n'.join("%s\t%d\t%s" % (lexid,
lxidfreq[typ][lexid],
lorth[lexid])
for lexid in words)
##print (typ, wrds)
c.execute("""INSERT INTO ltypes
(typ, words, lfreq, cfreq)
VALUES (?,?,?,?)""", (typ, wrds,
lfreq[typ],
typefreq[typ]))
### Wack these into a database
for typ in typefreq:
#print("%d\t%s" % (typefreq[typ], typ))
c.execute("""INSERT INTO typfreq (typ, freq)
VALUES (?,?)""", (typ, typefreq[typ]))
for l in lexfreq:
for w in lexfreq[l]:
#print("%d\t%s\t%s" % (lexfreq[l][w], l, w))
c.execute("""INSERT INTO lexfreq (lexid, word, freq)
VALUES (?,?,?)""", (l, w, lexfreq[l][w]))
for s in sent:
##print(s, " ".join([surf for (surf, lexid) in sent[s]]))
for i, (w, l) in enumerate(sent[s]):
c.execute("""INSERT INTO sent (profile, sid, wid, word, lexid)
VALUES (?,?,?,?,?)""", (pname[s], s, i, w, l))
for t in typind:
for s in typind[t]:
##print("%s\t%s\t%s" % (t, s, typind[t][s]))
for (k, m) in typind[t][s]:
c.execute("""INSERT INTO typind (typ, sid, kara, made)
VALUES (?,?,?,?)""", (t, s, k, m))
conn.commit()