-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathfix_freebase_literal_format.py
47 lines (42 loc) · 1.31 KB
/
fix_freebase_literal_format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import gzip
in_file = "freebase-rdf-latest.gz"
out_file = "freebase-rdf-latest-literal_fixed.gz"
# datatype strings
datatype_string = {}
datatype_string["type.int"] = "<http://www.w3.org/2001/XMLSchema#integer>"
datatype_string["type.float"] = "<http://www.w3.org/2001/XMLSchema#float>"
datatype_string["type.boolean"] = "<http://www.w3.org/2001/XMLSchema#boolean>"
# get the properties with literal object value
type_map = {}
with open("numeric_properties.txt", "r") as f_in:
for line in f_in:
line = line.strip()
pred, type = line.split("\t")
type_map[pred] = datatype_string[type]
# update literal type line by line
f_in = gzip.open(in_file, "r")
f_out = gzip.open(out_file, "w")
line_num = 0
for line in f_in:
line_num += 1
if not line:
continue
subj, pred, obj, rest = line.split("\t")
pred_t = pred[pred.rfind("/")+1:len(pred)-1]
try:
datatype_string = type_map[pred_t]
if "^^" in obj:
pass
else:
if "\"" in obj:
obj = obj + "^^" + datatype_string
else:
obj = "\"" + obj + "\"^^" + datatype_string
line = "\t".join([subj, pred, obj, rest])
except:
pass
f_out.write(line)
if line_num % 1000000 == 0:
print(line_num)
f_in.close()
f_out.close()