forked from urvashik/knnlm
-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess_wikitext.py
163 lines (146 loc) · 5.67 KB
/
preprocess_wikitext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pickle
import re
import tqdm
from typing import List, Tuple
import sling
import time
import os
from collections import defaultdict
import numpy as np
class SlingExtractor(object):
def load_kb(self, root_dir: str = 'local/data/e/wiki'):
print('loading and indexing kb ...')
start = time.time()
self.kb = sling.Store()
self.kb.load(os.path.join(root_dir, 'kb.sling'))
self.phrase = sling.PhraseTable(self.kb, os.path.join(root_dir, 'en', 'phrase-table.repo'))
self.kb.freeze()
self.extract_property_names()
print('loading took', (time.time() - start), 'secs')
def extract_property_names(self):
print('storing property names ...')
start = time.time()
self.property_names = defaultdict(list)
for frame in self.kb:
if 'id' in frame and frame.id.startswith('P'):
self.property_names[frame.id].append(frame.name)
print('found', str(len(self.property_names)), 'properties')
print('took', (time.time() - start), 'sec')
@staticmethod
def get_frame_id(frame):
if 'id' in frame:
return frame.id
if 'is' in frame:
if type(frame['is']) != sling.Frame:
return None
if 'id' in frame['is']:
return frame['is'].id
return None
@staticmethod
def get_date_property(prop, tail):
if 'target' not in prop:
return None
if prop.target.id != '/w/time':
return None
prop_id = SlingExtractor.get_frame_id(prop)
if type(tail) == int:
return (prop_id, tail)
elif type(tail) == sling.Frame and 'is' in tail and type(tail['is']) == int:
return (prop_id, tail['is'])
return None
@staticmethod
def get_canonical_property(prop, tail):
if type(prop) != sling.Frame or type(tail) != sling.Frame:
return None
prop_id = SlingExtractor.get_frame_id(prop)
tail_id = SlingExtractor.get_frame_id(tail)
if prop_id is None:
return None
if tail_id is None:
return None
if not prop_id.startswith('P') or not tail_id.startswith('Q'):
return None
return (prop_id, tail_id)
def get_type(self, wid) -> str:
for type_prop in ['P31', 'P279']:
try:
return self.kb[self.kb[wid][type_prop].id].name
except:
pass
return None
def get_name(self, wid) -> str:
return self.kb[wid].name
def iter_property(self, wid: str, type: str = 'can', shuffle: bool = False):
tup_li: List[Tuple] = []
if self.kb[wid] is None:
return []
for prop, tail in self.kb[wid]:
tup = self.get_canonical_property(prop, tail)
if tup is not None and type == 'can':
if not hasattr(self, 'filter') or tup[0] in self.filter:
tup_li.append(tup)
continue
if tup is None:
tup = self.get_date_property(prop, tail)
if tup is not None and type == 'time':
if not hasattr(self, 'filter') or tup[0] in self.filter:
tup_li.append(tup)
continue
group = defaultdict(list)
for k, v in tup_li:
group[k].append(v)
result = list(group.items())
if shuffle:
np.random.shuffle(result)
return result
def load_dbpedia_mapping():
mapping = {}
with open('examples/language_model/wikitext103_seg/instance-types_specific.ttl', encoding='utf-8') as mapping_f:
for line in mapping_f:
ls = line.strip().split()
if len(ls) == 4:
qid = ls[0].rstrip('>').split('/')[-1]
category = ls[2].rstrip('>').split('/')[-1]
mapping[qid] = category
return mapping
# mapping = load_dbpedia_mapping()
se = SlingExtractor()
se.load_kb(root_dir='/home/fangzhex/sling_data')
def process_splits(pkl_path):
data = pickle.load(open(pkl_path, 'rb'))
SECTION_PATTERN = re.compile(r"= = [^=]+ = =")
segmented_texts = []
corresponding_doc_ids = []
section_names = []
domain_types = []
domain_not_found_count = 0
for doc in tqdm.tqdm(data):
text = ' '.join(doc[0])
doc_id = doc[1][0][1]
starts = [0]
section_match = re.finditer(SECTION_PATTERN, text)
for m in section_match:
section_names.append(m.group(0))
domains = []
ps = se.iter_property(doc_id, type='can')
for p in ps:
if p[0] in {'P31', 'P279'}:
domains.extend(p[1])
if len(domains) == 0:
domain_not_found_count += 1
domain_types.append(';'.join(domains))
starts.append(m.start())
segmented_texts.append(text[starts[-2]:starts[-1]])
corresponding_doc_ids.append(doc_id)
print('domain not found:', domain_not_found_count)
open(pkl_path.rstrip('.pkl') + '.txt', 'w', encoding='utf-8').write('\n'.join(segmented_texts))
open(pkl_path.rstrip('.pkl') + '.txt.docid', 'w', encoding='utf-8').write(
'\n'.join(corresponding_doc_ids))
open(pkl_path.rstrip('.pkl') + '.txt.sec', 'w', encoding='utf-8').write(
'\n'.join(section_names))
open(pkl_path.rstrip('.pkl') + '.txt.dom', 'w', encoding='utf-8').write(
'\n'.join(domain_types))
if __name__ == '__main__':
process_splits('examples/language_model/wikitext103_seg/test.pkl')
process_splits('examples/language_model/wikitext103_seg/valid.pkl')
process_splits('examples/language_model/wikitext103_seg/train.pkl')