-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdump.py
75 lines (56 loc) · 1.72 KB
/
dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import bz2
import cdb
import lxml.etree as etree
def get_pages_at(dump, offset):
decomp = bz2.BZ2Decompressor()
dump.seek(offset)
builder = etree.TreeBuilder()
p = etree.XMLParser(target=builder)
p.feed('<mediawiki>')
while True:
s = dump.read(1024)
try:
s = decomp.decompress(s)
except EOFError:
p.feed('</mediawiki>')
p.close()
break
p.feed(s)
return builder.close()
class Dump(object):
def __init__(self, path):
self.dump = file(path)
self.id_db = cdb.init(path + '.ids')
self.title_db = cdb.init(path + '.titles')
self.offset_db = cdb.init(path + '.offsets')
def get_by_id(self, id):
offset = self.offset_db.get(id)
if offset is None:
raise KeyError
offset = int(offset)
# This one seems fixed...
# def wtf(offset):
# # Looks like the offset index has integer overflow problems.
# for i in xrange(10):
# try:
# return get_pages_at(self.dump, offset)
# except IOError, e:
# offset += 2 ** 32
# raise e
try:
# tree = wtf(offset)
tree = get_pages_at(self.dump, offset)
except IOError, e:
print e
raise KeyError
for page in tree.findall('page'):
if page.findtext('id') == id:
return page
# This shouldn't happen.
raise KeyError
def get_by_title(self, title):
id = self.title_db.get(title)
if id is None:
# Article not found.
raise KeyError
return self.get_by_id(id)