forked from jukujala/tulos_scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_results.py
79 lines (69 loc) · 1.74 KB
/
parse_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# parse results scraped using pull_results.py
# output is a dictionary in JSON or pickle
import json
import pickle
import sys
import random
import time
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup, SoupStrainer
from urlparse import urljoin
num_header = ">"
num_end = "</td>"
def parse_num(line):
s = line.find(num_header) + len(num_header)
e = line.find(num_end)
num = line[s:e]
return num
header = '<td valign="top" class="leipateksti" align="left" bgcolor="#E6E6E6">'
end = '</td>'
def parse_page(page):
i = 0
lines = page.split("\n")
d = {}
while i < len(lines):
line = lines[i]
if line.find(header) != -1:
begpos = line.find(header) + len(header)
endpos = line.find(end)
name = line[begpos:endpos]
name = unicode(name,encoding="iso8859_10",errors="replace")
if name[0] == "M" or name[0] == "<":
i = i+1
continue
l = [parse_num(lines[i+j]) for j in range(1,7)]
assert len(l) == 6
#print name, l
d[name.encode("utf8")] = l[4]
i = i+6
i = i+1
assert len(d) == 8
return d
def parse_pages(d):
dd = {}
for alue in d:
print "processing", alue
page = d[alue]
dd[alue] = parse_page(page)
return dd
def usage():
print "transforms scraped election results to more convenient dictionary"
print "%s matched_pages.pickle parsed_results.output" %sys.argv[0]
def parse_cp():
if len(sys.argv) != 3:
usage()
sys.exit(-1)
return (sys.argv[1],sys.argv[2])
def main():
(infn,outfn) = parse_cp()
(d,dd) = pickle.load(open(infn))
f = open(outfn,"w")
pages = parse_pages(dd)
#output in pickle:
#pickle.dump(pages,f)
#output in json:
import json; json.dump(pages,f)
f.close()
if __name__ == "__main__":
main()