-
Notifications
You must be signed in to change notification settings - Fork 0
/
korpusreader.py
36 lines (28 loc) · 913 Bytes
/
korpusreader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#-*- coding: utf-8 -*-
import re
def filter(inputfile, outputfile):
with open(inputfile, 'r') as inputs, open(outputfile, 'w') as output:
count = 0
for line in inputs:
text = re.search("\{\"teaser\": \[\"(.*)\"\]\}", line)
#print html_decode(text.group(1))
output.write(str(count)+"\t"+"\t"+html_decode(text.group(1))+"\n")
count += 1
def html_decode(s):
htmlCodes = (
("ä", '\u00e4'),
('Ä', '\u00c4'),
('ö', '\u00f6'),
('Ö', '\u00d6'),
('ü', '\u00fc'),
('Ü', '\u00dc'),
('ß', '\u00df'),
('"','\\"'),
('é','\u00e9'),
('á','\u00e0'),
('','\\r\\n'))
for code in htmlCodes:
s = s.replace(code[1], code[0])
return s
if __name__ == '__main__':
filter("item.json","korpus.txt")