-
Notifications
You must be signed in to change notification settings - Fork 28
/
cleaner.py
36 lines (31 loc) · 932 Bytes
/
cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import csv
import re
import unittest
def fillKamus(filename):
file_data = open(filename)
kamus = {}
file_data.readline()
for key in file_data:
key = key.replace('"', '').strip()
k = key.split(",")
kamus['%s' % k[0]] = '%s' % k[1]
return kamus
def writeFile(file_in, column, file_out, kamus):
"File must be in csv format"
with open(file_in) as fi:
f = csv.reader(fi)
out = open(file_out, 'w')
for row in f:
line = translate(row[column], kamus)
out.write('"%s"\n' % line)
def tokenize(line):
return filter(lambda x: x != '', re.split('[^\w]', line.lower()))
def translate(line, kamus):
tokens = tokenize(line)
for key, word in enumerate(tokens):
if word in kamus:
tokens[key] = kamus[word]
return ' '.join(tokens)
if __name__ == '__main__':
kamus = fillKamus('singkatan-lib.csv')
writeFile('sample/capres2014-2.1.csv', 2, 'sample/output.csv', kamus)