-
Notifications
You must be signed in to change notification settings - Fork 1
/
liwc.py
80 lines (73 loc) · 2.68 KB
/
liwc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from __future__ import unicode_literals
import numpy as np
import codecs
import re
class LIWC:
def __init__(self, lang='pt'):
self.words = []
self.prefixes = []
self.dim_map = {}
self.dic = {}
if lang == 'pt':
self.path = 'lexicons/LIWC2007_Portugues_win.dic'
else:
self.path = 'lexicons/LIWC2007.dic'
self.start_line = 66
self.create_LIWC_dim_maps()
self.compute_prefixes_and_words()
self.create_dictionary()
def create_LIWC_dim_maps(self):
dim_idx = 0
self.dim_map = {}
with codecs.open(self.path, encoding='latin1') as f:
for i, l in enumerate(f):
if i == 0:
continue
l = l.strip()
if l == '%':
break
given_dim = l.split('\t')[0]
self.dim_map[given_dim] = dim_idx
dim_idx += 1
def compute_prefixes_and_words(self):
with codecs.open(self.path, encoding='latin1') as f:
for i, l in enumerate(f):
if i < self.start_line:
continue
target = l.strip().split('\t')[0]
if target.endswith('*'):
self.prefixes.append(target[:-1])
else:
self.words.append(target)
def create_dictionary(self):
with codecs.open(self.path, encoding='latin1') as f:
for i, l in enumerate(f):
if i < self.start_line:
continue
l = l.strip().split('\t')
word, dim = l[0], l[1:]
if word.endswith('*'):
word = word[:-1]
actual_dims = []
for d in dim:
try:
actual_dims.append(self.dim_map[d])
except:
print(l)
print(word)
print(dim)
self.dic[word] = actual_dims
def build_features(self, texts):
liwc_ret = np.zeros((len(texts), len(self.dim_map)))
for idx, review in enumerate(texts):
for word in re.findall(r'\w+', review):
if word in self.dic:
for d in self.dic[word]:
liwc_ret[idx][d] += 1
else:
if word.startswith(tuple(self.prefixes)):
for item in self.prefixes:
if word.startswith(item):
for d in self.dic[item]:
liwc_ret[idx][d] += 1
return liwc_ret