-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_features.py
245 lines (199 loc) · 7.09 KB
/
get_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
"""
script to retrieve the features per sentence and do Sentiment Analysis
we analyse each corpus seperately
Procedure
- Getting data ready
- Loading dictionaries and retrieving values
- Sentiment Analysis (RoBERTa)
- Saving to json
"""
# %%
#
from utils import *
from functions import *
# %%
# set input path for data
input_path = 'data/emobank_data.json'
title = input_path.split('/')[1].split('_')[0]
print('data treated:', title.upper())
# texts should contain sentences and SA annotated scores
with open(input_path, 'r') as f:
all_data = json.load(f)
df = pd.DataFrame.from_dict(all_data)
print('len data:', len(df))
df.head()
# %%
# PART 1: loading dicts, getting feature values
print('# PART 1: loading dicts, getting feature values')
# Loading concreteness lexicon
# the json is structured so that the word is the key, the value the concreteness score
with open("resources/concreteness_brysbaert.json", 'r') as f:
diconc = json.load(f)
print('loaded concreteness lexicon, len:', len(diconc))
# loading VAD
# same here, where values are the valence, arousal and dominance scores (in that order)
with open("resources/VAD_lexicon.json", 'r') as f:
dico = json.load(f)
print('loaded VAD lexicon, len:', len(dico))
# reopen save dict of sensorimotor values
with open('resources/sensorimotor_norms_dict.json', 'r') as f:
sensori_dict = json.load(f)
print('loaded sensorimotor lexicon, len:', len(sensori_dict))
# and get the imageability dict from MRC psycholinguistics database
with open('resources/mrc_psychol_dict.json', 'r') as f:
dict_mrc = json.load(f)
print('loaded imageability lexicon, len:', len(dict_mrc))
# %%
# TEST cell
words = ['dog', 'feeling', 'stomach', 'outside', 'tree', 'heart', 'stone']
print('dict values test')
for word in words:
print(word, dict_mrc[word]['imag'])
# %%
# retrieving values
concretenesses_avg, all_concretenesses = [], []
valences_avg, arousals_avg, dominances_avg = [], [], []
auditory_list = []
gustatory_list = []
haptic_list = []
interoceptive_list = []
olfactory_list = []
visual_list = []
imageability_avg = []
datasets_english = ['emobank', 'emotales', 'FB']
if title in datasets_english:
use_col = 'SENTENCE'
else:
print("Using col 'SENTENCE_ENGLISH' for calculating the dictionary values")
use_col = 'SENTENCE_ENGLISH'
# loop through df
for i, row in df.iterrows():
words = []
# make sure we're using the english sentence (also for Danish texts)
sent = row[use_col]
toks = nltk.wordpunct_tokenize(sent.lower())
lems = [lmtzr.lemmatize(word) for word in toks]
words += lems
# lists to store values for current row
valences, arousals, dominances, concreteness = [], [], [], []
auditory = []
gustatory = []
haptic = []
interoceptive = []
olfactory = []
visual = []
imageabilities = []
# get the VAD values
for word in words:
if word in dico.keys():
valences.append(convert_to_float(dico[word][0]))
arousals.append(convert_to_float(dico[word][1]))
dominances.append(convert_to_float(dico[word][2]))
else:
valences.append(np.nan)
arousals.append(np.nan)
dominances.append(np.nan)
# get concreteness
if word in diconc.keys():
concreteness.append(np.nanmean(diconc[word]))
else:
concreteness.append(np.nan)
# get the sensorimotor values
if word in sensori_dict.keys():
auditory.append(sensori_dict[word]['Auditory.mean'])
gustatory.append(sensori_dict[word]['Gustatory.mean'])
haptic.append(sensori_dict[word]['Haptic.mean'])
interoceptive.append(sensori_dict[word]['Interoceptive.mean'])
olfactory.append(sensori_dict[word]['Olfactory.mean'])
visual.append(sensori_dict[word]['Visual.mean'])
else:
auditory.append(np.nan)
gustatory.append(np.nan)
haptic.append(np.nan)
interoceptive.append(np.nan)
olfactory.append(np.nan)
visual.append(np.nan)
# get imageability
if word in dict_mrc.keys():
imageabilities.append(dict_mrc[word]['imag'])
else:
imageabilities.append(np.nan)
# save everything and get the means per sentence
valences_avg.append(np.nanmean(valences))
arousals_avg.append(np.nanmean(arousals))
dominances_avg.append(np.nanmean(dominances))
concretenesses_avg.append(np.nanmean(concreteness))
all_concretenesses.append(concreteness)
auditory_list.append(np.nanmean(auditory))
gustatory_list.append(np.nanmean(gustatory))
haptic_list.append(np.nanmean(haptic))
interoceptive_list.append(np.nanmean(interoceptive))
olfactory_list.append(np.nanmean(olfactory))
visual_list.append(np.nanmean(visual))
imageability_avg.append(np.nanmean(imageabilities))
# %%
# Make columns in the df
df['avg_concreteness'] = concretenesses_avg
df['concreteness'] = all_concretenesses
df['avg_valence'] = valences_avg
df['avg_arousal'] = arousals_avg
df['avg_dominance'] = dominances_avg
df['Auditory.mean'] = auditory_list
df['Gustatory.mean'] = gustatory_list
df['Haptic.mean'] = haptic_list
df['Interoceptive.mean'] = interoceptive_list
df['Olfactory.mean'] = olfactory_list
df['Visual.mean'] = visual_list
df['avg_imageability'] = imageability_avg
df.head()
# %%
# CHECKUP
df = df.copy().reset_index(drop=True)
print(len(df))
df.head()
# %%
# PART 2: sentiment analysis
print('# PART 2: sentiment analysis')
# now we want to get the VADER and roberta scores for these texts
xlm_model = pipeline(model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
# %%
# SA scoring
if title in datasets_english: # make sure we're using the english sentence (also for Danish texts)
use_col = 'SENTENCE'
# Ensure text is strings
df['SENTENCE'] = df['SENTENCE'].astype(str)
else:
print('check that you use the right col for the mixed language dataset, set it manually')
use_col = 'SENTENCE_ENGLISH'
print(f'using col {use_col}')
# # Ensure text is strings
df['SENTENCE_ENGLISH'] = df['SENTENCE_ENGLISH'].astype(str)
xlm_labels = []
xlm_scores = []
for s in df[use_col]:
# Join to string if list
if isinstance(s, list):
s = " ".join(s)
# get sent-label & confidence to transform to continuous
sent = xlm_model(s)
xlm_labels.append(sent[0].get("label"))
xlm_scores.append(sent[0].get("score"))
# function defined in functions to transform score to continuous
xlm_converted_scores = conv_scores(xlm_labels, xlm_scores, ["positive", "neutral", "negative"])
df["tr_xlm_roberta"] = xlm_converted_scores
# %%
# Check df for nan values
nan_counts = df.isna().sum()
print("NaN counts per column:")
print(nan_counts)
nan_rows_annotators = df[df[['HUMAN', 'tr_xlm_roberta']].isna().any(axis=1)]
print("Rows with NaN values in SA columns:")
print(nan_rows_annotators)
df.head()
# %%
# dump to json
with open(input_path, 'w') as f:
json.dump(df.to_dict(), f)
# %%
print(f'treated {title.upper()}: \n VAD, concreteness, sensorimotor and imageability calculated \n -- json updated!')
# %%