-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment_analysis.R
174 lines (149 loc) · 5 KB
/
sentiment_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# Transform the text to a tidy data structure with one token per row
tokens <- trilogy %>%
group_by(episode) %>%
mutate(linenumber=row_number()) %>%
ungroup() %>%
unnest_tokens(word, dialogue)
# Positive and negative words -bing lexicon
tokens %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort=TRUE) %>%
acast(word ~ sentiment, value.var="n", fill=0) %>%
comparison.cloud(colors=c("#F8766D", "#00BFC4"),match.colors = TRUE, max.words=100)
#NRC Lexicon
nrc_all<-get_sentiments("nrc")
nrc_trust <-get_sentiments("nrc") %>%
filter(sentiment =="trust")
#Trust words for C-3PO
tokens %>%
filter(character=="THREEPIO") %>%
inner_join(nrc_trust) %>%
count(word,sort=TRUE)
#Trust words for C-3PO
nrc_fear<-get_sentiments("nrc") %>%
filter(sentiment =="fear")
#Fear words for Luke
tokens %>%
filter(character=="LUKE") %>%
inner_join(nrc_fear) %>%
count(word,sort=TRUE)
#Fear words for Luke
tokens %>%
filter(character=="VADER") %>%
inner_join(nrc_fear) %>%
count(word,sort=TRUE)
#Sections 10
trilogy_sentiments_10 <-tokens %>%
inner_join(get_sentiments("bing")) %>%
count(episode,index=linenumber %/% 10, sentiment) %>%
spread(sentiment,n,fill=0) %>%
mutate(sentiment = positive - negative)
ggplot(trilogy_sentiments_10, aes(index,sentiment,fill=episode)) +
geom_col(show.legend = FALSE) +
facet_wrap(~episode,ncol=1,scales="free_x")
#Afinn 20 -tokens
afinn_20<-tokens %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index=linenumber %/% 20) %>%
summarise(sentiment=sum(value)) %>%
mutate(method="AFINN")
#Graphs
bind_rows(afinn_20) %>%
ggplot(aes(index,sentiment,fill=method)) +
geom_col(show.legend=FALSE) +
facet_wrap(~method,ncol=2,scales="free_y")
#Word freq contribution
bing_word_counts <-tokens %>%
inner_join(get_sentiments("bing")) %>%
count(word,sentiment,sort=TRUE) %>%
ungroup()
bing_word_counts
#contribution to negative/positive sentiment
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word=reorder(word,n)) %>%
ggplot(aes(word,n,fill=sentiment)) +
geom_col(show.legend=FALSE) +
facet_wrap(~sentiment,scales="free_y") +
labs(y="Contribution to sentiment",
x=NULL) + coord_flip()
#Wordclouds
#Ordered
wordcloud_test<-tokens %>%
anti_join(stop_words) %>%
count(word)
wordcloud2(wordcloud_test, size=0.4)
# Sentiments and frequency associated with each word
sentiments <- tokens %>%
inner_join(nrc_all, "word") %>%
count(word, sentiment, sort=TRUE)
# Frequency of each sentiment
ggplot(data=sentiments, aes(x=reorder(sentiment, -n, sum), y=n)) +
geom_bar(stat="identity", aes(fill=sentiment), show.legend=FALSE) +
labs(x="Sentiment", y="Frequency") +
theme_bw()
# Top 10 terms for each sentiment
sentiments %>%
group_by(sentiment) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x=reorder(word, n), y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~sentiment, scales="free_y") +
labs(y="Frequency", x="Terms") +
coord_flip() +
theme_bw()
# Sentiment analysis for the Top 10 characters with more dialogues
tokens %>%
filter(character %in% c("LUKE","HAN","THREEPIO","LEIA","VADER",
"BEN","LANDO","YODA","EMPEROR","RED LEADER")) %>%
inner_join(nrc_all, "word") %>%
count(character, sentiment, sort=TRUE) %>%
ggplot(aes(x=sentiment, y=n)) +
geom_col(aes(fill=sentiment), show.legend=FALSE) +
facet_wrap(~character, scales="free_x") +
labs(x="Sentiment", y="Frequency") +
coord_flip() +
theme_bw()
# Tokens without stopwords
top_chars_tokens <- trilogy %>%
mutate(dialogue=as.character(trilogy$dialogue)) %>%
filter(character %in% c("LUKE","HAN","THREEPIO","LEIA","VADER",
"BEN","LANDO","YODA","EMPEROR","RED LEADER")) %>%
unnest_tokens(word, dialogue) %>%
anti_join(mystopwords, by="word")
# Most frequent words for each character - before tf_idf
top_chars_tokens %>%
count(character, word) %>%
group_by(character) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ungroup() %>%
mutate(word2=factor(paste(word, character, sep="__"),
levels=rev(paste(word, character, sep="__"))))%>%
ggplot(aes(x=word2, y=n)) +
geom_col(aes(fill=character), show.legend=FALSE) +
facet_wrap(~character, scales="free_y") +
labs(x="Sentiment", y="Frequency") +
scale_x_discrete(labels=function(x) gsub("__.+$", "", x)) +
coord_flip() +
theme_bw()
# Most relevant words for each character - tf_idf
top_chars_tokens %>%
count(character, word) %>%
bind_tf_idf(word, character, n) %>%
group_by(character) %>%
arrange(desc(tf_idf)) %>%
slice(1:10) %>%
ungroup() %>%
mutate(word2=factor(paste(word, character, sep="__"),
levels=rev(paste(word, character, sep="__"))))%>%
ggplot(aes(x=word2, y=tf_idf)) +
geom_col(aes(fill=character), show.legend=FALSE) +
facet_wrap(~character, scales="free_y") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
scale_x_discrete(labels=function(x) gsub("__.+$", "", x)) +
coord_flip() +
theme_bw()