-
Notifications
You must be signed in to change notification settings - Fork 0
/
n_grams.R
61 lines (50 loc) · 1.48 KB
/
n_grams.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#Bigrams summary for trilogy using custom function
bigrams_summary(trilogy)
#Trigrams summary for trilogy using custom function
trigrams_summary(trilogy)
#Sections
trilogy_sections_words<-trilogy %>%
mutate(section=row_number()%/%10) %>%
filter(section>0) %>%
unnest_tokens(word, dialogue) %>%
filter(!word %in% en_stopwords)
word_pairs <- trilogy_sections_words %>%
pairwise_count(word, section, sort = TRUE)
word_pairs %>%
filter(item1 == "yoda")
#Words often found within the same section as "yoda"
#jedi, time, master, force, training
word_pairs %>%
filter(item1 == "dark")
#Words often found within the same section as "vader"
#lord, luke, ship
#Coef Phi
word_cors <- trilogy_sections_words %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, section, sort = TRUE)
word_cors
word_cors %>%
filter(item1 == "force")
#View correlations
word_cors %>%
filter(item1 %in% c("dark", "princess", "jedi", "master")) %>%
group_by(item1) %>%
top_n(6) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = 'identity') +
facet_wrap(~item1, scales = 'free') +
coord_flip()
set.seed(123)
par(bg = 'blue')
#nodes
word_cors %>%
filter(correlation > .15) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "orange", size = 3) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()