-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathterm_frequency_word_Cloud.R
62 lines (47 loc) · 2.39 KB
/
term_frequency_word_Cloud.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#tidying
reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"
immigrant_tweets_words <- immigrant_immigration_tweets %>%
mutate(text =
str_replace_all(text, "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&|<|>|@[a-z,A-Z]*", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
tweet_words_count <- immigrant_tweets_words %>%
count(word, sort = TRUE) %>%
arrange(desc(n))
tweet_words_count
ggplot(tweet_words_count[1:20,], aes(x= reorder(word, -n), y = n))
+ geom_bar(alpha = 0.8, stat = "identity", show.legend = FALSE) +
+ coord_flip()
+ xlab("Number of Occurences") + ylab("Word") + ggtitle("Words Used in Tweets about Immigration")
+#Just ignore "immigrant," "rt," and "https." Couldn't figure out how to get rid of them.
##so this is another way to clean the body of text, but I've tried to run it line by line and am having trouble still
immigrant_tweets_text <- str_replace_all(immigrant_immigration_tweets$text, "@\\w+", "")
immigrant_wordCorpus <- Corpus(VectorSource(immigrant_tweets_text))
immigrant_wordCorpus <- tm_map(immigrant_wordCorpus, removePunctuation)
immigrant_wordCorpus <- tm_map(immigrant_wordCorpus, content_transformer(tolower))
immigrant_wordCorpus <- tm_map(immigrant_wordCorpus, removeWords, stopwords("english"))
immigrant_wordCorpus <- tm_map(immigrant_wordCorpus, removeWords, c("amp", "2yo", "3yo", "4yo"))
immigrant_wordCorpus <- tm_map(immigrant_wordCorpus, stripWhitespace)
#construct term document matrix
immigrant_tdm <- TermDocumentMatrix(immigrant_wordCorpus,control = list(wordLengths = c(1, Inf)))
immigrant_tdm
#find frequent terms
freq.terms <- findFreqTerms(tdm, lowfreq = 20)
term.freq <- rowSums(as.matrix(immigrant_tdm))
term.freq <- subset(term.freq, term.freq >= 20)
immigrant_df <- data.frame(term = names(term.freq), freq = term.freq)
#graph frequent terms
library(ggplot2)
ggplot(immigrant_df, aes(x=term, y=freq)) + geom_bar(stat="identity") +
xlab("Terms") + ylab("Count") + coord_flip() +
theme(axis.text=element_text(size=7))
m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:4)]
# plot word cloud
library(wordcloud)
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 3,
random.order = F, colors = pal)