-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02_data_wrangling.r
60 lines (50 loc) · 1.95 KB
/
02_data_wrangling.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
library(tidytext)
library(tidyverse)
library(SnowballC)
library(feather)
# Load stop word dictionery
data(stop_words)
# Main dataset
# Read data, adjust encoding, remove duplicate data
text_raw <- read_csv("./data/df_cb_main.csv") %>%
mutate(Review = iconv(Review, "ASCII", "UTF-8")) %>%
transmute(GameTitle = `Game Title`, Review, GSScore = `GS Score`, ESRB, AuthorName = `Author Name`) %>%
arrange(GameTitle, GSScore) %>%
distinct(GameTitle, .keep_all = TRUE)
# Remove game title words in each review
title_tidy <- text_raw %>%
mutate(TitleToken = GameTitle) %>%
unnest_tokens(word, TitleToken) %>%
select(GameTitle, word)
i <- 1
for(title in text_raw$GameTitle){
title_tidy_single <- filter(title_tidy, GameTitle == text_raw[[1]][i])
for(titleText in title_tidy_single$word){
filterTextS = paste("(^(", titleText, ") )", sep = "")
filterTextO = paste("(( ", titleText, "))", sep = "")
text_raw[i, 2] <- gsub(filterTextS, "", text_raw[i, 2], ignore.case = TRUE)
text_raw[i, 2] <- gsub(filterTextO, "", text_raw[i, 2], ignore.case = TRUE)
}
i <- i + 1
}
# Tokenize review, remove stop words, stem words
text_tidy <- text_raw %>%
unnest_tokens(word, Review) %>%
anti_join(stop_words, by = "word") %>%
mutate(word = wordStem(word))
# Generate tfidf index
text_tfidf <- text_tidy %>%
group_by(GameTitle) %>%
count(word, sort = TRUE) %>%
bind_tf_idf(word, GameTitle, n)
# Save in feather file
write_feather(text_raw, "./data/text_raw.feather")
write_feather(text_tidy, "./data/text_tidy.feather")
write_feather(text_tfidf, "./data/text_tfidf.feather")
# Genre dataset
# Read data, remove duplicate data'
genre_tidy <- read_csv("./data/df_cb_genre.csv") %>%
transmute(GameTitle = `Game Title`, Genre) %>%
distinct(GameTitle, Genre, .keep_all = TRUE)
# Save in feather file
write_feather(genre_tidy, "./data/genre_tidy.feather")