-
Notifications
You must be signed in to change notification settings - Fork 2
/
04_analysis_1.R
177 lines (145 loc) · 9.06 KB
/
04_analysis_1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
library(feather)
library(stringr)
#install.packages("tm")
#install.packages("NLP") # "tm" is attached with library NLP
library(NLP)
library(tm) # text mining library
library(tidyverse)
# install.packages("tidytext")
library(tidytext)
# Description of this Rscipt
######################################################################################
# Purpose: Classify terms into three categories, and examine percentage of terms in
# each category in daily and monthly newspaper.
# Readin Files: all the tidy data in the ./dataframes
# Output: 1) combined dataframe from 1986-1990 as a feather file --> ./data
# 2) plots: dscores.png -->./graph
# mscores.png -->./graph
######################################################################################
# When you find running these commands two slowly on your computer,
# please try the following two approaches:
######################################################################################
# a) sparklyr
# Note: spark needs you to install the java JDK
# http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html
# b) or instead of the library(data.frame), use the library(data.table)
# Dealing with big dataset by data.table
# https://www.analyticsvidhya.com/blog/2016/05/data-table-data-frame-work-large-data-sets/
######################################################################################
#install.packages("sparklyr")
#spark_install(version = "2.0.0")
#library(sparkly)
#sc <- spark_connect(master = "local", version = "2.0.0")
# Read in data from the dataframe/
data1986 <-read.csv("dataframe/tidydata_1986", colClasses=c("NULL",NA,NA))
# http://stackoverflow.com/questions/14527466/is-there-a-way-to-omit-the-first-column-when-reading-a-csv
data1987 <-read.csv("dataframe/tidydata_1987", colClasses=c("NULL",NA,NA))
data1988 <-read.csv("dataframe/tidydata_1988", colClasses=c("NULL",NA,NA))
data1989 <-read.csv("dataframe/tidydata_1989", colClasses=c("NULL",NA,NA))
data1990 <-read.csv("dataframe/tidydata_1990", colClasses=c("NULL",NA,NA))
######################################################################################
# Arbitrily categorize the terms in to "strong ideology (SI)," weak ideology (WI)"
# and "economic perfermance (EP)", and socre each months` articels
# based on these three categories.
######################################################################################
df <- do.call ("rbind", list(data1986,data1987, data1988, data1989, data1990)) %>%
mutate (Datetime = as.Date(date)) %>%
separate (date, c("Year", "Month", "Date"), sep="-") %>%
unite (year_month, Year, Month, sep = "-") %>%
drop_na(term, count)
write_feather(df, "data/df")
SI <- c ( "资产阶级自由化", "精神污染", "阶级斗争", "四项", "剥削阶级", "工人阶级", "无产阶级", "资本主义", "反社会主义",
"帝国主义", "修正主义", "辩证唯物主义", "虚无主义", "共产主义", "共产主义者", "马克思主义", "马克思主义者", "革命英雄主义","官僚资本主义", "唯物主义", "历史唯物主义",
"革命军人", "革命队伍", "革命斗志", "革命精神", "革命事业","革命化", "反革命")
WI <- c ("党的领导", "党的建设", "个人主义", "集体主义", "民主专政", "艰苦奋斗","愚公精神",
"安定团结", "团结奋斗", "团结一致", "紧密团结", "加强团结", "团结" ,
"稳定","稳定性","社会安定",
"党风建设", "法制", "民主法制","法制建设", "法制化","法制观念","法制教育", "健全法制",
"爱国主义", "爱国精神","爱国热情",
"民主监督","民主集中制")
EP <- c("四化建设", "基本建设", "城乡建设","改革开放", "对外开放", "改革建设",
"科学技术", "生产力", "科技进步","科技兴农",
"体制改革", "深化改革", "推进改革", "繁荣昌盛", "繁荣富强", "经济繁荣","经济效益","经济增长", "迅猛发展", "蓬勃发展")
#Note: to plot the ideological terms is very tricky, we cannot include those words themselves with very high
# frequency all over the time, such as "socialism"(社会主义),if this one added into the SI,
# we will see actully SI will be the dominant terms all over five years
# In addition, we can not use very simple words, such as "construction" (建设), if put this word
# word in EP, then we will find EP will be very dominant besides in the period after the crackdown
# of the movement.
df_score <- df %>%
mutate (SI_score = ifelse (term %in% SI, count, 0)) %>%
mutate (WI_score = ifelse (term %in% WI, count, 0)) %>%
mutate (EP_score = ifelse (term %in% EP, count, 0))
# write this dataframe with scores in to a csv file and upload
# complete_df_and_scores <- df_score %>%
# select (Datetime, term, count, SI_score, WI_score, EP_score)
# write_csv(complete_df_and_scores, "dataframe/complete_dataset_with_scores")
##############################
# Monthly scores
##############################
df_monthly_score <-df_score %>%
group_by (year_month) %>%
summarise (SI = sum (SI_score), WI = sum (WI_score), EP = sum(EP_score), monthly_term =sum (count)) %>%
# calculate in each month, the featured terms count for how many percentage in total vocabulary
mutate (adj_SI= 100*(SI/monthly_term), adj_WI= 100*(WI/monthly_term), adj_EP = 100*(EP/monthly_term)) %>%
# convert the year_month to XXXX-XX-01 as a datetime type variable for the convenience in the next step`s plotting
mutate (year_month = as.Date(paste0 (year_month, "-01"))) %>%
select (year_month, adj_SI, adj_WI, adj_EP) %>%
gather (key, percent, -year_month)
##############################
# Plot1: Monthly_Score Plot
##############################
ggplot (df_monthly_score, aes(x = year_month, y = percent, color = key)) +
theme_bw () +
# legend position, and rotate the x-axis` label
theme(legend.position ="bottom", axis.text.x = element_text(angle = 45, hjust = 1))+
geom_line() +
labs (title = "SI, WI and EP's Occurrence in the People's Daily (Monthly Summary)",
subtitle = "January 1986 to December 1990",
x = "months from 1986 to 1990",
y = "weighted scores (in percentage)",
color = NULL)+ # "color"=NULL to silent the legend title.
# change the x-axis`s label
scale_x_date(date_breaks = "1 year", date_minor_breaks = "1 month", date_labels = "%Y-%m") +
# change the label`s name
scale_color_discrete(breaks=c("adj_SI", "adj_WI", "adj_EP"), label=c("Strong Ideology","Weak Ideology", "Economic Performance"))+
geom_vline(xintercept = as.numeric(as.Date("1989-04-15")), color ="red", linetype = "longdash", alpha=0.5) +
geom_vline(xintercept = as.numeric(as.Date("1989-06-04")), color ="red", linetype = "longdash", alpha=0.5) +
geom_vline(xintercept = as.numeric(as.Date("1986-12-03")), color ="blue", linetype = "longdash", alpha=0.5) +
geom_vline(xintercept = as.numeric(as.Date("1987-01-01")), color ="blue", linetype = "longdash", alpha=0.5)
png("upload.png", width = 600, height =400)
ggsave("graph/mscore.png")
##############################
# Daily scores
##############################
df_daily_score <-df_score %>%
group_by (Datetime) %>%
summarise (SI = sum (SI_score), WI = sum (WI_score), EP = sum(EP_score), daily_term =sum (count)) %>%
# calculate in each month, the featured terms count for how many percentage in total vocabulary
mutate (adj_SI= 100*(SI/daily_term), adj_WI= 100*(WI/daily_term), adj_EP = 100*(EP/daily_term)) %>%
select (Datetime, adj_SI, adj_WI, adj_EP) %>%
gather (key, percent, -Datetime)
##############################
# Daily scores plot
##############################
ggplot (df_daily_score, aes(x = Datetime, y = percent, fill = key)) +
theme_bw () +
# legend position, and rotate the x-axis` label
theme(legend.position ="bottom", axis.text.x = element_text(angle = 45, hjust = 1))+
geom_bar(stat="identity") +
labs (title = "SI, WI and EP's Occurrence in the People's Daily (Daily Summary)",
subtitle = "January 1986 to December 1990",
x = "months from 1986 to 1990",
y = "weighted scores (in percentage)",
color = NULL)+ # "color"=NULL to silent the legend title.
# change the x-axis`s label
scale_x_date(date_breaks = "1 year", date_minor_breaks = "1 month", date_labels = "%Y-%m") +
# change the label`s name
scale_fill_discrete(breaks=c("adj_SI", "adj_WI", "adj_EP"), label=c("Strong Ideology","Weak Ideology", "Economic Performance"))+
geom_vline(xintercept = as.numeric(as.Date("1989-04-15")), color ="red", linetype = "longdash", alpha=0.5) +
geom_vline(xintercept = as.numeric(as.Date("1989-06-04")), color ="red", linetype = "longdash", alpha=0.5) +
geom_vline(xintercept = as.numeric(as.Date("1986-12-03")), color ="blue", linetype = "longdash", alpha=0.5) +
geom_vline(xintercept = as.numeric(as.Date("1987-01-01")), color ="blue", linetype = "longdash", alpha=0.5)
png("upload.png", width = 600, height =400)
ggsave("graph/dscore.png")
# find through out 5 years, everyday mean for adj_SI= 0.2165333; adj_WI=0.4380292; adj_EP= 0.251179