-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_setup_rf_20170912.r
213 lines (170 loc) · 10.7 KB
/
data_setup_rf_20170912.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
library(tidyverse)
#consumption
consumption <- read.table("~/github/springboard_capstone_1/alcohol_consumption/state_consumption_data_2.csv", sep = ',', header = TRUE) %>%
mutate(GEO_ID = as.numeric(as.character(GEO_ID)))
data_source <- read.table("~/github/springboard_capstone_1/alcohol_consumption/data_source.csv", sep = ',', header = TRUE)
state_codes <- read.table("~/github/springboard_capstone_1/alcohol_consumption/state_codes.csv", sep = ',', header = TRUE)%>%
mutate(GEO_ID = as.numeric(as.character(GEO_ID)))
bev_type <- read.table("~/github/springboard_capstone_1/alcohol_consumption/bev_type.csv", sep = ',', header = TRUE)
combo <- consumption %>%
left_join(data_source, by = "DATA_SOURCE_CODE") %>%
left_join(state_codes, by = "GEO_ID") %>%
left_join(bev_type, by = "BEVERAGE_TYPE_CODE") %>%
filter(BEVERAGE_TYPE == "All_beverages",
GEO_ID %in% 1:56,
YEAR >= 2010) %>%
select(one_of(c("YEAR", "GALLONS_ALCOHOL", "GALLONS_ALCOHOL_PER_CAP_21", "DECILE_21", "STATE", "GEO_ID"))) %>%
mutate(STATE = substr(as.character(STATE), 2, nchar(as.character(STATE))))
#png(file = "e:/personal/traffic_project/consumption_1_5_10.png", units = "in", width = 14, height = 7, res = 500)
ggplot(filter(combo, DECILE_21 %in% c(1,5,10))) + geom_point(aes(x = YEAR, y = GALLONS_ALCOHOL_PER_CAP_21, color = STATE),size = 5, alpha = .7) +
facet_grid(~DECILE_21)+
theme(axis.title.x = element_text(size=20), axis.text.x = element_text(size=20, angle = 45, hjust = 1),
axis.title.y = element_text( size=20), axis.text.y = element_text(size=20),
panel.grid.major = element_line(colour = "black"), strip.text.x = element_text(size = 20),
legend.title = element_text(size=25, face="bold"), legend.text = element_text(size=18))
#dev.off()
#png(file = "e:/personal/traffic_project/consumption_1_10.png", units = "in", width = 14, height = 7, res = 500)
ggplot(filter(combo, DECILE_21 %in% c(1,10))) + geom_point(aes(x = YEAR, y = GALLONS_ALCOHOL_PER_CAP_21, color = STATE),size = 5, alpha = .7) +
facet_grid(~DECILE_21)+
theme(axis.title.x = element_text(size=20), axis.text.x = element_text(size=20, angle = 45, hjust = 1),
axis.title.y = element_text( size=20), axis.text.y = element_text(size=20),
panel.grid.major = element_line(colour = "black"), strip.text.x = element_text(size = 20),
legend.title = element_text(size=25, face="bold"), legend.text = element_text(size=18))
#dev.off()
#taxes
beer <- read.table("~/github/springboard_capstone_1/alcohol_consumption/data/TX1B - ChangesOverTime.csv", sep = ",", header = TRUE)
colnames(beer)[c(1,4)] <- c("STATE_CODE", "BEER_TAX")
beer <- beer %>%
select(one_of(c("STATE_CODE" ,"BEER_TAX"))) %>%
mutate(BEER_TAX = as.numeric(substr(BEER_TAX, 2, nchar(as.character(BEER_TAX))))) %>%
group_by(STATE_CODE) %>%
summarize(BEER_TAX = max(BEER_TAX))
wine <- read.table("~/github/springboard_capstone_1/alcohol_consumption/data/TX2W - ChangesOverTime.csv", sep = ",", header = TRUE)
colnames(wine)[c(1,4)] <- c("STATE_CODE", "WINE_TAX")
wine <- wine %>%
select(one_of(c("STATE_CODE" ,"WINE_TAX"))) %>%
mutate(WINE_TAX = as.numeric(substr(WINE_TAX, 2, nchar(as.character(WINE_TAX)))))%>%
group_by(STATE_CODE) %>%
summarize(WINE_TAX = max(WINE_TAX))
spirits <- read.table("~/github/springboard_capstone_1/alcohol_consumption/data/TX3S - ChangesOverTime.csv", sep = ",", header = TRUE)
colnames(spirits)[c(1,4)] <- c("STATE_CODE", "SPIRITS_TAX")
spirits <- spirits %>%
select(one_of(c("STATE_CODE" ,"SPIRITS_TAX"))) %>%
mutate(SPIRITS_TAX = as.numeric(substr(SPIRITS_TAX, 2, nchar(as.character(SPIRITS_TAX)))))%>%
group_by(STATE_CODE) %>%
summarize(SPIRITS_TAX = max(SPIRITS_TAX))
all_tax <- beer %>%
left_join(wine, by = "STATE_CODE") %>%
left_join(spirits, by = "STATE_CODE") %>%
mutate(STATE_CODE = as.character(STATE_CODE))
state_codes <- read.table("~/github/springboard_capstone_1/alcohol_consumption/data/state_codes.csv", sep = ",", header = TRUE) %>%
select(-STATENS) %>%
filter(STATE <= 56) %>%
mutate(STATE_LONG = as.character(STATE_LONG))
colnames(state_codes) <- c("STATE_NUM", "STATE_CODE", "STATE")
combo_tax <- combo %>%
left_join(state_codes, by = "STATE") %>%
left_join(all_tax, by = "STATE_CODE")
tax_long <- combo_tax %>%
gather("TYPE", "TAX", BEER_TAX:SPIRITS_TAX)
#png(file = "e:/personal/traffic_project/taxes_consumption.png", units = "in", width = 14, height = 7, res = 500)
ggplot(tax_long) + geom_point(aes(x = TAX, y = GALLONS_ALCOHOL_PER_CAP_21), size = 5, alpha = 0.7) + facet_grid(~TYPE, scales="free_x") +
geom_smooth(aes(x = TAX, y = GALLONS_ALCOHOL_PER_CAP_21)) +
theme(axis.title.x = element_text(size=20), axis.text.x = element_text(size=20, angle = 45, hjust = 1),
axis.title.y = element_text( size=20), axis.text.y = element_text(size=20),
panel.grid.major = element_line(colour = "black"), strip.text.x = element_text(size = 20),
legend.title = element_text(size=25, face="bold"), legend.text = element_text(size=18))
#dev.off()
ggplot(combo_tax) + geom_point(aes(x = BEER_TAX, y = GALLONS_ALCOHOL_PER_CAP_21, color = YEAR))
#read in FARS data
acc_dui <- read.csv("~/github/springboard_capstone_1/alcohol_consumption/data/alcohol_dui.csv", sep = ",", header = TRUE)
#combine with law data
colnames(tax_long)[colnames(tax_long) == "STATE"] <- "STATE_LONG"
colnames(tax_long)[colnames(tax_long) == "STATE_NUM"] <- "STATE"
colnames(tax_long)[colnames(tax_long) == "STATE_CODE"] <- "STATE_SHORT"
all_dui <- acc_dui %>%
left_join(tax_long, by = c("YEAR", "STATE", "STATE_LONG", "STATE_SHORT"))
ggplot(data = all_dui) + geom_point(aes(x = GALLONS_ALCOHOL_PER_CAP_21, y = ACH_MIL), size = 3, alpha = 0.3)+
geom_smooth(aes( x = GALLONS_ALCOHOL_PER_CAP_21, y = ACH_MIL)) +
theme(axis.title.x = element_text(size=20), axis.text.x = element_text(size=20, angle = 45, hjust = 1),
axis.title.y = element_text( size=20), axis.text.y = element_text(size=20),
panel.grid.major = element_line(colour = "black"), strip.text.x = element_text(size = 20),
legend.title = element_text(size=25, face="bold"), legend.text = element_text(size=18))
ggplot(data = all_dui,aes(x = POP_MIL, y = ACH_MIL)) + geom_point( size = 3, alpha = 0.3)+
geom_smooth() +
theme(axis.title.x = element_text(size=20), axis.text.x = element_text(size=20, angle = 45, hjust = 1),
axis.title.y = element_text( size=20), axis.text.y = element_text(size=20),
panel.grid.major = element_line(colour = "black"), strip.text.x = element_text(size = 20),
legend.title = element_text(size=25, face="bold"), legend.text = element_text(size=18))
ggplot(data = dui_tax_short,aes(x = BEER_TAX, y = ACH_MIL)) + geom_point( size = 3, alpha = 0.3)+
geom_smooth() +
theme(axis.title.x = element_text(size=20), axis.text.x = element_text(size=20, angle = 45, hjust = 1),
axis.title.y = element_text( size=20), axis.text.y = element_text(size=20),
panel.grid.major = element_line(colour = "black"), strip.text.x = element_text(size = 20),
legend.title = element_text(size=25, face="bold"), legend.text = element_text(size=18))
#new predictor data
#household income and poverty rate
income <- read.table("~/github/springboard_capstone_1/census_data/household_income.csv", sep = ",", header = TRUE)
income <- income[,c(1,4,10,41)]
colnames(income) <- c("YEAR","STATE_LONG", "PERCENT_POVERTY", "MEDIAN_INCOME")
income <- income %>%
mutate(MEDIAN_INCOME = as.numeric(paste0(substr(as.character(MEDIAN_INCOME), 2, 3), substr(as.character(MEDIAN_INCOME), 5,7))),
PERCENT_POVERTY = as.numeric(PERCENT_POVERTY))
#insured
insure <- read.table("~/github/springboard_capstone_1/census_data/uninsured.csv", sep = ",", header = TRUE)
insure <- insure[,c(5,7,12)]
colnames(insure) <- c("YEAR","STATE_LONG", "PERCENT_UNINSURE")
#education
ed11 <- read.table("~/github/springboard_capstone_1/census_data/edu_2011.csv", sep = ",", header = TRUE)
ed12 <- read.table("~/github/springboard_capstone_1/census_data/edu_2012.csv", sep = ",", header = TRUE)
ed13 <- read.table("~/github/springboard_capstone_1/census_data/edu_2013.csv", sep = ",", header = TRUE)
ed14 <- read.table("~/github/springboard_capstone_1/census_data/edu_2014.csv", sep = ",", header = TRUE)
ed15 <- read.table("~/github/springboard_capstone_1/census_data/edu_2015.csv", sep = ",", header = TRUE)
ed_all <- ed11 %>%
bind_rows(ed12) %>%
bind_rows(ed13) %>%
bind_rows(ed14) %>%
bind_rows(ed15) %>%
mutate(LESS_HS_P = LESS_HIGH_SCHOOL / TOTAL,
HS_P = HIGH_SCHOOL / TOTAL,
SOME_C_P = SOME_COLLEGE / TOTAL,
BACHELORS_P = BACHELORS / TOTAL,
GRAD_PROF_P = GRADUATE_PROFESSIONAL / TOTAL,
STATE_LONG = as.character(STATE_LONG))
all_dui_2 <- all_dui %>%
left_join(income, by = c("YEAR", "STATE_LONG")) %>%
left_join(insure, by = c("YEAR", "STATE_LONG")) %>%
left_join(ed_all, by = c("YEAR", "STATE_LONG"))
unique(ed_all$STATE_LONG[!(ed_all$STATE_LONG %in% all_dui$STATE_LONG)])
##########################################
#build model
library(randomForest)
dui_tax_short <- spread(all_dui_2, TYPE, TAX) %>%
select(one_of(c("ACC_MIL", "ACH_MIL", "YEAR", "STATE_SHORT", "POP_MIL", "SUSPEND", "INTERLOCK_1ST", "INTERLOCK_REPEAT",
"GALLONS_ALCOHOL_PER_CAP_21", "BEER_TAX","PERCENT_POVERTY", "MEDIAN_INCOME", "PERCENT_UNINSURE", "LESS_HS_P",
"HS_P", "SOME_C_P", "BACHELORS_P", "GRAD_PROF_P"))) %>%
mutate(STATE_SHORT = as.factor(STATE_SHORT)) %>%
filter(STATE_SHORT != "DC",
YEAR < 2015)
dui_tax_short <- na.omit(dui_tax_short)
dui_rf <- randomForest(ACH_MIL~YEAR + STATE_SHORT + ACC_MIL +POP_MIL + SUSPEND + INTERLOCK_1ST + INTERLOCK_REPEAT +
GALLONS_ALCOHOL_PER_CAP_21 + BEER_TAX + PERCENT_POVERTY + MEDIAN_INCOME + PERCENT_UNINSURE + LESS_HS_P +
HS_P + SOME_C_P + BACHELORS_P + GRAD_PROF_P,
dui_tax_short[!is.na(dui_tax_short$BEER_TAX),],
importance=TRUE)
dui_rf
importance(dui_rf)
dui_rf_2 <- randomForest(ACH_MIL~STATE_SHORT + ACC_MIL +POP_MIL + INTERLOCK_REPEAT +
GALLONS_ALCOHOL_PER_CAP_21 + BEER_TAX,
dui_tax_short[!is.na(dui_tax_short$BEER_TAX),],
importance=TRUE)
dui_rf_2
importance(dui_rf_2)
dui_rf_3 <- randomForest(ACH_MIL~YEAR + POP_MIL + SUSPEND + INTERLOCK_REPEAT +
GALLONS_ALCOHOL_PER_CAP_21 + BEER_TAX,
dui_tax_short[!is.na(dui_tax_short$BEER_TAX),],
importance=TRUE, ntree = 100000)
dui_rf_3
importance(dui_rf_3)
plot(dui_rf_3)
varImpPlot(dui_rf_3)