forked from campbwa/R-videos
-
Notifications
You must be signed in to change notification settings - Fork 4
/
titanic gbm.R
261 lines (146 loc) · 6.14 KB
/
titanic gbm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#GBM: Generalized Boosted Models
#an ensemble of classification or regression trees
#with the gbm package, you can do both a few different kinds including
#AdaBoost and Gradient Boosting
#advantages of GBM over logistic regression (parametric method):
#robust to outliers
#can still make predictions when an observation has missing data!
#handles unequal class sizes and unbalanced predictor variables well
#(logistic regression isn't as good at this)
#you don't need to specify interaction terms with tree models!
#usually have greater predictive ability
#potential drawbacks:
#trees can overfit, especially if the number of
#ending nodes is too small or the number of trees is too large
#definitely want to use CV, can't use in-sample
#prediction rate as a measure of goodness of fit!
#install.packages("gbm")
require(gbm)
# install.packages("dplyr")
#this package will change your life as it relates to data management
require(dplyr)
############# Load and transform data #################
train = read.csv("https://raw.githubusercontent.com/campbwa/R-videos/master/train.csv")
test = read.csv("https://raw.githubusercontent.com/campbwa/R-videos/master/test.csv")
head(train)
summary(train)
#missing values of age!
#you can estimate gbm and make predictions on observations with missing values
#in the feature space (independent variables)
#I would still recommend imputing missing values
####### Basic data manipulation ########
survived = train$Survived
train = select(train, -Survived)
end_trn = nrow(train)
#combine the two into one data set
all = rbind(train,test)
#Why? So if we manipulate variables (create new ones, cap and floor),
#we do the same operation for the training and testing data
end = nrow(all)
#select variables to use in modeling (select is a dplyr function)
#gbm does a good job of filtering out noise variables, but will still
#get a better fit when you get rid of junk
#(especially factor variables with lots of levels)
all = select(all
, Pclass
, Sex
, Age
, SibSp
, Parch
, Fare
, Embarked
)
#not many variables to choose from
#perform variable selection later
head(all)
########################################################
########## The model #############
#as always, look at the help page for the function
?gbm
#a high guess of how many trees we'll need
ntrees = 5000
#how to tune parameters?
#in this video, we'll tune the number of trees and
#use reasonable values of other parameters
#test different parameters with Cross Validation
#see the other video on this topic
Model = gbm.fit(
x = all[1:end_trn,] #dataframe of features
, y = survived #dependent variable
#two ways to fit the model
#use gbm.fit if you are going to specify x = and y =
#instead of using a formula
#if there are lots of features, I think it's easier to specify
#x and y instead of using a formula
, distribution = "bernoulli"
#use bernoulli for binary outcomes
#other values are "gaussian" for GBM regression
#or "adaboost"
, n.trees = ntrees
#Choose this value to be large, then we will prune the
#tree after running the model
, shrinkage = 0.01
#smaller values of shrinkage typically give slightly better performance
#the cost is that the model takes longer to run for smaller values
, interaction.depth = 3
#use cross validation to choose interaction depth!!
, n.minobsinnode = 10
#n.minobsinnode has an important effect on overfitting!
#decreasing this parameter increases the in-sample fit,
#but can result in overfitting
, nTrain = round(end_trn * 0.8)
#use this so that you can select the number of trees at the end
# , var.monotone = c()
#can help with overfitting, will smooth bumpy curves
, verbose = TRUE #print the preliminary output
)
#look at the last model built
#Relative influence among the variables can be used in variable selection
summary(Model)
#If you see one variable that's much more important than all of the rest,
#that could be evidence of overfitting.
#optimal number of trees based upon CV
gbm.perf(Model)
#look at the effects of each variable, does it make sense?
?plot.gbm
for(i in 1:length(Model$var.names)){
plot(Model, i.var = i
, ntrees = gbm.perf(Model, plot.it = FALSE) #optimal number of trees
, type = "response" #to get fitted probabilities
)
}
###########################################
################ Make predictions ##################
#test set predictions
TestPredictions = predict(object = Model,newdata =all[(end_trn+1):end,]
, n.trees = gbm.perf(Model, plot.it = FALSE)
, type = "response") #to output a probability
#training set predictions
TrainPredictions = predict(object = Model,newdata =all[1:end_trn,]
, n.trees = gbm.perf(Model, plot.it = FALSE)
, type = "response")
#round the predictions to zero or one
#in general, don't do this!
#it was only because the answers in the comp had to be 0 or 1
TestPredictions = round(TestPredictions)
TrainPredictions = round(TrainPredictions)
#could also mess around with different cutoff values
#would need CV to determine the best
head(TrainPredictions, n = 20)
head(survived, n = 20)
#in sample classification accuracy
1 - sum(abs(survived - TrainPredictions)) / length(TrainPredictions)
#depending upon the tuning parameters,
#I've gotten this as high as 99%, but that model
#resulted in lower test set scores
#to get predicted out of sample accuracy
#need to set aside a testing data set
#write the submission
submission = data.frame(PassengerId = 1:nrow(test), survived = TestPredictions)
write.csv(submission, file = "gbm submission.csv", row.names = FALSE)
#####################################################
#GBM vs Random Forests:
#https://www.nescent.org/wg/cart/images/9/91/Chapter6_March20.pdf
#difficult to know which method will be the best beforehand!
#"The jury is out on whether they are generally more powerful
#than Random Forests; sometimes they are, sometimes not."