-
Notifications
You must be signed in to change notification settings - Fork 2
/
rf.R
335 lines (265 loc) · 12.9 KB
/
rf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
############################################
## Use H2O to create a random forest
## against the entire data set in
## just a couple minutes
##
## This is a starter script, using defaults
## so it can be improved. And RF may not be
## the best algorithm for this problem. But
## the script shows that it can be done in R
## fairly quickly, in fact. And it will scale
## well to adding many more columns.
##
## To better fit MAE, the log of the
## target has been used: log1p/expm1
##
## The final step blends with the Marshall-Palmer
## benchmark 50/50.
##
## The API for h2o.randomForest is shown
## at the bottom of the script
###########################################
library(h2o)
library(data.table)
library(Metrics)
h2o.init(nthreads=-1)
ref_diff <- function(ref) {
diff <-
c(ref[1:length(ref) - 1]) -
c(ref[2:length(ref)])
return(mean(diff))
}
mpalmer <- function(ref, minutes_past) {
# order reflectivity values and minutes_past
sort_min_index = order(minutes_past)
minutes_past <- minutes_past[sort_min_index]
ref <- ref[sort_min_index]
# calculate the length of time for which each reflectivity value is valid
valid_time <-
c(minutes_past[-length(minutes_past)], 60) -
c(0, minutes_past[-length(minutes_past)])
valid_time = valid_time / 60
# calculate hourly rain rates using marshall-palmer weighted by valid times
return(sum(((10^(ref/10))/200)^0.625*valid_time, na.rm=TRUE))
}
rate_kdp <- function(Kdp, minutes_past) {
# order Kdp values and minutes_past
sort_min_index = order(minutes_past)
minutes_past <- minutes_past[sort_min_index]
Kdp <- Kdp[sort_min_index]
# calculate the length of time for which each Kdp value is valid
valid_time <-
c(minutes_past[-length(minutes_past)], 60) -
c(0, minutes_past[-length(minutes_past)])
valid_time = valid_time / 60
# calculate hourly rain rates using S and Z formula weighted by valid times
return(sum((sign(Kdp)*(4.06)*(abs(Kdp) **.0866)*valid_time), na.rm=TRUE))
}
print(paste("reading training file:",Sys.time()))
train<-fread("../train.csv",select=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,24))
#Cut off outliers of Expected >= 69
train <- subset(train, Expected < 69)
summary(train)
# train$dt <- time_difference(train$minutes_past)
# train$mp5 <- marshall_palmer(train$Ref_5x5_50th)
# train$mp9 <- marshall_palmer(train$Ref_5x5_90th)
#Cut off Ref values < 0
train$Ref_5x5_50th[which(train$Ref_5x5_10th < 0)] <- NA
train$Ref_5x5_50th[which(train$Ref_5x5_50th < 0)] <- NA
train$Ref_5x5_90th[which(train$Ref_5x5_90th < 0)] <- NA
train$RefComposite[which(train$RefComposite < 0)] <- NA
train$RefComposite_5x5_10th[which(train$RefComposite_5x5_10th < 0)] <- NA
train$RefComposite_5x5_50th[which(train$RefComposite_5x5_50th < 0)] <- NA
train$RefComposite_5x5_90th[which(train$RefComposite_5x5_90th < 0)] <- NA
train$Ref[which(train$Ref < 0)] <- NA
cor(train, use = "pairwise.complete.obs")
summary(train)
trainHex<-as.h2o(train[,.(
dist = mean(radardist_km, na.rm = T),
refArea1 = mean(Ref_5x5_10th, na.rm = T),
varRefArea1 = var(Ref_5x5_10th, na.rm = T),
refArea5 = mean(Ref_5x5_50th, na.rm = T),
varRefArea5 = var(Ref_5x5_50th, na.rm = T),
refArea9 = mean(Ref_5x5_90th, na.rm = T),
varRefArea9 = var(Ref_5x5_90th, na.rm = T),
meanRefcomp = mean(RefComposite,na.rm = T),
varRefcomp = var(RefComposite,na.rm = T),
meanRefcomp1 = mean(RefComposite_5x5_10th,na.rm = T),
#varRefcomp1 = var(RefComposite_5x5_10th,na.rm = T),
meanRefcomp5 = mean(RefComposite_5x5_50th,na.rm = T),
varRefcomp5 = var(RefComposite_5x5_50th,na.rm = T),
meanRefcomp9 = mean(RefComposite_5x5_90th,na.rm = T),
varRefcomp9 = var(RefComposite_5x5_90th,na.rm = T),
rhoHV = mean(RhoHV, na.rm = T),
rhoHV1 = mean(RhoHV_5x5_10th, na.rm = T),
rhoHV5 = mean(RhoHV_5x5_50th, na.rm = T),
rhoHV9 = mean(RhoHV_5x5_90th, na.rm = T),
zdr = mean(Zdr, na.rm = T),
zdr1 = mean(Zdr_5x5_10th, na.rm = T),
zdr5 = mean(Zdr_5x5_50th, na.rm = T),
zdr9 = mean(Zdr_5x5_90th, na.rm = T),
#kdp = rate_kdp(Kdp, minutes_past),
target = log1p(mean(Expected)),
meanRef = mean(Ref,na.rm = T),
varRef = var(Ref, na.rm = T),
sumRef = sum(Ref,na.rm = T),
yy1 = mean(Ref,na.rm = T) / mean(radardist_km, na.rm = T),
#yy2 = mean(Ref_5x5_90th, na.rm = T) - mean(Ref_5x5_50th, na.rm = T),
yy3 = mean(Ref_5x5_90th,na.rm = T) / mean(radardist_km, na.rm = T),
yy4 = mean(Ref_5x5_50th,na.rm = T) / mean(radardist_km, na.rm = T),
yy5 = mean(RefComposite,na.rm = T) / mean(radardist_km, na.rm = T),
yy6 = mean(RefComposite_5x5_90th,na.rm = T) / mean(radardist_km, na.rm = T),
yy7 = mean(RefComposite_5x5_50th,na.rm = T) / mean(radardist_km, na.rm = T),
yy8 = ref_diff(Ref),
yy9 = ref_diff(Ref_5x5_50th),
yy10 = ref_diff(Ref_5x5_90th),
yy11 = ref_diff(RefComposite),
yy12 = ref_diff(RefComposite_5x5_50th),
yy13 = ref_diff(RefComposite_5x5_90th),
records = .N,
naCounts = sum(is.na(Ref))
#mp50 = mpalmer(RefComposite_5x5_50th, minutes_past),
#mp90 = mpalmer(RefComposite_5x5_90th, minutes_past),
#mp = mpalmer(Ref, minutes_past)
),Id][records>naCounts,],destination_frame="train.hex")
summary(trainHex)
rfHex<-h2o.randomForest(x=c("dist", "refArea1", "varRefArea1", "refArea5", "varRefArea5", "refArea9", "varRefArea9",
"meanRefcomp", "varRefcomp","meanRefcomp1","meanRefcomp5", "varRefcomp5","meanRefcomp9", "varRefcomp9",
"rhoHV","rhoHV1","rhoHV5","rhoHV9",
"zdr", "zdr1", "zdr5", "zdr9",
#"kdp",
#"mp50","mp90","mp",
"meanRef", "varRef", "sumRef", "records","naCounts",
"yy1", "yy3", "yy4", "yy5", "yy6", "yy7","yy8","yy9","yy10","yy11","yy12","yy13"
),
y="target",training_frame=trainHex,model_id="rfStarter.hex", ntrees=500, sample_rate = 0.7)
rfHex
h2o.varimp(rfHex)
rm(train)
test<-fread("../test.csv",select=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20))
# test$dt <- time_difference(te_raw$minutes_past)
# test$mp5 <- marshall_palmer(te_raw$Ref_5x5_50th)
# test$mp9 <- marshall_palmer(te_raw$Ref_5x5_90th)
#Cut off Ref values < 0
test$Ref_5x5_50th[which(test$Ref_5x5_50th < 0)] <- NA
test$Ref_5x5_90th[which(test$Ref_5x5_90th < 0)] <- NA
test$RefComposite[which(test$RefComposite < 0)] <- NA
test$RefComposite_5x5_10th[which(test$RefComposite_5x5_10th < 0)] <- NA
test$RefComposite_5x5_50th[which(test$RefComposite_5x5_50th < 0)] <- NA
test$RefComposite_5x5_90th[which(test$RefComposite_5x5_90th < 0)] <- NA
test$Ref[which(test$Ref < 0)] <- NA
testHex<-as.h2o(test[,.(
dist = mean(radardist_km, na.rm = T),
refArea1 = mean(Ref_5x5_10th, na.rm = T),
varRefArea1 = var(Ref_5x5_10th, na.rm = T),
refArea5 = mean(Ref_5x5_50th, na.rm = T),
varRefArea5 = var(Ref_5x5_50th, na.rm = T),
refArea9 = mean(Ref_5x5_90th, na.rm = T),
varRefArea9 = var(Ref_5x5_90th, na.rm = T),
meanRefcomp = mean(RefComposite,na.rm=T),
varRefcomp = var(RefComposite,na.rm = T),
meanRefcomp1 = mean(RefComposite_5x5_10th,na.rm = T),
#varRefcomp1 = var(RefComposite_5x5_10th,na.rm = T),
meanRefcomp5 = mean(RefComposite_5x5_50th,na.rm = T),
varRefcomp5 = var(RefComposite_5x5_50th,na.rm = T),
meanRefcomp9 = mean(RefComposite_5x5_90th,na.rm = T),
varRefcomp9 = var(RefComposite_5x5_90th,na.rm = T),
rhoHV = mean(RhoHV, na.rm = T),
rhoHV1 = mean(RhoHV_5x5_10th, na.rm = T),
rhoHV5 = mean(RhoHV_5x5_50th, na.rm = T),
rhoHV9 = mean(RhoHV_5x5_90th, na.rm = T),
zdr = mean(Zdr, na.rm = T),
zdr1 = mean(Zdr_5x5_10th, na.rm = T),
zdr5 = mean(Zdr_5x5_50th, na.rm = T),
zdr9 = mean(Zdr_5x5_90th, na.rm = T),
#kdp = rate_kdp(Kdp, minutes_past),
meanRef = mean(Ref,na.rm=T),
varRef = var(Ref, na.rm=T),
sumRef = sum(Ref,na.rm=T),
yy1 = mean(Ref,na.rm = T) / mean(radardist_km, na.rm = T),
#yy2 = mean(Ref_5x5_90th, na.rm = T) - mean(Ref_5x5_50th, na.rm = T),
yy3 = mean(Ref_5x5_90th,na.rm = T) / mean(radardist_km, na.rm = T),
yy4 = mean(Ref_5x5_50th,na.rm = T) / mean(radardist_km, na.rm = T),
yy5 = mean(RefComposite,na.rm = T) / mean(radardist_km, na.rm = T),
yy6 = mean(RefComposite_5x5_90th,na.rm = T) / mean(radardist_km, na.rm = T),
yy7 = mean(RefComposite_5x5_50th,na.rm = T) / mean(radardist_km, na.rm = T),
yy8 = ref_diff(Ref),
yy9 = ref_diff(Ref_5x5_50th),
yy10 = ref_diff(Ref_5x5_90th),
yy11 = ref_diff(RefComposite),
yy12 = ref_diff(RefComposite_5x5_50th),
yy13 = ref_diff(RefComposite_5x5_90th),
records = .N,
naCounts = sum(is.na(Ref))
#mp50 = mpalmer(RefComposite_5x5_50th, minutes_past),
#mp90 = mpalmer(RefComposite_5x5_90th, minutes_past),
#mp = mpalmer(Ref, minutes_past)
),Id],destination_frame="test.hex")
summary(testHex)
submission<-fread("../sample_solution.csv")
predictions<-as.data.frame(h2o.predict(rfHex,testHex))
submission$Expected<- 0.9 * expm1(predictions$predict) + 0.1 * submission$Expected
#convert expected values to 0.01in values
submission$Expected <- round(submission$Expected / 0.254) * 0.254
summary(submission)
write.csv(submission,"rfv3cn3.csv",row.names=F)
####################################################################################
## Appendix: h2o.randomForest API
####################################################################################
##h2o.randomForest(x, y, training_frame, model_id, validation_frame, checkpoint,
## mtries = -1, sample_rate = 0.632, build_tree_one_node = FALSE,
## ntrees = 50, max_depth = 20, min_rows = 1, nbins = 20,
## nbins_cats = 1024, binomial_double_trees = FALSE,
## balance_classes = FALSE, max_after_balance_size = 5, seed,
## offset_column = NULL, weights_column = NULL, nfolds = 0,
## fold_column = NULL, fold_assignment = c("AUTO", "Random", "Modulo"),
## keep_cross_validation_predictions = FALSE, ...)
## Arguments
## x
## A vector containing the names or indices of the predictor variables to use in building the GBM model.
## y
## The name or index of the response variable. If the data does not contain a header, this is the column index number starting at 1, and increasing from left to right. (The response must be either an integer or a categorical variable).
## training_frame
## An H2OFrame object containing the variables in the model.
## model_id
## (Optional) The unique id assigned to the resulting model. If none is given, an id will automatically be generated.
## validation_frame
## An H2OFrame object containing the variables in the model.
## checkpoint
## "Model checkpoint (either key or H2ODeepLearningModel) to resume training with."
## mtries
## Number of variables randomly sampled as candidates at each split. If set to -1, defaults to sqrtp for classification, and p/3 for regression, where p is the number of predictors.
## sample_rate
## Sample rate, from 0 to 1.0. (edit: row sampling, per tree)
## build_tree_one_node
## Run on one node only; no network overhead but fewer cpus used. Suitable for small datasets.
## ntrees
## A nonnegative integer that determines the number of trees to grow.
## max_depth
## Maximum depth to grow the tree.
## min_rows
## Minimum number of rows to assign to teminal nodes.
## nbins
## For numerical columns (real/int), build a histogram of this many bins, then split at the best point.
## nbins_cats
## For categorical columns (enum), build a histogram of this many bins, then split at the best point. Higher values can lead to more overfitting.
## binomial_double_trees
## For binary classification: Build 2x as many trees (one per class) - can lead to higher accuracy.
## balance_classes
## logical, indicates whether or not to balance training data class counts via over/under-sampling (for imbalanced data)
## max_after_balance_size
## Maximum relative size of the training data after balancing class counts (can be less than 1.0)
## seed
## Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded
## offset_column
## Specify the offset column.
## weights_column
## Specify the weights column.
## nfolds
## (Optional) Number of folds for cross-validation. If nfolds >= 2, then validation must remain empty.
## fold_column
## (Optional) Column with cross-validation fold index assignment per observation
## fold_assignment
## Cross-validation fold assignment scheme, if fold_column is not specified Must be "AUTO", "Random" or "Modulo"
## keep_cross_validation_predictions
## Whether to keep the predictions of the cross-validation models