-
Notifications
You must be signed in to change notification settings - Fork 0
/
pml_mach_learn.r
80 lines (61 loc) · 3.16 KB
/
pml_mach_learn.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
setwd('/home/khalid/Documents/03 - Courses/Practical Machine Learning/barbell project/data');
training_raw = read.csv('pml-training.csv')
testing_raw = read.csv('pml-testing.csv')
# head(training_raw)
# summary(training_raw)
# str(training_raw)
# NB: As the model is going to be tested against the test-set let us consider only those attributes which are
# present in the test set and which show some variation (therefore can hold classification value) in the
# test set. This is also a useful step to reduce the number of attributes.
summary(testing_raw)
####### PRE-PROCESSING #############
# Convert all variables test set to factors
col_names <- names(testing_raw)
testing_as_factors <- lapply(testing_raw[,col_names], factor)
# Get the number of factors for each column
num_factor_levels <- lapply(testing_as_factors, function(x) nlevels(x))
# Keep only those column which have >1 factor level (so some classification value)
# Also remove row number variable
training_proc <- training_raw[, num_factor_levels > 1]
training_proc <- training_raw[, -1]
####### DATA-SPLITTING #############
# Create a test set (10%), a training set (70%)
# and a cross validation set (20%). Test set not to be used
# for any of training procedures
library(caret)
set.seed(42)
inTraining <- createDataPartition(training_proc$classe, p = 0.9, list=FALSE)
barbell_train <- training_proc[inTraining,]
barbell_test <- training_proc[-inTraining,]
set.seed(42)
inValidation <- createDataPartition(barbell_train$classe, p=(0.2/0.9), list=FALSE)
barbell_valid <- barbell_train[inValidation,]
barbell_train <- barbell_train[-inValidation,]
####### TRAIN-MODEL #############
# Train model on the data using all of the remaining factors on the test set
# Using a random forest model (because in the lectures this is supposed to be reasonable)
set.seed(42)
modFit <- train(classe ~ ., method='rf', data=barbell_train)
####### CHECK ACCURACY ##########
# Initially check on training set
# Output standard finalModel output which will give confusion matrix as well as other performance metrics
modFit$finalModel
# This shows just a single miss-classification error
# Checking the training data through the prediction again suggests there is no (0) classification error
train_prediction <- predict(modFit, newdata=barbell_train)
confusionMatrix(data=train_prediction, barbell_train$classe)
# Then check on validation set
validation_prediction <- predict(modFit, newdata=subset(barbell_valid, select=-c(classe)))
confusionMatrix(data=validation_prediction, barbell_valid$classe)
# Which again gives 0 miss-classification error
# Finally; as results above are so positive; try with the test set
test_prediction <- predict(modFit, newdata=subset(barbell_test, select=-c(classe)))
confusionMatrix(data=test_prediction, barbell_test$classe)
# Here there was a single miss-classification error, but with an accuracy of 99.95% it would be
# unlucky for the model to fail at fitting one of the 20 examples
####### MAKE PREDICTIONS ##########
# Extract relevant fields of real test data:
testing_proc <- testing_raw[, num_factor_levels > 1]
testing_proc <- testing_raw[, -1]
real_test_prediction <- predict(modFit, newdata=testing_proc)
real_test_prediction