-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_analysis.r
144 lines (124 loc) · 3.67 KB
/
run_analysis.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
## run_analysis.r
## This script will perform the following steps on the UCI HAR Dataset downloaded from
## https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip
## 1. Merges the training and the test sets to create one data set.
## 2. Extracts only the measurements on the mean and standard deviation for each measurement.
## 3. Uses descriptive activity names to name the activities in the data set
## 4. Appropriately labels the data set with descriptive activity names.
## 5. Creates a second, independent tidy data set with the average of each variable for each activity and each subject.
## install.packages("reshape2")
library("reshape2")
## install.packages("plyr")
library(plyr)
print(dir.data <- paste0(getwd(),"/UCI HAR Dataset/"))
print(file.output <- paste0(getwd(),"/tidy_data.txt"))
## Import Reference Files
### Activity
str(ref.activity <- read.table(
paste0(dir.data,
"activity_labels.txt"
)
,col.names = c("activity_id", "activity_label")
))
## Strip down to just a factor - First column is just an auto-incrementing number
ref.activity <- ref.activity$activity_label
# Import the feature reference files
str(ref.features <- read.table(
paste0(dir.data,
"features.txt"
)
,col.names = c("feature_id", "feature_name")
))
## Strip down to just a factor - First column is just an auto-incrementing number
ref.features <- ref.features$feature_name
## Create a logical vector identifying features to keep.
## This will be used to subset our train/test sets.
features.keep <- grepl("-(mean|std)\\(\\)-", ref.features)
## Scrub the testing data.
### Import X
str(x.test <- read.table(
paste0(dir.data,
"test/x_test.txt"
)
))
### Limit to selected features
names(x.test) <- ref.features
x.test <- x.test[,features.keep]
### Import Y
str(y.test <- read.table(
paste0(dir.data,
"test/y_test.txt"
)
,col.names = "activity_id"
))
### Merge on the label
y.test$activity_label = ref.activity[y.test$activity_id]
### Import subjects
str(subject.test <- read.table(
paste0(dir.data,
"test/subject_test.txt"
)
,col.names = "subject_id"
))
### Merge into one set
head(data.test <- cbind(
subject.test
,y.test
,x.test
))
## Scrub the training data.
### Import X
str(x.train <- read.table(
paste0(dir.data,
"train/x_train.txt"
)
))
names(x.train) <- ref.features
### Limit to selected features
x.train <- x.train[,features.keep]
### Import Y
str(y.train <- read.table(
paste0(dir.data,
"train/y_train.txt"
)
,col.names = "activity_id"
))
### Merge on the label
y.train$activity_label = ref.activity[y.train$activity_id]
### Import subjects
str(subject.train <- read.table(
paste0(dir.data,
"train/subject_train.txt"
)
,col.names = "subject_id"
))
### Merge into one set
head(data.train <- cbind(
subject.train
,y.train
,x.train
))
### Append training and testing sets
analysis.set <- rbind(data.train
,data.test
)
## Split the variables into dimensions
## and measures so we can pass them to the "melt" function
print(id.labels <- c("subject_id", "activity_id", "activity_label"))
print(data.labels <- setdiff(colnames(analysis.set), id.labels))
## Melt the data so we can more easily aggregate
data.melt <- melt(
analysis.set
,id = id.labels
,measure.vars = data.labels
,variable.name = "feature"
)
## Aggregate the data to reduce its size
data.tidy <- ddply(
data.melt
,c(id.labels,"feature")
,summarize
,avg=mean(value)
)
## Output
write.table(data.tidy, file = file.output, row.names = FALSE)