-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_analysis.R
167 lines (95 loc) · 4.94 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
################################################################################
#IMPORT DATASETS TO R; USE THEIR ORIGINAL NAMES IN R
################################################################################
#Labels and features
activity_labels <- read.table("./UCI HAR Dataset/activity_labels.txt", sep="")
features <- read.table("./UCI HAR Dataset/features.txt", sep="")
#Training set
subject_train <- read.table("./UCI HAR Dataset/train/subject_train.txt", sep="")
X_train <- read.table("./UCI HAR Dataset/train/X_train.txt", sep="")
y_train <- read.table("./UCI HAR Dataset/train/y_train.txt", sep="")
#Test set
subject_test <- read.table("./UCI HAR Dataset/test/subject_test.txt", sep="")
X_test <- read.table("./UCI HAR Dataset/test/X_test.txt", sep="")
y_test <- read.table("./UCI HAR Dataset/test/y_test.txt", sep="")
################################################################################
#Merge the training and the test sets to create one data set.
################################################################################
#Convert subject-ids to factors
y_train <- factor(y_train[,1])
y_test <- factor(y_test[,1])
#Convert activities to factors
subject_train <- factor(subject_train[,1])
subject_test <- factor(subject_test[,1])
#Merge subject-ids and and activities to training set and
#create indicator to tell whether a record belongs to training set or not
trainingset <- cbind(subject_id = subject_train,
training_ind = TRUE,
activity = y_train,
X_train)
#Merge subject-ids and and activities to test set and
#create indicator to tell whether a record belongs to training set or not
testset <- cbind(subject_id = subject_test,
training_ind = FALSE,
activity = y_test,
X_test)
#Append testset to trainingset to single dataset
onedata_temp <- rbind(trainingset, testset)
################################################################################
#Appropriately label the data set with descriptive variable names.
################################################################################
#Use feature names provided in the dataset to rename features
colnames(onedata_temp)[c(4:564)] <- as.character(features[,2])
#Check the result
names(onedata_temp)
################################################################################
#Use descriptive activity names to name the activities in the data set
################################################################################
#Merge activity labels to dataset
onedata <- merge(onedata_temp, activity_labels, by.x="activity", by.y="V1")
#--> Watch out, subject_id is no more the first column!!!!
#Check the result --> OK
table(onedata[,1],onedata[,565])
#Rename activity_label column
colnames(onedata)[565] <- "activity_label"
################################################################################
#Extract only the measurements on the mean and standard deviation
#for each measurement.
################################################################################
#Exclude the angle values at the end of vector as they are not means
#although the variable names include "mean"
to_exclude <- grep("angle", tolower(features[,2]))
#Extract variables containing means from feature vector.
mean_features <- grep("mean",
tolower(features[which(!features$V1 %in% to_exclude), 2]))
#Extract variables containing stds from feature vector
std_features <- grep("std",
tolower(features[which(!features$V1 %in% to_exclude), 2]))
#Concatenate to single vector and sort
means_stds <- sort(c(mean_features, std_features))
#Create dataset containing necessary ids and means and stds of features
onedata_means_stds <- onedata[,c(2:3, 565, means_stds + 3)]
################################################################################
#Create a second, independent tidy data set with
#the average of each variable for each activity and each subject.
################################################################################
#Rearrange columns in onedata
onedata <- onedata[,c(2:3, 565, 4:564)]
#Aggregate by computing means of means for each subject-id and activity
temp <- aggregate(onedata[,4:564],
by = list(subject_id = onedata[,1],
training_ind = onedata[,2],
activity_label = onedata[,3]),
FUN=mean)
#Sort the resulting data
means_by_subject_activity <- temp[order(as.numeric(as.character(temp[,1])),
temp[,3]), ]
#Check duplicates
length(unique(means_by_subject_activity[, 1:3]))==
length(means_by_subject_activity[, 1:3])
#Rename columns to avoid confusion with original variables
x <- names(means_by_subject_activity)
y <- c(x[1:3], paste("avg", x[4:564], sep="_"))
colnames(means_by_subject_activity) <- y
#Remove temporary objects
rm(x,y,temp,onedata_temp)