-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.dt.R
76 lines (66 loc) · 3.44 KB
/
run_analysis.dt.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Coursera Getting and Cleaning Data
# Course Project
#
# Usage description:
# source the script
#
# Objectives:
# You should create one R script called run_analysis.R that does the following.
# 1. Merges the training and the test sets to create one data set.
# 2. Extracts only the measurements on the mean and standard deviation
# for each measurement.
# 3. Uses descriptive activity names to name the activities in the data set
# 4. Appropriately labels the data set with descriptive variable names.
# 5. From the data set in step 4, creates a second, independent tidy data
# set with the average of each variable for each activity and each subject.
source('common.R') # Common external data retrival file
getFile("https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip",
".",
"power_consumption.zip",
unzip = TRUE)
depends.on('data.table')
# ###############################################################
# Combine all 3 information parts into 1 table
# - X file (Actual observation)
# - Y file (Activity)
# - Subject file (The subject id observed)
#
# Argument
# type - The type of information to process
# Possible values: test | train
#
# Return value
# A data.frame with with 3 additional columns(type, subject, activity)
#
# Dependencies on parent closure (to avoid longer than necessary function signature)
# activity - A vector of activity labels (Order corresponds to activity value)
# col.class - A vector of classes (length must be equal to number of observational columns )
# Allows to filter out unwanted columns
# col.names - A vector of feature columns we are interested in
construct <- function(type) {
obs <- data.table(read.table(file.path("./UCI HAR Dataset", type, paste0("X_", type, ".txt")), colClasses = col.class))
act <- scan(file.path("./UCI HAR Dataset", type, paste0("Y_", type, ".txt")), quiet = T)
sbj <- scan(file.path("./UCI HAR Dataset", type, paste0("subject_", type, ".txt")), quiet = T)
setnames(obs, names(obs),col.names)
obs[, c("subject", "activity", "type") := list(
sbj,
sapply(act, function(act.no) { tolower(activity[act.no]) }),
type)]
# Re-order the data.frame to have the introduced columns first
setcolorder(obs, c('subject', 'activity', 'type', col.names) )
}
# ###############################################################
# Main
activity <- read.table("./UCI HAR Dataset/activity_labels.txt", colClasses = c("NULL", "character"))[,1]
feature <- read.table("./UCI HAR Dataset/features.txt", colClasses = c("NULL", "character"))[,1]
# Deduce columns with mean() and std() ONLY. Create a col.class mask to filter out unecessary columns
select.feature <- grep("(mean|std)\\()", feature, value = TRUE)
col.class <- sapply(feature, function(ft) { unname(ifelse(ft %in% select.feature, "numeric", "NULL")) } )
# Column names quite explanatory. I only remove the () from the name, which I dislike.
col.names <- gsub("BodyBody", "Body", gsub("\\()", "",select.feature))
# Step 1 throuh 4: Merged train & test with properly named columns
merged <- rbindlist(list(construct("train"), construct("test")))
# Step 5: tidy dataset averaged per activity/subject
setkey(merged, subject, activity)
per.activity.subject <- merged[, lapply(.SD, mean), by = c("subject", "activity", "type")]
write.table(per.activity.subject, "tidy.txt", row.names = FALSE, quote = FALSE)