Skip to content

Latest commit



357 lines (254 loc) · 8.84 KB

File metadata and controls

357 lines (254 loc) · 8.84 KB



Load packages.

packages <- c("data.table", "reshape2")
sapply(packages, require, character.only=TRUE, quietly=TRUE)
## Warning: package 'data.table' was built under R version 3.2.3
## data.table 1.9.6  For help type ?data.table or
## The fastest way to learn (by data.table authors):
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##     dcast, melt
## data.table   reshape2 
##       TRUE       TRUE

Set path.

path <- getwd()
## [1] "/Users/ricadomanhaes/Documents/Coursera/Data_Science/Getting_and_Cleaning_Data/Project"

Get the data

Download the file. Put it in the Data folder.

url <- ""
f <- ""
if (!file.exists(path)) {dir.create(path)}
download.file(url, file.path(path, f))

Unzip the file.


The archive put the files in a folder named Dataset. Set this folder as the input path.

pathIn <- file.path(path, "UCI HAR Dataset")
list.files(pathIn, recursive=TRUE)
##  [1] "activity_labels.txt"                         
##  [2] "features_info.txt"                           
##  [3] "features.txt"                                
##  [4] "README.txt"                                  
##  [5] "test/Inertial Signals/body_acc_x_test.txt"   
##  [6] "test/Inertial Signals/body_acc_y_test.txt"   
##  [7] "test/Inertial Signals/body_acc_z_test.txt"   
##  [8] "test/Inertial Signals/body_gyro_x_test.txt"  
##  [9] "test/Inertial Signals/body_gyro_y_test.txt"  
## [10] "test/Inertial Signals/body_gyro_z_test.txt"  
## [11] "test/Inertial Signals/total_acc_x_test.txt"  
## [12] "test/Inertial Signals/total_acc_y_test.txt"  
## [13] "test/Inertial Signals/total_acc_z_test.txt"  
## [14] "test/subject_test.txt"                       
## [15] "test/X_test.txt"                             
## [16] "test/y_test.txt"                             
## [17] "train/Inertial Signals/body_acc_x_train.txt" 
## [18] "train/Inertial Signals/body_acc_y_train.txt" 
## [19] "train/Inertial Signals/body_acc_z_train.txt" 
## [20] "train/Inertial Signals/body_gyro_x_train.txt"
## [21] "train/Inertial Signals/body_gyro_y_train.txt"
## [22] "train/Inertial Signals/body_gyro_z_train.txt"
## [23] "train/Inertial Signals/total_acc_x_train.txt"
## [24] "train/Inertial Signals/total_acc_y_train.txt"
## [25] "train/Inertial Signals/total_acc_z_train.txt"
## [26] "train/subject_train.txt"                     
## [27] "train/X_train.txt"                           
## [28] "train/y_train.txt"

Read the files

Read the subject files.

fileToDataTable <- function (f) {
	df <- read.table(f)
	dt <- data.table(df)
dtSubjectTrain <- fileToDataTable(file.path(pathIn, "train", "subject_train.txt"))
dtSubjectTest  <- fileToDataTable(file.path(pathIn, "test" , "subject_test.txt" ))

Read the label files.

dtActivityTrain <- fileToDataTable(file.path(pathIn, "train", "y_train.txt"))
dtActivityTest  <- fileToDataTable(file.path(pathIn, "test" , "y_test.txt" ))

Read the data files.

dtTrain <- fileToDataTable(file.path(pathIn, "train", "X_train.txt"))
dtTest  <- fileToDataTable(file.path(pathIn, "test" , "X_test.txt" ))

Read the key files.

dtFeatures <- fileToDataTable(file.path(pathIn, "features.txt"))
dtActivityNames <- fileToDataTable(file.path(pathIn, "activity_labels.txt"))
  1. Merge the training and the test sets

Concatenate the data tables.

dtSubject <- rbind(dtSubjectTrain, dtSubjectTest)
setnames(dtSubject, "V1", "subject")
dtActivity <- rbind(dtActivityTrain, dtActivityTest)
setnames(dtActivity, "V1", "activityNum")
dt <- rbind(dtTrain, dtTest)

Merge columns.

dtSubject <- cbind(dtSubject, dtActivity)
dt <- cbind(dtSubject, dt)

Set key.

setkey(dt, subject, activityNum)
  1. Extract only the mean and standard deviation

Read the features.txt file. This tells which variables in dt are measurements for the mean and standard deviation.

setnames(dtFeatures, names(dtFeatures), c("featureNum", "featureName"))

Subset only measurements for the mean and standard deviation.

dtFeatures <- dtFeatures[grepl("mean\\(\\)|std\\(\\)", featureName)]

Convert the column numbers to a vector of variable names matching columns in dt.

dtFeatures$featureCode <- dtFeatures[, paste0("V", featureNum)]
##    featureNum       featureName featureCode
## 1:          1 tBodyAcc-mean()-X          V1
## 2:          2 tBodyAcc-mean()-Y          V2
## 3:          3 tBodyAcc-mean()-Z          V3
## 4:          4  tBodyAcc-std()-X          V4
## 5:          5  tBodyAcc-std()-Y          V5
## 6:          6  tBodyAcc-std()-Z          V6
##  [1] "V1"   "V2"   "V3"   "V4"   "V5"   "V6"   "V41"  "V42"  "V43"  "V44" 
## [11] "V45"  "V46"  "V81"  "V82"  "V83"  "V84"  "V85"  "V86"  "V121" "V122"
## [21] "V123" "V124" "V125" "V126" "V161" "V162" "V163" "V164" "V165" "V166"
## [31] "V201" "V202" "V214" "V215" "V227" "V228" "V240" "V241" "V253" "V254"
## [41] "V266" "V267" "V268" "V269" "V270" "V271" "V345" "V346" "V347" "V348"
## [51] "V349" "V350" "V424" "V425" "V426" "V427" "V428" "V429" "V503" "V504"
## [61] "V516" "V517" "V529" "V530" "V542" "V543"

Subset these variables using variable names.

select <- c(key(dt), dtFeatures$featureCode)
dt <- dt[, select, with=FALSE]
  1. Use descriptive activity names

Use the activity_labels.txt file. This will be used to add descriptive names to the activities.

setnames(dtActivityNames, names(dtActivityNames), c("activityNum", "activityName"))
  1. Appropriately labels the data set with descriptive variable names.

Merge activity labels.

dt <- merge(dt, dtActivityNames, by="activityNum", all.x=TRUE)

Add activityName as a key.

setkey(dt, subject, activityNum, activityName)

Melt the data table to reshape it from a short and wide format to a tall and narrow format.

dt <- data.table(melt(dt, key(dt),"featureCode"))

Merge activity name.

dt <- merge(dt, dtFeatures[, list(featureNum, featureCode, featureName)], by="featureCode", all.x=TRUE)

Create a new variable, activity that is equivalent to activityName as a factor class. Create a new variable, feature that is equivalent to featureName as a factor class.

dt$activity <- factor(dt$activityName)
dt$feature <- factor(dt$featureName)

Separate features from featureName using the helper function grepthis.

grepthis <- function (regex) {
  grepl(regex, dt$feature)
## Features with 2 categories
n <- 2
y <- matrix(seq(1, n), nrow=n)
x <- matrix(c(grepthis("^t"), grepthis("^f")), ncol=nrow(y))
dt$featDomain <- factor(x %*% y, labels=c("Time", "Freq"))
x <- matrix(c(grepthis("Acc"), grepthis("Gyro")), ncol=nrow(y))
dt$featInstrument <- factor(x %*% y, labels=c("Accelerometer", "Gyroscope"))
x <- matrix(c(grepthis("BodyAcc"), grepthis("GravityAcc")), ncol=nrow(y))
dt$featAcceleration <- factor(x %*% y, labels=c(NA, "Body", "Gravity"))
x <- matrix(c(grepthis("mean()"), grepthis("std()")), ncol=nrow(y))
dt$featVariable <- factor(x %*% y, labels=c("Mean", "SD"))
## Features with 1 category
dt$featJerk <- factor(grepthis("Jerk"), labels=c(NA, "Jerk"))
dt$featMagnitude <- factor(grepthis("Mag"), labels=c(NA, "Magnitude"))
## Features with 3 categories
n <- 3
y <- matrix(seq(1, n), nrow=n)
x <- matrix(c(grepthis("-X"), grepthis("-Y"), grepthis("-Z")), ncol=nrow(y))
dt$featAxis <- factor(x %*% y, labels=c(NA, "X", "Y", "Z"))

Check to make sure all possible combinations of feature are accounted for by all possible combinations of the factor class variables.

r1 <- nrow(dt[, .N, by=c("feature")])
r2 <- nrow(dt[, .N, by=c("featDomain", "featAcceleration", "featInstrument", "featJerk", "featMagnitude", "featVariable", "featAxis")])
r1 == r2
## [1] TRUE
  1. Create a new independent tidy data set

Create a data set with the average of each variable for each activity and each subject.

setkey(dt, subject, activity, featDomain, featAcceleration, featInstrument, featJerk, featMagnitude, featVariable, featAxis)
dtTidy <- dt[, list(count = .N, average = mean(value)), by=key(dt)]

Make codebook.

knit("createCodebook.Rmd", output="", encoding="ISO8859-1", quiet = TRUE)
## [1] ""
markdownToHTML("", "codebook.html")