-
Notifications
You must be signed in to change notification settings - Fork 42
/
commonFunctions.Rmd
223 lines (190 loc) · 7.99 KB
/
commonFunctions.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
---
title: "Common Functions"
author: "João Neto"
date: October 2014
output:
html_document:
toc: true
toc_depth: 3
fig_width: 6
fig_height: 6
cache: yes
---
NA removal
----------
```{r}
l5 <- c(1,2,NA,4,NA,6)
l5a <- l5[!is.na(l5)]
l5a
l5b <- c("a","b",NA,"d",NA,"f")
good.pairs <- complete.cases(l5,l5b) # if we need to join both lists only with complete pairs...
data.frame(a=l5[good.pairs],b=l5b[good.pairs]) # another example: airquality is a data frame
airquality[4,][c(1,4,3)] # shows cols 1, 4 & 3 from the 4th row
airquality[complete.cases(airquality),][1:6,] # to show all non NA rows for the first 6 columns
```
Sample function
---------------
Function **sample** takes a sample of the specified size from the elements of x using either with or without replacement.
```{r}
coin <- c("Heads","Tails")
sample(coin,8,rep=T)
dice.events <- 1:6 # dice possible results
ten.throws <- sample(dice.events, 10, replace=T) # balanced dice
ten.throws
ten.more.throws <- sample(dice.events, 10, replace=T, prob=c(.1,.1,.1,.1,.1,.5)) # unbalanced dice (50% it falls 6)
ten.more.throws
# Without replacement, we can create permutations
xs <- 1:10
sample(xs,10)
sample(xs,10)
# Eg: we have 50 pacients that we want to separate into two equal size groups (Control or Treatment)
pacients <- data.frame(patient = 1:50,
age = rnorm(100, mean = 60, sd = 12))
head(pacients)
xs <- rep(c("Control", "Treat"),25)
xs
xs <- sample(xs,50) # a random permutation to remove selection bias
xs
pacients$group <- as.factor(xs) # add new column with required information
head(pacients)
# Eg: simple bootstrap example (from: www.sigmafield.org/2010/05/23/r-function-of-the-day-sample)
rn <- rnorm(1000, 10) # random sample from a normal distribution
# making 1000 samples and keeping the means
sample.means <- replicate(1000, mean(sample(rn, replace = TRUE)))
sample.means[1:20]
quantile(sample.means, probs = c(0.025, 0.975)) # produces sample quantiles corresponding to the given probabilities
# Compare this to the standard parametric technique.
t.test(rn)$conf.int
```
split-apply-combine
---------------
References:
+ [http://www.sigmafield.org/2009/09/20/r-function-of-the-day-tapply](http://www.sigmafield.org/2009/09/20/r-function-of-the-day-tapply)
+ [http://stackoverflow.com/questions/11562656/averaging-column-values-for-specific-sections-of-data-corresponding-to-other-col/11562850#11562850](http://stackoverflow.com/questions/11562656/averaging-column-values-for-specific-sections-of-data-corresponding-to-other-col/11562850#11562850)
check also: [http://www.jstatsoft.org/v40/i01/paper](http://www.jstatsoft.org/v40/i01/paper)
We use **tapply** when:
* A dataset that can be broken up into groups
* We want to break it up into groups
* Within each group, we want to apply a function
The tapply function is useful when we need to break up a vector into groups defined by some classifying factor, compute a function on the subsets, and return the results in a convenient form.
```{r}
## generate data for medical example
medical.example <- data.frame(patient = 1:100,
age = rnorm(100, mean = 60, sd = 12),
treatment = gl(2, 50, labels = c("Treatment", "Control")))
# we want to break the dataset by Treatment, and then find the age's mean
tapply(medical.example$age, medical.example$treatment, mean)
# Another eg
baseball.example <- data.frame(team = gl(5, 5,
labels = paste("Team", LETTERS[1:5])),
player = sample(letters, 25), batting.average = runif(25, .200, .400))
# we want to break the dataset by Team, and then find the max batting avg
tapply(baseball.example$batting.average, baseball.example$team, max)
# an artificial eg from R's help:
n <- 17
fac <- factor(rep(1:3, length = n), levels = 1:5)
fac
table(fac)
# this next function, separates all 1:n value into the factors given by fac,
# and then sum each factor
tapply(1:n, fac, sum)
# Let's explicitly sum the first factor just to confirm:
fac == 1
(1:n)[fac==1]
sum((1:n)[fac==1])
# we can apply any unary function over each factor
tapply(1:n, fac, range)
tapply(1:n, fac, quantile)
```
Other possibilities:
```{r}
df <- data.frame(dive=factor(sample(c("dive1","dive2"),10,replace=TRUE)),
speed=runif(10))
df
# 'by()' takes in vectors and applies a function to them
res.by <- by( df$speed, df$dive, mean)
library(taRifx)
as.data.frame(res.by)
# takes in data.frames, outputs data.frames, and uses a formula interface.
aggregate( speed ~ dive, df, mean )
# Check also library plyr which facilitates this type of data frama manipulation
```
Rle function
---------------
Function **rle** computes the lengths and values of runs of equal values in a vector.
Function **inverse.rle** does the reverse operation.
```{r}
coin <- c("H","T")
experiment <- sample(coin,100,rep=T)
experiment
experiment.rle <- rle(experiment)
max.seq <- max(experiment.rle$lengths) # the maximum sequence of equal values
max.seq
experiment.rle$values[which(experiment.rle$lengths == max.seq)] # was it heads or tails?
# we can compute both max sequences of heads and tails by using tapply:
tapply(experiment.rle$lengths, experiment.rle$values, max)
inverse.rle(experiment.rle) # recomputes the initial sequence
```
Split/Cut functions
--------------
Function **split** divides the data in the vector x into the groups defined by f. The replacement forms replace values corresponding to such a division.
Function **cut** divides the range of x into intervals and codes the values in x according to which interval they fall. The leftmost interval corresponds to level one, the next leftmost to level two and so on.
```{r}
split(1:10, 1:2)
split(1:10, 1:2)[[1]] # or: split(1:10, 1:2)$"1"
ma <- cbind(x = 1:10, y = (-4:5)^2)
ma
split(ma, colnames(ma))
split(ma, col(ma))
split(ma, col(ma))[[1]]
ma[,1] # easier alternative
ma[,"x"] # another alternative
xs <- 1:20
split(xs,factor(c("yes","no","yes","yes"))) # every four elements are attached to the given level sequence "yes","no","yes","yes"
is <- split(xs,factor(c("yes","no","yes","yes")))[["yes"]]
xs[is]
# Use of cut (from: www.sigmafield.org/2009/09/23/r-function-of-the-day-cut)
# generate some data
clinical.trial <-
data.frame(patient = 1:100, age = rnorm(100, mean = 60, sd = 8),
year = sample(paste("19", 85:99, sep = ""), 100, replace = T))
# Now, we will use the cut function to make age a factor, ie,
# age will be a categorical variable
c1 <- cut(clinical.trial$age, breaks = 4)
table(c1)
# year.enroll is a factor, so must convert to numeric first!
years <- as.numeric(as.character(clinical.trial$year))
years
c2 <- cut(years, breaks = 3)
table(c2)
# The previous intervals where determined by R.
# We can force our own (in this case, I'll use 'seq')
c3 <- cut(clinical.trial$age, breaks = seq(30, 80, by = 10))
table(c3)
# the interval size can also be variable:
age.cat <- function(x, lower = 0, upper, by = 10,
sep = "-", above.char = "+") {
labs <- c(paste(seq(lower, upper - by, by = by),
seq(lower + by - 1, upper - 1, by = by),
sep = sep),
paste(upper, above.char, sep = ""))
cut(floor(x), breaks = c(seq(lower, upper, by = by), Inf),
right = FALSE, labels = labs)
}
# This function categorizes age in a fairly flexible way. The first assignment to labs inside the function creates a vector of labels. Then, the cut function is called to do the work, with the custom labels as an argument.
# only specifying an upper bound, uses 0 as lower bound, and breaks up categories by 10
table(age.cat(clinical.trial$age, upper = 70))
# now specifying a lower bound
table(age.cat(clinical.trial$age, lower = 30, upper = 70))
# now specifying a lower bound AND the "by" argument
table(age.cat(clinical.trial$age, lower = 30, upper = 70, by = 5))
```
Subsetting
-----------
`subset` returns subsets of vectors, matrices or data frames which meet conditions.
```{r}
head(airquality)
head(subset(airquality, Temp > 80, select = c(Ozone, Temp)))
head(subset(airquality, Day == 1, select = -Temp))
head(subset(airquality, select = Ozone:Wind))
```