-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo.R
executable file
·287 lines (230 loc) · 9.82 KB
/
demo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)
library(ccaPP)
library(parallel)
## Pipelines
## Flow of data
## r vs python (FP vs OOP for data programming)
## Shapes of data
## - data entry
## - modelling
## - plotting
## - tabulating
## EHR data entry
## Problem - How has the way that doctors code diagnoses of medical conditions
## changed over time?
## - New regulations
## - New definitions
## - "drift"
## Why?
## - improve research
## - make better comparisons between studies
## - understand the effects of policy changes
## data:
## 1. All clinical events in a domain
load("data/hf_incidence.rda")
View(hf_incidence)
## merge with clinical entity names
master <- read.delim("data/medical.txt")
a <- inner_join(hf_incidence, master) # same as base::merge
View(a)
## 2. Frequencies of all clinical codes used per year
load("data/hf_frequencies.rda")
View(hf_incidence_freqs)
## Calculate, evenness, richness and diversity of clinical codes:
conditions <- c("hf", "learning_disability")
## load data into new environment. This keeps your top-level namespace clean
## For loops are fine in this situation because you have side effects (i.e. you are performing I/O)
## If there are no side effects, use lapply or similar
frequencies <- new.env()
for(condition in conditions){
load(paste0("data/", condition, "_frequencies.rda"), envir = frequencies)
}
## forget <- do.call("rbind", lapply(frequencies, function(x) x))
## Use bind_rows - it is cleaner and quicker
all_freqs <- bind_rows(lapply(frequencies, function(x) x))
#' calculates the Shannon index of diversity/entropy
#' I think I stole this from the "vegan" package
shannon <- function(x)
{
x <- drop(as.matrix(x))
if (length(dim(x)) > 1) {
total <- apply(x, 1, sum)
x <- sweep(x, 1, total, "/")
}
else {
x <- x/sum(x)
}
x <- -x * log(x)
if (length(dim(x)) > 1)
H <- apply(x, sum, na.rm = TRUE)
else H <- sum(x, na.rm = TRUE)
H
}
# get indices for all years and conditions:
H_all <- all_freqs %>%
group_by(year, condition) %>% # set these as groups
summarise(H = shannon(count), # calculate your statistics for all groups
richness = n(),
evenness = H / log(richness)) %>%
ungroup() %>%
gather(index, measure, H:evenness) # equivalent to reshape2::melt, need to ungroup first
## this code is exactly equivalent to (without the pipe operator):
## Notice how the previous way, you are reading the flow of comands in a linear way.
## here you need to read the code from the inside out in order to understand what is going on.
gather(
ungroup(
summarise(
group_by(
all_freqs, year, condition),
H = shannon(count),
richness = n(),
evenness = H / log(richness))),
index, measure, H:evenness)
## Best alternative in base R I could come up with!
## cf with line 87 code!
H_all2 <- aggregate(all_freqs$count,
list(year = all_freqs$year, condition = all_freqs$condition),
FUN = shannon)
names(H_all2)[names(H_all2) == "x"] <- "H"
H_all2$richness <- aggregate(all_freqs$count,
list(all_freqs$year,
all_freqs$condition),
FUN = length)$x
H_all2$evenness <- H_all2$H / log(H_all2$richness)
H_all2 <- reshape(H_all2, v.names = "measure",
varying = c("H", "richness", "evenness"),
direction = "long")
H_all2$index <- "H"
H_all2$index[ H_all2$time == 2] <- "richness"
H_all2$index[ H_all2$time == 3] <- "evenness"
H_all2$index <- factor(H_all2$index, levels = c("H", "richness", "evenness"))
H_all2$time <- NULL
H_all2$id <- NULL
H_all2 <- H_all2[order( H_all2$index, H_all2$year),
c("year", "condition", "index", "measure")]
all(H_all == H_all2) # return the same values
## compare the two methods using microbenchmark:
library(microbenchmark)
microbenchmark(dplyr = {H_all <- all_freqs %>%
group_by(year, condition) %>%
summarise(H = shannon(count),
richness = n(),
evenness = H / log(richness)) %>%
ungroup() %>%
gather(index, measure, H:evenness)
}, base = {H_all2 <- aggregate(all_freqs$count,
list(year = all_freqs$year, condition = all_freqs$condition),
FUN = shannon)
names(H_all2)[names(H_all2) == "x"] <- "H"
H_all2$richness <- aggregate(all_freqs$count,
list(all_freqs$year, all_freqs$condition),
FUN = length)$x
H_all2$evenness <- H_all2$H / log(H_all2$richness)
H_all2 <- reshape(H_all2, v.names = "measure",
varying = c("H", "richness", "evenness"),
direction = "long")
H_all2$index <- "H"
H_all2$index[ H_all2$time == 2] <- "richness"
H_all2$index[ H_all2$time == 3] <- "evenness"
H_all2$index <- factor(H_all2$index, levels = c("H", "richness", "evenness"))
H_all2$time <- NULL
H_all2$id <- NULL
H_all2 <- H_all2[order( H_all2$index, H_all2$year),
c("year", "condition", "index", "measure")]}, times = 500)
# dplyr is 10 x faster:
# Unit: milliseconds
# expr min lq median uq max neval
# dplyr 2.396384 2.528917 2.658345 2.837134 5.146062 500
# base 20.728356 21.163055 21.548402 23.728467 75.431676 500
## plot in ggplot2
p <- ggplot(H_all, aes(x = year, y = measure))
p + geom_line() + # line plot
facet_grid(index ~ condition, scale = "free_y") + # indices on the vertical, conditions on the horizontal facets
expand_limits(y = 0) + # include 0 in all y axes
geom_vline(x = 2004, colour = "red", linetype = "longdash") + # pay for performance introduced in 2004
theme_bw() + # more minimal theme than the default
labs(title = "Clinical code usage")
#' get robust cannononical correlation for two dataframes using ccaPP package
get_cc <- function(mat1, mat2, ...){ ## dots allow you to pass optional arguments to ccaProj
x <- as.matrix(mat1)
y <- as.matrix(mat2)
ccaProj(x, y,standardize = FALSE, ...) ## calculate correlation
}
for(condition in conditions){
## get a database of medcode frequencies by year and practice
load(paste0("data/", condition, "_incidence.rda"), envir = frequencies)
## Build a matrix of frequencies n x m
## n is the different GP practices
## m is the different clinical codes
freq_all <- get(paste0(condition, "_incidence"), envir = frequencies) %>%
mutate(practid = str_sub(as.character(patid), -3, -1), # extract practice number from patient id
medcode = factor(medcode)) %>%
group_by(medcode, year, practid) %>%
summarise(count = n()) %>% # count rows for each code,year,practice combination
ungroup() %>%
group_by(year, practid) %>% # regroup
mutate(total = sum(count),
freq = count / total) %>% # calculate frequencies for each year and practice
select(-count, -total) %>%
ungroup() %>%
spread(medcode, freq) %>% # put into wide format
mutate_each(funs(replace(., which(is.na(.)), 0))) # replace NAs with zeros
## split into a list of matrices, one for each year
freq_mats <- lapply(2000:2013, function(x){
freq_all %>%
filter(year == x) %>%
select(-year) # remove year column
})
## include only those practices that are common across years
## Reduce makes a function that only acccepts 2 arguments into a general vectorised function over a list
common_practices <- Reduce(intersect, lapply(freq_mats, function(x) x$practid))
freq_mats <- lapply(freq_mats, function(x){
x %>%
filter(practid %in% common_practices) %>%
select(-practid)
})
names(freq_mats) <- 2000:2013
## combinations for comparisons of matrices:
combinations <- lapply(list(
# year-on-year
c(2000, 2001),
c(2001, 2002),
c(2002, 2003),
c(2003, 2004),
c(2004, 2005),
c(2005, 2006),
c(2006, 2007),
c(2007, 2008),
c(2008, 2009),
c(2009, 2010),
c(2010, 2011),
c(2011, 2012),
c(2012, 2013)), as.character)
## get correlations for all combinations of frequency matrices
assign(paste0(condition, "_year_cors"),
mclapply(combinations, function(this_year){
mat1 <- freq_mats[[this_year[1]]] # get the matrices for that year
mat2 <- freq_mats[[this_year[2]]]
get_cc(mat1, mat2,
method = "spearman")$cor # calculate the correlation
}, mc.cores = 2))
}
## extract the data and bind for all years
all_corrs <- bind_rows(lapply(conditions, function(condition){
data_frame(year = 2001:2013, # for each condition, get the data for each year
condition = condition,
index = "correlation",
measure = unlist(get(paste0(condition, "_year_cors"))))
}))
## repeat of above plot with extra correlation data
all_indices <- bind_rows(H_all, all_corrs) # stick the dataframes on top of each other
p <- ggplot(all_indices, aes(x = year, y = measure))
p + geom_line() +
facet_grid(index ~ condition, scale = "free_y") +
expand_limits(y = 0) +
geom_vline(x = 2004, colour = "red", linetype = "longdash") +
theme_bw() +
labs(title = "Clinical code usage")