forked from zhangyuqing/ComBat-seq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ComBat_seq.R
204 lines (178 loc) · 8.97 KB
/
ComBat_seq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#' Adjust for batch effects using an empirical Bayes framework in RNA-seq raw counts
#'
#' ComBat_seq is an extension to the ComBat method using Negative Binomial model.
#'
#' @param counts Raw count matrix from genomic studies (dimensions gene x sample)
#' @param batch Batch covariate (only one batch allowed)
#' @param group Vector / factor for condition of interest
#' @param covar_mod Model matrix for other covariates to include in linear model besides batch and condition of interest
#' @param full_mod Boolean, if TRUE include condition of interest in model
#' @param shrink Boolean, whether to apply empirical Bayes estimation on parameters
#' @param shrink.disp Boolean, whether to apply empirical Bayes estimation on dispersion
#' @param gene.subset.n Number of genes to use in empirical Bayes estimation, only useful when shrink = TRUE
#' @return data A probe x sample count matrix, adjusted for batch effects.
#'
#' @examples
#'
#'
#'
#' @export
#'
ComBat_seq <- function(counts, batch, group=NULL, covar_mod=NULL, full_mod=TRUE,
shrink=FALSE, shrink.disp=FALSE, gene.subset.n=NULL){
######## Preparation ########
counts <- as.matrix(counts)
## Does not support 1 sample per batch yet
batch <- as.factor(batch)
if(any(table(batch)<=1)){
stop("ComBat-seq doesn't support 1 sample per batch yet")
}
## Remove genes with only 0 counts in any batch
keep_lst <- lapply(levels(batch), function(b){
which(apply(counts[, batch==b], 1, function(x){!all(x==0)}))
})
keep <- Reduce(intersect, keep_lst)
rm <- setdiff(1:nrow(counts), keep)
countsOri <- counts
counts <- counts[keep, ]
# require bioconductor 3.7, edgeR 3.22.1
dge_obj <- DGEList(counts=counts)
## Prepare characteristics on batches
n_batch <- nlevels(batch) # number of batches
batches_ind <- lapply(1:n_batch, function(i){which(batch==levels(batch)[i])}) # list of samples in each batch
n_batches <- sapply(batches_ind, length)
#if(any(n_batches==1)){mean_only=TRUE; cat("Note: one batch has only one sample, setting mean.only=TRUE\n")}
n_sample <- sum(n_batches)
cat("Found",n_batch,'batches\n')
## Make design matrix
# batch
batchmod <- model.matrix(~-1+batch) # colnames: levels(batch)
# covariate
group <- as.factor(group)
if(full_mod & nlevels(group)>1){
cat("Using full model in ComBat-seq.\n")
mod <- model.matrix(~group)
}else{
cat("Using null model in ComBat-seq.\n")
mod <- model.matrix(~1, data=as.data.frame(t(counts)))
}
# drop intercept in covariate model
if(!is.null(covar_mod)){
if(is.data.frame(covar_mod)){
covar_mod <- do.call(cbind, lapply(1:ncol(covar_mod), function(i){model.matrix(~covar_mod[,i])}))
}
covar_mod <- covar_mod[, !apply(covar_mod, 2, function(x){all(x==1)})]
}
# bind with biological condition of interest
mod <- cbind(mod, covar_mod)
# combine
design <- cbind(batchmod, mod)
## Check for intercept in covariates, and drop if present
check <- apply(design, 2, function(x) all(x == 1))
#if(!is.null(ref)){check[ref]=FALSE} ## except don't throw away the reference batch indicator
design <- as.matrix(design[,!check])
cat("Adjusting for",ncol(design)-ncol(batchmod),'covariate(s) or covariate level(s)\n')
## Check if the design is confounded
if(qr(design)$rank<ncol(design)){
#if(ncol(design)<=(n_batch)){stop("Batch variables are redundant! Remove one or more of the batch variables so they are no longer confounded")}
if(ncol(design)==(n_batch+1)){stop("The covariate is confounded with batch! Remove the covariate and rerun ComBat-Seq")}
if(ncol(design)>(n_batch+1)){
if((qr(design[,-c(1:n_batch)])$rank<ncol(design[,-c(1:n_batch)]))){stop('The covariates are confounded! Please remove one or more of the covariates so the design is not confounded')
}else{stop("At least one covariate is confounded with batch! Please remove confounded covariates and rerun ComBat-Seq")}}
}
## Check for missing values in count matrix
NAs = any(is.na(counts))
if(NAs){cat(c('Found',sum(is.na(counts)),'Missing Data Values\n'),sep=' ')}
######## Estimate gene-wise dispersions within each batch ########
cat("Estimating dispersions\n")
## Estimate common dispersion within each batch as an initial value
disp_common <- sapply(1:n_batch, function(i){
if((n_batches[i] <= ncol(design)-ncol(batchmod)+1) | qr(mod[batches_ind[[i]], ])$rank < ncol(mod)){
# not enough residual degree of freedom
return(estimateGLMCommonDisp(counts[, batches_ind[[i]]], design=NULL, subset=nrow(counts)))
}else{
return(estimateGLMCommonDisp(counts[, batches_ind[[i]]], design=mod[batches_ind[[i]], ], subset=nrow(counts)))
}
})
## Estimate gene-wise dispersion within each batch
genewise_disp_lst <- lapply(1:n_batch, function(j){
if((n_batches[j] <= ncol(design)-ncol(batchmod)+1) | qr(mod[batches_ind[[j]], ])$rank < ncol(mod)){
# not enough residual degrees of freedom - use the common dispersion
return(rep(disp_common[j], nrow(counts)))
}else{
return(estimateGLMTagwiseDisp(counts[, batches_ind[[j]]], design=mod[batches_ind[[j]], ],
dispersion=disp_common[j], prior.df=0))
}
})
names(genewise_disp_lst) <- paste0('batch', levels(batch))
## construct dispersion matrix
phi_matrix <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts))
for(k in 1:n_batch){
phi_matrix[, batches_ind[[k]]] <- vec2mat(genewise_disp_lst[[k]], n_batches[k])
}
######## Estimate parameters from NB GLM ########
cat("Fitting the GLM model\n")
glm_f <- glmFit(dge_obj, design=design, dispersion=phi_matrix, prior.count=1e-4) #no intercept - nonEstimable; compute offset (library sizes) within function
alpha_g <- glm_f$coefficients[, 1:n_batch] %*% as.matrix(n_batches/n_sample) #compute intercept as batch-size-weighted average from batches
new_offset <- t(vec2mat(getOffset(dge_obj), nrow(counts))) + # original offset - sample (library) size
vec2mat(alpha_g, ncol(counts)) # new offset - gene background expression # getOffset(dge_obj) is the same as log(dge_obj$samples$lib.size)
glm_f2 <- glmFit.default(dge_obj$counts, design=design, dispersion=phi_matrix, offset=new_offset, prior.count=1e-4)
gamma_hat <- glm_f2$coefficients[, 1:n_batch]
mu_hat <- glm_f2$fitted.values
phi_hat <- do.call(cbind, genewise_disp_lst)
######## In each batch, compute posterior estimation through Monte-Carlo integration ########
if(shrink){
cat("Apply shrinkage - computing posterior estimates for parameters\n")
mcint_fun <- monte_carlo_int_NB
monte_carlo_res <- lapply(1:n_batch, function(ii){
if(ii==1){
mcres <- mcint_fun(dat=counts[, batches_ind[[ii]]], mu=mu_hat[, batches_ind[[ii]]],
gamma=gamma_hat[, ii], phi=phi_hat[, ii], gene.subset.n=gene.subset.n)
}else{
invisible(capture.output(mcres <- mcint_fun(dat=counts[, batches_ind[[ii]]], mu=mu_hat[, batches_ind[[ii]]],
gamma=gamma_hat[, ii], phi=phi_hat[, ii], gene.subset.n=gene.subset.n)))
}
return(mcres)
})
names(monte_carlo_res) <- paste0('batch', levels(batch))
gamma_star_mat <- lapply(monte_carlo_res, function(res){res$gamma_star})
gamma_star_mat <- do.call(cbind, gamma_star_mat)
phi_star_mat <- lapply(monte_carlo_res, function(res){res$phi_star})
phi_star_mat <- do.call(cbind, phi_star_mat)
if(!shrink.disp){
cat("Apply shrinkage to mean only\n")
phi_star_mat <- phi_hat
}
}else{
cat("Shrinkage off - using GLM estimates for parameters\n")
gamma_star_mat <- gamma_hat
phi_star_mat <- phi_hat
}
######## Obtain adjusted batch-free distribution ########
mu_star <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts))
for(jj in 1:n_batch){
mu_star[, batches_ind[[jj]]] <- exp(log(mu_hat[, batches_ind[[jj]]])-vec2mat(gamma_star_mat[, jj], n_batches[jj]))
}
phi_star <- rowMeans(phi_star_mat)
######## Adjust the data ########
cat("Adjusting the data\n")
adjust_counts <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts))
for(kk in 1:n_batch){
counts_sub <- counts[, batches_ind[[kk]]]
old_mu <- mu_hat[, batches_ind[[kk]]]
old_phi <- phi_hat[, kk]
new_mu <- mu_star[, batches_ind[[kk]]]
new_phi <- phi_star
adjust_counts[, batches_ind[[kk]]] <- match_quantiles(counts_sub=counts_sub,
old_mu=old_mu, old_phi=old_phi,
new_mu=new_mu, new_phi=new_phi)
}
#dimnames(adjust_counts) <- dimnames(counts)
#return(adjust_counts)
## Add back genes with only 0 counts in any batch (so that dimensions won't change)
adjust_counts_whole <- matrix(NA, nrow=nrow(countsOri), ncol=ncol(countsOri))
dimnames(adjust_counts_whole) <- dimnames(countsOri)
adjust_counts_whole[keep, ] <- adjust_counts
adjust_counts_whole[rm, ] <- countsOri[rm, ]
return(adjust_counts_whole)
}