-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnormalise.R
125 lines (103 loc) · 3.96 KB
/
normalise.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#' Normalise Digital Gene Expression Matrix
#'
#' \code{normaliseDGE} normalise a gene by cell matrix
#'
#'
#' @param dge matrix; a digital gene expression matrix containing count data (columns = cells, rows = genes), can be a sparce Matrix object.
#'
#' @param center logical; if true genes are centered to mean of 0
#'
#' @param scale logical; if TRUE genes are scaled to unit variance
#'
#' @param threshold integer; any normalised values larger than this will be rounded down
#'
#' @param gene_subset real; what fraction of the genes do you want to keep (by mean) e.g. 0.5 is top half most expressed
#'
#' @param min_library_size real; only cells with library size equal or greater to this will be kept
#'
#' @param verbose logical; if TRUE the function will print diagnostic graphs along the way
#'
#' @param outliers logical; calculate max value per gene and cell to assess threshold position
#'
#' @param transformation character; either sqrt (default) or asinh
#'
#' @return A matrix of read counts
#'
#' @examples
#' # normalised_counts <- normalise_dge(dge)
#' @export
#' @import data.table Matrix
#'
normaliseDGE <- function(dge, verbose=FALSE, center=TRUE, scale=TRUE, outliers = FALSE, threshold = 5, min_library_size=200, gene_subset=0.5, transformation="sqrt"){
library_size <- colSums(dge)
lib_size_correction_factor <- library_size / median(sqrt(library_size))
dge <- t(t(dge) / lib_size_correction_factor) # sweep(dge, 2, , "/")
# subset cells and genes
cell_subset <- library_size >= min_library_size
gene_mean <- rowMeans(dge[, cell_subset])
gene_subset <- data.table(gene_mean, names(gene_mean))[order(-gene_mean)][1:round(length(gene_mean) * gene_subset)]$V2
dge <- dge[gene_subset, cell_subset]
# multiply & sqrt
if (transformation=="asinh"){
dge <- asinh(10000 * dge)
} else{
dge <- sqrt(10000 * dge)
}
# transpose so cells are rows
dge <- t(dge)
sample_dge <- function(dge){
if(prod(dim(dge))>1e6){
tmp <- sample(dge, 1e6)
} else if(prod(dim(dge))<1e6 & class(dge)=="dgCMatrix"){
tmp <- as.matrix(dge)
} else{
tmp <- dge
}
return(tmp)
}
# check everything looks ok
if(verbose){
str(dge)
hist(sample_dge(dge), breaks = 2000, ylim = c(0, 1e3), xlim=c(0,50), xlab="Expression", main="Histogram of data post cell-wise (& sqrt) normalisation")
}
# scale to unit variance all columns (genes) - don't center so 0 values remain exactly 0 and all data >= 0
if (center==TRUE){
dge <- scale(dge, center = TRUE, scale = TRUE)
} else{
colSdColMeans <- function(x, na.rm=TRUE) {
if (na.rm) {
n <- colSums(!is.na(x)) # thanks @flodel
} else {
n <- nrow(x)
}
colVar <- colMeans(x*x, na.rm=na.rm) - (colMeans(x, na.rm=na.rm))^2
return(sqrt(colVar * n/(n-1)))
}
dge <- t(t(dge) / colSdColMeans(dge)) #, na.rm=TRUE
}
dge <- dge[, !is.na(colSums(dge))] # remove NA genes (sd calculation fails if all 0s)
if(verbose){
hist(sample_dge(dge), breaks = 2000, ylim = c(0, 1e4), main="Histogram of data post gene-wise normalisation")
}
# make sure no NA values left
stopifnot(sum(is.na(dge)) == 0)
# assess degree of outliers
if(outliers){
max_by_cell <- apply(dge, 1, max)
max_by_gene <- apply(dge, 2, max)
plot(max_by_gene, main="Maximum expression value per gene", ylab="Maximum Expression", xlab="Gene Index (by mean expression)")
abline(h=threshold, col="red")
plot(max_by_cell[], main="Maximum expression value per cell", ylab="Maximum Expression", xlab="Cell Index (by library size)")
abline(h=threshold, col="red")
}
# round down large values which could be due to small library size and normalisation OR low gene wise expression and scaling
sum(dge > threshold)
dge[dge > threshold] <- threshold
dge[dge < (-threshold)] <- (-threshold)
if(verbose){
hist(sample_dge(dge), breaks = 500, ylim = c(0, 1e3), main="Histogram of final data", xlab="Expression")
# check everything looks ok
str(dge)
}
return(dge)
}