-
Notifications
You must be signed in to change notification settings - Fork 0
/
remove_outlying_bioreps.R
110 lines (92 loc) · 3.65 KB
/
remove_outlying_bioreps.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#Load libraries
library(dplyr)
library(data.table)
library(tidyr)
library(tibble)
library(stringr)
#Take file input on command line to allow integration into pipelines (Rscript remove_outlying_techreps.R 'file.csv')
file_input = commandArgs(trailingOnly = T)
#Read as a data.table
file = fread(file_input[1])
file = file %>% select(-c(Ct, SEM, most_stable_CT))
#Group by name and get mean and sd
ss = file %>% filter(isoform=="ss")
iu = file %>% filter(isoform=="iu")
#can only remove outliers from the normalizED not the normalizER (the normalizER will have dCT = 0 in all rows)
if((ss %>% select(dCT) %>% unlist() %>% sum())==0){
file=iu
other_file=ss
}else{
file=ss
other_file=iu
}
file_summary = file %>% group_by(time) %>% summarize(Mean_dCt = mean(dCT), SD_dCT = sd(dCT))
#If there is only 1 techrep left, calculating SD will return NA. We want to force this to be 0
file_summary = file_summary %>% mutate(SD_dCT=ifelse(is.na(SD_dCT), 0, SD_dCT))
file = left_join(file, file_summary)
#Compared to the rest of the data, which sets of technical replicates have abnormally high SDs?
SDs = file %>% ungroup %>% select(SD_dCT) %>% distinct() %>% unname() %>% unlist()
SDs_mean = mean(SDs)
SDs_SD = sd(SDs)
n = length(SDs)
error = qnorm(0.975)*SDs_SD/sqrt(n)
#failsafe in case error is very low, we don't want to exclude decent samples (this is only really an issue for sets with lots of replicates where error is likely to be very low)
if(error < 0.2){
error = 0.2
}
cutoff = SDs_mean+error
#Add column with bool to distinguish groups containing outlier vs non-outliers
file = file %>% mutate(high_SD = SD_dCT>cutoff)
#To spot the outlier in the triplet we will compare the each data point to the other two, and omit the point which has a difference > cutoff in both comparisons. If all comparisons > cutoff then the entire sample will be omitted.
test = file %>% filter(high_SD == TRUE)
test = test %>% add_column(drop=NA)
test = transform(test, drop=as.logical(drop))
#filter for n=3 as we cannot do stats on 1 / 2
filter_test = test %>% group_by(time) %>% summarize(n=n()) %>% filter(n==3) %>% select(time) %>% unlist %>% unname()
test = test %>% filter(time %in% filter_test)
for(x in unique(test$time)){
values = test %>% filter(time==x) %>% ungroup() %>% select(dCT) %>% unlist()
print(values)
comp1 = abs(values[1] - values[2])
comp2 = abs(values[1] - values[3])
comp3 = abs(values[2] - values[3])
#outputs (F=keep, T=drop)
#if 1-2 is < cutoff, then 3 caused the high sd
if(comp1<cutoff){
output = c(F,F,T)
}
#if 1-3 < cutoff, then 2 caused the high sd
if(comp2<cutoff){
output = c(F,T,F)
}
# if 2-3 < cutoff, then 1 caused the high sd
if(comp3<cutoff){
output = c(T,F,F)
}
#if all greater than cutoff, then drop all
if(comp1>=cutoff & comp2>=cutoff & comp3>=cutoff){
output = c(T,T,T)
}
drop = test %>% filter(time==x)
drop[,9] = output
test = full_join(test, drop)
}
#this duplicates them but with NA, remove duplicates
test = test %>% filter(!is.na(drop))
#Drop those where drop == TRUE
#join test (high SD) and file (total)
file = left_join(file, test)
#set those NAs to FALSES in drop column
file[,9][is.na(file[,9])]=F
#only keep those where drop == FALSE
file = file %>% filter(drop==FALSE)
#Remove unnecessary columns to allow output
file = file[,1:5]
file = rbind(file, other_file)
#Output as data.table
fwrite(file, file_input[2])
#Also output dropped
file_name = basename(file_input[2])
directory = dirname(file_input[2])
dropped_outfile = paste0(directory, "/DROPPED_", file_name)
fwrite(test, dropped_outfile)