-
Notifications
You must be signed in to change notification settings - Fork 0
/
remove_outlying_techreps.R
102 lines (84 loc) · 3.17 KB
/
remove_outlying_techreps.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#Load libraries
library(dplyr)
library(data.table)
library(stringr)
#Take file input on command line to allow integration into pipelines (Rscript remove_outlying_techreps.R 'file.csv')
file_input = commandArgs(trailingOnly = T)
#Read as a data.table
file = fread(file_input[1])
#Group by name and get mean and sd
file = file %>% select(Name, Ct) %>% group_by(Name) %>% summarize(.groups="keep", Name = Name, Ct = Ct, SD_Ct = sd(Ct),Mean_Ct = mean(Ct))
#Compared to the rest of the data, which sets of technical replicates have abnormally high SDs?
SDs = file %>% ungroup %>% select(SD_Ct) %>% unname() %>% unique() %>% unlist()
SDs_mean = mean(SDs)
SDs_SD = sd(SDs)
n = length(SDs)
error = qnorm(0.975)*SDs_SD/sqrt(n)
cutoff = SDs_mean+error
#failsafe in case error is very low, we don't want to exclude decent samples
if(cutoff < 0.3){
cutoff = 0.3
}
#Add column with bool to distinguish groups containing outlier vs non-outliers
file = file %>% mutate(high_SD = SD_Ct>cutoff)
#To spot the outlier in the triplet we will compare the each data point to the other two, and omit the point which has a difference > cutoff in both comparisons. If all comparisons > cutoff then the entire sample will be omitted.
test = file %>% filter(high_SD == TRUE)
test[6]=""
colnames(test)[6]="drop"
test = transform(test, drop=as.logical(drop))
for(x in unique(test$Name)){
values = test %>% filter(Name==x) %>% ungroup() %>% select(Ct) %>% unlist()
#if statement to check whether we have a full triplicate, if not then we have 1 / 2 with high sd, drop the biorep
if(length(values)==3){
comp1 = abs(values[1] - values[2])
comp2 = abs(values[1] - values[3])
comp3 = abs(values[2] - values[3])
#outputs (F=keep, T=drop)
#if 1-2 is < cutoff, then 3 caused the high sd
if(comp1<cutoff){
output = c(F,F,T)
}
#if 1-3 < cutoff, then 2 caused the high sd
if(comp2<cutoff){
output = c(F,T,F)
}
# if 2-3 < cutoff, then 1 caused the high sd
if(comp3<cutoff){
output = c(T,F,F)
}
#if all greater than cutoff, then drop all
if(comp1>=cutoff & comp2>=cutoff & comp3>=cutoff){
output = c(T,T,T)
}
}else{
if(length(values)==2){
output=c(T,T)
}else{
if(length(values)==1){
output=T
}
}
}
drop = test %>% filter(Name==x)
drop[6] = output
colnames(drop)[6] = "drop"
test = full_join(test, drop, by=c("Name", "Ct", "Mean_Ct", "SD_Ct", "high_SD", "drop"))
}
#this duplicates them but with NA, remove duplicates
test = test %>% filter(!is.na(drop))
#Drop those where drop == TRUE
#join test (high SD) and file (total)
file = left_join(file, test, by=c("Name", "Ct", "Mean_Ct", "SD_Ct", "high_SD"))
#set those NAs to FALSES in drop column
file[6][is.na(file[6])]=F
#only keep those where drop == FALSE
file = file %>% filter(drop==FALSE)
#Remove unnecessary columns to allow output
file = file[,1:2]
#Output as data.table
fwrite(file, file_input[2])
#Also output those which were dropped
file_name = basename(file_input[2])
directory = dirname(file_input[2])
dropped_outfile = paste0(directory, "/DROPPED_", file_name)
fwrite(test, dropped_outfile)