remove_outlying_techreps.R

#Load libraries
library(dplyr)
library(data.table)
library(stringr)

#Take file input on command line to allow integration into pipelines (Rscript remove_outlying_techreps.R 'file.csv')
file_input = commandArgs(trailingOnly = T)

#Read as a data.table
file = fread(file_input[1])

#Group by name and get mean and sd
file = file %>% select(Name, Ct) %>% group_by(Name) %>% summarize(.groups="keep", Name = Name, Ct = Ct, SD_Ct = sd(Ct),Mean_Ct = mean(Ct))

#Compared to the rest of the data, which sets of technical replicates have abnormally high SDs? 
SDs = file %>% ungroup %>% select(SD_Ct) %>% unname() %>% unique() %>% unlist()
SDs_mean = mean(SDs)
SDs_SD = sd(SDs)
n = length(SDs)
error = qnorm(0.975)*SDs_SD/sqrt(n)
cutoff = SDs_mean+error

#failsafe in case error is very low, we don't want to exclude decent samples
if(cutoff < 0.3){
  cutoff = 0.3
}

#Add column with bool to distinguish groups containing outlier vs non-outliers
file = file %>% mutate(high_SD = SD_Ct>cutoff)

#To spot the outlier in the triplet we will compare the each data point to the other two, and omit the point which has a difference > cutoff in both comparisons. If all comparisons > cutoff then the entire sample will be omitted. 
test = file %>% filter(high_SD == TRUE)
test[6]=""
colnames(test)[6]="drop"
test = transform(test, drop=as.logical(drop))
for(x in unique(test$Name)){
  values = test %>% filter(Name==x) %>% ungroup() %>% select(Ct) %>% unlist() 
  #if statement to check whether we have a full triplicate, if not then we have 1 / 2 with high sd, drop the biorep
  if(length(values)==3){
  comp1 = abs(values[1] - values[2])
  comp2 = abs(values[1] - values[3])
  comp3 = abs(values[2] - values[3])
  
  #outputs (F=keep, T=drop)
  #if 1-2 is < cutoff, then 3 caused the high sd
  if(comp1<cutoff){
    output = c(F,F,T)
  }
  #if 1-3 < cutoff, then 2 caused the high sd
  if(comp2<cutoff){
    output = c(F,T,F)
  }
  # if 2-3 < cutoff, then 1 caused the high sd
  if(comp3<cutoff){
    output = c(T,F,F) 
  }
  #if all greater than cutoff, then drop all
  if(comp1>=cutoff & comp2>=cutoff & comp3>=cutoff){
    output = c(T,T,T)
  }
  }else{
    if(length(values)==2){
      output=c(T,T)
    }else{
      if(length(values)==1){
        output=T
      }
    }
  }
  
  drop = test %>% filter(Name==x) 
  drop[6] = output
  colnames(drop)[6] = "drop"
  test = full_join(test, drop, by=c("Name", "Ct", "Mean_Ct", "SD_Ct", "high_SD", "drop"))
  
}

#this duplicates them but with NA, remove duplicates
test = test %>% filter(!is.na(drop))


#Drop those where drop == TRUE 
#join test (high SD) and file (total)
file = left_join(file, test, by=c("Name", "Ct", "Mean_Ct", "SD_Ct", "high_SD"))

#set those NAs to FALSES in drop column
file[6][is.na(file[6])]=F

#only keep those where drop == FALSE
file = file %>% filter(drop==FALSE)

#Remove unnecessary columns to allow output
file = file[,1:2]

#Output as data.table
fwrite(file, file_input[2])

#Also output those which were dropped
file_name = basename(file_input[2])
directory = dirname(file_input[2])
dropped_outfile = paste0(directory, "/DROPPED_", file_name)
fwrite(test, dropped_outfile)