remove_outlying_bioreps.R

#Load libraries
library(dplyr)
library(data.table)
library(tidyr)
library(tibble)
library(stringr)

#Take file input on command line to allow integration into pipelines (Rscript remove_outlying_techreps.R 'file.csv')
file_input = commandArgs(trailingOnly = T)
#Read as a data.table
file = fread(file_input[1])
file = file %>% select(-c(Ct, SEM, most_stable_CT))
#Group by name and get mean and sd
ss = file %>% filter(isoform=="ss")
iu = file %>% filter(isoform=="iu")

#can only remove outliers from the normalizED not the normalizER (the normalizER will have dCT = 0 in all rows)
if((ss %>% select(dCT) %>% unlist() %>% sum())==0){
  file=iu
  other_file=ss
}else{
  file=ss
  other_file=iu
}

file_summary = file %>% group_by(time) %>% summarize(Mean_dCt = mean(dCT), SD_dCT = sd(dCT))
#If there is only 1 techrep left, calculating SD will return NA. We want to force this to be 0
file_summary = file_summary %>% mutate(SD_dCT=ifelse(is.na(SD_dCT), 0, SD_dCT))
file = left_join(file, file_summary)
#Compared to the rest of the data, which sets of technical replicates have abnormally high SDs? 
SDs = file %>% ungroup %>% select(SD_dCT) %>% distinct() %>% unname() %>% unlist()
SDs_mean = mean(SDs)
SDs_SD = sd(SDs)
n = length(SDs)
error = qnorm(0.975)*SDs_SD/sqrt(n)
#failsafe in case error is very low, we don't want to exclude decent samples (this is only really an issue for sets with lots of replicates where error is likely to be very low)
if(error < 0.2){
  error = 0.2
}
cutoff = SDs_mean+error


#Add column with bool to distinguish groups containing outlier vs non-outliers
file = file %>% mutate(high_SD = SD_dCT>cutoff)

#To spot the outlier in the triplet we will compare the each data point to the other two, and omit the point which has a difference > cutoff in both comparisons. If all comparisons > cutoff then the entire sample will be omitted. 
test = file %>% filter(high_SD == TRUE)
test = test %>% add_column(drop=NA)
test = transform(test, drop=as.logical(drop))
#filter for n=3 as we cannot do stats on 1 / 2
filter_test = test %>% group_by(time) %>% summarize(n=n()) %>% filter(n==3) %>% select(time) %>% unlist %>% unname()
test = test %>% filter(time %in% filter_test)
for(x in unique(test$time)){
  values = test %>% filter(time==x) %>% ungroup() %>% select(dCT) %>% unlist() 
  print(values)
  comp1 = abs(values[1] - values[2])
  comp2 = abs(values[1] - values[3])
  comp3 = abs(values[2] - values[3])
  
  #outputs (F=keep, T=drop)
  #if 1-2 is < cutoff, then 3 caused the high sd
  if(comp1<cutoff){
    output = c(F,F,T)
  }
  #if 1-3 < cutoff, then 2 caused the high sd
  if(comp2<cutoff){
    output = c(F,T,F)
  }
  # if 2-3 < cutoff, then 1 caused the high sd
  if(comp3<cutoff){
    output = c(T,F,F) 
  }
  #if all greater than cutoff, then drop all
  if(comp1>=cutoff & comp2>=cutoff & comp3>=cutoff){
    output = c(T,T,T)
  }
  
  drop = test %>% filter(time==x) 
  drop[,9] = output
  test = full_join(test, drop)
  
}

#this duplicates them but with NA, remove duplicates
test = test %>% filter(!is.na(drop))


#Drop those where drop == TRUE 
#join test (high SD) and file (total)
file = left_join(file, test)

#set those NAs to FALSES in drop column
file[,9][is.na(file[,9])]=F

#only keep those where drop == FALSE
file = file %>% filter(drop==FALSE)

#Remove unnecessary columns to allow output
file = file[,1:5]

file = rbind(file, other_file)

#Output as data.table
fwrite(file, file_input[2])

#Also output dropped
file_name = basename(file_input[2])
directory = dirname(file_input[2])
dropped_outfile = paste0(directory, "/DROPPED_", file_name)
fwrite(test, dropped_outfile)