SynTracker.R

#important points:
# 1. the title of each region to be compared is the Refseq title of the choromosome + the location of the region along the sequence, 
# as generated by this python script: select_1kb_core_sequences_for_synteny_ref_genomes.ipynb
# 2. The title of the contigs in each assembly should be named with the Sample ID in the begginging: e.g, ID_NODE_x_coverage_y, etc. 


#requirements
library(DECIPHER)
library(tidyverse)
library(parallel)
library(tools)

###### functions file ######
source("SynTracker_functions.R")
source("add_metadata_fields.R")

############################

# user inputs required (args)
# 1. input library (blastcmddb folder)
# 2. number of cores
# 3. Flag: save RDS/not save RDS ("--intermediate", "--no_indermediate"). If not provided the script fails
# 4. Flag: use set.seed or not for the subsampling of n regions: ("--use.setseed", "--setseed.off").  If not provided the script fails 
# 5. metadata: metadata file, should include the sample ID, and any other relevant fields 

args <- commandArgs(trailingOnly = TRUE) #user input: is inherited from bash script flags
blastcmddb_directory<-args[1]
output_folder<-args[2]
core_number<-args[3]
save_intermediate_objects<-args[4]
stopifnot(save_intermediate_objects %in% c("--intermediate", "--no_indermediate")) 
set_seed_arg<-args[5]
stopifnot(set_seed_arg %in% c("--use.setseed", "--setseed.off")) 
if(length(args)==6) {
  metadata_file<-args[6]
  metadata<-read.csv(file=metadata_file, sep=";", header = TRUE)
  cat("Running analysis with Metadata file\n")
} else {
  metadata="NA"
  cat("Running analysis without Metadata\n")
}

####################
#.   Define and create folders
####################

#setwd(working_directory)
#output_folder<-paste0(working_directory, "/final_output/") 
temp_file_folder<-paste0(output_folder, "/R_temp")
intermediate_file_folder<-paste0(temp_file_folder, "/intermediate_objects/")
if (!dir.exists(output_folder)) {
  dir.create(output_folder)
} else {
  unlink(output_folder, recursive = T)
  dir.create(output_folder)
}
if (!dir.exists(temp_file_folder)) {
  dir.create(temp_file_folder)
  dir.create(intermediate_file_folder)
} else {
  unlink(temp_file_folder, recursive = T)
  dir.create(temp_file_folder)
  dir.create(intermediate_file_folder)
}
paths<-list.files(path=blastcmddb_directory, full.names=T)
cat("file list length is:", length(paths), sep="\n")
pathnames<-""
for (j in 1:length(paths)) {pathnames[j]<-basename(paths[j])} # return the folders that hold the sequences from samples that match each central region
names(paths)<-pathnames


##########################
#. external_func: a wrapper function to run the SynTracker function
# inputs:
# 1. paths: list of folder paths
# 2. path_names: the names of the elements in "paths"
# 3. metadata: the metadata file/argument indicating no file was provided
# 4. temp file folder
# 5. number of cores to use

# Correct: make metadata file optional. Only read if user spcify one. If no metadata file, just output a list of pairs
external_func<-function(paths, path_names, metadata, temp_file_folder, core_number, save_intermediate_objects, set_seed_arg) {
  #create a temp folder 
  tmp_folder<-runif(1, 1000000000000, 9999999999999) %>% round %>% as.character %>% paste0(temp_file_folder,"/tmp", ., "/") 
  dir.create(tmp_folder)
  cat("starting synteny analysis for:", path_names, sep = "\n" )
  # first part: run the synteny_analysis function on each input file
  filepaths <-list.files(path=paths, full.names=TRUE)
  gene_names<-""
  for (i in 1:length(filepaths)) {gene_names[i]<-basename(filepaths[i])} # extract the file from the path
  gene_names<-file_path_sans_ext(gene_names) #remove file extension
  
  objects<-mcmapply(synteny_analysis, filepaths, gene_names, tmp_folder, SIMPLIFY = F, mc.preschedule=F, mc.cores=core_number) 
  names(objects)<-gene_names 
  bad_objects_elements <- sapply(objects, inherits, what = "try-error") #identify iterations of synteny_anlysis that failed for some reason.
  
  objects<-objects[!bad_objects_elements] # and filter these elements out...
  narrow<-Filter(function(x) nrow(x) > 1, objects) #filter elemnts with comparisons of less than 2 valid seqs, if this happened.
  #print(length(objects))
  rm(objects)
  
  if(save_intermediate_objects == "--intermediate") {
    saveRDS(narrow, file = paste0(intermediate_file_folder,"narrow.rds"))
  }
  
  # second part: Process synteny objects
  dfs<-mcmapply(synteny_scores,narrow, SIMPLIFY = F, mc.preschedule=F,mc.cores=core_number) 
  bad_dfs_elements <- sapply(dfs, inherits, what = "try-error") #identify iterations of synteny scores that failed for some reason. Mostly (although very rare), those are two hits for the same region
  dfs<-dfs[!bad_dfs_elements] # and filter these elements out...
  
  if(save_intermediate_objects == "--intermediate") {
    saveRDS(dfs, file = paste0(intermediate_file_folder,"dfs.rds"))
  }
  #third part: add names to each table in a new column, merge to one big dataframe, arrange it.
  improved_dfs<-map2(dfs, names(dfs), add_names)
  big_dfs<-bind_rows(improved_dfs)  # bind to one dataframe
  
  # change order of sample1 and sample2, according to some rules, so that the order will be uniform throughout the table
  # i.e. sampleX-sampleY will always be like that and not sampleY-sampleX ==> if the order is not uniform it will be treated as two different comparisons
  big_dfs<-big_dfs %>% mutate(replaced = ifelse(sample2>sample1, "yes", "no")) # add a column specifing if the order of sample 1 and 2 should be replaced (for the sake of Grouping correctly in the next lines)
 
  #change the order of sample specific fields
   big_organized_dfs<-big_dfs %>% 
     mutate(temp=ifelse(replaced == "yes", as.character(sample1), "no need"),  #if replaced == yes: temp will hold sample1
            sample1 = ifelse(replaced == "yes", as.character(sample2), as.character(sample1)), #sample1 will hold sample2
            sample2=ifelse(replaced == "yes", temp, as.character(sample2)))  #sample2 will hold temp (the original sample1...)
            
  big_organized_dfs<- big_organized_dfs %>% arrange(sample1,sample2,position_counter) #arrange the order of rows in the table: This is done as in different Operating systems the order is different => leads to different subsampling, even if set.seed() is the same          
  
  species_temp_folder=paste0(intermediate_file_folder,"/",path_names)     
  dir.create(species_temp_folder)
  species_temp_big_table_path<-paste0(species_temp_folder,"/","big_tbl_rstudio.tab")
  write.table(big_organized_dfs,file = species_temp_big_table_path, sep=",", row.names = FALSE) #this should be under a condition
  ##################
  # filter and subsample regions
  ##################
  # find the maximal number of regions/pairwise-comparison:
  biggest_group<-max(big_organized_dfs %>% 
                     group_by(sample1, sample2) %>% 
                     mutate(regions = n()) %>%
                     ungroup() %>%
                     pull(regions))

  # create a list of data frames, with different regions subsampling values (i.e., subsample x regions per pair-wise comparison)
  # the conditons is used to avoid subsampling more regions than there are in the biggest group (could result in an error)
  grouped_list<-list()
  regions_sampled<-c(20,30,40,60,80,100,200)

  #run the subsampling function for values below the maximal nuber of regions/pair:
  for (i in 1:length(regions_sampled)) { 
    ifelse(biggest_group >= regions_sampled[i], 
           grouped_list[i]<-mapply(subsample_regions, list(big_organized_dfs), regions_sampled[i], SIMPLIFY = F), #add the set.seed argument to the function
           next)
  }
  # give names to the elements in the newly filled list
  for (i in 1:length(grouped_list)) {names(grouped_list)[i]<-paste0(path_names, "_",regions_sampled[i])}
  #print("number of regions found:")
  #print(names(grouped_list))
  #print("length of grouped list:")
  #print(length(grouped_list))
  unlink(tmp_folder, recursive = T)
  
  
  if (exists("metadata")) {
    cat("adding metadata fields\n")
    grouped_list<-mapply(add_metadata,grouped_list, MoreArgs = list(metadata), SIMPLIFY = FALSE)
    #grouped_list<-mapply(add_metadata,grouped_list, metadata, SIMPLIFY = FALSE)
    
  }
  
  cat("Finished synteny analysis for:", path_names, sep = "\n" )
  
  species_output<-paste(output_folder, path_names,"/", sep = "")
  dir.create(species_output)
  
  mapply(
       write.table,
       x=grouped_list, file=paste(species_output,names(grouped_list), ".txt", sep=""),
       MoreArgs=list(row.names=FALSE, sep=",")
     )
  return(grouped_list)
}
  

#outlist<-list(grouped_list, grouped_list_figs, big_organized_dfs)
#names(outlist)<-c("pairwise comparison tables", "pairwise comparison figures", "Final table ungrouped")
#return(outlist)


real_tetsting<-mapply(external_func, paths, pathnames, MoreArgs = list(metadata, temp_file_folder, core_number, save_intermediate_objects, set_seed_arg), SIMPLIFY = F)


#write the output tables to files. 
#dir.create("output_grrr")
#for(i in 1:length(real_tetsting)){
#  dir.create(paste0("output_grrr","/",names(real_tetsting[i])))
#  for(idx in 1:length(real_tetsting[[i]])){
#    write.table(real_tetsting[[i]][[idx]],paste0("output_grrr/",names(real_tetsting[i]),"/",names(real_tetsting[[i]][idx]),".csv"), sep = "\t",row.names = FALSE)
#  }
#}