R scripts:

Create a new function 'synteny_analysis_per_region', which includes both synteny analysis (by DECIPHER) and the calculation of synteny scores. Handle better cases, in which the synteny analysis fails for some reason. Return empty matrices for failed regions, which are filtered out later.
leylabmpi · Jan 8, 2024 · 8f4be9b · 8f4be9b
1 parent 5829c2c
commit 8f4be9b
Show file tree

Hide file tree

Showing 4 changed files with 130 additions and 42 deletions.
diff --git a/syntracker.py b/syntracker.py
@@ -372,7 +372,7 @@ def main():
 
                 # According to the number of output files - R has finished successfully
                 if outfiles_num == len(config.subsampling_lengths) + 1:
-                    print("\nFound synteny calculation output files- no need to run this stage again")
+                    print("\nFound synteny calculation output files - no need to run this stage again")
                     config.genomes_dict[ref_genome]['finished_R'] = 1
                 else:
                     print("\nNumber of output files doesn't match the expected - run the synteny calculation again")

diff --git a/syntracker_R_scripts/SynTracker.R b/syntracker_R_scripts/SynTracker.R
@@ -18,7 +18,7 @@ tmp_folder <- args[6] # A temporary folder for the db files - should be deleted
 intermediate_file_folder <- args[7] # If not empty - the folder for saving R intermediate objects (if empty - do not save them)
 set_seed_arg <- as.integer(args[8]) # an integer to set the seed (if 0 - do not use seed)
 core_number <- as.integer(args[9])
-metadata_file <- args[10] # If not empty - teh path of the metadata file (if empty - there is no metadata)
+metadata_file <- args[10] # If not empty - the path of the metadata file (if empty - there is no metadata)
 
 ######################################################################################################
 
@@ -33,39 +33,28 @@ if(metadata_file=='NA') {
 
 # List the fasta files from the blastdbcmd output
 filepaths <-list.files(path=blastdbcmd_output_path, full.names=TRUE)
-gene_names<-""
-for (i in 1:length(filepaths)) {gene_names[i]<-basename(filepaths[i])} # extract the file names from the full path
-gene_names<-file_path_sans_ext(gene_names) # remove file extensions
+region_names<-""
+for (i in 1:length(filepaths)) {region_names[i]<-basename(filepaths[i])} # extract the file names from the full path
+region_names<-file_path_sans_ext(region_names) # remove file extensions
 
 # Run the Decipher synteny analysis (in multi-core)
 print("Running synteny analysis using Decipher...")
-objects<-mcmapply(synteny_analysis, filepaths, gene_names, tmp_folder, SIMPLIFY = F, mc.preschedule=F, mc.cores=core_number)
-names(objects)<-gene_names
-print("Decipher analysis finished successfully")
 
-# identify iterations of synteny_anlysis that failed for some reason and filter these elements out...
-bad_objects_elements <- sapply(objects, inherits, what = "try-error")
-objects<-objects[!bad_objects_elements]
-narrow<-Filter(function(x) nrow(x) > 1, objects) #filter elements with comparisons of less than 2 valid seqs, if this happened.
-rm(objects)
+# Run synteny analysis, including the calculation of the synteny scores, for each region
+# Returns an object containing all the scores for all the regions
+synteny_scores_dfs<-mcmapply(synteny_analysis_per_region, filepaths, region_names, tmp_folder, intermediate_file_folder, SIMPLIFY = F, mc.preschedule=F, mc.cores=core_number)
 
-# If the user asked to save intermediate objects - save the narrow ds to the right folder
-if(intermediate_file_folder != 'NA') {
-    saveRDS(narrow, file = paste0(intermediate_file_folder, "narrow.rds"))
-}
+names(synteny_scores_dfs)<-region_names
 
-# second part: Process synteny objects (multi-core processing)
-cat("\nCalculating synteny scores", "", sep = "\n" )
-dfs<-mcmapply(synteny_scores, narrow, SIMPLIFY = F, mc.preschedule=F, mc.cores=core_number)
-bad_dfs_elements <- sapply(dfs, inherits, what = "try-error") #identify iterations of synteny scores that failed for some reason. Mostly (although very rare), those are two hits for the same region
-dfs<-dfs[!bad_dfs_elements] # and filter these elements out...
+# Filter out empty data-frames of regions in which synteny has failed or returned an invalid object
+synteny_scores_dfs_filtered<-Filter(function(x) nrow(x) > 0, synteny_scores_dfs)
 
 if(intermediate_file_folder != 'NA') {
-    saveRDS(dfs, file = paste0(intermediate_file_folder,"dfs.rds"))
+    saveRDS(synteny_scores_dfs_filtered, file = paste0(intermediate_file_folder,"synteny_scores_tables.rds"))
 }
 
 #third part: add names to each table in a new column, merge to one big dataframe, arrange it.
-improved_dfs<-map2(dfs, names(dfs), add_names)
+improved_dfs<-map2(synteny_scores_dfs_filtered, names(synteny_scores_dfs_filtered), add_names)
 big_dfs<-bind_rows(improved_dfs)  # bind to one dataframe
 
 ###################################

diff --git a/syntracker_R_scripts/SynTracker_functions.R b/syntracker_R_scripts/SynTracker_functions.R
@@ -5,19 +5,129 @@ library(DECIPHER)
 # fucntions
 ###########
 
+# A function that combines two stages and is applied on each region:
+# First stage: Synteny analysis using DECIPHER - calculate all vs. all hits
+# Second stage: Calculate synteny score for each comparison
+synteny_analysis_per_region<-function(inpath, region_name, tmp_folder, intermediate_file_folder) {
+
+    # Define a data frame that will hold the synteny scores for all the comparisons in the region:
+    per_region_table<-data.frame("sample1" = character(),
+                               "sample2"= character(),
+                               "length1" = integer(),
+                               "length2" = integer() ,
+                               "overlap" = integer(), #accomulative length of overlapping regions
+                               "Blocks" = integer(), # number of synteny blocks per pairwise comparison
+                               "synteny_score" = integer(), stringsAsFactors = FALSE)
+
+
+    cat("\nRunning synteny analysis using Decipher for region ", region_name, sep = "\n" )
+    seqs <- readDNAStringSet(inpath)
+    db<-paste0(tmp_folder,region_name)
+    flag<-0
+    for (i in seq_along(seqs)) {
+        # The following line adds sequences to the DECIPHER DB:
+        ###########################
+        #   IMPORTANT:
+        # SPLITTING THE names(seqs[i]) yields the "sample.xxx" from the fasta header of the sequence - therefore only one sequence per sample will be used.
+        # If the str_split is removed, potentially more than one contig per sample can be compared - i.e,, substrains in the same sample.
+        Seqs2DB(seqs[i], "XStringSet", db, str_split(names(seqs[i]), "_")[[1]][1])
+        ###########################
+        flag<-flag+1
+    }
+
+    # only run DECIPHER analysis if we have more than 1 sequence matching this specific region.
+    if (flag>1) {
+
+        try_catch <- function(exprs) {!inherits(try(eval(exprs)), "try-error")}
+
+        # Synteny run for this region finished successfully and returned a synteny object
+        if (try_catch(synteny_object_region<-FindSynteny(db, maxSep=15, maxGap = 15))) {
+            cat("\nSynteny analysis for region", region_name, "finished successfully\n", sep = " " )
+
+        # Synteny analysis for this region returned an error - return an empty matrix
+        } else{
+            cat("\nSynteny analysis for region", region_name, "could not be completed\n", sep = " " )
+
+            # If the user asked to save intermediate objects - save the corrupted synteny object in an 'error' file
+            if(intermediate_file_folder != 'NA') {
+                error_file_name = paste0("synteny_object_error_", region_name, ".rds")
+                saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, error_file_name))
+            }
+
+            return(per_region_table)
+        }
+
+    # Otherwise, return an emtpy matrix
+    } else {
+        cat("\nCannot perform synteny analysis for region", region_name, "- only one hit was found\n", sep = " " )
+        return(per_region_table)
+    }
+
+    # The synteny object was created but something is wrong and it contains no real information
+    if (nrow(synteny_object_region[2,1][[1]]) == 0){
+
+        # If the user asked to save intermediate objects - save the corrupted synteny object in an 'error' file
+        if(intermediate_file_folder != 'NA') {
+          error_file_name = paste0("synteny_object_error_", region_name, ".rds")
+          saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, error_file_name))
+        }
+
+        return(per_region_table)
+
+    # Sunteny object is fine
+    } else{
+
+        # If the user asked to save intermediate objects - save the synteny object of the current region
+        if(intermediate_file_folder != 'NA') {
+          synteny_object_file_name = paste0("synteny_object_", region_name, ".rds")
+          saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, synteny_object_file_name))
+        }
+    }
+
+    # Continue to calculating the syntey scores only if the synteny analysis returned a valid object
+    cat("\nCalculating synteny scores for region ", region_name, sep = "\n" )
+
+    # for each two samples, create the values to be kept in the dataframe (assing to one row).
+    for (i in 2:ncol(synteny_object_region)) {
+        for (j in 1:(i-1)) {
+            sample1<-as.character(str_split(colnames(synteny_object_region)[j], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats !
+            sample2<-as.character(str_split(colnames(synteny_object_region)[i], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats !
+            length1<-synteny_object_region[j,j][[1]]
+            length2<-synteny_object_region[i,i][[1]]
+            overlap<-sum(synteny_object_region[j,i][[1]][,4])
+            blocks<-nrow(synteny_object_region[i,j][[1]])
+
+            # calculate the synteny score
+            if (length1>length2) {
+                syn_score<-1+log10((overlap/length2)/blocks)
+            }
+            else {
+                syn_score<-1+log10((overlap/length1)/blocks)
+            }
+            temprow<-data.frame(sample1,sample2,length1,length2,overlap,blocks,syn_score)
+
+            per_region_table<-rbind(per_region_table, temprow)
+        }
+    }
+
+    return(per_region_table)
+}
+
+
+#########################################################################################
+# The old functions - not in use anymore
+#########################################################################################
+
 # synteny_analysis: function to run Decipher analysis for each gene
 # inputs: 1. inpath = path to fasta file(s)
 #         2. gene name = central region identifier 
 #         3. tmp_folder = temporary folder, to store the DECIPHER data. 
 
 # output: synteny object
-synteny_analysis<-function(inpath, gene_name, tmp_folder) { 
-  #cat("   starting first function - synteny analysis")
-  fas<-inpath
-  seqs <- readDNAStringSet(fas)
-  #print(fas)
-  #print(gene_name)
-  db<-paste0(tmp_folder,gene_name)
+synteny_analysis<-function(inpath, region_name, tmp_folder) {
+  #fas<-inpath
+  seqs <- readDNAStringSet(inpath)
+  db<-paste0(tmp_folder,region_name)
   flag<-0
   for (i in seq_along(seqs)) {
       # The following line adds sequences to the DECIPHER DB: 
@@ -206,12 +316,3 @@ tree_making<-function(synteny_matrix) {
 
   #return(tree_upgma) # put a # here if want to return a plot instead!!!!!!
 }      
-
-
-
-
-
-
-
-
-
diff --git a/syntracker_first_stage/blast.py b/syntracker_first_stage/blast.py
@@ -129,7 +129,6 @@ def run_blastn(query_file, outfile, blast_db_file_path, minimal_identity, minima
         print(str(command))
         print(err)
         return 1
-        #sys.exit()
 
     return 0
 
@@ -147,7 +146,6 @@ def run_blastdbcmd(entry, start, end, strand, minimal_full_length, outfile, outf
         print(command)
         print(err)
         return 1
-        #sys.exit()
 
     # Read the tmp file to check the sequence's length
     if os.path.isfile(outfile_tmp):