Skip to content

Commit

Permalink
R scripts:
Browse files Browse the repository at this point in the history
Create a new function 'synteny_analysis_per_region', which includes both synteny analysis (by DECIPHER) and the calculation of synteny scores.
Handle better cases, in which the synteny analysis fails for some reason.
Return empty matrices for failed regions, which are filtered out later.
  • Loading branch information
inbalpaz authored and ipaz committed Jan 8, 2024
1 parent 5829c2c commit 8f4be9b
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 42 deletions.
2 changes: 1 addition & 1 deletion syntracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ def main():

# According to the number of output files - R has finished successfully
if outfiles_num == len(config.subsampling_lengths) + 1:
print("\nFound synteny calculation output files- no need to run this stage again")
print("\nFound synteny calculation output files - no need to run this stage again")
config.genomes_dict[ref_genome]['finished_R'] = 1
else:
print("\nNumber of output files doesn't match the expected - run the synteny calculation again")
Expand Down
35 changes: 12 additions & 23 deletions syntracker_R_scripts/SynTracker.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ tmp_folder <- args[6] # A temporary folder for the db files - should be deleted
intermediate_file_folder <- args[7] # If not empty - the folder for saving R intermediate objects (if empty - do not save them)
set_seed_arg <- as.integer(args[8]) # an integer to set the seed (if 0 - do not use seed)
core_number <- as.integer(args[9])
metadata_file <- args[10] # If not empty - teh path of the metadata file (if empty - there is no metadata)
metadata_file <- args[10] # If not empty - the path of the metadata file (if empty - there is no metadata)

######################################################################################################

Expand All @@ -33,39 +33,28 @@ if(metadata_file=='NA') {

# List the fasta files from the blastdbcmd output
filepaths <-list.files(path=blastdbcmd_output_path, full.names=TRUE)
gene_names<-""
for (i in 1:length(filepaths)) {gene_names[i]<-basename(filepaths[i])} # extract the file names from the full path
gene_names<-file_path_sans_ext(gene_names) # remove file extensions
region_names<-""
for (i in 1:length(filepaths)) {region_names[i]<-basename(filepaths[i])} # extract the file names from the full path
region_names<-file_path_sans_ext(region_names) # remove file extensions

# Run the Decipher synteny analysis (in multi-core)
print("Running synteny analysis using Decipher...")
objects<-mcmapply(synteny_analysis, filepaths, gene_names, tmp_folder, SIMPLIFY = F, mc.preschedule=F, mc.cores=core_number)
names(objects)<-gene_names
print("Decipher analysis finished successfully")

# identify iterations of synteny_anlysis that failed for some reason and filter these elements out...
bad_objects_elements <- sapply(objects, inherits, what = "try-error")
objects<-objects[!bad_objects_elements]
narrow<-Filter(function(x) nrow(x) > 1, objects) #filter elements with comparisons of less than 2 valid seqs, if this happened.
rm(objects)
# Run synteny analysis, including the calculation of the synteny scores, for each region
# Returns an object containing all the scores for all the regions
synteny_scores_dfs<-mcmapply(synteny_analysis_per_region, filepaths, region_names, tmp_folder, intermediate_file_folder, SIMPLIFY = F, mc.preschedule=F, mc.cores=core_number)

# If the user asked to save intermediate objects - save the narrow ds to the right folder
if(intermediate_file_folder != 'NA') {
saveRDS(narrow, file = paste0(intermediate_file_folder, "narrow.rds"))
}
names(synteny_scores_dfs)<-region_names

# second part: Process synteny objects (multi-core processing)
cat("\nCalculating synteny scores", "", sep = "\n" )
dfs<-mcmapply(synteny_scores, narrow, SIMPLIFY = F, mc.preschedule=F, mc.cores=core_number)
bad_dfs_elements <- sapply(dfs, inherits, what = "try-error") #identify iterations of synteny scores that failed for some reason. Mostly (although very rare), those are two hits for the same region
dfs<-dfs[!bad_dfs_elements] # and filter these elements out...
# Filter out empty data-frames of regions in which synteny has failed or returned an invalid object
synteny_scores_dfs_filtered<-Filter(function(x) nrow(x) > 0, synteny_scores_dfs)

if(intermediate_file_folder != 'NA') {
saveRDS(dfs, file = paste0(intermediate_file_folder,"dfs.rds"))
saveRDS(synteny_scores_dfs_filtered, file = paste0(intermediate_file_folder,"synteny_scores_tables.rds"))
}

#third part: add names to each table in a new column, merge to one big dataframe, arrange it.
improved_dfs<-map2(dfs, names(dfs), add_names)
improved_dfs<-map2(synteny_scores_dfs_filtered, names(synteny_scores_dfs_filtered), add_names)
big_dfs<-bind_rows(improved_dfs) # bind to one dataframe

###################################
Expand Down
133 changes: 117 additions & 16 deletions syntracker_R_scripts/SynTracker_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,129 @@ library(DECIPHER)
# fucntions
###########

# A function that combines two stages and is applied on each region:
# First stage: Synteny analysis using DECIPHER - calculate all vs. all hits
# Second stage: Calculate synteny score for each comparison
synteny_analysis_per_region<-function(inpath, region_name, tmp_folder, intermediate_file_folder) {

# Define a data frame that will hold the synteny scores for all the comparisons in the region:
per_region_table<-data.frame("sample1" = character(),
"sample2"= character(),
"length1" = integer(),
"length2" = integer() ,
"overlap" = integer(), #accomulative length of overlapping regions
"Blocks" = integer(), # number of synteny blocks per pairwise comparison
"synteny_score" = integer(), stringsAsFactors = FALSE)


cat("\nRunning synteny analysis using Decipher for region ", region_name, sep = "\n" )
seqs <- readDNAStringSet(inpath)
db<-paste0(tmp_folder,region_name)
flag<-0
for (i in seq_along(seqs)) {
# The following line adds sequences to the DECIPHER DB:
###########################
# IMPORTANT:
# SPLITTING THE names(seqs[i]) yields the "sample.xxx" from the fasta header of the sequence - therefore only one sequence per sample will be used.
# If the str_split is removed, potentially more than one contig per sample can be compared - i.e,, substrains in the same sample.
Seqs2DB(seqs[i], "XStringSet", db, str_split(names(seqs[i]), "_")[[1]][1])
###########################
flag<-flag+1
}

# only run DECIPHER analysis if we have more than 1 sequence matching this specific region.
if (flag>1) {

try_catch <- function(exprs) {!inherits(try(eval(exprs)), "try-error")}

# Synteny run for this region finished successfully and returned a synteny object
if (try_catch(synteny_object_region<-FindSynteny(db, maxSep=15, maxGap = 15))) {
cat("\nSynteny analysis for region", region_name, "finished successfully\n", sep = " " )

# Synteny analysis for this region returned an error - return an empty matrix
} else{
cat("\nSynteny analysis for region", region_name, "could not be completed\n", sep = " " )

# If the user asked to save intermediate objects - save the corrupted synteny object in an 'error' file
if(intermediate_file_folder != 'NA') {
error_file_name = paste0("synteny_object_error_", region_name, ".rds")
saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, error_file_name))
}

return(per_region_table)
}

# Otherwise, return an emtpy matrix
} else {
cat("\nCannot perform synteny analysis for region", region_name, "- only one hit was found\n", sep = " " )
return(per_region_table)
}

# The synteny object was created but something is wrong and it contains no real information
if (nrow(synteny_object_region[2,1][[1]]) == 0){

# If the user asked to save intermediate objects - save the corrupted synteny object in an 'error' file
if(intermediate_file_folder != 'NA') {
error_file_name = paste0("synteny_object_error_", region_name, ".rds")
saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, error_file_name))
}

return(per_region_table)

# Sunteny object is fine
} else{

# If the user asked to save intermediate objects - save the synteny object of the current region
if(intermediate_file_folder != 'NA') {
synteny_object_file_name = paste0("synteny_object_", region_name, ".rds")
saveRDS(synteny_object_region, file = paste0(intermediate_file_folder, synteny_object_file_name))
}
}

# Continue to calculating the syntey scores only if the synteny analysis returned a valid object
cat("\nCalculating synteny scores for region ", region_name, sep = "\n" )

# for each two samples, create the values to be kept in the dataframe (assing to one row).
for (i in 2:ncol(synteny_object_region)) {
for (j in 1:(i-1)) {
sample1<-as.character(str_split(colnames(synteny_object_region)[j], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats !
sample2<-as.character(str_split(colnames(synteny_object_region)[i], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats !
length1<-synteny_object_region[j,j][[1]]
length2<-synteny_object_region[i,i][[1]]
overlap<-sum(synteny_object_region[j,i][[1]][,4])
blocks<-nrow(synteny_object_region[i,j][[1]])

# calculate the synteny score
if (length1>length2) {
syn_score<-1+log10((overlap/length2)/blocks)
}
else {
syn_score<-1+log10((overlap/length1)/blocks)
}
temprow<-data.frame(sample1,sample2,length1,length2,overlap,blocks,syn_score)

per_region_table<-rbind(per_region_table, temprow)
}
}

return(per_region_table)
}


#########################################################################################
# The old functions - not in use anymore
#########################################################################################

# synteny_analysis: function to run Decipher analysis for each gene
# inputs: 1. inpath = path to fasta file(s)
# 2. gene name = central region identifier
# 3. tmp_folder = temporary folder, to store the DECIPHER data.

# output: synteny object
synteny_analysis<-function(inpath, gene_name, tmp_folder) {
#cat(" starting first function - synteny analysis")
fas<-inpath
seqs <- readDNAStringSet(fas)
#print(fas)
#print(gene_name)
db<-paste0(tmp_folder,gene_name)
synteny_analysis<-function(inpath, region_name, tmp_folder) {
#fas<-inpath
seqs <- readDNAStringSet(inpath)
db<-paste0(tmp_folder,region_name)
flag<-0
for (i in seq_along(seqs)) {
# The following line adds sequences to the DECIPHER DB:
Expand Down Expand Up @@ -206,12 +316,3 @@ tree_making<-function(synteny_matrix) {

#return(tree_upgma) # put a # here if want to return a plot instead!!!!!!
}









2 changes: 0 additions & 2 deletions syntracker_first_stage/blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ def run_blastn(query_file, outfile, blast_db_file_path, minimal_identity, minima
print(str(command))
print(err)
return 1
#sys.exit()

return 0

Expand All @@ -147,7 +146,6 @@ def run_blastdbcmd(entry, start, end, strand, minimal_full_length, outfile, outf
print(command)
print(err)
return 1
#sys.exit()

# Read the tmp file to check the sequence's length
if os.path.isfile(outfile_tmp):
Expand Down

0 comments on commit 8f4be9b

Please sign in to comment.