-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathSynTracker.R
311 lines (255 loc) · 17.2 KB
/
SynTracker.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#requirements
library(DECIPHER)
library(tidyverse)
library(parallel)
library(tools)
############
# fucntions
###########
# A function that combines two stages and is applied on each region:
# First stage: Synteny analysis using DECIPHER - calculate all vs. all hits
# Second stage: Calculate synteny score for each comparison
synteny_analysis_per_region<-function(inpath, region_name, tmp_folder, intermediate_file_folder, logfile) {
# Define a data frame that will hold the synteny scores for all the comparisons in the region:
per_region_table<-data.frame("sample1" = character(),
"sample2"= character(),
"length1" = integer(),
"length2" = integer() ,
"overlap" = integer(), #accomulative length of overlapping regions
"Blocks" = integer(), # number of synteny blocks per pairwise comparison
"synteny_score" = integer(), stringsAsFactors = FALSE)
cat("\nRunning synteny analysis using Decipher for region ", region_name, sep = "\n" )
seqs <- readDNAStringSet(inpath)
db<-paste0(tmp_folder,region_name)
flag<-0
for (i in seq_along(seqs)) {
# The following line adds sequences to the DECIPHER DB:
###########################
# IMPORTANT:
# SPLITTING THE names(seqs[i]) yields the "sample.xxx" from the fasta header of the sequence - therefore only one sequence per sample will be used.
# If the str_split is removed, potentially more than one contig per sample can be compared - i.e,, substrains in the same sample.
Seqs2DB(seqs[i], "XStringSet", db, str_split(names(seqs[i]), "_")[[1]][1])
###########################
flag<-flag+1
}
# only run DECIPHER analysis if we have more than 1 sequence matching this specific region.
if (flag>1) {
try_catch <- function(exprs) {!inherits(try(eval(exprs)), "try-error")}
# Synteny run for this region finished successfully and returned a synteny object
if (try_catch(synteny_object_region<-FindSynteny(db, maxSep=15, maxGap = 15))) {
cat("\nSynteny analysis for region", region_name, "finished successfully\n", sep = " " )
cat("\nSynteny analysis for region", region_name, "finished successfully\n", sep = " ", file=logfile, append=TRUE)
# Synteny analysis for this region returned an error - return an empty matrix
} else{
cat("\nSynteny analysis for region", region_name, "could not be completed\n", sep = " ")
cat("\nSynteny analysis for region", region_name, "could not be completed\n", sep = " ", file=logfile, append=TRUE)
unlink(db) # Remove the DECIPHER db file
# Add a '_done' suffix to the region-specific file in the blastdbcmd folder
# (so this region will not be analysed again in case the user repeats the run in continue mode)
file.rename(inpath, paste0(inpath,"_done"))
return(per_region_table)
}
# Otherwise, return an emtpy matrix
} else {
cat("\nCannot perform synteny analysis for region", region_name, "- only one hit was found\n", sep = " ")
cat("\nCannot perform synteny analysis for region", region_name, "- only one hit was found\n", sep = " ", file=logfile, append=TRUE)
# Remove the DECIPHER db file
unlink(db)
# Add a '_done' suffix to the region-specific file in the blastdbcmd folder
# (so this region will not be analysed again in case the user repeats the run in continue mode)
file.rename(inpath, paste0(inpath,"_done"))
return(per_region_table)
}
# The synteny object was created but something is wrong and it contains no real information
if (nrow(synteny_object_region[2,1][[1]]) == 0){
unlink(db) # Remove the DECIPHER db file
# Add a '_done' suffix to the region-specific file in the blastdbcmd folder
# (so this region will not be analysed again in case the user repeats the run in continue mode)
file.rename(inpath, paste0(inpath,"_done"))
return(per_region_table)
}
# Remove the DECIPHER db file
unlink(db)
# Add a '_done' suffix to the region-specific file in the blastdbcmd folder
# (so this region will not be analysed again in case the user repeats the run in continue mode)
file.rename(inpath, paste0(inpath,"_done"))
# Continue to calculating the syntey scores only if the synteny analysis returned a valid object
cat("\nCalculating synteny scores for region ", region_name, sep = "\n" )
# for each two samples, create the values to be kept in the dataframe (assing to one row).
for (i in 2:ncol(synteny_object_region)) {
for (j in 1:(i-1)) {
sample1<-as.character(str_split(colnames(synteny_object_region)[j], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats !
sample2<-as.character(str_split(colnames(synteny_object_region)[i], "_")[[1]][1]) # pay attention to the str_split: could (and should) be changed to suit other naming formats !
length1<-synteny_object_region[j,j][[1]]
length2<-synteny_object_region[i,i][[1]]
overlap<-sum(synteny_object_region[j,i][[1]][,4])
blocks<-nrow(synteny_object_region[i,j][[1]])
# calculate the synteny score
if (length1>length2) {
syn_score<-1+log10((overlap/length2)/blocks)
}
else {
syn_score<-1+log10((overlap/length1)/blocks)
}
temprow<-data.frame(sample1,sample2,length1,length2,overlap,blocks,syn_score)
per_region_table<-rbind(per_region_table, temprow)
}
}
cat("\nSynteny scores calculation for region", region_name, "finished successfully\n", sep = " ")
cat("\nSynteny scores calculation for region", region_name, "finished successfully\n", sep = " ", file=logfile, append=TRUE)
# Save the synteny scores object of the current region to be used in a continuing run when needed
synteny_scores_object_file_name = paste0(region_name, ".rds")
saveRDS(per_region_table, file = paste0(intermediate_file_folder, synteny_scores_object_file_name))
return(per_region_table)
}
# add the names (ref_genome_header + position) to the dataframe
add_names<-function(dfs, names) {
dfs %>% mutate(ref_genome_region=names)
}
# subsample_regions: a function to sample x regions per pairwise comparison and calculate APSS (Average Pairwise Synteny Scores)
# inputs:
# 1. big_organized_dfs = dataframe that holds per-region pairwise comparisons (multiple regions, multiple pairs)
# 2. subsampling_value = how many regions/pairwise to sample.
# output: dataframe with APSS values (col name is "APSS"), based on "subsampling_value" regions per pair. Pairs with <"subsampling_value" regions are filtererd out.
subsample_regions<-function(big_organized_dfs, subsampling_value, set_seed_arg) {
if(set_seed_arg != 0) {
set.seed(set_seed_arg)
cat("Using set.seed for reproducable subsampling. Seed: ", set_seed_arg, sep = " " )
cat("\n\n")
}
newdf<-big_organized_dfs %>%
# pay attention to the grouping variable below - should match the groups specified in the "synteny_score" function
group_by(Sample1, Sample2) %>%
filter(n() > subsampling_value-1) %>%
sample_n(subsampling_value) %>% #subsample "subsampling_value" regions from each group
mutate(regions = n()) %>%
summarise(APSS=mean(Synteny_score), Compared_regions=mean(regions)) %>%
return(newdf)
}
###########################################################################################################################
# Getting the arguments from the python script
args <- commandArgs(trailingOnly = TRUE)
genome_name <- args[1]
old_new_names_file <- args[2] # Sample names dictionary file
blastdbcmd_output_path <- args[3] # The directory in which the fasta sequences (the output of blastdbcmd) are located
output_folder <- args[4] # The destination folder for the per-genome output tables
output_summary_folder <- args[5] # The destination folder for the summary output tables (all genomes together)
tmp_folder <- args[6] # A temporary folder for the db files - should be deleted in the end
intermediate_file_folder <- args[7] # In continue mode, it contains the previously processed regions (if any)
set_seed_arg <- as.integer(args[8]) # an integer to set the seed (if 0 - do not use seed)
core_number <- as.integer(args[9])
logfile <- args[10] # The path of the logfile
######################################################################################################
# Define an empty list for the synteny scores per region dfs
synteny_scores_dfs_total <- list()
total_region_names <- character()
# Check if there are files in the intermediate_file_folder. If so, it is a continue mode run:
# 1. Read the already processed regions to a list of dfs
# 2. Run the synteny analysis only for the regions that have not been processed yet
# 3. Unite the two lists of dfs for final results
checkfiles <-list.files(path=intermediate_file_folder, full.names=TRUE)
if(length(checkfiles)>0) {
cat("\nFound", length(checkfiles), "regions that were already processed in a previous run\n", sep = " ", file=logfile, append=TRUE)
# Create a list of region names
processed_region_names<-""
for (i in 1:length(checkfiles)) {processed_region_names[i]<-basename(checkfiles[i])} # extract the file names from the full path
processed_region_names<-file_path_sans_ext(processed_region_names) # remove file extensions
total_region_names <- c(processed_region_names)
# Read the saved RDS score-per-region objects
for (i in 1:length(checkfiles)) {
scores_per_region_df <- readRDS(checkfiles[i])
scores_per_region_df_list <- list(scores_per_region_df)
synteny_scores_dfs_total <- c(synteny_scores_dfs_total, scores_per_region_df_list)
}
}
# List the fasta files from the blastdbcmd output
# (files that have already been processed will have '_done' suffix and shouldn't be listed)
filepaths <-list.files(path=blastdbcmd_output_path, full.names=TRUE, pattern="fasta$")
# Make sure that there are files that haven't been processed yet in order to run the DECIPHER analysis
if(length(filepaths)>0) {
print("\nFound regions that haven't been processed yet\n")
cat("\nFound", length(filepaths), "regions that haven't been processed yet\n", sep = " ", file=logfile, append=TRUE)
region_names<-""
for (i in 1:length(filepaths)) {region_names[i]<-basename(filepaths[i])} # extract the file names from the full path
region_names<-file_path_sans_ext(region_names) # remove file extensions
# Run the Decipher synteny analysis (in multi-core)
print("Running synteny analysis using Decipher...")
cat("\nGoing to run synteny analysis using Decipher for", length(filepaths), "regions...\n", sep = " ", file=logfile, append=TRUE)
# Run synteny analysis, including the calculation of the synteny scores, for each region
# Returns an object containing all the scores for all the regions
synteny_scores_dfs<-mcmapply(synteny_analysis_per_region, filepaths, region_names, tmp_folder, intermediate_file_folder, logfile, SIMPLIFY = F, mc.preschedule=F, mc.cores=core_number)
cat("\nNumber of processed regions (not necessarily valid:)", length(synteny_scores_dfs), "\n", sep = " ", file=logfile, append=TRUE)
synteny_scores_dfs_total <- c(synteny_scores_dfs_total, synteny_scores_dfs)
total_region_names <- c(total_region_names, region_names)
}
cat("\nTotal number of processed regions in current run and previous runs (not necessarily valid:)", length(synteny_scores_dfs_total), "\n", sep = " ", file=logfile, append=TRUE)
names(synteny_scores_dfs_total)<-total_region_names
# Filter out empty data-frames of regions in which synteny has failed or returned an invalid object
synteny_scores_dfs_filtered<-Filter(function(x) nrow(x) > 0, synteny_scores_dfs_total)
#third part: add names to each table in a new column, merge to one big dataframe, arrange it.
improved_dfs<-map2(synteny_scores_dfs_filtered, names(synteny_scores_dfs_filtered), add_names)
big_dfs<-bind_rows(improved_dfs) # bind to one dataframe
###################################
# Read the sample names dictionary file and add the original sample names to the table
old_new_names_minimal <- read.table(file=old_new_names_file, header = T, sep="\t")
big_dfs<-left_join(big_dfs, old_new_names_minimal, by=c("sample1"= "new.sample.name")) %>%
dplyr::rename("temp sample1"="sample1", "sample1" := "old.sample.name" )
big_dfs<-left_join(big_dfs, old_new_names_minimal, by=c("sample2"= "new.sample.name")) %>%
dplyr::rename("temp sample2"="sample2", "sample2" := "old.sample.name" )
# change order of sample1 and sample2, according to some rules, so that the order will be uniform throughout the table
# i.e. sampleX-sampleY will always be like that and not sampleY-sampleX ==> if the order is not uniform it will be treated as two different comparisons
big_dfs<-big_dfs %>% mutate(replaced = ifelse(sample1>sample2, "yes", "no")) # add a column specifying if the order of sample 1 and 2 should be replaced (for the sake of Grouping correctly in the next lines)
#change the order of sample specific fields
big_organized_dfs<-big_dfs %>%
mutate(temp=ifelse(replaced == "yes", as.character(sample1), "no need"), #if replaced == yes: temp will hold sample1
sample1 = ifelse(replaced == "yes", as.character(sample2), as.character(sample1)), #sample1 will hold sample2
sample2=ifelse(replaced == "yes", temp, as.character(sample2))) #sample2 will hold temp (the original sample1...)
# Create a big summary table containing all the comparisons with the original sample names, sorted by sample1, sample2, region
# (The sorting is important for the reproducibility of the subsampling given the same seed)
big_organized_dfs<-big_organized_dfs %>% arrange(sample1,sample2, ref_genome_region)
# Edit this table for the user to have only the necessary information
big_organized_dfs_final<-big_organized_dfs %>% select(sample1, sample2, ref_genome_region, length1, length2, overlap, blocks, syn_score) %>%
dplyr::rename(Sample1="sample1", Sample2="sample2", Region="ref_genome_region", Length1="length1", Length2="length2", Overlap="overlap", Blocks="blocks", Synteny_score="syn_score") %>%
dplyr::distinct()
#genome_big_table_path<-paste0(output_folder, genome_name , "_synteny_scores_per_region.csv")
#write.table(big_organized_dfs_final, file=genome_big_table_path, sep=",", row.names = FALSE)
# Add a column for the ref genome in the final table
summary_organized_dfs<-big_organized_dfs_final %>% mutate(Ref_genome=genome_name, .before = Sample1)
# Save the genome-specific final table to a file
genome_big_table_path<-paste0(output_folder, genome_name , "_synteny_scores_per_region.csv")
write.table(summary_organized_dfs, file=genome_big_table_path, sep=",", row.names = FALSE)
# Append the genome-specific final table to the main table, that contains all the ref-genomes together
summary_table_path<-paste0(output_summary_folder, "synteny_scores_per_region.csv")
write.table(summary_organized_dfs, file=summary_table_path, sep=",", row.names = FALSE, col.names=FALSE, append=TRUE)
##################
# filter and subsample regions
##################
# find the maximal number of regions/pairwise-comparison:
biggest_group<-max(big_organized_dfs_final %>%
group_by(Sample1, Sample2) %>%
mutate(regions = n()) %>%
ungroup() %>%
pull(regions))
# create a list of data frames, with different regions subsampling values (i.e., subsample x regions per pair-wise comparison)
# the conditons is used to avoid subsampling more regions than there are in the biggest group (could result in an error)
regions_sampled<-c(40,60,80,100,200)
for (i in 1:length(regions_sampled)) {
if(biggest_group >= regions_sampled[i]){
grouped_df<-as.data.frame(mapply(subsample_regions, list(big_organized_dfs_final), regions_sampled[i], set_seed_arg, SIMPLIFY = F))
grouped_df_with_genome<-grouped_df %>% mutate(Ref_genome=genome_name, .before = Sample1)
write.table(grouped_df_with_genome, file=paste(output_folder, genome_name, "_avg_synteny_scores_", regions_sampled[i], "_regions.csv", sep=""), row.names=FALSE, sep=",")
write.table(grouped_df_with_genome, file=paste(output_summary_folder, "avg_synteny_scores_", regions_sampled[i], "_regions.csv", sep=""),
row.names=FALSE, col.names=FALSE, append=TRUE, sep=",")
}
}
# Create an additional output table without subsampling
grouped_df_avg_all<-big_organized_dfs_final %>%
group_by(Sample1, Sample2) %>%
mutate(regions = n()) %>%
summarise(APSS=mean(Synteny_score), Compared_regions=mean(regions))
grouped_df_all_with_genome<-grouped_df_avg_all %>% mutate(Ref_genome=genome_name, .before = Sample1)
write.table(grouped_df_all_with_genome, file=paste(output_folder, genome_name, "_avg_synteny_scores_all_regions.csv", sep=""), row.names=FALSE, sep=",")
write.table(grouped_df_all_with_genome, file=paste(output_summary_folder, "avg_synteny_scores_all_regions.csv", sep=""),
row.names=FALSE, col.names=FALSE, append=TRUE, sep=",")
unlink(tmp_folder, recursive = T)
cat("\nFinished synteny analysis for:", genome_name, sep = "\n" )