forked from leylabmpi/SynTracker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSynTracker.R
executable file
·200 lines (166 loc) · 9.18 KB
/
SynTracker.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#important points:
# 1. the title of each region to be compared is the Refseq title of the choromosome + the location of the region along the sequence,
# as generated by this python script: select_1kb_core_sequences_for_synteny_ref_genomes.ipynb
# 2. The title of the contigs in each assembly should be named with the Sample ID in the begginging: e.g, ID_NODE_x_coverage_y, etc.
#requirements
library(DECIPHER)
library(tidyverse)
library(parallel)
library(tools)
###### functions file ######
source("SynTracker_functions.R")
source("add_metadata_fields.R")
############################
# user inputs required (args)
# 1. input library (blastcmddb folder)
# 2. number of cores
# 3. Flag: save RDS/not save RDS ("--intermediate", "--no_indermediate"). If not provided the script fails
# 4. Flag: use set.seed or not for the subsampling of n regions: ("--use.setseed", "--setseed.off"). If not provided the script fails
# 5. metadata: metadata file, should include the sample ID, and any other relevant fields
args <- commandArgs(trailingOnly = TRUE) #user input: is inherited from bash script flags
blastcmddb_directory<-args[1]
output_folder<-args[2]
core_number<-args[3]
save_intermediate_objects<-args[4]
stopifnot(save_intermediate_objects %in% c("--intermediate", "--no_indermediate"))
set_seed_arg<-args[5]
stopifnot(set_seed_arg %in% c("--use.setseed", "--setseed.off"))
if(length(args)==6) {
metadata_file<-args[6]
metadata<-read.csv(file=metadata_file, sep=";", header = TRUE)
cat("Running analysis with Metadata file\n")
} else {
metadata="NA"
cat("Running analysis without Metadata\n")
}
####################
#. Define and create folders
####################
#setwd(working_directory)
#output_folder<-paste0(working_directory, "/final_output/")
temp_file_folder<-paste0(output_folder, "/R_temp")
intermediate_file_folder<-paste0(temp_file_folder, "/intermediate_objects/")
if (!dir.exists(output_folder)) {
dir.create(output_folder)
} else {
unlink(output_folder, recursive = T)
dir.create(output_folder)
}
if (!dir.exists(temp_file_folder)) {
dir.create(temp_file_folder)
dir.create(intermediate_file_folder)
} else {
unlink(temp_file_folder, recursive = T)
dir.create(temp_file_folder)
dir.create(intermediate_file_folder)
}
paths<-list.files(path=blastcmddb_directory, full.names=T)
cat("file list length is:", length(paths), sep="\n")
pathnames<-""
for (j in 1:length(paths)) {pathnames[j]<-basename(paths[j])} # return the folders that hold the sequences from samples that match each central region
names(paths)<-pathnames
##########################
#. external_func: a wrapper function to run the SynTracker function
# inputs:
# 1. paths: list of folder paths
# 2. path_names: the names of the elements in "paths"
# 3. metadata: the metadata file/argument indicating no file was provided
# 4. temp file folder
# 5. number of cores to use
# Correct: make metadata file optional. Only read if user spcify one. If no metadata file, just output a list of pairs
external_func<-function(paths, path_names, metadata, temp_file_folder, core_number, save_intermediate_objects, set_seed_arg) {
#create a temp folder
tmp_folder<-runif(1, 1000000000000, 9999999999999) %>% round %>% as.character %>% paste0(temp_file_folder,"/tmp", ., "/")
dir.create(tmp_folder)
cat("starting synteny analysis for:", path_names, sep = "\n" )
# first part: run the synteny_analysis function on each input file
filepaths <-list.files(path=paths, full.names=TRUE)
gene_names<-""
for (i in 1:length(filepaths)) {gene_names[i]<-basename(filepaths[i])} # extract the file from the path
gene_names<-file_path_sans_ext(gene_names) #remove file extension
objects<-mcmapply(synteny_analysis, filepaths, gene_names, tmp_folder, SIMPLIFY = F, mc.preschedule=F, mc.cores=core_number)
names(objects)<-gene_names
bad_objects_elements <- sapply(objects, inherits, what = "try-error") #identify iterations of synteny_anlysis that failed for some reason.
objects<-objects[!bad_objects_elements] # and filter these elements out...
narrow<-Filter(function(x) nrow(x) > 1, objects) #filter elemnts with comparisons of less than 2 valid seqs, if this happened.
#print(length(objects))
rm(objects)
if(save_intermediate_objects == "--intermediate") {
saveRDS(narrow, file = paste0(intermediate_file_folder,"narrow.rds"))
}
# second part: Process synteny objects
dfs<-mcmapply(synteny_scores,narrow, SIMPLIFY = F, mc.preschedule=F,mc.cores=core_number)
bad_dfs_elements <- sapply(dfs, inherits, what = "try-error") #identify iterations of synteny scores that failed for some reason. Mostly (although very rare), those are two hits for the same region
dfs<-dfs[!bad_dfs_elements] # and filter these elements out...
if(save_intermediate_objects == "--intermediate") {
saveRDS(dfs, file = paste0(intermediate_file_folder,"dfs.rds"))
}
#third part: add names to each table in a new column, merge to one big dataframe, arrange it.
improved_dfs<-map2(dfs, names(dfs), add_names)
big_dfs<-bind_rows(improved_dfs) # bind to one dataframe
# change order of sample1 and sample2, according to some rules, so that the order will be uniform throughout the table
# i.e. sampleX-sampleY will always be like that and not sampleY-sampleX ==> if the order is not uniform it will be treated as two different comparisons
big_dfs<-big_dfs %>% mutate(replaced = ifelse(sample2>sample1, "yes", "no")) # add a column specifing if the order of sample 1 and 2 should be replaced (for the sake of Grouping correctly in the next lines)
#change the order of sample specific fields
big_organized_dfs<-big_dfs %>%
mutate(temp=ifelse(replaced == "yes", as.character(sample1), "no need"), #if replaced == yes: temp will hold sample1
sample1 = ifelse(replaced == "yes", as.character(sample2), as.character(sample1)), #sample1 will hold sample2
sample2=ifelse(replaced == "yes", temp, as.character(sample2))) #sample2 will hold temp (the original sample1...)
big_organized_dfs<- big_organized_dfs %>% arrange(sample1,sample2,position_counter) #arrange the order of rows in the table: This is done as in different Operating systems the order is different => leads to different subsampling, even if set.seed() is the same
species_temp_folder=paste0(intermediate_file_folder,"/",path_names)
dir.create(species_temp_folder)
species_temp_big_table_path<-paste0(species_temp_folder,"/","big_tbl_rstudio.tab")
write.table(big_organized_dfs,file = species_temp_big_table_path, sep=",", row.names = FALSE) #this should be under a condition
##################
# filter and subsample regions
##################
# find the maximal number of regions/pairwise-comparison:
biggest_group<-max(big_organized_dfs %>%
group_by(sample1, sample2) %>%
mutate(regions = n()) %>%
ungroup() %>%
pull(regions))
# create a list of data frames, with different regions subsampling values (i.e., subsample x regions per pair-wise comparison)
# the conditons is used to avoid subsampling more regions than there are in the biggest group (could result in an error)
grouped_list<-list()
regions_sampled<-c(20,30,40,60,80,100,200)
#run the subsampling function for values below the maximal nuber of regions/pair:
for (i in 1:length(regions_sampled)) {
ifelse(biggest_group >= regions_sampled[i],
grouped_list[i]<-mapply(subsample_regions, list(big_organized_dfs), regions_sampled[i], SIMPLIFY = F), #add the set.seed argument to the function
next)
}
# give names to the elements in the newly filled list
for (i in 1:length(grouped_list)) {names(grouped_list)[i]<-paste0(path_names, "_",regions_sampled[i])}
#print("number of regions found:")
#print(names(grouped_list))
#print("length of grouped list:")
#print(length(grouped_list))
unlink(tmp_folder, recursive = T)
if (exists("metadata")) {
cat("adding metadata fields\n")
grouped_list<-mapply(add_metadata,grouped_list, MoreArgs = list(metadata), SIMPLIFY = FALSE)
#grouped_list<-mapply(add_metadata,grouped_list, metadata, SIMPLIFY = FALSE)
}
cat("Finished synteny analysis for:", path_names, sep = "\n" )
species_output<-paste(output_folder, path_names,"/", sep = "")
dir.create(species_output)
mapply(
write.table,
x=grouped_list, file=paste(species_output,names(grouped_list), ".txt", sep=""),
MoreArgs=list(row.names=FALSE, sep=",")
)
return(grouped_list)
}
#outlist<-list(grouped_list, grouped_list_figs, big_organized_dfs)
#names(outlist)<-c("pairwise comparison tables", "pairwise comparison figures", "Final table ungrouped")
#return(outlist)
real_tetsting<-mapply(external_func, paths, pathnames, MoreArgs = list(metadata, temp_file_folder, core_number, save_intermediate_objects, set_seed_arg), SIMPLIFY = F)
#write the output tables to files.
#dir.create("output_grrr")
#for(i in 1:length(real_tetsting)){
# dir.create(paste0("output_grrr","/",names(real_tetsting[i])))
# for(idx in 1:length(real_tetsting[[i]])){
# write.table(real_tetsting[[i]][[idx]],paste0("output_grrr/",names(real_tetsting[i]),"/",names(real_tetsting[[i]][idx]),".csv"), sep = "\t",row.names = FALSE)
# }
#}