Skip to content

Commit

Permalink
Updated code, to improve reproducibility over different operating sys…
Browse files Browse the repository at this point in the history
…tems
  • Loading branch information
henav committed Apr 20, 2023
1 parent 7547c22 commit 3e752b6
Show file tree
Hide file tree
Showing 6 changed files with 6 additions and 307,399 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file modified Sample_input/.DS_Store
Binary file not shown.
9 changes: 6 additions & 3 deletions SynTracker.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,10 @@ external_func<-function(paths, path_names, metadata, temp_file_folder, core_numb
for (i in 1:length(filepaths)) {gene_names[i]<-basename(filepaths[i])} # extract the file from the path
gene_names<-file_path_sans_ext(gene_names) #remove file extension

objects<-mcmapply(synteny_analysis, filepaths, gene_names, tmp_folder, SIMPLIFY = F, mc.preschedule=T, mc.cores=core_number) #change number of cores if needed
objects<-mcmapply(synteny_analysis, filepaths, gene_names, tmp_folder, SIMPLIFY = F, mc.preschedule=F, mc.cores=core_number)
names(objects)<-gene_names
bad_objects_elements <- sapply(objects, inherits, what = "try-error") #identify iterations of synteny_anlysis that failed for some reason.

objects<-objects[!bad_objects_elements] # and filter these elements out...
narrow<-Filter(function(x) nrow(x) > 1, objects) #filter elemnts with comparisons of less than 2 valid seqs, if this happened.
#print(length(objects))
Expand All @@ -103,7 +104,7 @@ external_func<-function(paths, path_names, metadata, temp_file_folder, core_numb
}

# second part: Process synteny objects
dfs<-mcmapply(synteny_scores,narrow, SIMPLIFY = F, mc.preschedule=F,mc.cores=core_number) #change number of cores if needed
dfs<-mcmapply(synteny_scores,narrow, SIMPLIFY = F, mc.preschedule=F,mc.cores=core_number)
bad_dfs_elements <- sapply(dfs, inherits, what = "try-error") #identify iterations of synteny scores that failed for some reason. Mostly (although very rare), those are two hits for the same region
dfs<-dfs[!bad_dfs_elements] # and filter these elements out...

Expand All @@ -118,11 +119,13 @@ external_func<-function(paths, path_names, metadata, temp_file_folder, core_numb
# i.e. sampleX-sampleY will always be like that and not sampleY-sampleX ==> if the order is not uniform it will be treated as two different comparisons
big_dfs<-big_dfs %>% mutate(replaced = ifelse(sample2>sample1, "yes", "no")) # add a column specifing if the order of sample 1 and 2 should be replaced (for the sake of Grouping correctly in the next lines)

#change the order of sample specific fields: YOU SHOULD modify them if your metadata file contains different numer of fields
#change the order of sample specific fields
big_organized_dfs<-big_dfs %>%
mutate(temp=ifelse(replaced == "yes", as.character(sample1), "no need"), #if replaced == yes: temp will hold sample1
sample1 = ifelse(replaced == "yes", as.character(sample2), as.character(sample1)), #sample1 will hold sample2
sample2=ifelse(replaced == "yes", temp, as.character(sample2))) #sample2 will hold temp (the original sample1...)

big_organized_dfs<- big_organized_dfs %>% arrange(sample1,sample2,position_counter) #arrange the order of rows in the table: This is done as in different Operating systems the order is different => leads to different subsampling, even if set.seed() is the same

species_temp_folder=paste0(intermediate_file_folder,"/",path_names)
dir.create(species_temp_folder)
Expand Down
13 changes: 0 additions & 13 deletions SynTracker_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,6 @@ synteny_scores<- function(synteny_object) {
"length2" = integer() ,
"overlap" = integer(), #accomulative length of overlapping regions
"Blocks" = integer(), # number of synteny blocks per pairwise comparison
#"GroupA1" = character(),
#"GroupA2" = character(),
#"GroupB1" = character(),
#"GroupB2" = character(),
#"GroupC1" = character(),
#"GroupC2" = character(),
#"GroupD1" = character(),
#"GroupD2" = character(),
#"is.same.GroupA" = logical(),
#"is.same.GroupB" = logical(),
#"is.same.GroupC" = logical(),
#"is.same.GroupD" = logical(),
"synteny_score" = integer(), stringsAsFactors = FALSE)

# for each two samples, create the values to be kept in the dataframe (assing to one row).
Expand Down Expand Up @@ -143,7 +131,6 @@ subsample_regions<-function(big_organized_dfs,subsampling_value) {
set.seed(1) # user should decide whether set.seed should be commented or not: depends if you want to get repreducible subsampling or different in any run
newdf<-big_organized_dfs %>%
# pay attention to the grouping variable below - should match the groups specified in the "synteny_score" function
#group_by(sample1, sample2, GroupA1, GroupA2, GroupB1, GroupB2, GroupC1, GroupC2, GroupD1, GroupD2, is.same.GroupA, is.same.GroupB, is.same.GroupC, is.same.GroupD) %>%
group_by(sample1, sample2) %>%
filter(n() > subsampling_value-1) %>%
sample_n(subsampling_value) %>% #subsample "subsampling_value" regions from each group
Expand Down
130 changes: 0 additions & 130 deletions find_overlapping_regions copy.sh

This file was deleted.

Loading

0 comments on commit 3e752b6

Please sign in to comment.