Skip to content

Commit

Permalink
sdm 2.18 (AVITI support), usearch v12 added, LULU fix, -platform AVIT…
Browse files Browse the repository at this point in the history
…I added
  • Loading branch information
hifa committed Aug 4, 2024
1 parent 45f0a45 commit 567eaba
Show file tree
Hide file tree
Showing 8 changed files with 172 additions and 71 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
*.sh
runs
runs/*
sqLOGs
myTestRun
myTestRun2
Expand All @@ -16,6 +18,7 @@ configs/sdm_miSeq_t.txt
configs/sdm_PacBio_LSSU_ARC.txt
debug.sh
myTestRun2
bin/usearch12_linux_beta
bin/ncbi-blast-*
bin/vxtr/vxtractor.pl
bin/vxtr/
Expand Down
98 changes: 54 additions & 44 deletions autoInstall.pl
Original file line number Diff line number Diff line change
Expand Up @@ -158,48 +158,7 @@
#die "@txt\n";


if ($usearch_reset==1 && !$condaDBinstall){
print "\n\n#######################################################################################";
print "\nUSEARCH ver 7, 8, 9 or 10 has to be installed manually (due to licensing). Please download & install from http://www.drive5.com/usearch/download.html \n";

# Check if usearch is available in path and store the absolute path in $usearch_path
my $usearch_path = `which usearch`;
chomp($usearch_path);

if ($usearch_path eq "") {
print "USEARCH was not found on this system. Please enter the absolute path to usearch below.\nOr continue by entering \"0\" \n (you will have to add it later via \"./autoInstall.pl -link_usearch [path to usearch] or use -link_usearch [path to usearch] as an option when running LotuS2 )\"\n\nAnswer:";

my $inu = "";
while ( 1){
$inu = <>; chomp $inu;
if ($inu eq "0"){$inu="";last;
} elsif (!-f $inu){print "Not a valid file! try again:\n";
} else {last;}
}
$usearch_path = $inu;
} else {
print "USEARCH was found at: \n";
print $usearch_path;
print "\nIf this is not the right USEARCH version you can change it later via \"./autoInstall.pl -link_usearch [path to usearch] or use -link_usearch [path to usearch] as an option when running LotuS2.\n\n";
}

# print "USEARCH was not found on this system. Please enter the absolute path to usearch below.\nOr continue by entering \"0\" \n (you will have to add it later via \"./autoInstall.pl -link_usearch [path to usearch] or use -link_usearch [path to usearch] as an option when running LotuS2 )\"\n\nAnswer:";
# #print "Once downloaded, rename the binary userachXXX to usearch_bin, make it executable (chmod +x usearch_bin) and copy/link it to this directory:\n$bdir \n";
# #print "\nLotuS is almost ready to run (usearch).\n\n";
# my $inu = "";
# while ( 1){
# $inu = <>; chomp $inu;
# if ($inu eq "0"){$inu="";last;
# } elsif (!-f $inu){print "Not a valid file! try again:\n";
# } else {last;}
# }
@txt = addInfoLtS("usearch",$usearch_path,\@txt,1) if ($usearch_path ne "");

} else {
print "Found valid usearch bin at $uspath\n";
print "\nLotuS is ready to run.\n\n";

}
getUsearch();


######## get programs ####################
Expand Down Expand Up @@ -1182,7 +1141,58 @@ sub get_DBs{


}


sub getUsearch{
if (1){
print "Downloading usearch v 12 for sequence clustering and tax annotations..\n";
my $tarUTN = "$bdir/usearch12_linux_beta";
getS2("https://github.com/rcedgar/usearch12/releases/download/v12.0-beta1/usearch_linux_x86_12.0-beta",$tarUTN);
@txt = addInfoLtS("usearch",$tarUTN,\@txt,1);

}elsif ($usearch_reset==1 && !$condaDBinstall){ #old install for previous usearch versions..
print "\n\n#######################################################################################";
print "\nUSEARCH ver 7, 8, 9 or 10 has to be installed manually (due to licensing). Please download & install from http://www.drive5.com/usearch/download.html \n";

# Check if usearch is available in path and store the absolute path in $usearch_path
my $usearch_path = `which usearch`;
chomp($usearch_path);

if ($usearch_path eq "") {
print "USEARCH was not found on this system. Please enter the absolute path to usearch below.\nOr continue by entering \"0\" \n (you will have to add it later via \"./autoInstall.pl -link_usearch [path to usearch] or use -link_usearch [path to usearch] as an option when running LotuS2 )\"\n\nAnswer:";

my $inu = "";
while ( 1){
$inu = <>; chomp $inu;
if ($inu eq "0"){$inu="";last;
} elsif (!-f $inu){print "Not a valid file! try again:\n";
} else {last;}
}
$usearch_path = $inu;
} else {
print "USEARCH was found at: \n";
print $usearch_path;
print "\nIf this is not the right USEARCH version you can change it later via \"./autoInstall.pl -link_usearch [path to usearch] or use -link_usearch [path to usearch] as an option when running LotuS2.\n\n";
}

# print "USEARCH was not found on this system. Please enter the absolute path to usearch below.\nOr continue by entering \"0\" \n (you will have to add it later via \"./autoInstall.pl -link_usearch [path to usearch] or use -link_usearch [path to usearch] as an option when running LotuS2 )\"\n\nAnswer:";
# #print "Once downloaded, rename the binary userachXXX to usearch_bin, make it executable (chmod +x usearch_bin) and copy/link it to this directory:\n$bdir \n";
# #print "\nLotuS is almost ready to run (usearch).\n\n";
# my $inu = "";
# while ( 1){
# $inu = <>; chomp $inu;
# if ($inu eq "0"){$inu="";last;
# } elsif (!-f $inu){print "Not a valid file! try again:\n";
# } else {last;}
# }
@txt = addInfoLtS("usearch",$usearch_path,\@txt,1) if ($usearch_path ne "");

} else {
print "Found valid usearch bin at $uspath\n";
print "\nLotuS is ready to run.\n\n";

}
}


sub get_programs{
Expand Down Expand Up @@ -1614,11 +1624,11 @@ ()
print "Answer:";
while (<>){
chomp($_);
if ($_ == 1 ||$_ == 4 || $_ == 3 ||$_ == 2 ||$_ == 5 ||$_ == 0 ||$_ == 8){
if (defined($_) && $_ ne "" && ($_ == 1 ||$_ == 4 || $_ == 3 ||$_ == 2 ||$_ == 5 ||$_ == 0 ||$_ == 8)){
$refDBinstall[8] = 0;
$refDBinstall[$_] = 1;
last;
}
} else { print "Invalid answer, aborting.."; exit(); }
}
#SILVA license
if (!$skipAll && ($refDBinstall[2] || $refDBinstall[8])){
Expand Down
2 changes: 1 addition & 1 deletion bin/R/LULU.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ lulu = function (otutable, matchlist, minimum_ratio_type = "min", minimum_ratio
curate_index <- row.names(statistics_table) == statistics_table$parent_id
statistics_table$curated[curate_index] <- "parent"
statistics_table <- transform(statistics_table, rank = ave(total, FUN = function(x) rank(-x, ties.method = "first")))
curation_table <- as.data.frame(curation_table %>% group_by(nOTUid) %>% summarise_all(funs(sum)))
curation_table <- as.data.frame(curation_table %>% group_by(nOTUid) %>% summarise_all(list(sum)))
row.names(curation_table) <- as.character(curation_table$nOTUid)
curation_table <- curation_table[, -1]
curated_otus <- names(table(statistics_table$parent_id))
Expand Down
Binary file modified bin/sdm
Binary file not shown.
71 changes: 71 additions & 0 deletions configs/sdm_AVITI.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#sdm options file to control sequence quality filtering, demultiplexing and preparation (can also be used without demultiplexing)
#* indicates alternative quality filtering options, saved in *.add.fna etc. files separately from initial quality filtered dataset
#sequence length refers to sequence length AFTER removal of Primers, Barcodes and trimming. this ensures that downstream analyis tools will have appropiate sequence information
#options with a star in front are lenient parameters for mid qual sequences (only used for estimating OTU abundance, not for OTU building itself).
minSeqLength 200
maxSeqLength 1000
minAvgQuality 27
*minSeqLength 200
*minAvgQuality 20
#truncate total Sequence length to X (length after Barcode, Adapter and Primer removals, set to -1 to deactivate)
TruncateSequenceLength 200

#Ambiguous bases in Sequence
maxAmbiguousNT 0
*maxAmbiguousNT 1

#sequence is discarded if a homonucleotide run in sequence is longer
maxHomonucleotide 8

#Filter whole sequence if one window of quality scores is below average
QualWindowWidth 50
QualWindowThreshhold 25

#Trim the end of a sequence if a window falls below quality threshhold. Useful for removing low qulaity trailing ends of sequence
TrimWindowWidth 20
TrimWindowThreshhold 25

#Probabilistic max number of accumulated sequencing errors. After this length, the rest of the sequence will be deleted. Complimentary to TrimWindowThreshhold. (-1) deactivates this option.
maxAccumulatedError 0.95
*maxAccumulatedError -1
#Binomial error model of expected errors per sequence (see https://github.com/fpusan/moira), to deactivate, set BinErrorModelAlpha to -1
BinErrorModelMaxExpError 2.7
BinErrorModelAlpha 0.005


#Max Barcode Errors
maxBarcodeErrs 0
maxPrimerErrs 0

#keep Barcode / Primer Sequence in the output fasta file - in a normal 16S analysis this should be deactivated (0) for Barcode and de-activated (0) for primer
keepBarcodeSeq 0
keepPrimerSeq 0


#set fastqVersion to 1 if you use Sanger, Illumina 1.8+ or NCBI SRA files. Set fastqVersion to 2, if you use Illumina 1.3+ - 1.7+ or Solexa fastq files. "auto" will look for typical characteristics of either of these and choose the quality offset score automatically.
fastqVersion auto

#if one or more files have a technical adapter still included (e.g. TCAG 454) this can be removed by setting this option
TechnicalAdapter

#delete X NTs (e.g. if the first 5 bases are known to have strange biases)
TrimStartNTs 0

#correct PE header format (1/2) this is to accomodate the illumina miSeq paired end annotations 2="@XXX 1:0:4" insteand of 1="@XXX/1". Note that the format will be automatically detected
PEheaderPairFmt 1

#sets if sequences without match to reverse primer (ReversePrimer) will be accepted (T=reject ; F=accept all); default=F
RejectSeqWithoutRevPrim T
#*RejectSeqWithoutRevPrim F
#sets if sequences without a forward (LinkerPrimerSequence) primer will be accepted (T=reject ; F=accept all); default=F
RejectSeqWithoutFwdPrim T
#*RejectSeqWithoutFwdPrim F

#this option should be "T" if your amplicons are possibly shorter than a single read in a paired end sequencing run (e.g. if the 16S amplicon length is 200bp in a 250x2 miSeq run, set this to "T"). This option increases runtime by 10%, if in doubt just set to "T". *Requires* LinkerPrimerSequence and ReversePrimer to be defined in mapping file.
AmpliconShortPE F

#options for difficulties during sequencing library construction
#checks if pair1 and pair2 were switched (ignore if single read data)
CheckForMixedPairs T
#checks if whole amplicon was reverse-transcribed sequenced (not switched, just reverse translated)
CheckForReversedSeqs T
8 changes: 4 additions & 4 deletions configs/sdm_PacBio_ITS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
minSeqLength 300
maxSeqLength 2000
minAvgQuality 27
*minSeqLength 200
*minAvgQuality 20
*minSeqLength 300
*minAvgQuality 25
#truncate total Sequence length to X (length after Barcode, Adapter and Primer removals, set to -1 to deactivate)
TruncateSequenceLength 1500
TruncateSequenceLength -1

#Ambiguous bases in Sequence
maxAmbiguousNT 2
Expand All @@ -22,7 +22,7 @@ QualWindowThreshhold -1

#Trim the end of a sequence if a window falls below quality threshhold. Useful for removing low qulaity trailing ends of sequence
TrimWindowWidth 20
TrimWindowThreshhold 25
TrimWindowThreshhold -1

#Probabilistic max number of accumulated sequencing errors. After this length, the rest of the sequence will be deleted. Complimentary to TrimWindowThreshhold. (-1) deactivates this option.
maxAccumulatedError -1
Expand Down
8 changes: 4 additions & 4 deletions configs/sdm_PacBio_LSSU.txt
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
#sdm options file to control sequence quality filtering, demultiplexing and preparation (can also be used without demultiplexing)
#* indicates alternative quality filtering options, saved in *.add.fna etc. files separately from initial quality filtered dataset
#sequence length refers to sequence length AFTER removal of Primers, Barcodes and trimming. this ensures that downstream analyis tools will have appropiate sequence information
minSeqLength 700
maxSeqLength 2000
minSeqLength 900
maxSeqLength 2300
minAvgQuality 27
*minSeqLength 500
*minAvgQuality 20
#truncate total Sequence length to X (length after Barcode, Adapter and Primer removals, set to -1 to deactivate)
TruncateSequenceLength -1

#Ambiguous bases in Sequence
maxAmbiguousNT 3
maxAmbiguousNT 2
*maxAmbiguousNT 3

#sequence is discarded if a homonucleotide run in sequence is longer
maxHomonucleotide 16

#Filter whole sequence if one window of quality scores is below average
QualWindowWidth 50
QualWindowThreshhold 25
QualWindowThreshhold -1

#Trim the end of a sequence if a window falls below quality threshhold. Useful for removing low qulaity trailing ends of sequence
TrimWindowWidth 20
Expand Down
Loading

0 comments on commit 567eaba

Please sign in to comment.