sdm 2.18 (AVITI support), usearch v12 added, LULU fix, -platform AVIT…

…I added
hildebra · Aug 4, 2024 · 567eaba · 567eaba
1 parent 45f0a45
commit 567eaba
Show file tree

Hide file tree

Showing 8 changed files with 172 additions and 71 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 *.sh
+runs
+runs/*
 sqLOGs
 myTestRun
 myTestRun2
@@ -16,6 +18,7 @@ configs/sdm_miSeq_t.txt
 configs/sdm_PacBio_LSSU_ARC.txt
 debug.sh
 myTestRun2
+bin/usearch12_linux_beta
 bin/ncbi-blast-*
 bin/vxtr/vxtractor.pl
 bin/vxtr/

diff --git a/autoInstall.pl b/autoInstall.pl
@@ -158,48 +158,7 @@
 #die "@txt\n";
 
 
-if ($usearch_reset==1 && !$condaDBinstall){
-	print "\n\n#######################################################################################";
-	print "\nUSEARCH ver 7, 8, 9 or 10 has to be installed manually (due to licensing). Please download & install from http://www.drive5.com/usearch/download.html  \n";
-
-    # Check if usearch is available in path and store the absolute path in $usearch_path
-    my $usearch_path = `which usearch`;
-    chomp($usearch_path);
-
-    if ($usearch_path eq "") {
-		print "USEARCH was not found on this system. Please enter the absolute path to usearch below.\nOr continue by entering \"0\" \n (you will have to add it later via \"./autoInstall.pl -link_usearch [path to usearch] or use -link_usearch [path to usearch] as an option when running LotuS2 )\"\n\nAnswer:";
-
-		my $inu = "";
-		while ( 1){
-			$inu = <>; chomp $inu;
-			if ($inu eq "0"){$inu="";last;
-			} elsif (!-f $inu){print "Not a valid file! try again:\n";
-			} else {last;}
-		}
-		$usearch_path = $inu;
-    } else {
-    	print "USEARCH was found at: \n";
-    	print $usearch_path;
-    	print "\nIf this is not the right USEARCH version you can change it later via \"./autoInstall.pl -link_usearch [path to usearch] or use -link_usearch [path to usearch] as an option when running LotuS2.\n\n";
-    }
-
-	# print "USEARCH was not found on this system. Please enter the absolute path to usearch below.\nOr continue by entering \"0\" \n (you will have to add it later via \"./autoInstall.pl -link_usearch [path to usearch] or use -link_usearch [path to usearch] as an option when running LotuS2 )\"\n\nAnswer:";
-	# #print "Once downloaded, rename the binary userachXXX to usearch_bin, make it executable (chmod +x usearch_bin) and copy/link it to this directory:\n$bdir \n";
-	# #print "\nLotuS is almost ready to run (usearch).\n\n";
-	# my $inu = "";
-	# while ( 1){
-	# 	$inu = <>; chomp $inu;
-	# 	if ($inu eq "0"){$inu="";last;
-	# 	} elsif (!-f $inu){print "Not a valid file! try again:\n";
-	# 	} else {last;}
-	# }
-	@txt = addInfoLtS("usearch",$usearch_path,\@txt,1) if ($usearch_path ne "");
-
-} else {
-	print "Found valid usearch bin at $uspath\n";
-	print "\nLotuS is ready to run.\n\n";
-
-}
+getUsearch();
 
 
 ######## get programs ####################
@@ -1182,7 +1141,58 @@ sub get_DBs{
 
 
 }
+
+
+sub getUsearch{
+	if (1){
+		print "Downloading usearch v 12 for sequence clustering and tax annotations..\n";
+		my $tarUTN = "$bdir/usearch12_linux_beta";
+		getS2("https://github.com/rcedgar/usearch12/releases/download/v12.0-beta1/usearch_linux_x86_12.0-beta",$tarUTN);
+		@txt = addInfoLtS("usearch",$tarUTN,\@txt,1);
+
+	}elsif ($usearch_reset==1 && !$condaDBinstall){ #old install for previous usearch versions..
+		print "\n\n#######################################################################################";
+		print "\nUSEARCH ver 7, 8, 9 or 10 has to be installed manually (due to licensing). Please download & install from http://www.drive5.com/usearch/download.html  \n";
+
+		# Check if usearch is available in path and store the absolute path in $usearch_path
+		my $usearch_path = `which usearch`;
+		chomp($usearch_path);
+
+		if ($usearch_path eq "") {
+			print "USEARCH was not found on this system. Please enter the absolute path to usearch below.\nOr continue by entering \"0\" \n (you will have to add it later via \"./autoInstall.pl -link_usearch [path to usearch] or use -link_usearch [path to usearch] as an option when running LotuS2 )\"\n\nAnswer:";
+
+			my $inu = "";
+			while ( 1){
+				$inu = <>; chomp $inu;
+				if ($inu eq "0"){$inu="";last;
+				} elsif (!-f $inu){print "Not a valid file! try again:\n";
+				} else {last;}
+			}
+			$usearch_path = $inu;
+		} else {
+			print "USEARCH was found at: \n";
+			print $usearch_path;
+			print "\nIf this is not the right USEARCH version you can change it later via \"./autoInstall.pl -link_usearch [path to usearch] or use -link_usearch [path to usearch] as an option when running LotuS2.\n\n";
+		}
 
+		# print "USEARCH was not found on this system. Please enter the absolute path to usearch below.\nOr continue by entering \"0\" \n (you will have to add it later via \"./autoInstall.pl -link_usearch [path to usearch] or use -link_usearch [path to usearch] as an option when running LotuS2 )\"\n\nAnswer:";
+		# #print "Once downloaded, rename the binary userachXXX to usearch_bin, make it executable (chmod +x usearch_bin) and copy/link it to this directory:\n$bdir \n";
+		# #print "\nLotuS is almost ready to run (usearch).\n\n";
+		# my $inu = "";
+		# while ( 1){
+		# 	$inu = <>; chomp $inu;
+		# 	if ($inu eq "0"){$inu="";last;
+		# 	} elsif (!-f $inu){print "Not a valid file! try again:\n";
+		# 	} else {last;}
+		# }
+		@txt = addInfoLtS("usearch",$usearch_path,\@txt,1) if ($usearch_path ne "");
+
+	} else {
+		print "Found valid usearch bin at $uspath\n";
+		print "\nLotuS is ready to run.\n\n";
+
+	}
+}
 
 
 sub get_programs{
@@ -1614,11 +1624,11 @@ ()
 	print "Answer:";
 	while (<>){
 		chomp($_); 
-		if ($_ == 1 ||$_ == 4 || $_ == 3 ||$_ == 2 ||$_ == 5 ||$_ == 0 ||$_ == 8){
+		if (defined($_) && $_ ne "" && ($_ == 1 ||$_ == 4 || $_ == 3 ||$_ == 2 ||$_ == 5 ||$_ == 0 ||$_ == 8)){
 			$refDBinstall[8] = 0;
 			$refDBinstall[$_] = 1;
 			last;
-		}
+		} else { print "Invalid answer, aborting.."; exit(); }
 	}
 	#SILVA license
 	if (!$skipAll && ($refDBinstall[2] || $refDBinstall[8])){

diff --git a/bin/R/LULU.R b/bin/R/LULU.R
@@ -83,7 +83,7 @@ lulu = function (otutable, matchlist, minimum_ratio_type = "min", minimum_ratio
     curate_index <- row.names(statistics_table) == statistics_table$parent_id
     statistics_table$curated[curate_index] <- "parent"
     statistics_table <- transform(statistics_table, rank = ave(total, FUN = function(x) rank(-x, ties.method = "first")))
-    curation_table <- as.data.frame(curation_table %>% group_by(nOTUid) %>% summarise_all(funs(sum)))
+    curation_table <- as.data.frame(curation_table %>% group_by(nOTUid) %>% summarise_all(list(sum)))
     row.names(curation_table) <- as.character(curation_table$nOTUid)
     curation_table <- curation_table[, -1]
     curated_otus <- names(table(statistics_table$parent_id))

diff --git a/bin/sdm b/bin/sdm
diff --git a/configs/sdm_AVITI.txt b/configs/sdm_AVITI.txt
@@ -0,0 +1,71 @@
+#sdm options file to control sequence quality filtering, demultiplexing and preparation (can also be used without demultiplexing)
+#* indicates alternative quality filtering options, saved in *.add.fna etc. files separately from initial quality filtered dataset
+#sequence length refers to sequence length AFTER removal of Primers, Barcodes and trimming. this ensures that downstream analyis tools will have appropiate sequence information
+#options with a star in front are lenient parameters for mid qual sequences (only used for estimating OTU abundance, not for OTU building itself).
+minSeqLength	200
+maxSeqLength	1000
+minAvgQuality	27
+*minSeqLength	200
+*minAvgQuality	20
+#truncate total Sequence length to X (length after Barcode, Adapter and Primer removals, set to -1 to deactivate)
+TruncateSequenceLength	200
+
+#Ambiguous bases in Sequence
+maxAmbiguousNT	0
+*maxAmbiguousNT	1
+
+#sequence is discarded if a homonucleotide run in sequence is longer
+maxHomonucleotide	8
+
+#Filter whole sequence if one window of quality scores is below average
+QualWindowWidth	50
+QualWindowThreshhold	25
+
+#Trim the end of a sequence if a window falls below quality threshhold. Useful for removing low qulaity trailing ends of sequence
+TrimWindowWidth	20
+TrimWindowThreshhold	25
+
+#Probabilistic max number of accumulated sequencing errors. After this length, the rest of the sequence will be deleted. Complimentary to TrimWindowThreshhold. (-1) deactivates this option.
+maxAccumulatedError	0.95
+*maxAccumulatedError	-1
+#Binomial error model of expected errors per sequence (see https://github.com/fpusan/moira), to deactivate, set BinErrorModelAlpha to -1
+BinErrorModelMaxExpError	2.7
+BinErrorModelAlpha	0.005
+
+
+#Max Barcode Errors 
+maxBarcodeErrs	0
+maxPrimerErrs	0
+
+#keep Barcode / Primer Sequence in the output fasta file - in a normal 16S analysis this should be deactivated (0) for Barcode and de-activated (0) for primer
+keepBarcodeSeq	0
+keepPrimerSeq	0
+
+
+#set fastqVersion to 1 if you use Sanger, Illumina 1.8+ or NCBI SRA files. Set fastqVersion to 2, if you use Illumina 1.3+ - 1.7+ or Solexa fastq files.  "auto" will look for typical characteristics of either of these and choose the quality offset score automatically.
+fastqVersion	auto
+
+#if one or more files have a technical adapter still included (e.g. TCAG 454) this can be removed by setting this option
+TechnicalAdapter	
+
+#delete X NTs (e.g. if the first 5 bases are known to have strange biases)
+TrimStartNTs	0
+
+#correct PE header format (1/2) this is to accomodate the illumina miSeq paired end annotations 2="@XXX 1:0:4" insteand of 1="@XXX/1". Note that the format will be automatically detected
+PEheaderPairFmt	1
+
+#sets if sequences without match to reverse primer (ReversePrimer) will be accepted (T=reject ; F=accept all); default=F
+RejectSeqWithoutRevPrim	T
+#*RejectSeqWithoutRevPrim	F
+#sets if sequences without a forward (LinkerPrimerSequence) primer will be accepted (T=reject ; F=accept all); default=F
+RejectSeqWithoutFwdPrim	T
+#*RejectSeqWithoutFwdPrim	F
+
+#this option should be "T" if your amplicons are possibly shorter than a single read in a paired end sequencing run (e.g. if the 16S amplicon length is 200bp in a 250x2 miSeq run, set this to "T"). This option increases runtime by 10%, if in doubt just set to "T". *Requires* LinkerPrimerSequence and ReversePrimer to be defined in mapping file.
+AmpliconShortPE	F
+
+#options for difficulties during sequencing library construction
+#checks if pair1 and pair2 were switched (ignore if single read data)
+CheckForMixedPairs	T
+#checks if whole amplicon was reverse-transcribed sequenced (not switched, just reverse translated)
+CheckForReversedSeqs	T
diff --git a/configs/sdm_PacBio_ITS.txt b/configs/sdm_PacBio_ITS.txt
@@ -4,10 +4,10 @@
 minSeqLength	300
 maxSeqLength	2000
 minAvgQuality	27
-*minSeqLength	200
-*minAvgQuality	20
+*minSeqLength	300
+*minAvgQuality	25
 #truncate total Sequence length to X (length after Barcode, Adapter and Primer removals, set to -1 to deactivate)
-TruncateSequenceLength	1500
+TruncateSequenceLength	-1
 
 #Ambiguous bases in Sequence
 maxAmbiguousNT	2
@@ -22,7 +22,7 @@ QualWindowThreshhold	-1
 
 #Trim the end of a sequence if a window falls below quality threshhold. Useful for removing low qulaity trailing ends of sequence
 TrimWindowWidth	20
-TrimWindowThreshhold	25
+TrimWindowThreshhold	-1
 
 #Probabilistic max number of accumulated sequencing errors. After this length, the rest of the sequence will be deleted. Complimentary to TrimWindowThreshhold. (-1) deactivates this option.
 maxAccumulatedError	-1

diff --git a/configs/sdm_PacBio_LSSU.txt b/configs/sdm_PacBio_LSSU.txt
@@ -1,24 +1,24 @@
 #sdm options file to control sequence quality filtering, demultiplexing and preparation (can also be used without demultiplexing)
 #* indicates alternative quality filtering options, saved in *.add.fna etc. files separately from initial quality filtered dataset
 #sequence length refers to sequence length AFTER removal of Primers, Barcodes and trimming. this ensures that downstream analyis tools will have appropiate sequence information
-minSeqLength	700
-maxSeqLength	2000
+minSeqLength	900
+maxSeqLength	2300
 minAvgQuality	27
 *minSeqLength	500
 *minAvgQuality	20
 #truncate total Sequence length to X (length after Barcode, Adapter and Primer removals, set to -1 to deactivate)
 TruncateSequenceLength	-1
 
 #Ambiguous bases in Sequence
-maxAmbiguousNT	3
+maxAmbiguousNT	2
 *maxAmbiguousNT	3
 
 #sequence is discarded if a homonucleotide run in sequence is longer
 maxHomonucleotide	16
 
 #Filter whole sequence if one window of quality scores is below average
 QualWindowWidth	50
-QualWindowThreshhold	25
+QualWindowThreshhold	-1
 
 #Trim the end of a sequence if a window falls below quality threshhold. Useful for removing low qulaity trailing ends of sequence
 TrimWindowWidth	20