Merge pull request #32 from Eco-Flow/nf-test-dev

Nf test dev
Eco-Flow · Feb 19, 2024 · 87488d8 · 87488d8
2 parents aff25a3 + 4d38bd3
commit 87488d8
Show file tree

Hide file tree

Showing 190 changed files with 626,706 additions and 259,948 deletions.
diff --git a/.gitattributes b/.gitattributes
diff --git a/.github/workflows/test-pipeline.yml b/.github/workflows/test-pipeline.yml
@@ -6,6 +6,10 @@ on:
 env:
   NEXTFLOW_VERSION: 23.10.1
   NF_TEST_VERSION: 0.8.3
+  AWS_CLI_VERSION: 2.15.19
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PUBLIC_KEY }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY }}
+  AWS_DEFAULT_REGION: 'us-east-1'
 
 jobs:
 
@@ -31,6 +35,9 @@ jobs:
     - name: Install nf-test
       run: sudo bash; cd /opt; wget "https://github.com/askimed/nf-test/releases/download/v${NF_TEST_VERSION}/nf-test-${NF_TEST_VERSION}.tar.gz"; tar -xvf "nf-test-${NF_TEST_VERSION}.tar.gz"; chmod +x nf-test; rm "/opt/nf-test-${NF_TEST_VERSION}.tar.gz"
 
+    - name: Install aws cli
+      run: curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64-${AWS_CLI_VERSION}.zip" -o "awscliv2.zip"; unzip awscliv2.zip; sudo ./aws/install --update
+
     - name: Add software to path
       run: echo "/opt" >> $GITHUB_PATH;
 
@@ -43,6 +50,12 @@ jobs:
     - name: Run jcvi test
       run: nf-test test tests/modules/jcvi.nf.test
 
+    - name: Download synteny module test cds files
+      run:  aws s3 cp s3://synteny-test-data/data/synteny_input/Drosophila_santomea.cds data/synteny_input/Drosophila_santomea.cds; aws s3 cp s3://synteny-test-data/data/synteny_input/Drosophila_simulans.cds data/synteny_input/Drosophila_simulans.cds
+
+    - name: Run synteny test
+      run: nf-test test tests/modules/synteny.nf.test
+
     - name: Run chromopaint test
       run: nf-test test tests/modules/chromo.nf.test
 
@@ -55,5 +68,8 @@ jobs:
     - name: Run go test
       run: nf-test test tests/modules/go.nf.test
 
+    - name: Run go summarise test
+      run: nf-test test tests/modules/go_summarise.nf.test
+
     - name: Clean workspace after finish
       run: rm -rf $GITHUB_WORKSPACE/*
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,6 @@ Results
 Go
 .nextflow*
 .DS_Store
-.nf-test
-.nf-test.log
-test-results
+.nf-test*
+nextflow
+nf-test
diff --git a/.gitpod.yml b/.gitpod.yml
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -0,0 +1 @@
+repository_type: pipeline
diff --git a/README.md b/README.md
diff --git a/bin/Best_synteny_classifier_v6.classify.pl b/bin/Best_synteny_classifier_v6.classify.pl
@@ -0,0 +1,181 @@
+#!/usr/bin/perl
+use warnings;
+use strict;
+use List::Util qw(max);
+use List::Util qw(min);
+
+my @breaks=`ls *Break_junction_information.txt`;
+
+#Summary of all runs:
+my $outname="Trans_Inversion_junction_count.txt";
+open(my $OUT, ">", $outname)   or die "Could not open $outname\n";
+#Print header of output file- see after 236 where we print this line
+print $OUT "Comparison\tTranslocation_junctions\tInversion_junctions\tSame_direction_duplication_junctions\tLoop_direction_duplication_junctions\n";
+
+
+foreach my $file (@breaks){
+	chomp $file;
+	#Sort out which name comparison we are making
+	my @split_name=split(/\./, $file);
+	my $species1=$split_name[0];
+	my $species2=$split_name[1];
+	my $combName="$species1\.$species2";
+
+	#Initiate out file handle
+	my $outfile="$combName\.Classification_summary.tsv";
+	open(my $out, ">", $outfile)   or die "Could not open $outfile \n";
+
+	#Read in syntenic anchor files (produced by MScanX using the program jcvi)
+	my $in_break_file="$combName\.Break_junction_information.txt";
+	open(my $in_break, "<", $in_break_file)   or die "Could not open $in_break_file\n";
+
+	#Store for each syntenic blocks gene order in sp2. Needed for next script.
+	my $in_sp2_order="$combName\.Sp2_synteny_order.txt";
+	open(my $in_order, "<", $in_sp2_order)   or die "Could not open $in_sp2_order\n";
+
+	#First read in the synteny order file so we have a hash that tells us exactly where all the genes are location. Based on their order.
+
+	my %gene_order_all;
+	my %gene_order_max;
+	my %gene_order_min;
+	my %position_to_syntenic_block;
+	while ( my $line1 = <$in_order> ){
+		chomp $line1;
+		my @sp=split("\t", $line1);
+		my $chr=$sp[0];
+		my $syn=$sp[1];
+		my $list=$sp[2];
+		my @array_list=split("\,", $list);
+
+		if ($line1 =~ m/Synteny_block_number/g){
+			#ignore this line, its the header
+		}
+		else{
+			$gene_order_all{$chr}{$syn}=$list;
+			$gene_order_max{$chr}{$syn}=max(@array_list);
+			$gene_order_min{$chr}{$syn}=min(@array_list);
+		}
+
+		#then loop through the genes in each block to assign each position to a syntenic block number
+		foreach my $gene_pos (@array_list){
+			$position_to_syntenic_block{$chr}{$gene_pos}=$syn;
+		}
+
+	}
+
+
+	my $removeheader=<$in_break>;
+	my $odd_junction=0;
+	my $same_direction=0;
+	my $inversions=0;
+	my $translocations=0;
+
+	#For each my apparetn syntenic block.
+	#Check first if the gene in syntenic block 1, from range min to max, are found within any other sytenic block on the same chromosome.
+	while ( my $line2 = <$in_break> ){
+		chomp $line2;
+		#print "START\n$line2\n";
+		my @sp=split("\t", $line2);
+		my $chr=$sp[0];
+		my $syn1=$sp[1];
+		my $syn2=$sp[2];
+
+		my $sta_1=$sp[3];
+		my $end_1=$sp[4];
+		my $sta_2=$sp[5];
+		my $end_2=$sp[6];
+
+		my $type_original=$sp[7];
+
+		#calcualte direction of block
+		my $direction_1;
+		if ($sta_1 == $end_1){
+			$direction_1=0;
+		}
+		elsif($sta_1 < $end_1){
+			$direction_1=1;
+		}
+		else{
+			$direction_1=-1;
+		}
+
+		my $direction_2;
+		if ($sta_2 == $end_2){
+			$direction_2=0;
+		}
+		elsif($sta_2 < $end_2){
+			$direction_2=1;
+		}
+		else{
+			$direction_2=-1;
+		}
+
+
+		my $min_block1=$gene_order_min{$chr}{$syn1};
+		my $max_block1=$gene_order_max{$chr}{$syn1};
+		my $min_block2=$gene_order_min{$chr}{$syn2};
+		my $max_block2=$gene_order_max{$chr}{$syn2};
+
+		#calculate the gene order numbers from within the two blocks:
+		#Didn't work, as many of the syntenic blocks have genes have genes from random chromosomes or from the same chromosome in them. Could be TEs, or indels?
+		#print "$min_block1 $max_block1\n";
+		#for (my $i=$min_block1; $i<=$max_block1; $i++){
+		#	if ($position_to_syntenic_block{$chr}{$i}){
+		#		if ($position_to_syntenic_block{$chr}{$i} ne $syn1){
+		#			print "Can happen : $position_to_syntenic_block{$chr}{$i}\n";
+		#		}	
+		#	}
+		#}
+
+		#try to calculate the smallest gap between the two blocks.
+		my $gap;
+		my $minmax=min($sta_1,$end_1)-max($sta_2,$end_2);
+		my $maxmin=max($sta_1,$end_1)-min($sta_2,$end_2);
+		if ($minmax <= $maxmin){
+			$gap=$minmax;
+		}
+		else{
+			$gap=$maxmin;
+		}
+
+
+
+		if ($type_original eq "INVER"){
+			my $junction_type;
+			#try to detect if its an inversion:
+			if ($direction_1==0 || $direction_2==0){
+				$odd_junction++;
+				$junction_type="Odd-duplicate";
+			}
+			elsif($direction_1==1 && $direction_2==1){
+				$same_direction++;
+				$junction_type="Same_direction";
+			}
+			elsif($direction_1==0 && $direction_2==0){
+				$same_direction++;
+				$junction_type="Same_direction";
+			}
+			else{
+				#rest must be inversions:
+				$junction_type="Inversion";
+				$inversions++;
+			}
+			print $out "$chr\t$syn1\t$syn2\t$min_block1\t$max_block1\t$sta_1\t$end_1\t$sta_2\t$end_2\t$gap\t$junction_type\n";
+		}
+		else{
+			#Then its a translocation, we cannot caluclate a gap
+			print $out "$chr\t$syn1\t$syn2\t$min_block1\t$max_block1\t$sta_1\t$end_1\t$sta_2\t$end_2\tNA\tTranslocation\n";
+			$translocations++;
+		}
+
+	}
+
+	print $OUT "$combName\t$translocations\t$inversions\t$same_direction\t$odd_junction\n";
+	print "Translocations: $translocations\nInversions: $inversions\nSame_direction_likely_duplicate: $same_direction\nOdd_direction_likely_duplicates : $odd_junction   \n\#(likely duplicated, one of the syntenic block starts and ends with the same gene)\nSame_direction $same_direction    \n\#Gene order in same direction (so not an inversion [or translocation]), could be caused by duplications\n\n";
+	close $out;
+	close $in_break;
+	close $in_order;
+}
+
+
+