Merge branch 'release/2.1.0'

Improved handling of variants immediatly around the start and stop codons
cancerit · Apr 10, 2015 · 9ebab4b · 9ebab4b
2 parents 11db6aa + 9b84208
commit 9ebab4b
Show file tree

Hide file tree

Showing 14 changed files with 1,693 additions and 54 deletions.
diff --git a/bin/AnnotateVcf.pl b/bin/AnnotateVcf.pl
@@ -33,6 +33,8 @@
 use Data::Dumper;
 
 use List::Util qw(first);
+use File::Temp qw(tempfile);
+use Try::Tiny qw(try catch);
 
 use FindBin qw($Bin);
 use lib "$Bin/../lib";
@@ -76,6 +78,10 @@
 const my $REPRE_BM => Sanger::CGP::Vagrent::Bookmarkers::RepresentativeTranscriptBookmarker->new();
 const my $WORST_BM => Sanger::CGP::Vagrent::Bookmarkers::MostDeleteriousBookmarker->new();
 
+const my $SORT_CMD => 'cat %s | vcf-sort > %s';
+const my $BGZIP_CMD => 'bgzip %s';
+const my $TABIX_CMB => 'tabix -p vcf %s';
+
 
 my $header_already_parsed = 0;
 
@@ -86,12 +92,22 @@
   unless(defined $options->{'species'} && defined $options->{'assembly'}) {
     croak 'unable to determine species and assembly from VCF file, please specify on command line' unless find_species_in_vcf($vcf_in,$options);
   }
-  open my $OUT_FH, '>', $options->{'output'} or croak 'Failed to create: '.$options->{'output'};
+  my $output = $options->{'output'};
+  if($options->{'tabix'}){
+    (undef,$output) = tempfile('vagrentXXXXXXX', OPEN => 0, SUFFIX => '.vcf');
+  }
+
+  open my $OUT_FH, '>', $output or croak 'Failed to create: '.$output;
   my $annotator = get_annotator($options);
 
   process_data($vcf_in,$OUT_FH,$annotator,$options);
-  close $OUT_FH or croak 'Failed to close: '.$options->{'output'};
-  Vcf::validate($options->{'output'});
+  close $OUT_FH or croak 'Failed to close: '.$output;
+  Vcf::validate($output);
+
+  if($options->{'tabix'}){
+    compressAndIndex($options,$output);
+  }
+
   1;
 } or do {
   warn "EVAL_ERROR: $EVAL_ERROR\n" if($EVAL_ERROR);
@@ -100,6 +116,44 @@
   croak 'A problem occurred';
 };
 
+sub compressAndIndex {
+  my ($options, $tmpfile) = @_;
+
+  my $sort_cmd = sprintf $SORT_CMD, $tmpfile, $options->{'output'};
+  my $bgzip_cmd = sprintf $BGZIP_CMD, $options->{'output'};
+  my $totabix = $options->{'output'} .'.gz';
+  my $tabix_cmd = sprintf $TABIX_CMB, $totabix;
+
+  try {
+    my $tabix_in = $options->{'input'}.'.tbx';
+    unless(-e $tabix_in){
+      # If the input has a tabix index it must have already been sorted, 
+      # we haven't changed the order of the file so we can skip this sort
+      system($sort_cmd);
+    }
+
+  } catch {
+    warn "EXECUTION ERROR: $sort_cmd\n";
+    die $_;
+  };
+
+  try {
+    system($bgzip_cmd);
+  } catch {
+    warn "EXECUTION ERROR: $bgzip_cmd\n";
+    die $_;
+  };
+
+  try {
+    system($tabix_cmd);
+  } catch {
+    warn "EXECUTION ERROR: $tabix_cmd\n";
+    die $_;
+  };
+
+  unlink $tmpfile;
+}
+
 sub process_data {
   my ($in,$out,$anno,$opts) = @_;
   print $out generate_header($in,$opts);
@@ -407,6 +461,7 @@ sub option_builder {
     'i|input=s' => \$opts{'input'},
     'o|output=s' => \$opts{'output'},
     'c|cache=s' => \$opts{'cache'},
+    't|tabix' => \$opts{'tabix'},
     'p|process=n' => \$opts{'process'},
     'sp|species=s' => \$opts{'species'},
     'as|assembly=s' => \$opts{'assembly'},
@@ -443,15 +498,15 @@ =head1 NAME
 
 =head1 SYNOPSIS
 
-AnnotateVcf.pl [-h] -i <IN_FILE> -o <OUT_FILE> -c <VAGRENT_CACHE_FILE>
+AnnotateVcf.pl [-h] [-t] -i <IN_FILE> -o <OUT_FILE> -c <VAGRENT_CACHE_FILE> [-sp <SPECIES> -as <GENOME_VERSON>]
 
   General Options:
 
     --help      (-h)      Brief documentation
 
     --input     (-i)      Input vcf file (expects *.bgz)
 
-    --output    (-o)      Output vcf
+    --output    (-o)      Output vcf file (plain text, add -t for zip and index)
 
     --cache     (-c)      Vagrent reference data cache file
 
@@ -467,4 +522,6 @@ =head1 SYNOPSIS
 
     --process   (-p)      ID_PROCESS that generated this file
 
+    --tabix     (-t)      bgzip and tabix index the output file (will generate the .gz version of the -o option)
+
 =cut
diff --git a/docs.tar.gz b/docs.tar.gz
diff --git a/lib/Sanger/CGP/Vagrent.pm b/lib/Sanger/CGP/Vagrent.pm
@@ -26,7 +26,7 @@ use strict;
 use Const::Fast qw(const);
 
 use base 'Exporter';
-our $VERSION = '2.0';
+our $VERSION = '2.1.0';
 our @EXPORT = qw($VERSION);
 
 1;
diff --git a/lib/Sanger/CGP/Vagrent/Annotators/AbstractAnnotator.pm b/lib/Sanger/CGP/Vagrent/Annotators/AbstractAnnotator.pm
@@ -462,18 +462,19 @@ sub _buildProteinAnnotation {
  		# something has gone wrong
  		return undef;
  	}
+
 	my $mtDna = $self->_getMutatedCdsSequence($wtDna,$cdsMinPos,$cdsMaxPos,$cAnnot->getMt());
 	my $mtProt = Bio::Seq->new(-seq => $prePad . $mtDna . $postPad)->translate->seq(); # mutant protein sequence
 	my $maxMtProt = Bio::Seq->new(-seq => $prePad . $mtDna . substr($tran->getcDNASeq,$tran->getCdsMaxPos()))->translate->seq(); # maximised protein sequence, overruns the natural stop and translates to the end of the transcript
 	if($wtProt eq $mtProt){
 		# wt and mt protein sequences are the same, its silent
 		$mutProtMin = ceil(($cAnnot->getMinPos / 3));
-  		$mutProtMax = ceil(($cAnnot->getMaxPos / 3));
-  		$wt = substr($wtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
-  	  	$mt = substr($mtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
-  	  	if(length($wt) == 1 && length($mt) == 1 && $mutProtMin == $mutProtMax){
-			$desc = 'p.'.$wt.$mutProtMin.$mt;
-  	  	} else {
+  	$mutProtMax = ceil(($cAnnot->getMaxPos / 3));
+  	$wt = substr($wtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
+  	$mt = substr($mtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
+  	if(length($wt) == 1 && length($mt) == 1 && $mutProtMin == $mutProtMax){
+		  $desc = 'p.'.$wt.$mutProtMin.$mt;
+  	} else {
 			$desc = 'p.(=)';
 		}
 		$type = $self->_getDefaultProteinAnnotationType();
@@ -495,10 +496,10 @@ sub _buildProteinAnnotation {
  		if($mutProtMin == 1){
  	  		# its frame shifted the start codon, no idea what this is going to cause.
  	  		push(@classes,$self->getStartLostVariantClass);
- 	  		return $self->_buildUnknownProteinAnnotation($var,$tran,$cAnnot,length($wtProt),@classes);
- 	    }
+	  		return $self->_buildUnknownProteinAnnotation($var,$tran,$cAnnot,length($wtProt),@classes);
+    }
  		$type = Sanger::CGP::Vagrent::Data::Annotation::getFrameShiftAnnotationType();
- 	    push(@classes,$self->getFrameShiftVariantClass);
+    push(@classes,$self->getFrameShiftVariantClass);
 	} else {
 		$wt = $wtProt;
 		$mt = $mtProt;
@@ -512,7 +513,7 @@ sub _buildProteinAnnotation {
 			substr($wt,-1,1,'');
 			substr($mt,-1,1,'');
 		}
-
+    
 		#warn "|$wt| to |$mt|\n";
 		if($wt ne ''){
 			# wild type residue has been changed
@@ -617,8 +618,6 @@ sub _buildProteinAnnotation {
 															subtype => $subtype);
   	$anno->addClassification(@classes);
   	return $anno;
-
-	return undef;
 }
 
 sub _getMutatedCdsSequence: Abstract;
@@ -637,7 +636,6 @@ sub _buildCDSAnnotation {
 		return $self->_buildUnknownCDSAnnotation($var,$tran,$rAnnot,@classes);
 	}
 	my ($cdsMin,$cdsMinOffset,$cdsMax,$cdsMaxOffset) = (undef,undef,undef,undef);
-
 	if($rAnnot->getMinPos < $tran->getCdsMinPos){
 		$cdsMin = 1;
 		$cdsMinOffset = 0;
@@ -668,6 +666,8 @@ sub _buildCDSAnnotation {
 		$cdsMaxOffset = $rAnnot->getMaxOffset();
   }
 
+  print "CDS: $cdsMin , $cdsMinOffset - $cdsMax, $cdsMaxOffset\n" if $self->_debug();
+
 	my $wt = $self->_getWildTypeStringForCDSAnno($var,$tran,$rAnnot);
 	my $mt = $self->_getMutantStringForCDSAnno($var,$tran,$rAnnot);
 	my $desc = $self->_getCDSDescriptionString($tran,$cdsMin,$cdsMax,$cdsMinOffset,$cdsMaxOffset,$wt,$mt);
@@ -959,21 +959,6 @@ sub _coversStopCodon {
       return 1;
     }
   }
-
-
-
-# 	if($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()){
-# 		if($anno->getMinPos <= $tran->getCdsMaxPos && $anno->getMaxPos >= $tran->getCdsMaxPos - 2){
-# 			return 1;
-# 		}
-# 	} elsif($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()){
-# 		if($anno->getMinPos <= $tran->getCdsLength && $anno->getMaxPos >= $tran->getCdsLength - 2){
-# 			return 1;
-# 		}
-# 	} else {
-# 		# don't know, assume no
-# 		return 0;
-# 	}
 	return 0;
 }
 
@@ -1025,10 +1010,25 @@ sub _canAnnotateToCDS {
 		if($anno->hasClassification($self->getInsertionClass)){
 			# insertions are a special case.
 			# Coordinates are the last WT positions, and not the first variant ones like everything else
-			if($anno->getMaxPos <= $tran->getCdsMinPos || $anno->getMinPos >= $tran->getCdsMaxPos){
-				# its outside the CDS
-				return 0;
-			}
+
+      print 'ANNO POS: '.$anno->getMinPos.' , '.$anno->getMinOffset.' - '.$anno->getMaxPos.' , '.$anno->getMaxOffset."\n" if $self->_debug();
+      print 'CDS POS: '.$tran->getCdsMinPos.' , '.$tran->getCdsMaxPos."\n" if $self->_debug();
+
+      if($anno->getMaxPos < $tran->getCdsMinPos || $anno->getMinPos > $tran->getCdsMaxPos){
+        # ends before CDS or starts afterwards
+        return 0;
+      } elsif($anno->getMaxPos == $tran->getCdsMinPos) {
+        # potential start codon issues
+        if($anno->getMinPos == $anno->getMaxPos && $anno->getMinPos == $tran->getCdsMinPos && abs($anno->getMinOffset) + abs($anno->getMaxOffset) > 0){
+          # probably start coordinate issues
+          unless($anno->getMaxOffset <= 0 && $self->_isIntronicOffsetDistance($anno->getMaxOffset) == 0){
+            # or not
+            return 0;
+          }
+        } else {
+          return 0;
+        }
+      }
 		} else {
 			if($anno->getMaxPos < $tran->getCdsMinPos || $anno->getMinPos > $tran->getCdsMaxPos){
 				# its outside the CDS
@@ -1050,6 +1050,9 @@ sub _canAnnotateToCDS {
 			return 0;
 		} elsif($anno->hasClassification($self->getUnknownVariantClass)){
 			return 0;
+    } elsif($anno->hasClassification($self->getInsertionClass) && $anno->hasClassification($self->get5PrimeUtrVariantClass)){
+      # odd case, insertions close to the start codons can be described on the CDS even though they don't change it. 
+      return 1;
 		} else {
 			my $msg = "Unable to calculate CDS relevance - UNKNOWN CLASSIFICATION: ".join(' ',$anno->getClassifications);
 			$self->addMessage($msg);
@@ -1066,6 +1069,7 @@ sub _canAnnotateToCDS {
 
 sub _canAnnotateToProtein {
 	my ($self,$tran,$anno) = @_;
+
 	unless($tran->isProteinCoding){
 		# if the transcript isn't protein coding it can't be a coding change
 		return 0;

diff --git a/lib/Sanger/CGP/Vagrent/Annotators/SimpleSubstitutionAnnotator.pm b/lib/Sanger/CGP/Vagrent/Annotators/SimpleSubstitutionAnnotator.pm
@@ -190,18 +190,14 @@ sub _buildRNAAnnotation {
 	}
 
 	if($tran->isProteinCoding){
-    #print "HERE\n";
     if(($pos > $tran->getCdsMinPos || ($pos == $tran->getCdsMinPos && $offset >= 0)) && 
        ($pos < $tran->getCdsMaxPos || ($pos == $tran->getCdsMaxPos && $offset <= 0))){
-#		if($pos >= $tran->getCdsMinPos && $pos <= $tran->getCdsMaxPos){
 			# coding change
 			push(@groupClasses,$self->getCDSClass);
 		} elsif($pos < $tran->getCdsMinPos || ($pos == $tran->getCdsMinPos && $offset < 0)){
-#    } elsif($pos < $tran->getCdsMinPos){
 			# 5prime UTR
 			push(@groupClasses,$self->get5PrimeUtrClass);
 		} elsif($pos > $tran->getCdsMaxPos || ($pos == $tran->getCdsMaxPos && $offset > 0)){
-#    } elsif($pos > $tran->getCdsMaxPos){
 			# 3prime UTR
 			push(@groupClasses,$self->get3PrimeUtrClass);
 		} else {

diff --git a/lib/Sanger/CGP/Vagrent/Ontology/SequenceOntologyClassifier.pm b/lib/Sanger/CGP/Vagrent/Ontology/SequenceOntologyClassifier.pm
@@ -100,14 +100,15 @@ const my $SO_NON_PROTEIN_CODING_CLASS => 'SO:0000011:non_protein_coding';
 
 const my $TERM_SUMMARY_INI => 'SequenceOntologySummary.ini';
 
-#sub DESTROY {
-#  my $self = shift;
-#  if(defined $self->{'_SOsum'}){
-#    foreach my $k( sort {$self->{'_notSummary'}->{$b} <=> $self->{'_notSummary'}->{$a}} keys %{$self->{'_notSummary'}}){
-#      print $self->{'_notSummary'}->{$k},' - ',$k,"\n" unless $self->{'_notSummary'}->{$k} == 1;
-#    }
-#  }
-#}
+# sub DESTROY {
+# ##### Handy DESTROY function that will print ontology combinations that don't exist in the summary lookup at program termination.
+#   my $self = shift;
+#   if(defined $self->{'_SOsum'}){
+#     foreach my $k( sort {$self->{'_notSummary'}->{$b} <=> $self->{'_notSummary'}->{$a}} keys %{$self->{'_notSummary'}}){
+#       print $self->{'_notSummary'}->{$k},' - ',$k,"\n" unless $self->{'_notSummary'}->{$k} == 0;
+#     }
+#   }
+# }
 
 sub _ontologyInit {
   my $self = shift;