Skip to content

Commit

Permalink
Merge branch 'release/2.1.0'
Browse files Browse the repository at this point in the history
Improved handling of variants immediatly around the start and stop codons
  • Loading branch information
AndyMenzies committed Apr 10, 2015
2 parents 11db6aa + 9b84208 commit 9ebab4b
Show file tree
Hide file tree
Showing 14 changed files with 1,693 additions and 54 deletions.
67 changes: 62 additions & 5 deletions bin/AnnotateVcf.pl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
use Data::Dumper;

use List::Util qw(first);
use File::Temp qw(tempfile);
use Try::Tiny qw(try catch);

use FindBin qw($Bin);
use lib "$Bin/../lib";
Expand Down Expand Up @@ -76,6 +78,10 @@
const my $REPRE_BM => Sanger::CGP::Vagrent::Bookmarkers::RepresentativeTranscriptBookmarker->new();
const my $WORST_BM => Sanger::CGP::Vagrent::Bookmarkers::MostDeleteriousBookmarker->new();

const my $SORT_CMD => 'cat %s | vcf-sort > %s';
const my $BGZIP_CMD => 'bgzip %s';
const my $TABIX_CMB => 'tabix -p vcf %s';


my $header_already_parsed = 0;

Expand All @@ -86,12 +92,22 @@
unless(defined $options->{'species'} && defined $options->{'assembly'}) {
croak 'unable to determine species and assembly from VCF file, please specify on command line' unless find_species_in_vcf($vcf_in,$options);
}
open my $OUT_FH, '>', $options->{'output'} or croak 'Failed to create: '.$options->{'output'};
my $output = $options->{'output'};
if($options->{'tabix'}){
(undef,$output) = tempfile('vagrentXXXXXXX', OPEN => 0, SUFFIX => '.vcf');
}

open my $OUT_FH, '>', $output or croak 'Failed to create: '.$output;
my $annotator = get_annotator($options);

process_data($vcf_in,$OUT_FH,$annotator,$options);
close $OUT_FH or croak 'Failed to close: '.$options->{'output'};
Vcf::validate($options->{'output'});
close $OUT_FH or croak 'Failed to close: '.$output;
Vcf::validate($output);

if($options->{'tabix'}){
compressAndIndex($options,$output);
}

1;
} or do {
warn "EVAL_ERROR: $EVAL_ERROR\n" if($EVAL_ERROR);
Expand All @@ -100,6 +116,44 @@
croak 'A problem occurred';
};

sub compressAndIndex {
my ($options, $tmpfile) = @_;

my $sort_cmd = sprintf $SORT_CMD, $tmpfile, $options->{'output'};
my $bgzip_cmd = sprintf $BGZIP_CMD, $options->{'output'};
my $totabix = $options->{'output'} .'.gz';
my $tabix_cmd = sprintf $TABIX_CMB, $totabix;

try {
my $tabix_in = $options->{'input'}.'.tbx';
unless(-e $tabix_in){
# If the input has a tabix index it must have already been sorted,
# we haven't changed the order of the file so we can skip this sort
system($sort_cmd);
}

} catch {
warn "EXECUTION ERROR: $sort_cmd\n";
die $_;
};

try {
system($bgzip_cmd);
} catch {
warn "EXECUTION ERROR: $bgzip_cmd\n";
die $_;
};

try {
system($tabix_cmd);
} catch {
warn "EXECUTION ERROR: $tabix_cmd\n";
die $_;
};

unlink $tmpfile;
}

sub process_data {
my ($in,$out,$anno,$opts) = @_;
print $out generate_header($in,$opts);
Expand Down Expand Up @@ -407,6 +461,7 @@ sub option_builder {
'i|input=s' => \$opts{'input'},
'o|output=s' => \$opts{'output'},
'c|cache=s' => \$opts{'cache'},
't|tabix' => \$opts{'tabix'},
'p|process=n' => \$opts{'process'},
'sp|species=s' => \$opts{'species'},
'as|assembly=s' => \$opts{'assembly'},
Expand Down Expand Up @@ -443,15 +498,15 @@ =head1 NAME
=head1 SYNOPSIS
AnnotateVcf.pl [-h] -i <IN_FILE> -o <OUT_FILE> -c <VAGRENT_CACHE_FILE>
AnnotateVcf.pl [-h] [-t] -i <IN_FILE> -o <OUT_FILE> -c <VAGRENT_CACHE_FILE> [-sp <SPECIES> -as <GENOME_VERSON>]
General Options:
--help (-h) Brief documentation
--input (-i) Input vcf file (expects *.bgz)
--output (-o) Output vcf
--output (-o) Output vcf file (plain text, add -t for zip and index)
--cache (-c) Vagrent reference data cache file
Expand All @@ -467,4 +522,6 @@ =head1 SYNOPSIS
--process (-p) ID_PROCESS that generated this file
--tabix (-t) bgzip and tabix index the output file (will generate the .gz version of the -o option)
=cut
Binary file modified docs.tar.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion lib/Sanger/CGP/Vagrent.pm
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use strict;
use Const::Fast qw(const);

use base 'Exporter';
our $VERSION = '2.0';
our $VERSION = '2.1.0';
our @EXPORT = qw($VERSION);

1;
68 changes: 36 additions & 32 deletions lib/Sanger/CGP/Vagrent/Annotators/AbstractAnnotator.pm
Original file line number Diff line number Diff line change
Expand Up @@ -462,18 +462,19 @@ sub _buildProteinAnnotation {
# something has gone wrong
return undef;
}

my $mtDna = $self->_getMutatedCdsSequence($wtDna,$cdsMinPos,$cdsMaxPos,$cAnnot->getMt());
my $mtProt = Bio::Seq->new(-seq => $prePad . $mtDna . $postPad)->translate->seq(); # mutant protein sequence
my $maxMtProt = Bio::Seq->new(-seq => $prePad . $mtDna . substr($tran->getcDNASeq,$tran->getCdsMaxPos()))->translate->seq(); # maximised protein sequence, overruns the natural stop and translates to the end of the transcript
if($wtProt eq $mtProt){
# wt and mt protein sequences are the same, its silent
$mutProtMin = ceil(($cAnnot->getMinPos / 3));
$mutProtMax = ceil(($cAnnot->getMaxPos / 3));
$wt = substr($wtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
$mt = substr($mtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
if(length($wt) == 1 && length($mt) == 1 && $mutProtMin == $mutProtMax){
$desc = 'p.'.$wt.$mutProtMin.$mt;
} else {
$mutProtMax = ceil(($cAnnot->getMaxPos / 3));
$wt = substr($wtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
$mt = substr($mtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
if(length($wt) == 1 && length($mt) == 1 && $mutProtMin == $mutProtMax){
$desc = 'p.'.$wt.$mutProtMin.$mt;
} else {
$desc = 'p.(=)';
}
$type = $self->_getDefaultProteinAnnotationType();
Expand All @@ -495,10 +496,10 @@ sub _buildProteinAnnotation {
if($mutProtMin == 1){
# its frame shifted the start codon, no idea what this is going to cause.
push(@classes,$self->getStartLostVariantClass);
return $self->_buildUnknownProteinAnnotation($var,$tran,$cAnnot,length($wtProt),@classes);
}
return $self->_buildUnknownProteinAnnotation($var,$tran,$cAnnot,length($wtProt),@classes);
}
$type = Sanger::CGP::Vagrent::Data::Annotation::getFrameShiftAnnotationType();
push(@classes,$self->getFrameShiftVariantClass);
push(@classes,$self->getFrameShiftVariantClass);
} else {
$wt = $wtProt;
$mt = $mtProt;
Expand All @@ -512,7 +513,7 @@ sub _buildProteinAnnotation {
substr($wt,-1,1,'');
substr($mt,-1,1,'');
}

#warn "|$wt| to |$mt|\n";
if($wt ne ''){
# wild type residue has been changed
Expand Down Expand Up @@ -617,8 +618,6 @@ sub _buildProteinAnnotation {
subtype => $subtype);
$anno->addClassification(@classes);
return $anno;

return undef;
}

sub _getMutatedCdsSequence: Abstract;
Expand All @@ -637,7 +636,6 @@ sub _buildCDSAnnotation {
return $self->_buildUnknownCDSAnnotation($var,$tran,$rAnnot,@classes);
}
my ($cdsMin,$cdsMinOffset,$cdsMax,$cdsMaxOffset) = (undef,undef,undef,undef);

if($rAnnot->getMinPos < $tran->getCdsMinPos){
$cdsMin = 1;
$cdsMinOffset = 0;
Expand Down Expand Up @@ -668,6 +666,8 @@ sub _buildCDSAnnotation {
$cdsMaxOffset = $rAnnot->getMaxOffset();
}

print "CDS: $cdsMin , $cdsMinOffset - $cdsMax, $cdsMaxOffset\n" if $self->_debug();

my $wt = $self->_getWildTypeStringForCDSAnno($var,$tran,$rAnnot);
my $mt = $self->_getMutantStringForCDSAnno($var,$tran,$rAnnot);
my $desc = $self->_getCDSDescriptionString($tran,$cdsMin,$cdsMax,$cdsMinOffset,$cdsMaxOffset,$wt,$mt);
Expand Down Expand Up @@ -959,21 +959,6 @@ sub _coversStopCodon {
return 1;
}
}



# if($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()){
# if($anno->getMinPos <= $tran->getCdsMaxPos && $anno->getMaxPos >= $tran->getCdsMaxPos - 2){
# return 1;
# }
# } elsif($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()){
# if($anno->getMinPos <= $tran->getCdsLength && $anno->getMaxPos >= $tran->getCdsLength - 2){
# return 1;
# }
# } else {
# # don't know, assume no
# return 0;
# }
return 0;
}

Expand Down Expand Up @@ -1025,10 +1010,25 @@ sub _canAnnotateToCDS {
if($anno->hasClassification($self->getInsertionClass)){
# insertions are a special case.
# Coordinates are the last WT positions, and not the first variant ones like everything else
if($anno->getMaxPos <= $tran->getCdsMinPos || $anno->getMinPos >= $tran->getCdsMaxPos){
# its outside the CDS
return 0;
}

print 'ANNO POS: '.$anno->getMinPos.' , '.$anno->getMinOffset.' - '.$anno->getMaxPos.' , '.$anno->getMaxOffset."\n" if $self->_debug();
print 'CDS POS: '.$tran->getCdsMinPos.' , '.$tran->getCdsMaxPos."\n" if $self->_debug();

if($anno->getMaxPos < $tran->getCdsMinPos || $anno->getMinPos > $tran->getCdsMaxPos){
# ends before CDS or starts afterwards
return 0;
} elsif($anno->getMaxPos == $tran->getCdsMinPos) {
# potential start codon issues
if($anno->getMinPos == $anno->getMaxPos && $anno->getMinPos == $tran->getCdsMinPos && abs($anno->getMinOffset) + abs($anno->getMaxOffset) > 0){
# probably start coordinate issues
unless($anno->getMaxOffset <= 0 && $self->_isIntronicOffsetDistance($anno->getMaxOffset) == 0){
# or not
return 0;
}
} else {
return 0;
}
}
} else {
if($anno->getMaxPos < $tran->getCdsMinPos || $anno->getMinPos > $tran->getCdsMaxPos){
# its outside the CDS
Expand All @@ -1050,6 +1050,9 @@ sub _canAnnotateToCDS {
return 0;
} elsif($anno->hasClassification($self->getUnknownVariantClass)){
return 0;
} elsif($anno->hasClassification($self->getInsertionClass) && $anno->hasClassification($self->get5PrimeUtrVariantClass)){
# odd case, insertions close to the start codons can be described on the CDS even though they don't change it.
return 1;
} else {
my $msg = "Unable to calculate CDS relevance - UNKNOWN CLASSIFICATION: ".join(' ',$anno->getClassifications);
$self->addMessage($msg);
Expand All @@ -1066,6 +1069,7 @@ sub _canAnnotateToCDS {

sub _canAnnotateToProtein {
my ($self,$tran,$anno) = @_;

unless($tran->isProteinCoding){
# if the transcript isn't protein coding it can't be a coding change
return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,18 +190,14 @@ sub _buildRNAAnnotation {
}

if($tran->isProteinCoding){
#print "HERE\n";
if(($pos > $tran->getCdsMinPos || ($pos == $tran->getCdsMinPos && $offset >= 0)) &&
($pos < $tran->getCdsMaxPos || ($pos == $tran->getCdsMaxPos && $offset <= 0))){
# if($pos >= $tran->getCdsMinPos && $pos <= $tran->getCdsMaxPos){
# coding change
push(@groupClasses,$self->getCDSClass);
} elsif($pos < $tran->getCdsMinPos || ($pos == $tran->getCdsMinPos && $offset < 0)){
# } elsif($pos < $tran->getCdsMinPos){
# 5prime UTR
push(@groupClasses,$self->get5PrimeUtrClass);
} elsif($pos > $tran->getCdsMaxPos || ($pos == $tran->getCdsMaxPos && $offset > 0)){
# } elsif($pos > $tran->getCdsMaxPos){
# 3prime UTR
push(@groupClasses,$self->get3PrimeUtrClass);
} else {
Expand Down
17 changes: 9 additions & 8 deletions lib/Sanger/CGP/Vagrent/Ontology/SequenceOntologyClassifier.pm
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,15 @@ const my $SO_NON_PROTEIN_CODING_CLASS => 'SO:0000011:non_protein_coding';

const my $TERM_SUMMARY_INI => 'SequenceOntologySummary.ini';

#sub DESTROY {
# my $self = shift;
# if(defined $self->{'_SOsum'}){
# foreach my $k( sort {$self->{'_notSummary'}->{$b} <=> $self->{'_notSummary'}->{$a}} keys %{$self->{'_notSummary'}}){
# print $self->{'_notSummary'}->{$k},' - ',$k,"\n" unless $self->{'_notSummary'}->{$k} == 1;
# }
# }
#}
# sub DESTROY {
# ##### Handy DESTROY function that will print ontology combinations that don't exist in the summary lookup at program termination.
# my $self = shift;
# if(defined $self->{'_SOsum'}){
# foreach my $k( sort {$self->{'_notSummary'}->{$b} <=> $self->{'_notSummary'}->{$a}} keys %{$self->{'_notSummary'}}){
# print $self->{'_notSummary'}->{$k},' - ',$k,"\n" unless $self->{'_notSummary'}->{$k} == 0;
# }
# }
# }

sub _ontologyInit {
my $self = shift;
Expand Down
Loading

0 comments on commit 9ebab4b

Please sign in to comment.