Skip to content

Commit

Permalink
adding better handling for variants adjacent to or in the start and s…
Browse files Browse the repository at this point in the history
…top codons
  • Loading branch information
AndyMenzies committed Apr 10, 2015
1 parent 80c0a50 commit 38008d5
Show file tree
Hide file tree
Showing 6 changed files with 1,597 additions and 48 deletions.
68 changes: 36 additions & 32 deletions lib/Sanger/CGP/Vagrent/Annotators/AbstractAnnotator.pm
Original file line number Diff line number Diff line change
Expand Up @@ -462,18 +462,19 @@ sub _buildProteinAnnotation {
# something has gone wrong
return undef;
}

my $mtDna = $self->_getMutatedCdsSequence($wtDna,$cdsMinPos,$cdsMaxPos,$cAnnot->getMt());
my $mtProt = Bio::Seq->new(-seq => $prePad . $mtDna . $postPad)->translate->seq(); # mutant protein sequence
my $maxMtProt = Bio::Seq->new(-seq => $prePad . $mtDna . substr($tran->getcDNASeq,$tran->getCdsMaxPos()))->translate->seq(); # maximised protein sequence, overruns the natural stop and translates to the end of the transcript
if($wtProt eq $mtProt){
# wt and mt protein sequences are the same, its silent
$mutProtMin = ceil(($cAnnot->getMinPos / 3));
$mutProtMax = ceil(($cAnnot->getMaxPos / 3));
$wt = substr($wtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
$mt = substr($mtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
if(length($wt) == 1 && length($mt) == 1 && $mutProtMin == $mutProtMax){
$desc = 'p.'.$wt.$mutProtMin.$mt;
} else {
$mutProtMax = ceil(($cAnnot->getMaxPos / 3));
$wt = substr($wtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
$mt = substr($mtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1));
if(length($wt) == 1 && length($mt) == 1 && $mutProtMin == $mutProtMax){
$desc = 'p.'.$wt.$mutProtMin.$mt;
} else {
$desc = 'p.(=)';
}
$type = $self->_getDefaultProteinAnnotationType();
Expand All @@ -495,10 +496,10 @@ sub _buildProteinAnnotation {
if($mutProtMin == 1){
# its frame shifted the start codon, no idea what this is going to cause.
push(@classes,$self->getStartLostVariantClass);
return $self->_buildUnknownProteinAnnotation($var,$tran,$cAnnot,length($wtProt),@classes);
}
return $self->_buildUnknownProteinAnnotation($var,$tran,$cAnnot,length($wtProt),@classes);
}
$type = Sanger::CGP::Vagrent::Data::Annotation::getFrameShiftAnnotationType();
push(@classes,$self->getFrameShiftVariantClass);
push(@classes,$self->getFrameShiftVariantClass);
} else {
$wt = $wtProt;
$mt = $mtProt;
Expand All @@ -512,7 +513,7 @@ sub _buildProteinAnnotation {
substr($wt,-1,1,'');
substr($mt,-1,1,'');
}

#warn "|$wt| to |$mt|\n";
if($wt ne ''){
# wild type residue has been changed
Expand Down Expand Up @@ -617,8 +618,6 @@ sub _buildProteinAnnotation {
subtype => $subtype);
$anno->addClassification(@classes);
return $anno;

return undef;
}

sub _getMutatedCdsSequence: Abstract;
Expand All @@ -637,7 +636,6 @@ sub _buildCDSAnnotation {
return $self->_buildUnknownCDSAnnotation($var,$tran,$rAnnot,@classes);
}
my ($cdsMin,$cdsMinOffset,$cdsMax,$cdsMaxOffset) = (undef,undef,undef,undef);

if($rAnnot->getMinPos < $tran->getCdsMinPos){
$cdsMin = 1;
$cdsMinOffset = 0;
Expand Down Expand Up @@ -668,6 +666,8 @@ sub _buildCDSAnnotation {
$cdsMaxOffset = $rAnnot->getMaxOffset();
}

print "CDS: $cdsMin , $cdsMinOffset - $cdsMax, $cdsMaxOffset\n" if $self->_debug();

my $wt = $self->_getWildTypeStringForCDSAnno($var,$tran,$rAnnot);
my $mt = $self->_getMutantStringForCDSAnno($var,$tran,$rAnnot);
my $desc = $self->_getCDSDescriptionString($tran,$cdsMin,$cdsMax,$cdsMinOffset,$cdsMaxOffset,$wt,$mt);
Expand Down Expand Up @@ -959,21 +959,6 @@ sub _coversStopCodon {
return 1;
}
}



# if($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()){
# if($anno->getMinPos <= $tran->getCdsMaxPos && $anno->getMaxPos >= $tran->getCdsMaxPos - 2){
# return 1;
# }
# } elsif($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()){
# if($anno->getMinPos <= $tran->getCdsLength && $anno->getMaxPos >= $tran->getCdsLength - 2){
# return 1;
# }
# } else {
# # don't know, assume no
# return 0;
# }
return 0;
}

Expand Down Expand Up @@ -1025,10 +1010,25 @@ sub _canAnnotateToCDS {
if($anno->hasClassification($self->getInsertionClass)){
# insertions are a special case.
# Coordinates are the last WT positions, and not the first variant ones like everything else
if($anno->getMaxPos <= $tran->getCdsMinPos || $anno->getMinPos >= $tran->getCdsMaxPos){
# its outside the CDS
return 0;
}

print 'ANNO POS: '.$anno->getMinPos.' , '.$anno->getMinOffset.' - '.$anno->getMaxPos.' , '.$anno->getMaxOffset."\n" if $self->_debug();
print 'CDS POS: '.$tran->getCdsMinPos.' , '.$tran->getCdsMaxPos."\n" if $self->_debug();

if($anno->getMaxPos < $tran->getCdsMinPos || $anno->getMinPos > $tran->getCdsMaxPos){
# ends before CDS or starts afterwards
return 0;
} elsif($anno->getMaxPos == $tran->getCdsMinPos) {
# potential start codon issues
if($anno->getMinPos == $anno->getMaxPos && $anno->getMinPos == $tran->getCdsMinPos && abs($anno->getMinOffset) + abs($anno->getMaxOffset) > 0){
# probably start coordinate issues
unless($anno->getMaxOffset <= 0 && $self->_isIntronicOffsetDistance($anno->getMaxOffset) == 0){
# or not
return 0;
}
} else {
return 0;
}
}
} else {
if($anno->getMaxPos < $tran->getCdsMinPos || $anno->getMinPos > $tran->getCdsMaxPos){
# its outside the CDS
Expand All @@ -1050,6 +1050,9 @@ sub _canAnnotateToCDS {
return 0;
} elsif($anno->hasClassification($self->getUnknownVariantClass)){
return 0;
} elsif($anno->hasClassification($self->getInsertionClass) && $anno->hasClassification($self->get5PrimeUtrVariantClass)){
# odd case, insertions close to the start codons can be described on the CDS even though they don't change it.
return 1;
} else {
my $msg = "Unable to calculate CDS relevance - UNKNOWN CLASSIFICATION: ".join(' ',$anno->getClassifications);
$self->addMessage($msg);
Expand All @@ -1066,6 +1069,7 @@ sub _canAnnotateToCDS {

sub _canAnnotateToProtein {
my ($self,$tran,$anno) = @_;

unless($tran->isProteinCoding){
# if the transcript isn't protein coding it can't be a coding change
return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,18 +190,14 @@ sub _buildRNAAnnotation {
}

if($tran->isProteinCoding){
#print "HERE\n";
if(($pos > $tran->getCdsMinPos || ($pos == $tran->getCdsMinPos && $offset >= 0)) &&
($pos < $tran->getCdsMaxPos || ($pos == $tran->getCdsMaxPos && $offset <= 0))){
# if($pos >= $tran->getCdsMinPos && $pos <= $tran->getCdsMaxPos){
# coding change
push(@groupClasses,$self->getCDSClass);
} elsif($pos < $tran->getCdsMinPos || ($pos == $tran->getCdsMinPos && $offset < 0)){
# } elsif($pos < $tran->getCdsMinPos){
# 5prime UTR
push(@groupClasses,$self->get5PrimeUtrClass);
} elsif($pos > $tran->getCdsMaxPos || ($pos == $tran->getCdsMaxPos && $offset > 0)){
# } elsif($pos > $tran->getCdsMaxPos){
# 3prime UTR
push(@groupClasses,$self->get3PrimeUtrClass);
} else {
Expand Down
17 changes: 9 additions & 8 deletions lib/Sanger/CGP/Vagrent/Ontology/SequenceOntologyClassifier.pm
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,15 @@ const my $SO_NON_PROTEIN_CODING_CLASS => 'SO:0000011:non_protein_coding';

const my $TERM_SUMMARY_INI => 'SequenceOntologySummary.ini';

#sub DESTROY {
# my $self = shift;
# if(defined $self->{'_SOsum'}){
# foreach my $k( sort {$self->{'_notSummary'}->{$b} <=> $self->{'_notSummary'}->{$a}} keys %{$self->{'_notSummary'}}){
# print $self->{'_notSummary'}->{$k},' - ',$k,"\n" unless $self->{'_notSummary'}->{$k} == 1;
# }
# }
#}
# sub DESTROY {
# ##### Handy DESTROY function that will print ontology combinations that don't exist in the summary lookup at program termination.
# my $self = shift;
# if(defined $self->{'_SOsum'}){
# foreach my $k( sort {$self->{'_notSummary'}->{$b} <=> $self->{'_notSummary'}->{$a}} keys %{$self->{'_notSummary'}}){
# print $self->{'_notSummary'}->{$k},' - ',$k,"\n" unless $self->{'_notSummary'}->{$k} == 0;
# }
# }
# }

sub _ontologyInit {
my $self = shift;
Expand Down
Loading

0 comments on commit 38008d5

Please sign in to comment.