diff --git a/bin/AnnotateVcf.pl b/bin/AnnotateVcf.pl index 36c000f..7bd2083 100755 --- a/bin/AnnotateVcf.pl +++ b/bin/AnnotateVcf.pl @@ -33,6 +33,8 @@ use Data::Dumper; use List::Util qw(first); +use File::Temp qw(tempfile); +use Try::Tiny qw(try catch); use FindBin qw($Bin); use lib "$Bin/../lib"; @@ -76,6 +78,10 @@ const my $REPRE_BM => Sanger::CGP::Vagrent::Bookmarkers::RepresentativeTranscriptBookmarker->new(); const my $WORST_BM => Sanger::CGP::Vagrent::Bookmarkers::MostDeleteriousBookmarker->new(); +const my $SORT_CMD => 'cat %s | vcf-sort > %s'; +const my $BGZIP_CMD => 'bgzip %s'; +const my $TABIX_CMB => 'tabix -p vcf %s'; + my $header_already_parsed = 0; @@ -86,12 +92,22 @@ unless(defined $options->{'species'} && defined $options->{'assembly'}) { croak 'unable to determine species and assembly from VCF file, please specify on command line' unless find_species_in_vcf($vcf_in,$options); } - open my $OUT_FH, '>', $options->{'output'} or croak 'Failed to create: '.$options->{'output'}; + my $output = $options->{'output'}; + if($options->{'tabix'}){ + (undef,$output) = tempfile('vagrentXXXXXXX', OPEN => 0, SUFFIX => '.vcf'); + } + + open my $OUT_FH, '>', $output or croak 'Failed to create: '.$output; my $annotator = get_annotator($options); process_data($vcf_in,$OUT_FH,$annotator,$options); - close $OUT_FH or croak 'Failed to close: '.$options->{'output'}; - Vcf::validate($options->{'output'}); + close $OUT_FH or croak 'Failed to close: '.$output; + Vcf::validate($output); + + if($options->{'tabix'}){ + compressAndIndex($options,$output); + } + 1; } or do { warn "EVAL_ERROR: $EVAL_ERROR\n" if($EVAL_ERROR); @@ -100,6 +116,44 @@ croak 'A problem occurred'; }; +sub compressAndIndex { + my ($options, $tmpfile) = @_; + + my $sort_cmd = sprintf $SORT_CMD, $tmpfile, $options->{'output'}; + my $bgzip_cmd = sprintf $BGZIP_CMD, $options->{'output'}; + my $totabix = $options->{'output'} .'.gz'; + my $tabix_cmd = sprintf $TABIX_CMB, $totabix; + + try { + my $tabix_in = $options->{'input'}.'.tbx'; + unless(-e $tabix_in){ + # If the input has a tabix index it must have already been sorted, + # we haven't changed the order of the file so we can skip this sort + system($sort_cmd); + } + + } catch { + warn "EXECUTION ERROR: $sort_cmd\n"; + die $_; + }; + + try { + system($bgzip_cmd); + } catch { + warn "EXECUTION ERROR: $bgzip_cmd\n"; + die $_; + }; + + try { + system($tabix_cmd); + } catch { + warn "EXECUTION ERROR: $tabix_cmd\n"; + die $_; + }; + + unlink $tmpfile; +} + sub process_data { my ($in,$out,$anno,$opts) = @_; print $out generate_header($in,$opts); @@ -407,6 +461,7 @@ sub option_builder { 'i|input=s' => \$opts{'input'}, 'o|output=s' => \$opts{'output'}, 'c|cache=s' => \$opts{'cache'}, + 't|tabix' => \$opts{'tabix'}, 'p|process=n' => \$opts{'process'}, 'sp|species=s' => \$opts{'species'}, 'as|assembly=s' => \$opts{'assembly'}, @@ -443,7 +498,7 @@ =head1 NAME =head1 SYNOPSIS -AnnotateVcf.pl [-h] -i -o -c +AnnotateVcf.pl [-h] [-t] -i -o -c [-sp -as ] General Options: @@ -451,7 +506,7 @@ =head1 SYNOPSIS --input (-i) Input vcf file (expects *.bgz) - --output (-o) Output vcf + --output (-o) Output vcf file (plain text, add -t for zip and index) --cache (-c) Vagrent reference data cache file @@ -467,4 +522,6 @@ =head1 SYNOPSIS --process (-p) ID_PROCESS that generated this file + --tabix (-t) bgzip and tabix index the output file (will generate the .gz version of the -o option) + =cut diff --git a/docs.tar.gz b/docs.tar.gz index 14787bb..1ce30a4 100644 Binary files a/docs.tar.gz and b/docs.tar.gz differ diff --git a/lib/Sanger/CGP/Vagrent.pm b/lib/Sanger/CGP/Vagrent.pm index f378213..752f7ec 100644 --- a/lib/Sanger/CGP/Vagrent.pm +++ b/lib/Sanger/CGP/Vagrent.pm @@ -26,7 +26,7 @@ use strict; use Const::Fast qw(const); use base 'Exporter'; -our $VERSION = '2.0'; +our $VERSION = '2.1.0'; our @EXPORT = qw($VERSION); 1; diff --git a/lib/Sanger/CGP/Vagrent/Annotators/AbstractAnnotator.pm b/lib/Sanger/CGP/Vagrent/Annotators/AbstractAnnotator.pm index 48cde27..b039a6a 100644 --- a/lib/Sanger/CGP/Vagrent/Annotators/AbstractAnnotator.pm +++ b/lib/Sanger/CGP/Vagrent/Annotators/AbstractAnnotator.pm @@ -462,18 +462,19 @@ sub _buildProteinAnnotation { # something has gone wrong return undef; } + my $mtDna = $self->_getMutatedCdsSequence($wtDna,$cdsMinPos,$cdsMaxPos,$cAnnot->getMt()); my $mtProt = Bio::Seq->new(-seq => $prePad . $mtDna . $postPad)->translate->seq(); # mutant protein sequence my $maxMtProt = Bio::Seq->new(-seq => $prePad . $mtDna . substr($tran->getcDNASeq,$tran->getCdsMaxPos()))->translate->seq(); # maximised protein sequence, overruns the natural stop and translates to the end of the transcript if($wtProt eq $mtProt){ # wt and mt protein sequences are the same, its silent $mutProtMin = ceil(($cAnnot->getMinPos / 3)); - $mutProtMax = ceil(($cAnnot->getMaxPos / 3)); - $wt = substr($wtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1)); - $mt = substr($mtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1)); - if(length($wt) == 1 && length($mt) == 1 && $mutProtMin == $mutProtMax){ - $desc = 'p.'.$wt.$mutProtMin.$mt; - } else { + $mutProtMax = ceil(($cAnnot->getMaxPos / 3)); + $wt = substr($wtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1)); + $mt = substr($mtProt,($mutProtMin - 1),(($mutProtMax - $mutProtMin) + 1)); + if(length($wt) == 1 && length($mt) == 1 && $mutProtMin == $mutProtMax){ + $desc = 'p.'.$wt.$mutProtMin.$mt; + } else { $desc = 'p.(=)'; } $type = $self->_getDefaultProteinAnnotationType(); @@ -495,10 +496,10 @@ sub _buildProteinAnnotation { if($mutProtMin == 1){ # its frame shifted the start codon, no idea what this is going to cause. push(@classes,$self->getStartLostVariantClass); - return $self->_buildUnknownProteinAnnotation($var,$tran,$cAnnot,length($wtProt),@classes); - } + return $self->_buildUnknownProteinAnnotation($var,$tran,$cAnnot,length($wtProt),@classes); + } $type = Sanger::CGP::Vagrent::Data::Annotation::getFrameShiftAnnotationType(); - push(@classes,$self->getFrameShiftVariantClass); + push(@classes,$self->getFrameShiftVariantClass); } else { $wt = $wtProt; $mt = $mtProt; @@ -512,7 +513,7 @@ sub _buildProteinAnnotation { substr($wt,-1,1,''); substr($mt,-1,1,''); } - + #warn "|$wt| to |$mt|\n"; if($wt ne ''){ # wild type residue has been changed @@ -617,8 +618,6 @@ sub _buildProteinAnnotation { subtype => $subtype); $anno->addClassification(@classes); return $anno; - - return undef; } sub _getMutatedCdsSequence: Abstract; @@ -637,7 +636,6 @@ sub _buildCDSAnnotation { return $self->_buildUnknownCDSAnnotation($var,$tran,$rAnnot,@classes); } my ($cdsMin,$cdsMinOffset,$cdsMax,$cdsMaxOffset) = (undef,undef,undef,undef); - if($rAnnot->getMinPos < $tran->getCdsMinPos){ $cdsMin = 1; $cdsMinOffset = 0; @@ -668,6 +666,8 @@ sub _buildCDSAnnotation { $cdsMaxOffset = $rAnnot->getMaxOffset(); } + print "CDS: $cdsMin , $cdsMinOffset - $cdsMax, $cdsMaxOffset\n" if $self->_debug(); + my $wt = $self->_getWildTypeStringForCDSAnno($var,$tran,$rAnnot); my $mt = $self->_getMutantStringForCDSAnno($var,$tran,$rAnnot); my $desc = $self->_getCDSDescriptionString($tran,$cdsMin,$cdsMax,$cdsMinOffset,$cdsMaxOffset,$wt,$mt); @@ -959,21 +959,6 @@ sub _coversStopCodon { return 1; } } - - - -# if($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()){ -# if($anno->getMinPos <= $tran->getCdsMaxPos && $anno->getMaxPos >= $tran->getCdsMaxPos - 2){ -# return 1; -# } -# } elsif($anno->getContext eq Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()){ -# if($anno->getMinPos <= $tran->getCdsLength && $anno->getMaxPos >= $tran->getCdsLength - 2){ -# return 1; -# } -# } else { -# # don't know, assume no -# return 0; -# } return 0; } @@ -1025,10 +1010,25 @@ sub _canAnnotateToCDS { if($anno->hasClassification($self->getInsertionClass)){ # insertions are a special case. # Coordinates are the last WT positions, and not the first variant ones like everything else - if($anno->getMaxPos <= $tran->getCdsMinPos || $anno->getMinPos >= $tran->getCdsMaxPos){ - # its outside the CDS - return 0; - } + + print 'ANNO POS: '.$anno->getMinPos.' , '.$anno->getMinOffset.' - '.$anno->getMaxPos.' , '.$anno->getMaxOffset."\n" if $self->_debug(); + print 'CDS POS: '.$tran->getCdsMinPos.' , '.$tran->getCdsMaxPos."\n" if $self->_debug(); + + if($anno->getMaxPos < $tran->getCdsMinPos || $anno->getMinPos > $tran->getCdsMaxPos){ + # ends before CDS or starts afterwards + return 0; + } elsif($anno->getMaxPos == $tran->getCdsMinPos) { + # potential start codon issues + if($anno->getMinPos == $anno->getMaxPos && $anno->getMinPos == $tran->getCdsMinPos && abs($anno->getMinOffset) + abs($anno->getMaxOffset) > 0){ + # probably start coordinate issues + unless($anno->getMaxOffset <= 0 && $self->_isIntronicOffsetDistance($anno->getMaxOffset) == 0){ + # or not + return 0; + } + } else { + return 0; + } + } } else { if($anno->getMaxPos < $tran->getCdsMinPos || $anno->getMinPos > $tran->getCdsMaxPos){ # its outside the CDS @@ -1050,6 +1050,9 @@ sub _canAnnotateToCDS { return 0; } elsif($anno->hasClassification($self->getUnknownVariantClass)){ return 0; + } elsif($anno->hasClassification($self->getInsertionClass) && $anno->hasClassification($self->get5PrimeUtrVariantClass)){ + # odd case, insertions close to the start codons can be described on the CDS even though they don't change it. + return 1; } else { my $msg = "Unable to calculate CDS relevance - UNKNOWN CLASSIFICATION: ".join(' ',$anno->getClassifications); $self->addMessage($msg); @@ -1066,6 +1069,7 @@ sub _canAnnotateToCDS { sub _canAnnotateToProtein { my ($self,$tran,$anno) = @_; + unless($tran->isProteinCoding){ # if the transcript isn't protein coding it can't be a coding change return 0; diff --git a/lib/Sanger/CGP/Vagrent/Annotators/SimpleSubstitutionAnnotator.pm b/lib/Sanger/CGP/Vagrent/Annotators/SimpleSubstitutionAnnotator.pm index b272240..4c5552d 100644 --- a/lib/Sanger/CGP/Vagrent/Annotators/SimpleSubstitutionAnnotator.pm +++ b/lib/Sanger/CGP/Vagrent/Annotators/SimpleSubstitutionAnnotator.pm @@ -190,18 +190,14 @@ sub _buildRNAAnnotation { } if($tran->isProteinCoding){ - #print "HERE\n"; if(($pos > $tran->getCdsMinPos || ($pos == $tran->getCdsMinPos && $offset >= 0)) && ($pos < $tran->getCdsMaxPos || ($pos == $tran->getCdsMaxPos && $offset <= 0))){ -# if($pos >= $tran->getCdsMinPos && $pos <= $tran->getCdsMaxPos){ # coding change push(@groupClasses,$self->getCDSClass); } elsif($pos < $tran->getCdsMinPos || ($pos == $tran->getCdsMinPos && $offset < 0)){ -# } elsif($pos < $tran->getCdsMinPos){ # 5prime UTR push(@groupClasses,$self->get5PrimeUtrClass); } elsif($pos > $tran->getCdsMaxPos || ($pos == $tran->getCdsMaxPos && $offset > 0)){ -# } elsif($pos > $tran->getCdsMaxPos){ # 3prime UTR push(@groupClasses,$self->get3PrimeUtrClass); } else { diff --git a/lib/Sanger/CGP/Vagrent/Ontology/SequenceOntologyClassifier.pm b/lib/Sanger/CGP/Vagrent/Ontology/SequenceOntologyClassifier.pm index b14e928..9562655 100644 --- a/lib/Sanger/CGP/Vagrent/Ontology/SequenceOntologyClassifier.pm +++ b/lib/Sanger/CGP/Vagrent/Ontology/SequenceOntologyClassifier.pm @@ -100,14 +100,15 @@ const my $SO_NON_PROTEIN_CODING_CLASS => 'SO:0000011:non_protein_coding'; const my $TERM_SUMMARY_INI => 'SequenceOntologySummary.ini'; -#sub DESTROY { -# my $self = shift; -# if(defined $self->{'_SOsum'}){ -# foreach my $k( sort {$self->{'_notSummary'}->{$b} <=> $self->{'_notSummary'}->{$a}} keys %{$self->{'_notSummary'}}){ -# print $self->{'_notSummary'}->{$k},' - ',$k,"\n" unless $self->{'_notSummary'}->{$k} == 1; -# } -# } -#} +# sub DESTROY { +# ##### Handy DESTROY function that will print ontology combinations that don't exist in the summary lookup at program termination. +# my $self = shift; +# if(defined $self->{'_SOsum'}){ +# foreach my $k( sort {$self->{'_notSummary'}->{$b} <=> $self->{'_notSummary'}->{$a}} keys %{$self->{'_notSummary'}}){ +# print $self->{'_notSummary'}->{$k},' - ',$k,"\n" unless $self->{'_notSummary'}->{$k} == 0; +# } +# } +# } sub _ontologyInit { my $self = shift; diff --git a/share/SequenceOntologySummary.ini b/share/SequenceOntologySummary.ini index 3a3d058..15a936b 100644 --- a/share/SequenceOntologySummary.ini +++ b/share/SequenceOntologySummary.ini @@ -62,8 +62,10 @@ SO:0000010:protein_coding,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:1000002:s SO:0000010:protein_coding,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:1000002:substitution,SO:0001988:5_prime_UTR_premature_start_codon_gain_variant,SO:0001576:transcript_variant=5prime_UTR_variant SO:0000010:protein_coding,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:0000159:deletion,SO:0001623:5_prime_UTR_variant,SO:0001576:transcript_variant=5prime_UTR_variant SO:0000010:protein_coding,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:0000159:deletion,SO:0001636:2KB_upstream_variant,SO:0001623:5_prime_UTR_variant,SO:0001576:transcript_variant=5prime_UTR_variant +SO:0000010:protein_coding,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:0000159:deletion,SO:0001635:5KB_upstream_variant,SO:0001636:2KB_upstream_variant,SO:0001623:5_prime_UTR_variant,SO:0001576:transcript_variant=5prime_UTR_variant SO:0000010:protein_coding,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:0000667:insertion,SO:0001623:5_prime_UTR_variant,SO:0001576:transcript_variant=5prime_UTR_variant SO:0000010:protein_coding,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:1000032:indel,SO:0001623:5_prime_UTR_variant,SO:0001576:transcript_variant=5prime_UTR_variant +SO:0000010:protein_coding,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:1000032:indel,SO:0001636:2KB_upstream_variant,SO:0001623:5_prime_UTR_variant,SO:0001576:transcript_variant=5prime_UTR_variant SO:0000010:protein_coding,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:1000002:substitution,SO:0001624:3_prime_UTR_variant,SO:0001576:transcript_variant=3prime_UTR_variant SO:0000010:protein_coding,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0000159:deletion,SO:0001624:3_prime_UTR_variant,SO:0001576:transcript_variant=3prime_UTR_variant @@ -79,9 +81,12 @@ SO:0000011:non_protein_coding,SO:0000147:exon,SO:0000159:deletion,SO:0001619:nc_ SO:0000011:non_protein_coding,SO:0000147:exon,SO:0000159:deletion,SO:0001634:500B_downstream_variant,SO:0001619:nc_transcript_variant=nc_variant SO:0000011:non_protein_coding,SO:0000147:exon,SO:0000159:deletion,SO:0001636:2KB_upstream_variant,SO:0001619:nc_transcript_variant=nc_variant SO:0000011:non_protein_coding,SO:0000147:exon,SO:0000159:deletion,SO:0001633:5KB_downstream_variant,SO:0001634:500B_downstream_variant,SO:0001619:nc_transcript_variant=nc_variant +SO:0000011:non_protein_coding,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0000159:deletion,SO:0001577:complex_change_in_transcript,SO:0001619:nc_transcript_variant=nc_variant SO:0000011:non_protein_coding,SO:0000147:exon,SO:0000667:insertion,SO:0001619:nc_transcript_variant=nc_variant SO:0000011:non_protein_coding,SO:0000147:exon,SO:1000032:indel,SO:0001619:nc_transcript_variant=nc_variant SO:0000011:non_protein_coding,SO:0000147:exon,SO:1000032:indel,SO:0001636:2KB_upstream_variant,SO:0001619:nc_transcript_variant=nc_variant +SO:0000011:non_protein_coding,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:1000032:indel,SO:0001636:2KB_upstream_variant,SO:0001633:5KB_downstream_variant,SO:0001634:500B_downstream_variant,SO:0001577:complex_change_in_transcript,SO:0001619:nc_transcript_variant=nc_variant +SO:0000011:non_protein_coding,SO:0000147:exon,SO:1000032:indel,SO:0001634:500B_downstream_variant,SO:0001619:nc_transcript_variant=nc_variant SO:0000010:protein_coding,SO:0000316:CDS,SO:0000147:exon,SO:1000002:substitution,SO:0001581:codon_variant,SO:0001583:non_synonymous_codon=missense SO:0000010:protein_coding,SO:0000316:CDS,SO:0000147:exon,SO:1000032:indel,SO:0001650:inframe_variant,SO:1000002:substitution,SO:0001583:non_synonymous_codon=missense @@ -120,16 +125,21 @@ SO:0000010:protein_coding,SO:0000316:CDS,SO:0000147:exon,SO:0000159:deletion,SO: SO:0000010:protein_coding,SO:0000316:CDS,SO:0000147:exon,SO:0000159:deletion,SO:0001589:frameshift_variant,SO:0001576:transcript_variant=frameshift SO:0000010:protein_coding,SO:0000316:CDS,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0000159:deletion,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001578:stop_lost=frameshift SO:0000010:protein_coding,SO:0000316:CDS,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001633:5KB_downstream_variant,SO:0001634:500B_downstream_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001578:stop_lost=frameshift +SO:0000010:protein_coding,SO:0000316:CDS,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001633:5KB_downstream_variant,SO:0001634:500B_downstream_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001578:stop_lost=frameshift SO:0000010:protein_coding,SO:0000316:CDS,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001634:500B_downstream_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001578:stop_lost=frameshift SO:0000010:protein_coding,SO:0000316:CDS,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001578:stop_lost=frameshift SO:0000010:protein_coding,SO:0000316:CDS,SO:0000147:exon,SO:0000667:insertion,SO:0001589:frameshift_variant=frameshift SO:0000010:protein_coding,SO:0000316:CDS,SO:0000147:exon,SO:1000032:indel,SO:0001589:frameshift_variant=frameshift SO:0000010:protein_coding,SO:0000316:CDS,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:1000032:indel,SO:0001633:5KB_downstream_variant,SO:0001634:500B_downstream_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001578:stop_lost=frameshift +SO:0000010:protein_coding,SO:0000316:CDS,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:1000032:indel,SO:0001634:500B_downstream_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001578:stop_lost=frameshift +SO:0000010:protein_coding,SO:0000316:CDS,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:1000032:indel,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001578:stop_lost=frameshift SO:0000010:protein_coding,SO:0000316:CDS,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001582:initiator_codon_change=cds_disrupted SO:0000010:protein_coding,SO:0000316:CDS,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001635:5KB_upstream_variant,SO:0001636:2KB_upstream_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001582:initiator_codon_change=cds_disrupted SO:0000010:protein_coding,SO:0000316:CDS,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001636:2KB_upstream_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001582:initiator_codon_change=cds_disrupted SO:0000010:protein_coding,SO:0000316:CDS,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:1000032:indel,SO:0001635:5KB_upstream_variant,SO:0001636:2KB_upstream_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001582:initiator_codon_change=cds_disrupted +SO:0000010:protein_coding,SO:0000316:CDS,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:1000032:indel,SO:0001636:2KB_upstream_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001582:initiator_codon_change=cds_disrupted +SO:0000010:protein_coding,SO:0000316:CDS,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:1000032:indel,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant,SO:0001582:initiator_codon_change=cds_disrupted SO:0000011:non_protein_coding,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:1000032:indel,SO:0001635:5KB_upstream_variant,SO:0001636:2KB_upstream_variant,SO:0001577:complex_change_in_transcript,SO:0001619:nc_transcript_variant=nc_transcript_disrupted @@ -138,6 +148,8 @@ SO:0000010:protein_coding,SO:0000316:CDS,SO:0000204:five_prime_UTR,SO:0000205:th SO:0000010:protein_coding,SO:0000316:CDS,SO:0000204:five_prime_UTR,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001636:2KB_upstream_variant,SO:0001633:5KB_downstream_variant,SO:0001634:500B_downstream_variant,SO:0001577:complex_change_in_transcript=cds_deleted SO:0000010:protein_coding,SO:0000316:CDS,SO:0000204:five_prime_UTR,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001635:5KB_upstream_variant,SO:0001636:2KB_upstream_variant,SO:0001633:5KB_downstream_variant,SO:0001634:500B_downstream_variant,SO:0001577:complex_change_in_transcript=cds_deleted SO:0000010:protein_coding,SO:0000316:CDS,SO:0000204:five_prime_UTR,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001634:500B_downstream_variant,SO:0001577:complex_change_in_transcript=cds_deleted +SO:0000010:protein_coding,SO:0000316:CDS,SO:0000204:five_prime_UTR,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:1000032:indel,SO:0001636:2KB_upstream_variant,SO:0001634:500B_downstream_variant,SO:0001577:complex_change_in_transcript,SO:0000159:deletion=cds_deleted +SO:0000010:protein_coding,SO:0000316:CDS,SO:0000204:five_prime_UTR,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:1000032:indel,SO:0001636:2KB_upstream_variant,SO:0001633:5KB_downstream_variant,SO:0001634:500B_downstream_variant,SO:0001577:complex_change_in_transcript,SO:0000159:deletion=cds_deleted SO:0000011:non_protein_coding,SO:0000147:exon,SO:0000159:deletion,SO:0001636:2KB_upstream_variant,SO:0001633:5KB_downstream_variant,SO:0001634:500B_downstream_variant,SO:0001619:nc_transcript_variant=nc_transcript_deleted SO:0000011:non_protein_coding,SO:0000147:exon,SO:0000159:deletion,SO:0001635:5KB_upstream_variant,SO:0001636:2KB_upstream_variant,SO:0001633:5KB_downstream_variant,SO:0001634:500B_downstream_variant,SO:0001619:nc_transcript_variant=nc_transcript_deleted @@ -177,6 +189,7 @@ SO:0000010:protein_coding,SO:0000316:CDS,SO:0000147:exon,SO:0001993:extended_cis SO:0000010:protein_coding,SO:0000316:CDS,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:1000032:indel,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant=ess_splice SO:0000010:protein_coding,SO:0000316:CDS,SO:0001993:extended_cis_splice_site,SO:1000032:indel,SO:0001629:splice_site_variant,SO:0001576:transcript_variant=ess_splice SO:0000010:protein_coding,SO:0000316:CDS,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:1000032:indel,SO:0001995:extended_intronic_splice_region_variant,SO:0001629:splice_site_variant,SO:0001576:transcript_variant=ess_splice +SO:0000010:protein_coding,SO:0000316:CDS,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:1000032:indel,SO:0001995:extended_intronic_splice_region_variant,SO:0001629:splice_site_variant,SO:0001627:intron_variant,SO:0001576:transcript_variant=ess_splice SO:0000010:protein_coding,SO:0000204:five_prime_UTR,SO:0001993:extended_cis_splice_site,SO:1000002:substitution,SO:0001629:splice_site_variant,SO:0001623:5_prime_UTR_variant,SO:0001576:transcript_variant=5prime_UTR_ess_splice SO:0000010:protein_coding,SO:0000204:five_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001636:2KB_upstream_variant,SO:0001623:5_prime_UTR_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant=5prime_UTR_ess_splice @@ -195,6 +208,7 @@ SO:0000010:protein_coding,SO:0000205:three_prime_UTR,SO:0001993:extended_cis_spl SO:0000010:protein_coding,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001624:3_prime_UTR_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant=3prime_UTR_ess_splice SO:0000010:protein_coding,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001633:5KB_downstream_variant,SO:0001634:500B_downstream_variant,SO:0001624:3_prime_UTR_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant=3prime_UTR_ess_splice SO:0000010:protein_coding,SO:0000205:three_prime_UTR,SO:0001993:extended_cis_splice_site,SO:0000159:deletion,SO:0001624:3_prime_UTR_variant,SO:0001629:splice_site_variant,SO:0001576:transcript_variant=3prime_UTR_ess_splice +SO:0000010:protein_coding,SO:0000205:three_prime_UTR,SO:0001993:extended_cis_splice_site,SO:0001996:extended_intronic_splice_region,SO:0000188:intron,SO:0000159:deletion,SO:0001624:3_prime_UTR_variant,SO:0001995:extended_intronic_splice_region_variant,SO:0001629:splice_site_variant,SO:0001627:intron_variant,SO:0001576:transcript_variant=3prime_UTR_ess_splice SO:0000010:protein_coding,SO:0000205:three_prime_UTR,SO:0000147:exon,SO:0001993:extended_cis_splice_site,SO:1000032:indel,SO:0001624:3_prime_UTR_variant,SO:0001577:complex_change_in_transcript,SO:0001576:transcript_variant=3prime_UTR_ess_splice SO:0000011:non_protein_coding,SO:0001993:extended_cis_splice_site,SO:1000002:substitution,SO:0001629:splice_site_variant=nc_ess_splice diff --git a/t/deletion.t b/t/deletion.t index a0b3072..aca81c0 100644 --- a/t/deletion.t +++ b/t/deletion.t @@ -45,6 +45,8 @@ testSplice(); testExonic(); testComplexCases(); testUpStreamDownStream(); +testCdsBoundry(); + done_testing(); sub testUpStreamDownStream { @@ -331,6 +333,7 @@ sub testSplice { } sub testExonic { + #5 PRIME UTR EXON 1bp DEL test5PrimeUTR_1bp_CEP350(AnnotationTestUtils::CEP350_TRANSCRIPT); test5PrimeUTR_1bp_TOR1AIP2(AnnotationTestUtils::TOR1AIP2_TRANSCRIPT); @@ -415,7 +418,6 @@ sub testExonic { testCDSExon_StartCodon_2bp_3_CEP350(AnnotationTestUtils::CEP350_TRANSCRIPT); testCDSExon_StartCodon_4bp_1_CEP350(AnnotationTestUtils::CEP350_TRANSCRIPT); - #NON-CODING TRANSCRIPT testExon_1bp_1_AC068831(AnnotationTestUtils::AC068831_TRANSCRIPT); testExon_1bp_2_AC068831(AnnotationTestUtils::AC068831_TRANSCRIPT); @@ -425,6 +427,497 @@ sub testExonic { } +sub testCdsBoundry{ + + testStartUpstream_OR4F5(); + testStartEndsUpsteam1bp_OR4F5(); + testStartEndsUpsteam0bp_OR4F5(); + testEndStarts0bp_OR4F5(); + testEndStarts1bp_OR4F5(); + testEndDownstream_OR4F5(); + + testStartUpstream_GABPB2(); + testStartIntronic_GABPB2(); + testStartSpliceRegion_GABPB2(); + testStartEssSplice_GABPB2(); + +} + +# OR4F5 protein coding gene - single exon, no UTRs, has both start and stop codons, + strand (probably wrong but great for testing) + +sub testStartUpstream_OR4F5 { + subtest 'Testing OR4F5 5 Prime UTR Upstream + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + my $sub = Sanger::CGP::Vagrent::Data::Deletion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 68091, + 'maxpos' => 68091, + 'delseq' => 'C'); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::DeletionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getDeletionClass,$a->get2KBUpStreamVariantClass); + done_testing(); + }; + +} +sub testStartEndsUpsteam1bp_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 Ends Upstream 1 + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Deletion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 69090, + 'maxpos' => 69090, + 'delseq' => 'C'); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::DeletionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getDeletionClass,$a->get2KBUpStreamVariantClass); + + done_testing(); + }; +} +sub testStartEndsUpsteam0bp_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 Ends Upstream 0 + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Deletion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 69090, + 'maxpos' => 69090, + 'delseq' => 'C'); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::DeletionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getDeletionClass,$a->get2KBUpStreamVariantClass); + + done_testing(); + }; +} +sub testEndStarts0bp_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 Starts Downstream 0 + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Deletion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 70009, + 'maxpos' => 70009, + 'delseq' => 'C'); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::DeletionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getDeletionClass,$a->get500BPDownStreamVariantClass); + + done_testing(); + }; + + + +} +sub testEndStarts1bp_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 Starts Downstream 1 + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Deletion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 70010, + 'maxpos' => 70010, + 'delseq' => 'C'); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::DeletionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getDeletionClass,$a->get500BPDownStreamVariantClass); + + done_testing(); + }; + + + +} +sub testEndDownstream_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 Starts Downstream + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Deletion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 71010, + 'maxpos' => 71010, + 'delseq' => 'C'); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::DeletionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getDeletionClass,$a->get5KBDownStreamVariantClass); + + done_testing(); + }; + + + +} + +# GABPB2 protein coding gene with both utrs on + strand of genome, start codon is at the start of an exon + +sub testStartUpstream_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Upstream + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Deletion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151042080, + 'maxpos' => 151042080, + 'delseq' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::DeletionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getDeletionClass,$a->get2KBUpStreamVariantClass); + done_testing(); + }; + +} +sub testStartIntronic_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Intronic + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Deletion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151060460, + 'maxpos' => 151060460, + 'delseq' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::DeletionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass,$a->get5PrimeUtrClass,$a->getIntronClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getDeletionClass,$a->getIntronVariantClass); + done_testing(); + }; + +} +sub testStartSpliceRegion_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Splice Region + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Deletion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151060660, + 'maxpos' => 151060660, + 'delseq' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::DeletionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),3,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'has have CDS context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'has have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass,$a->get5PrimeUtrClass,$a->getSpliceRegionClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getDeletionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 332,-6,332,-6,'C','-','r.332-6delc',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getDeletionClass,$a->getSpliceRegionVariantClass,$a->get5PrimeUtrVariantClass); + + AnnotationTestUtils::checkAnnotation('examine CDS annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getDeletionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 1,-6,1,-6,'C','-','c.1-6delc',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getDeletionClass,$a->getSpliceRegionVariantClass,$a->get5PrimeUtrVariantClass); + + AnnotationTestUtils::checkAnnotation('examine Protein annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','p.?',$t[0]->getProteinAccession,$t[0]->getProteinAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getUnknownVariantClass); + done_testing(); + }; + +} +sub testStartEssSplice_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Ess Splice + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Deletion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151060665, + 'maxpos' => 151060665, + 'delseq' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::DeletionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),3,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'has have CDS context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'has have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass,$a->getEssentialSpliceSiteClass,$a->get5PrimeUtrClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getDeletionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 332,-1,332,-1,'C','-','r.332-1delc',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getDeletionClass,$a->get5PrimeUtrVariantClass,$a->getEssentialSpliceSiteVariantClass); + + AnnotationTestUtils::checkAnnotation('examine CDS annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getDeletionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 1,-1,1,-1,'C','-','c.1-1delc',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getDeletionClass,$a->get5PrimeUtrVariantClass,$a->getEssentialSpliceSiteVariantClass); + + AnnotationTestUtils::checkAnnotation('examine Protein annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','p.?',$t[0]->getProteinAccession,$t[0]->getProteinAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getUnknownVariantClass); + done_testing(); + }; + +} + + # CEP350 protein coding gene with 5 prime utr exons on + strand of genome sub testUpsteamMilesAway_1bp_CEP350 { diff --git a/t/insertion.t b/t/insertion.t index 2b70daf..307bbd8 100644 --- a/t/insertion.t +++ b/t/insertion.t @@ -45,8 +45,27 @@ testSplice(); testExonic(); testUpStreamDownStream(); testStrangeCases(); +testCdsBoundry(); done_testing(); + +sub testCdsBoundry { + + testStartUpstream_OR4F5(); + testStartEndsUpsteam1bp_OR4F5(); + testStartEndsUpsteam0bp_OR4F5(); + testEndStarts0bp_OR4F5(); + testEndStarts1bp_OR4F5(); + testEndDownstream_OR4F5(); + + testStartUpstream_GABPB2(); + testStartIntronic_GABPB2(); + testStartSpliceRegion_GABPB2(); + testStartEssSplice_GABPB2(); + testStartEssSplice2_GABPB2(); + +} + sub testUpStreamDownStream { testUpsteamMilesAway_CEP350(AnnotationTestUtils::CEP350_TRANSCRIPT); testEndsUpsteam5001bp_CEP350(AnnotationTestUtils::CEP350_TRANSCRIPT); @@ -85,9 +104,7 @@ sub testUpStreamDownStream { testStartsDownstreamMilesAway_TOR1AIP2(AnnotationTestUtils::TOR1AIP2_TRANSCRIPT); } - sub testStrangeCases { -#CENTRE OF INTRONS ' => sub { testIntronic_DeadCenterOfEvenSizedIntron_CEP350(AnnotationTestUtils::CEP350_TRANSCRIPT); testIntronic_StartingDeadCenterOfOddSizedIntron_CEP350(AnnotationTestUtils::CEP350_TRANSCRIPT); testIntronic_EndingDeadCenterOfOddSizedIntron_CEP350(AnnotationTestUtils::CEP350_TRANSCRIPT); @@ -319,6 +336,541 @@ sub testExonic { } +# OR4F5 protein coding gene - single exon, no UTRs, has both start and stop codons, + strand (probably wrong but great for testing) + +sub testStartUpstream_OR4F5 { + subtest 'Testing OR4F5 5 Prime UTR Upstream + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + my $sub = Sanger::CGP::Vagrent::Data::Insertion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 68091, + 'maxpos' => 68092, + 'insseq' => 'C'); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::InsertionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->get2KBUpStreamVariantClass); + done_testing(); + }; + +} +sub testStartEndsUpsteam1bp_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 Ends Upstream 1 + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Insertion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 69089, + 'maxpos' => 69090, + 'insseq' => 'C'); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::InsertionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->get2KBUpStreamVariantClass); + + done_testing(); + }; +} +sub testStartEndsUpsteam0bp_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 Ends Upstream 0 + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Insertion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 69090, + 'maxpos' => 69091, + 'insseq' => 'C'); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::InsertionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->get2KBUpStreamVariantClass); + + done_testing(); + }; +} +sub testEndStarts0bp_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 Starts Downstream 0 + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Insertion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 70008, + 'maxpos' => 70009, + 'insseq' => 'C'); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::InsertionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->get500BPDownStreamVariantClass); + + done_testing(); + }; + + + +} +sub testEndStarts1bp_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 Starts Downstream 1 + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Insertion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 70009, + 'maxpos' => 70010, + 'insseq' => 'C'); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::InsertionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->get500BPDownStreamVariantClass); + + done_testing(); + }; + + + +} +sub testEndDownstream_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 Starts Downstream + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Insertion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 71009, + 'maxpos' => 71010, + 'insseq' => 'C'); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::InsertionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->get5KBDownStreamVariantClass); + + done_testing(); + }; + + + +} + + +# GABPB2 protein coding gene with both utrs on + strand of genome, start codon is at the start of an exon + +sub testStartUpstream_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Upstream + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Insertion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151042080, + 'maxpos' => 151042081, + 'insseq' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::InsertionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->get2KBUpStreamVariantClass); + done_testing(); + }; + +} +sub testStartIntronic_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Intronic + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Insertion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151060460, + 'maxpos' => 151060461, + 'insseq' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::InsertionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass,$a->get5PrimeUtrClass,$a->getIntronClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->getIntronVariantClass); + done_testing(); + }; + +} +sub testStartSpliceRegion_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Splice Region + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Insertion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151060660, + 'maxpos' => 151060661, + 'insseq' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::InsertionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),3,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'has have CDS context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'has have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass,$a->get5PrimeUtrClass,$a->getSpliceRegionClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getInsertionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 332,-6,332,-5,'-','C','r.332-6_332-5insc',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->getSpliceRegionVariantClass,$a->get5PrimeUtrVariantClass); + + AnnotationTestUtils::checkAnnotation('examine CDS annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getInsertionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 1,-6,1,-5,'-','C','c.1-6_1-5insC',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->getSpliceRegionVariantClass,$a->get5PrimeUtrVariantClass); + + AnnotationTestUtils::checkAnnotation('examine Protein annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','p.?',$t[0]->getProteinAccession,$t[0]->getProteinAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getUnknownVariantClass); + done_testing(); + }; + +} +sub testStartEssSplice_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Ess Splice + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Insertion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151060664, + 'maxpos' => 151060665, + 'insseq' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::InsertionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),3,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'has have CDS context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'has have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass,$a->getEssentialSpliceSiteClass,$a->get5PrimeUtrClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getInsertionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 332,-2,332,-1,'-','C','r.332-2_332-1insc',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->get5PrimeUtrVariantClass,$a->getEssentialSpliceSiteVariantClass); + + AnnotationTestUtils::checkAnnotation('examine CDS annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getInsertionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 1,-2,1,-1,'-','C','c.1-2_1-1insC',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->get5PrimeUtrVariantClass,$a->getEssentialSpliceSiteVariantClass); + + AnnotationTestUtils::checkAnnotation('examine Protein annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','p.?',$t[0]->getProteinAccession,$t[0]->getProteinAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getUnknownVariantClass); + done_testing(); + }; + +} +sub testStartEssSplice2_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Ess Splice + strand 2' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Insertion->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151060665, + 'maxpos' => 151060666, + 'insseq' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::InsertionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),3,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'has have CDS context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'has have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass,$a->getExonClass,$a->get5PrimeUtrClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getInsertionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 332,-1,332,0,'-','C','r.332-1_332insc',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->get5PrimeUtrVariantClass); + + AnnotationTestUtils::checkAnnotation('examine CDS annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getInsertionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 1,-1,1,0,'-','C','c.1-1_1insC',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getInsertionClass,$a->get5PrimeUtrVariantClass); + + AnnotationTestUtils::checkAnnotation('examine Protein annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','p.?',$t[0]->getProteinAccession,$t[0]->getProteinAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getUnknownVariantClass); + done_testing(); + }; + +} + + # CEP350 protein coding gene with 5 prime utr exons on + strand of genome @@ -5062,7 +5614,7 @@ sub testCDSStartAdjacent_1bp_TOR1AIP2 { sub testCDSStartAdjacent_3bp_TOR1AIP2 { my $file = shift; - subtest 'Testing TOR1AIP2 5 prime UTR 1bp - strand CDS start adjacent' => sub { + subtest 'Testing TOR1AIP2 5 prime UTR 3bp - strand CDS start adjacent' => sub { my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); diff --git a/t/substitution.t b/t/substitution.t index 4aa0344..8f16b4d 100755 --- a/t/substitution.t +++ b/t/substitution.t @@ -45,6 +45,7 @@ testIntronic(); testSplice(); testExonic(); testUpStreamDownStream(); +testCDSBoundry(); done_testing(); @@ -261,6 +262,508 @@ sub testExonic { done_testing(); }; } +sub testCDSBoundry { + subtest 'Testing Start ' => sub { + testStartUpstream_GABPB2(); + testStartIntronic_GABPB2(); + testStartSpliceRegion_GABPB2(); + testStartEssSplice_GABPB2(); + + testStartUpstream_OR4F5(); + testStartSpliceRegion_OR4F5(); + testStartEssSplice_OR4F5(); + + done_testing(); + }; + + subtest 'Testing End ' => sub { + + testEndEssSplice_OR4F5(); + testEndSpliceRegion_OR4F5(); + testEndDownstream_OR4F5(); + + done_testing(); + }; +} + +# GABPB2 protein coding gene with both utrs on + strand of genome, start codon is at the start of an exon + +sub testStartUpstream_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Upstream + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Substitution->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151042080, + 'maxpos' => 151042080, + 'wt' => 'T', + 'mt' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::SimpleSubstitutionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getSubstitutionClass,$a->get2KBUpStreamVariantClass); + done_testing(); + }; + +} +sub testStartIntronic_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Intronic + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Substitution->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151060460, + 'maxpos' => 151060460, + 'wt' => 'T', + 'mt' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::SimpleSubstitutionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass,$a->get5PrimeUtrClass,$a->getIntronClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getSubstitutionClass,$a->getIntronVariantClass); + done_testing(); + }; + +} +sub testStartSpliceRegion_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Splice Region + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Substitution->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151060660, + 'maxpos' => 151060660, + 'wt' => 'T', + 'mt' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::SimpleSubstitutionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),3,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'has have CDS context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'has have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass,$a->get5PrimeUtrClass,$a->getSpliceRegionClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getSubstitutionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 332,-6,332,-6,'U','C','r.332-6u>c',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getSubstitutionClass,$a->getSpliceRegionVariantClass,$a->get5PrimeUtrVariantClass); + + AnnotationTestUtils::checkAnnotation('examine CDS annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getSubstitutionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 1,-6,1,-6,'T','C','c.1-6T>C',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getSubstitutionClass,$a->getSpliceRegionVariantClass,$a->get5PrimeUtrVariantClass); + + AnnotationTestUtils::checkAnnotation('examine Protein annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','p.?',$t[0]->getProteinAccession,$t[0]->getProteinAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getUnknownVariantClass); + done_testing(); + }; + +} +sub testStartEssSplice_GABPB2 { + my $file = shift; + + subtest 'Testing GABPB2 5 Prime UTR Ess Splice + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Substitution->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 151060665, + 'maxpos' => 151060665, + 'wt' => 'G', + 'mt' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::SimpleSubstitutionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),3,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'has have CDS context annotation'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'has have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass,$a->get5PrimeUtrClass,$a->getEssentialSpliceSiteClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getSubstitutionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 332,-1,332,-1,'G','C','r.332-1g>c',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getSubstitutionClass,$a->getEssentialSpliceSiteVariantClass,$a->get5PrimeUtrVariantClass); + + AnnotationTestUtils::checkAnnotation('examine CDS annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getSubstitutionAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffsetSubtype(), + 1,-1,1,-1,'G','C','c.1-1G>C',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getSubstitutionClass,$a->getEssentialSpliceSiteVariantClass,$a->get5PrimeUtrVariantClass); + + AnnotationTestUtils::checkAnnotation('examine Protein annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','p.?',$t[0]->getProteinAccession,$t[0]->getProteinAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getUnknownVariantClass); + done_testing(); + }; + +} + +# OR4F5 protein coding gene - single exon, no UTRs, has both start and stop codons, + strand (probably wrong but great for testing) + +sub testStartUpstream_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 5 Prime UTR Upstream + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Substitution->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 68091, + 'maxpos' => 68091, + 'wt' => 'T', + 'mt' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::SimpleSubstitutionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getSubstitutionClass,$a->get2KBUpStreamVariantClass); + done_testing(); + }; + +} +sub testStartSpliceRegion_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 5 Prime UTR Splice Region + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Substitution->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 69085, + 'maxpos' => 69085, + 'wt' => 'T', + 'mt' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::SimpleSubstitutionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getSubstitutionClass,$a->get2KBUpStreamVariantClass); + done_testing(); + }; + +} +sub testStartEssSplice_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 5 Prime UTR Ess Splice + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Substitution->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 69090, + 'maxpos' => 69090, + 'wt' => 'T', + 'mt' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::SimpleSubstitutionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getSubstitutionClass,$a->get2KBUpStreamVariantClass); + done_testing(); + }; + +} +sub testEndEssSplice_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 3 Prime UTR Ess Splice + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Substitution->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 70009, + 'maxpos' => 70009, + 'wt' => 'T', + 'mt' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::SimpleSubstitutionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getSubstitutionClass,$a->get500BPDownStreamVariantClass); + done_testing(); + }; + +} +sub testEndSpliceRegion_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 3 Prime UTR Splice Region + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Substitution->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 70015, + 'maxpos' => 70015, + 'wt' => 'T', + 'mt' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::SimpleSubstitutionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getSubstitutionClass,$a->get500BPDownStreamVariantClass); + done_testing(); + }; + +} +sub testEndDownstream_OR4F5 { + my $file = shift; + + subtest 'Testing OR4F5 Downstream + strand 1' => sub { + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => AnnotationTestUtils::TRANSCRIPT_CACHE); + + + my $sub = Sanger::CGP::Vagrent::Data::Substitution->new( + 'species' => 'human', + 'genomeVersion' => 'GRCh37', + 'chr' => 1, + 'minpos' => 72015, + 'maxpos' => 72015, + 'wt' => 'T', + 'mt' => 'C',); + + my @t = $ts->getTranscripts($sub); + + my $a = Sanger::CGP::Vagrent::Annotators::SimpleSubstitutionAnnotator->new(transcriptSource => $ts); + + my @res = $a->getAnnotation($sub); + + is(scalar(@res),1,'annotation group count'); + is($res[0]->getType,Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(),'annotation group type - proteincoding'); + is(scalar(@{$res[0]->getAllAnnotations}),1,'annotation count for group'); + ok(defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext())),'has mRNA context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getCDSAnnotationContext())),'doesnt have CDS context annotation'); + ok(!defined($res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getProteinAnnotationContext())),'doesnt have protein context annotation'); + + AnnotationTestUtils::checkAnnotationGroup('examine annotation group in detail',$res[0], + $t[0]->getGeneName,$t[0]->getCCDS,$t[0]->getAccession,$t[0]->getGeneType, + $a->getProteinCodingClass); + + AnnotationTestUtils::checkAnnotation('examine mRNA annotation in detail', + $res[0]->getAnnotationByContext(Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext()), + Sanger::CGP::Vagrent::Data::Annotation::getmRNAAnnotationContext(), + Sanger::CGP::Vagrent::Data::Annotation::getUnknownAnnotationType(), + Sanger::CGP::Vagrent::Data::Annotation::getPositionOffSequenceSubtype(), + 0,0,0,0,'?','?','r.?',$t[0]->getAccession,$t[0]->getAccessionVersion,$t[0]->getDatabase,$t[0]->getDatabaseVersion, + $a->getSubstitutionClass,$a->get5KBDownStreamVariantClass); + done_testing(); + }; + +} + # CEP350 protein coding gene with 5 prime utr exons on + strand of genome diff --git a/testData/test_transcript.cache.gz b/testData/test_transcript.cache.gz index 4588b22..19b7b74 100644 Binary files a/testData/test_transcript.cache.gz and b/testData/test_transcript.cache.gz differ diff --git a/testData/test_transcript.cache.gz.tbi b/testData/test_transcript.cache.gz.tbi index 486a61b..ac66ee2 100644 Binary files a/testData/test_transcript.cache.gz.tbi and b/testData/test_transcript.cache.gz.tbi differ diff --git a/testData/test_transcript.fa b/testData/test_transcript.fa index 04aa39a..e9679bb 100644 --- a/testData/test_transcript.fa +++ b/testData/test_transcript.fa @@ -847,3 +847,21 @@ TCTGACAGCTTTATGTACAGCGTATTTTTAGAAAAACTTAAATATACTTCTTTATTTAGG GTTTTATTCTGATGAGCAAGTTTGTGTGTATATGTGTGTATGAGCATTTGTATGTATATA TACTTATACAGATCTATATTATATATACAGTTTTTGTACTATCATTTAAAATAAAAATGT TTCTCAATAAAATGTCAAAGCCGA +>ENST00000335137 +ATGGTGACTGAATTCATTTTTCTGGGTCTCTCTGATTCTCAGGAACTCCAGACCTTCCTA +TTTATGTTGTTTTTTGTATTCTATGGAGGAATCGTGTTTGGAAACCTTCTTATTGTCATA +ACAGTGGTATCTGACTCCCACCTTCACTCTCCCATGTACTTCCTGCTAGCCAACCTCTCA +CTCATTGATCTGTCTCTGTCTTCAGTCACAGCCCCCAAGATGATTACTGACTTTTTCAGC +CAGCGCAAAGTCATCTCTTTCAAGGGCTGCCTTGTTCAGATATTTCTCCTTCACTTCTTT +GGTGGGAGTGAGATGGTGATCCTCATAGCCATGGGCTTTGACAGATATATAGCAATATGC +AAGCCCCTACACTACACTACAATTATGTGTGGCAACGCATGTGTCGGCATTATGGCTGTC +ACATGGGGAATTGGCTTTCTCCATTCGGTGAGCCAGTTGGCGTTTGCCGTGCACTTACTC +TTCTGTGGTCCCAATGAGGTCGATAGTTTTTATTGTGACCTTCCTAGGGTAATCAAACTT +GCCTGTACAGATACCTACAGGCTAGATATTATGGTCATTGCTAACAGTGGTGTGCTCACT +GTGTGTTCTTTTGTTCTTCTAATCATCTCATACACTATCATCCTAATGACCATCCAGCAT +CGCCCTTTAGATAAGTCGTCCAAAGCTCTGTCCACTTTGACTGCTCACATTACAGTAGTT +CTTTTGTTCTTTGGACCATGTGTCTTTATTTATGCCTGGCCATTCCCCATCAAGTCATTA +GATAAATTCCTTGCTGTATTTTATTCTGTGATCACCCCTCTCTTGAACCCAATTATATAC +ACACTGAGGAACAAAGACATGAAGACGGCAATAAGACAGCTGAGAAAATGGGATGCACAT +TCTAGTGTAAAGTTTTAG + diff --git a/testData/test_transcript.fa.fai b/testData/test_transcript.fa.fai index a97b9fb..b1b450e 100644 --- a/testData/test_transcript.fa.fai +++ b/testData/test_transcript.fa.fai @@ -7,3 +7,4 @@ ENST00000339290 3241 21445 60 61 ENST0000037195 9027 24757 60 61 ENST00000367612 7905 33952 60 61 ENST00000368918 8964 42006 60 61 +ENST00000335137 918 51137 60 68