diff --git a/CHANGES.md b/CHANGES.md index b710544..374e0dd 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,11 @@ # CHANGES +## 3.5.0 + +* Loads vagrent cache into IntervalTree to speed up processing by: + * reducing redundant/random disk access + * reducing repeated decompression of same data when events are local to each other + ## 3.4.0 * Add Dockerfile and supporting scripts diff --git a/Dockerfile b/Dockerfile index 49d3142..9661a31 100644 --- a/Dockerfile +++ b/Dockerfile @@ -52,7 +52,7 @@ FROM ubuntu:16.04 LABEL maintainer="cgphelp@sanger.ac.uk" \ uk.ac.sanger.cgp="Cancer, Ageing and Somatic Mutation, Wellcome Trust Sanger Institute" \ - version="3.4.0" \ + version="3.5.0" \ description="VAGrENT genome annotation docker" RUN apt-get -yq update diff --git a/Makefile.PL b/Makefile.PL index bf98a2f..c2318d4 100755 --- a/Makefile.PL +++ b/Makefile.PL @@ -55,6 +55,7 @@ WriteMakefile( 'Sort::Key' => '1.33', 'TAP::Harness' => '3.33', 'Try::Tiny' => '0.22', + 'Set::IntervalTree' => '0.12', } ); diff --git a/bin/AnnotateVcf.pl b/bin/AnnotateVcf.pl index e86aa63..ebe5dd8 100755 --- a/bin/AnnotateVcf.pl +++ b/bin/AnnotateVcf.pl @@ -404,9 +404,11 @@ sub make_process_log { sub get_annotator { my $options = shift; + my $sorted = $options->{'sorted'}; + $sorted = 1 if(-e $options->{'input'}.'.tbi'); # creating an EnsemblTranscriptSource using the Ensembl registry - my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => $options->{'cache'}); + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => $options->{'cache'}, 'sorted' => $sorted); # creating an AnnotatorCollection my $annotator = Sanger::CGP::Vagrent::Annotators::AnnotatorCollection->new( @@ -473,7 +475,7 @@ sub option_builder { 'p|process=n' => \$opts{'process'}, 'sp|species=s' => \$opts{'species'}, 'as|assembly=s' => \$opts{'assembly'}, - + 'u|sorted' => \$opts{'sorted'}, ); pod2usage() if($opts{'help'}); @@ -537,4 +539,6 @@ =head1 SYNOPSIS --tabix (-t) bgzip and tabix index the output file (will generate the .gz version of the -o option) + --sorted (-s) Input is sorted - lower memory requirement, automatic if *.tbi found + =cut diff --git a/lib/Sanger/CGP/Vagrent.pm b/lib/Sanger/CGP/Vagrent.pm index 1691f7d..b52c619 100644 --- a/lib/Sanger/CGP/Vagrent.pm +++ b/lib/Sanger/CGP/Vagrent.pm @@ -26,7 +26,7 @@ use strict; use Const::Fast qw(const); use base 'Exporter'; -our $VERSION = '3.4.0'; +our $VERSION = '3.5.0'; our @EXPORT = qw($VERSION); 1; diff --git a/lib/Sanger/CGP/Vagrent/TranscriptSource/FileBasedTranscriptSource.pm b/lib/Sanger/CGP/Vagrent/TranscriptSource/FileBasedTranscriptSource.pm index d198336..d8a4e4a 100644 --- a/lib/Sanger/CGP/Vagrent/TranscriptSource/FileBasedTranscriptSource.pm +++ b/lib/Sanger/CGP/Vagrent/TranscriptSource/FileBasedTranscriptSource.pm @@ -31,6 +31,7 @@ use Const::Fast qw(const); use Bio::DB::HTS; use Bio::DB::HTS::Tabix; +use Set::IntervalTree; use Sanger::CGP::Vagrent::Data::Transcript; use Sanger::CGP::Vagrent::Data::Exon; @@ -118,9 +119,43 @@ sub _generateLocationString { return $gr->getChr.':'.$gr->getMinPos.'-'.$gr->getMaxPos; } +sub _tabix_to_interval_tree { + my ($self, $chr) = @_; + return 1 if defined $self->{_cache_iTree}->{$chr}; + + my $full_tree = {}; + if ($self->{_sorted}) { + $self->{_cache_iTree} = $full_tree; + } + else { + $full_tree = $self->{_cache_iTree}; + } + + my %collated; + $self->{_cache_tbx} = Bio::DB::HTS::Tabix->new(filename => $self->{_cache}) unless defined $self->{_cache_tbx}; + my $iter = $self->{_cache_tbx}->query_full($chr); + return 1 unless defined $iter; + while(my $line = $iter->next) { + my ($chr, $s, $e, $object) = (split /\t/, $line)[0,1,2,6]; + # +1 on end to convert to 1 bases, tabix module would handle this + my $this_loci = sprintf '%s:%d:%d', $chr, $s, $e+1; + push @{$collated{sprintf '%s:%d:%d', $chr, $s, $e+1}}, $object; + } + + my $chr_tree = Set::IntervalTree->new(); + for my $loci(keys %collated) { + my ($chr, $s, $e) = split ':', $loci; + $chr_tree->insert($collated{$loci}, $s, $e); + delete $collated{$loci}; + } + $full_tree->{$chr} = $chr_tree; + return 1; +} + sub _getTranscriptsFromCache { my ($self,$gp) = @_; - $self->{_cache_tbx} = Bio::DB::HTS::Tabix->new(filename => $self->{_cache}) unless defined $self->{_cache_tbx}; + my $chr = $gp->getChr(); + $self->_tabix_to_interval_tree($chr); my $min; my $max = $gp->getMaxPos + $SEARCH_BUFFER; if($gp->getMinPos() < $SEARCH_BUFFER){ @@ -128,27 +163,39 @@ sub _getTranscriptsFromCache { } else { $min = ($gp->getMinPos - $SEARCH_BUFFER); } - my $iter = $self->{_cache_tbx}->query_full($gp->getChr(),$min,$max); - return undef unless defined $iter; - my $out = undef; - while(my $ret = $iter->next){ - my $raw = (split("\t",$ret))[6]; - my $VAR1; - eval $raw; - $VAR1->{_cdnaseq} = $self->_getTranscriptSeq($VAR1); - push @$out, $VAR1; + my @data = (); + @data = @{$self->{_cache_iTree}->{$chr}->fetch($min,$max)}; + return undef unless(@data); + my @out; + for my $overlap(@data){ + for my $item(@{$overlap}) { + unless(ref $item) { # turn string into object + my $VAR1; + eval $item; + $VAR1->{_cdnaseq} = $self->_getTranscriptSeq($VAR1); + $item = $VAR1; + } + push @out, $item; + } } - return $out; + @out = sort _sort_itree @out; + return \@out; } -sub _init { - my $self = shift; - my %vars = @_; - foreach my $k(keys(%vars)){ - if($k eq 'cache'){ - $self->_setCacheFile($vars{$k}); - } +sub _sort_itree { + if($a->{_genomicminpos} != $b->{_genomicminpos}) { + return $a->{_genomicminpos} <=> $b->{_genomicminpos}; + } + if($a->{_genomicmaxpos} != $b->{_genomicmaxpos}) { + return $a->{_genomicmaxpos} <=> $b->{_genomicmaxpos}; } + return 0; +} + +sub _init { + my ($self, %vars) = @_; + $self->_setCacheFile($vars{'cache'}); + $self->{_sorted} = $vars{'sorted'}; } sub _setCacheFile {