diff --git a/CHANGES.md b/CHANGES.md index a3af7bf..ea4a833 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,6 @@ +### 3.3.0 +* Complete re-write of the reference generation code + ### 3.2.2 * Add bedtools2 to `setup.sh` * Added bedtools2 to `README.md` diff --git a/MANIFEST b/MANIFEST index 954da43..c7d9ac4 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1,7 +1,7 @@ .travis.yml -bin/Admin_EnsemblGtf2CacheConverter.pl +bin/Admin_CacheFileBuilder.pl bin/Admin_EnsemblReferenceFileGenerator.pl -bin/Admin_EnsemblTranscriptFastaFilter.pl +bin/Admin_EnsemblTranscriptFilter.pl bin/Admin_GeneRegionBedDumper.pl bin/AnnotateVcf.pl bin/cpanm diff --git a/Makefile.PL b/Makefile.PL index a6042f9..bf98a2f 100755 --- a/Makefile.PL +++ b/Makefile.PL @@ -31,9 +31,9 @@ WriteMakefile( NAME => 'Vagrent', LICENSE => 'agpl_3', # http://search.cpan.org/~dagolden/CPAN-Meta-2.142690/lib/CPAN/Meta/Spec.pm#license VERSION_FROM => 'lib/Sanger/CGP/Vagrent.pm', - EXE_FILES => [qw( bin/Admin_EnsemblGtf2CacheConverter.pl + EXE_FILES => [qw( bin/Admin_CacheFileBuilder.pl bin/Admin_EnsemblReferenceFileGenerator.pl - bin/Admin_EnsemblTranscriptFastaFilter.pl + bin/Admin_EnsemblTranscriptFilter.pl bin/Admin_GeneRegionBedDumper.pl bin/AnnotateVcf.pl)], PREREQ_PM => { diff --git a/README.md b/README.md index 639ebc6..4d650f8 100644 --- a/README.md +++ b/README.md @@ -49,9 +49,9 @@ Please be aware that this expects basic C compilation libraries and tools to be LICENCE ======= -Copyright (c) 2014-2017 Genome Research Ltd. +Copyright (c) 2014-2018 Genome Research Ltd. -Author: Cancer Genome Project +Author: CASM/Cancer IT This file is part of VAGrENT. diff --git a/bin/Admin_CacheFileBuilder.pl b/bin/Admin_CacheFileBuilder.pl new file mode 100644 index 0000000..ddbef18 --- /dev/null +++ b/bin/Admin_CacheFileBuilder.pl @@ -0,0 +1,852 @@ +#!/usr/bin/perl + +##########LICENCE########## +# Copyright (c) 2018 Genome Research Ltd. +# +# Author: CASM/Cancer IT - cgphelp@sanger.ac.uk +# +# This file is part of VAGrENT. +# +# VAGrENT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +##########LICENCE########## + +use strict; +use English qw(-no_match_vars); +use warnings FATAL => 'all'; +use Carp; + +use FindBin qw($Bin); +use lib "$Bin/../lib"; + +use Getopt::Long; +use Pod::Usage; +use Try::Tiny; +use Capture::Tiny qw(capture capture_stderr); +use List::Util qw(first); +use Scalar::Util qw(looks_like_number); + +use Data::Dumper; +use File::Type; +use File::Temp qw(tempdir tempfile); +use File::Spec; +use File::Copy qw(copy); +use Const::Fast qw(const); + +use Bio::DB::HTS::Faidx; +use Bio::SeqIO; +use Bio::Seq; +use Bio::Tools::GFF; + +use Sanger::CGP::Vagrent::Data::Transcript; +use Sanger::CGP::Vagrent::Data::Exon; + +$Data::Dumper::Indent = 0; + +const my @TEXT_TYPES => qw(text/plain application/octet-stream); + +const my @ZIP_TYPES => qw(application/x-gzip); + +const my $TAG_GENE_ID => 'gene_id'; +const my $TAG_TRANSCRIPT_ID => 'transcript_id'; +const my $TAG_PROTEIN_ID => 'protein_id'; +const my $TAG_VERSION => 'version'; +const my $TAG_PARENT => 'Parent'; +const my $TAG_EXON_ORDER_GFF => 'rank'; +const my $TAG_EXON_ORDER_GTF => 'exon_number'; +const my @TAG_EXON_ORDER => ($TAG_EXON_ORDER_GTF,$TAG_EXON_ORDER_GFF); +const my $TAG_GENE_BIOTYPE => 'gene_biotype'; +const my $TAG_BIOTYPE => 'biotype'; +const my @TAG_BIOTYPE => ($TAG_BIOTYPE,$TAG_GENE_BIOTYPE); + +const my $TAG_GENE_NAME_GFF => 'Name'; +const my $TAG_GENE_NAME_GTF => 'gene_name'; +const my @TAG_GENE_NAME => ($TAG_GENE_NAME_GFF,$TAG_GENE_NAME_GTF); + +const my @TAG_CCDS => qw(ccds_id ccdsid); + +const my $PRIMARY_UTR5P => 'five_prime_UTR'; +const my $PRIMARY_CDS => 'CDS'; +const my $PRIMARY_UTR3P => 'three_prime_UTR'; +const my $PRIMARY_STOP => 'stop_codon'; + +const my @MITO_ALIASES => qw(M MT Mito MtDNA); + +const my @PRIMARY_EXON => qw(exon); +const my @PRIMARY_REGIONS => ($PRIMARY_UTR5P,$PRIMARY_CDS,$PRIMARY_STOP,$PRIMARY_UTR3P); + +const my $PROTEIN_CODING => 'protein_coding'; +const my $EXON_TYPE => 'exon'; +const my $CDS_START_TYPE => 'start_codon'; +const my $CDS_STOP_TYPE => 'stop_codon'; +const my $CDS_TYPE => 'CDS'; + +const my $UNZIP => 'gunzip -c %s > %s'; +const my $BGZIP => 'bgzip -c %s > %s'; +const my $SORT_N_BGZIP => 'sort -k 1,1 -k 2n,2 -k 3n,3 %s | bgzip > %s'; +const my $TABIX => 'tabix -p bed %s'; + +const my $SEQ_LOCATION => '%s:%s-%s'; + +const my $OUT_CACHE_FILE => 'vagrent.%s.%s.%s.cache.gz'; +const my $OUT_SEQ_FILE => 'vagrent.%s.%s.%s.fa'; + +my $tmpDir = tempdir("VagrentBuildCacheXXXXX", TMPDIR => 1, CLEANUP => 1); + +try { + my $opts = option_builder(); + my $transList = parseTranscriptList($opts->{'transcripts'}); + my $seqFiles = openSeqFiles($tmpDir,@{$opts->{'f'}}); + my $chrList = parseFaiFile($opts->{'fai'}); + my $ccdsList = parseCCDSfile($opts->{'c'}); + my ($tmp_cache_file,$tmp_seq_file) = processFeatureFile($opts,$tmpDir,$chrList,$seqFiles,$transList,$ccdsList); + finaliseOutputFiles($opts,$tmp_cache_file,$tmp_seq_file); + +} catch { + die "An error occurred while building reference support files\:\n\t$_"; # not $@ +}; + +sub finaliseOutputFiles { + my ($opts,$tmp_cache,$tmp_seq) = @_; + my $cache_file = makeCacheFilePath($opts); + my $seq_file = makeSeqFilePath($opts); + + copy($tmp_seq,$seq_file) or croak("unable to copy sequence file to desination: $!"); + capture_stderr {my $fa_index = Bio::DB::HTS::Faidx->new($seq_file)}; + + + my $snz_cmd = sprintf($SORT_N_BGZIP, $tmp_cache, $cache_file); + my ($snz_stdout, $snz_stderr, $snz_exit) = capture {system($snz_cmd);}; + croak('Error in sorting and bgzipping the cache file: '.$snz_stderr) if $snz_exit > 0; + my $tabix_cmd = sprintf $TABIX, $cache_file; + my ($tbx_stdout, $tbx_stderr, $tbx_exit) = capture {system($tabix_cmd);}; + croak('Error in indexing the cache file: '.$tbx_stderr) if $tbx_exit > 0; +} + +sub makeCacheFilePath { + my $opts = shift; + my $file = sprintf $OUT_CACHE_FILE, $opts->{'sp'},$opts->{'as'},$opts->{'d'}; + return File::Spec->catfile($opts->{'o'},$file); +} + +sub makeSeqFilePath { + my $opts = shift; + my $file = sprintf $OUT_SEQ_FILE, $opts->{'sp'},$opts->{'as'},$opts->{'d'}; + return File::Spec->catfile($opts->{'o'},$file); +} + +sub processFeatureFile { + my ($opts,$tmpDir,$chrList,$seqFiles,$transList,$ccdsList) = @_; + my ($cache_fh, $cache_tmp_file) = tempfile('VagrentBuildCacheXXXXX', DIR => $tmpDir, SUFFIX => '.cache'); + my ($seq_tmp_fh, $fa_tmp_file) = tempfile('VagrentBuildCacheXXXXX', DIR => $tmpDir, SUFFIX => '.fa'); + my $seq_fh = Bio::SeqIO->new(-fh => $seq_tmp_fh, -format => 'fasta'); + my $features = openFeatureFile($opts->{'features'}); + my $c = 0; + my $store = undef; + my $gene_names = undef; + my $current_chr = undef; + my $current_chr_translation = undef; + while(my $feat = $features->next_feature()) { + my $chr = $feat->seq_id(); + if(!defined $current_chr){ + $current_chr = $chr; + $current_chr_translation = checkChromosome($current_chr,$chrList); + if (!defined $current_chr_translation){ + print "Sequence $current_chr not found in fai file: no alternative found, skipping\n" if $opts->{'debug'}; + } elsif ($current_chr ne $current_chr_translation){ + print "Sequence $current_chr not found in fai file: using $current_chr_translation as alternative\n" if $opts->{'debug'}; + } + } + next if $current_chr eq $chr && !defined $current_chr_translation; + my $tid = undef; + if($feat->has_tag($TAG_GENE_ID)){ + my ($gene_id) = $feat->get_tag_values($TAG_GENE_ID); + unless(exists $gene_names->{$gene_id}){ + foreach my $tag (@TAG_GENE_NAME){ + next unless $feat->has_tag($tag); + my ($gn) = $feat->get_tag_values($tag); + $gene_names->{$gene_id} = $gn; + last; + } + } + } + if($feat->has_tag($TAG_TRANSCRIPT_ID)){ + ($tid) = $feat->get_tag_values($TAG_TRANSCRIPT_ID); + } else { + if($feat->has_tag($TAG_PARENT)){ + my ($parent) = $feat->get_tag_values($TAG_PARENT); + if($parent =~ m/^transcript:(.+)$/){ + $tid = $1; + } else { + next; + } + } else { + next; + } + } + next unless defined $tid; + if($current_chr ne $chr) { + # chromosome has ended, have to process transcripts + if(scalar(keys %$store) > 0){ + foreach my $obj (@{processCompleteTranscripts($opts,$store,$current_chr_translation,$gene_names,$chrList,$ccdsList,$seqFiles)}){ + writeOutput($obj,$cache_fh,$seq_fh); + } + $store = undef; + } + $current_chr = $chr; + $current_chr_translation = checkChromosome($current_chr,$chrList); + if (!defined $current_chr_translation){ + print "Sequence $current_chr not found in fai file: no alternative found, skipping\n" if $opts->{'debug'}; + } elsif ($current_chr ne $current_chr_translation){ + print "Sequence $current_chr not found in fai file: using $current_chr_translation as alternative\n" if $opts->{'debug'}; + } + next if !defined $current_chr_translation; + } + + if(exists $store->{$tid}){ + push @{$store->{$tid}}, $feat; + } elsif(exists $transList->{$tid}) { + push @{$store->{$tid}}, $feat; + $c++; + } else { + next; + } + } + if(scalar(keys %$store) > 0){ + foreach my $obj (@{processCompleteTranscripts($opts,$store,$current_chr_translation,$gene_names,$chrList,$ccdsList,$seqFiles)}){ + writeOutput($obj,$cache_fh,$seq_fh); + } + } + close $cache_fh; + close $seq_tmp_fh; + return ($cache_tmp_file,$fa_tmp_file); +} + +sub writeOutput { + my ($trans,$cache,$fa) = @_; + my $seq = Bio::Seq->new(-display_id => $trans->getAccession(),-seq => $trans->getcDNASeq); + my ($e) = $trans->getExons(); + my $cache_row; + try{ + $cache_row = join("\t",$e->getChr,$trans->getGenomicMinPos - 1,$trans->getGenomicMaxPos,$trans->getAccession,$trans->getGeneName,length $trans->getcDNASeq); + } catch { + croak('An error occured outputting transcript '.$trans->getAccession.', please check input data'); + }; + $trans->{_cdnaseq} = undef; + $cache_row .= "\t".Dumper($trans)."\n"; + + $fa->write_seq($seq); + print $cache $cache_row; +} + +sub processCompleteTranscripts { + my ($opts,$store,$chr,$gene_names,$chrList, $ccdsList,$seqFiles) = @_; + my @out = (); + foreach my $trans_id (keys %$store){ + my $converted = convertTranscript($opts,$store->{$trans_id},$trans_id,$gene_names,$ccdsList,$seqFiles,$chr); + push(@out,$converted) if defined $converted; + } + return \@out; +} + +sub convertTranscript { + my ($opts,$raw_transcript,$transcript_id,$gene_names,$ccdsList,$seqFiles,$chr) = @_; + my $type = getGeneTypeForTranscript($opts,$raw_transcript); + return undef unless defined $type; + my $rnaLengthSum = 0; + my $strand = undef; + my @exons; + + my @raw_exons; + my @raw_cds; + my $gene_id = undef; + my $transcript_version = 1; + foreach my $e(@$raw_transcript){ + if($e->has_tag($TAG_TRANSCRIPT_ID) && $e->has_tag($TAG_VERSION)){ + ($transcript_version) = $e->get_tag_values($TAG_VERSION); + } + unless(defined $gene_id){ + if($e->has_tag($TAG_GENE_ID)){ + ($gene_id) = $e->get_tag_values($TAG_GENE_ID); + } elsif ($e->has_tag($TAG_PARENT)){ + my ($parent) = $e->get_tag_values($TAG_PARENT); + my ($pt,$pv) = split ':',$parent; + if($pt eq 'gene'){ + $gene_id = $pv; + } + } + } + push @raw_exons, $e if first { $_ eq $e->primary_tag } @PRIMARY_EXON; + push @raw_cds, $e if first { $_ eq $e->primary_tag } @PRIMARY_REGIONS; + } + return undef if scalar(@raw_exons) == 0; + return undef if scalar(@raw_cds) == 0 && $type eq Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(); + my $gene_name; + if(exists $gene_names->{$gene_id} && defined $gene_names->{$gene_id}){ + $gene_name = $gene_names->{$gene_id}; + } else { + $gene_name = $gene_id + } + @raw_exons = sort featureOrderSortFunction @raw_exons; # sorting exons into transcript order + @raw_cds = sort featureOrderSortFunction @raw_cds; # sorting CDS fragments into transcript order + foreach my $e (@raw_exons){ + my $rmin = $rnaLengthSum + 1; + my $rmax = $rnaLengthSum + $e->length; + $rnaLengthSum += $e->length; + my $convE = Sanger::CGP::Vagrent::Data::Exon->new( + species => $opts->{'sp'}, + genomeVersion => $opts->{'as'}, + chr => $chr, + minpos => $e->start, + maxpos => $e->end, + rnaminpos => $rmin, + rnamaxpos => $rmax,); + + if(defined $strand){ + croak('Inconsistant strand in transcript') unless $strand == $e->strand; + } else { + $strand = $e->strand; + } + push @exons, $convE; + } + my ($protAcc,$protAccVer,$cdsMin,$cdsMax,$cdsPhase,$ccds); + if($type eq Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType()){ + ($cdsMin,$cdsMax,$cdsPhase) = calculateCDS($opts,\@raw_cds,\@raw_exons, $rnaLengthSum, $strand,$transcript_id); + return undef unless(defined $cdsMin && defined $cdsMax && defined $cdsPhase); + $ccds = getCCDS($raw_transcript,$transcript_id,$ccdsList); + foreach my $cds(@raw_cds){ + if($cds->has_tag($TAG_PROTEIN_ID)){ + ($protAcc) = $cds->get_tag_values($TAG_PROTEIN_ID); + last; + } + } + $protAccVer = 1; + + } else { + # non-coding + $protAcc = undef; + $protAccVer = undef; + $cdsMin = undef; + $cdsMax = undef; + $cdsPhase = -1; + } + + my @sortedExons = sort {$a->getMinPos <=> $b->getMinPos} @exons; + my $transcript = Sanger::CGP::Vagrent::Data::Transcript->new( + db => 'Ensembl', + dbversion => $opts->{'d'}, + acc => $transcript_id, + accversion => $transcript_version, + proteinacc => $protAcc, + proteinaccversion => $protAccVer, + ccds => $ccds, + genename => $gene_name, + genetype => $type, + strand => $strand, + cdnaseq => getSequence($transcript_id,$transcript_version,$seqFiles), + cdsminpos => $cdsMin, + cdsmaxpos => $cdsMax, + cdsphase => $cdsPhase, + genomicminpos => $sortedExons[0]->getMinPos, + genomicmaxpos => $sortedExons[-1]->getMaxPos, + exons => \@exons,); + + return $transcript; +} + +sub getSequence { + my ($id,$version,$seqFiles) = @_; + my $out = undef; + my $idv = join('.',$id,$version); + foreach my $fai(@$seqFiles){ + my $loc; + if($fai->has_sequence($id)){ + my $length = $fai->length($id); + $loc = sprintf($SEQ_LOCATION,$id,1,$length); + } elsif ($fai->has_sequence($idv)){ + my $length = $fai->length($idv); + $loc = sprintf($SEQ_LOCATION,$idv,1,$length); + } else { + next; + } + $out = $fai->get_sequence_no_length($loc); + } + return $out; +} + +sub getCCDS { + my ($raw_transcript,$transcript_id,$ccdsList) = @_; + my $ccds = undef; + foreach my $feat(@$raw_transcript){ + foreach my $tag(@TAG_CCDS){ + if($feat->has_tag($tag)){ + ($ccds) = $feat->get_tag_values($tag); + last; + } + } + last if defined $ccds; + } + unless(defined $ccds){ + if(defined $ccdsList){ + if(exists $ccdsList->{$transcript_id}){ + $ccds = $ccdsList->{$transcript_id}; + } + } + } + return $ccds; +} + +sub calculateCDS { + my ($opts,$feats,$exons,$rnaLength, $strand,$transcript_id) = @_; + my $utr5Length = 0; + my $cdsLength = 0; + my $utr3Length = 0; + my $stopCodonLength = 0; + my @cds; + foreach my $f (@$feats){ + if($f->primary_tag eq $PRIMARY_UTR5P){ + $utr5Length += $f->length; + } elsif($f->primary_tag eq $PRIMARY_CDS){ + $cdsLength += $f->length; + push @cds, $f; + } elsif($f->primary_tag eq $PRIMARY_UTR3P){ + $utr3Length += $f->length; + } elsif($f->primary_tag eq $PRIMARY_STOP){ + $stopCodonLength += $f->length; + } + } + my $utr5Length_e = 0; + my $cdsLength_e = 0; + my $utr3Length_e = 0; + + my $cds_first = $cds[0]; + my $cds_last = $cds[-1]; + + my $break = 0; + my $mess = ''; + + foreach my $e(@$exons){ + croak('Recieved undefined exon: '.Dumper($exons)) unless defined($e); + # remember start = min, end = max. start and end DO NOT infer orientation in BIOPERL + if($strand > 0){ + # forward strand start = min = start of feature, end = max = end of feature + if($e->end < $cds_first->start){ + # exon ends before the CDS starts + # 5 prime UTR exon + $utr5Length_e += $e->length; + } elsif ($e->end < $cds_last->start && $e->start <= $cds_first->start && $e->end >= $cds_first->start){ + # exon ends before last CDS fragment starts + # and first CDS fragment starts within this exon. + $utr5Length_e += $cds_first->start - $e->start; + $cdsLength_e += $e->length - ($cds_first->start - $e->start); + } elsif ($e->start >= $cds_first->start && $e->end <= $cds_last->end){ + # exon starts after the first CDS fragment starts + # and exon ends before the last CDS fragment ends + # exon is entirely within the CDS + $cdsLength_e += $e->length; + } elsif ($cds_first->start == $cds_last->start && $cds_first->end == $cds_last->end && $e->start <= $cds_first->start && $e->end >= $cds_last->end){ + # single CDS fragment contained entirely within this exon + $utr5Length_e += $cds_first->start - $e->start; + $cdsLength_e += $e->length - (($cds_first->start - $e->start) + ($e->end - $cds_last->end)); + $utr3Length_e += $e->end - $cds_last->end; + } elsif ($e->start > $cds_first->end && $e->start <= $cds_last->end && $e->end >= $cds_last->end){ + # exon starts after the first CDS fragment ends + # and the last CDS fragment ends within this exon + $cdsLength_e += $e->length - ($e->end - $cds_last->end); + $utr3Length_e += $e->end - $cds_last->end; + } elsif ($e->start > $cds_last->end){ + # exon starts after the CDS ends + $utr3Length_e += $e->length; + } else { + warn "\n",join(' - ',$utr5Length,$cdsLength,$stopCodonLength,$utr3Length),"\n"; + warn join(' - ',$utr5Length_e,$cdsLength_e,0,$utr3Length_e),"\n"; + warn "\n"; + foreach (@cds){warn "CDS : ",Dumper($_),"\n"}; + warn "\n"; + foreach (@$exons){warn "EXON : ",Dumper($_),"\n"}; + warn "\n"; + foreach ($cds_first,$cds_last){warn "CDS BOUNDRIES : ",Dumper($_),"\n"}; + warn "\n"; + warn "PROC : ",Dumper($e),"\n"; + croak('unhandled exon'); + } + + + } elsif($strand < 0){ + # reverse strand start = min = end of feature, end = max = start of feature + if($e->start > $cds_first->end){ + # exon ends (start) before the CDS starts (end) + # 5 prime UTR exon + $utr5Length_e += $e->length; + } elsif ($e->start > $cds_last->end && $e->end >= $cds_first->end && $e->start <= $cds_first->end){ + # exon ends (start) before last CDS fragment starts (end) + # and first CDS fragment starts (end) within this exon + $utr5Length_e += $e->end - $cds_first->end; + $cdsLength_e += $e->length - ($e->end - $cds_first->end); + } elsif ($e->end <= $cds_first->end && $e->start >= $cds_last->start){ + # exon starts (end) after the first CDS fragment starts (end) + # and exon ends (start) before the last CDS fragment ends (start) + # exon is entirely within the CDS + $cdsLength_e += $e->length; + } elsif ($cds_first->end == $cds_last->end && $cds_first->start == $cds_last->start && $e->end >= $cds_first->end && $e->start <= $cds_last->start){ + # single CDS fragment contained entirely within this exon + $utr5Length_e += $e->end - $cds_first->end; + $cdsLength_e += $e->length - (($e->end - $cds_first->end) + ($cds_last->start - $e->start)); + $utr3Length_e += $cds_last->start - $e->start; + } elsif ($e->end < $cds_first->start && $e->end >= $cds_last->start && $e->start <= $cds_last->start){ + # exon starts (end) after the first CDS fragment ends (start) + # and the last CDS fragment ends in this exon + $cdsLength_e += $e->length - ($cds_last->start - $e->start); + $utr3Length_e += $cds_last->start - $e->start; + } elsif ($e->end < $cds_last->start){ + # exons starts (end) after CDS ends (start) + $utr3Length_e += $e->length; + } else { + warn "\n",join(' - ',$utr5Length,$cdsLength,$stopCodonLength,$utr3Length),"\n"; + warn join(' - ',$utr5Length_e,$cdsLength_e,0,$utr3Length_e),"\n"; + warn "\n"; + foreach (@cds){warn "CDS : ",Dumper($_),"\n"}; + warn "\n"; + foreach (@$exons){warn "EXON : ",Dumper($_),"\n"}; + warn "\n"; + foreach ($cds_first,$cds_last){warn "CDS BOUNDRIES : ",Dumper($_),"\n"}; + warn "\n"; + warn "PROC : ",Dumper($e),"\n"; + croak('unhandled exon'); + } + } else { + croak('Cannot process strand: ',$strand); + } + } + + # length calculation double check + + if($utr5Length + $utr3Length > 0){ + # UTR features present in input file, check all 3 values, skip if data is inconsistant + return (undef,undef,undef) unless($utr5Length == $utr5Length_e && $cdsLength == $cdsLength_e && $utr3Length == $utr3Length_e); + } else { + # only compare the CDS length, skip if data is inconsistant + return (undef,undef,undef) unless($cdsLength == $cdsLength_e); + } + + my $cds_min = $utr5Length_e + 1; + my $cds_max = $utr5Length_e + $cdsLength_e + $stopCodonLength; + my $cds_phase = undef; + if(looks_like_number($cds_first->frame)){ + if($cds_first->frame > 0){ + $cds_phase = 3 - $cds_first->frame; + } else { + $cds_phase = 0; + } + } else { + return (undef,undef,undef); + } + return ($cds_min,$cds_max,$cds_phase); +} + +sub getGeneTypeForTranscript { + my ($opts,$t) = @_; + + my $type = undef; + my $type_tag = undef; + + foreach my $f(@$t){ + foreach my $tag(@TAG_BIOTYPE){ + if($f->has_tag($tag)){ + my ($raw_type) = $f->get_tag_values($tag); + if(!defined $type){ + $type = $raw_type; + $type_tag = $tag; + } elsif($type ne $raw_type){ + croak('Transcript has inconsistant biotype'); + } + } + } + } + if($type eq 'protein_coding'){ + return Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(); + } elsif($type eq 'miRNA'){ + return Sanger::CGP::Vagrent::Data::Transcript::getMicroRnaType(); + } elsif($type eq 'lincRNA'){ + return Sanger::CGP::Vagrent::Data::Transcript::getLincRnaType(); + } elsif($type eq 'snoRNA'){ + return Sanger::CGP::Vagrent::Data::Transcript::getSnoRnaType(); + } elsif($type eq 'snRNA'){ + return Sanger::CGP::Vagrent::Data::Transcript::getSnRnaType(); + } elsif($type eq 'rRNA'){ + return Sanger::CGP::Vagrent::Data::Transcript::getRRnaType(); + } else { + if($type_tag eq $TAG_GENE_BIOTYPE){ + # this can happen, there can be a discrepancy between gene and transcript biotype in ensembl. + # If they are only reporting the gene biotype and its not one we can handle we have to skip the transcript. + return undef; + } else { + if($opts->{'debug'}){ + foreach (@$t){ + warn Dumper($_),"\n"; + } + } + croak("Unhandled biotype: $type"); + } + } +} + +sub featureOrderSortFunction { + my $tag = undef; + + if($a->strand != $b->strand){ + croak('Features strands not consistent'); + } + + if($a->strand == 1 && $a->strand == $b->strand){ + return $a->start <=> $b->start; + } elsif($a->strand == -1 && $a->strand == $b->strand){ + return $b->start <=> $a->start; + } else { + croak('unknown strand type: ',$a->strand); + } +} + +sub checkChromosome { + my ($chr,$chrList) = @_; + my $use_chr = undef; + if(first { $_ eq $chr } @$chrList){ + $use_chr = $chr; + } else { + # chromosome not in fai file + # lets try chr prefix discrepancies + if($chr =~ m/^chr(.+)$/){ + # chromosome starts with chr, try removing and search again. + $use_chr = $1 if first { $_ eq $1 } @$chrList; + } elsif($chr !~ m/^chr/){ + # chromosome doesn't start with chr, try adding + $use_chr = 'chr'.$chr if first { $_ eq 'chr'.$chr } @$chrList; + } + if(!defined $use_chr){ + # still not found a translation + # check for mitochondria, its a special case + my $mito_check; + if($chr =~ m/^chr(.+)$/){ + $mito_check = $1; + } else { + $mito_check = $chr; + } + if(first { $_ eq $mito_check } @MITO_ALIASES){ + # this is mitochondria + # check fai file for aliases, with and without chr prefixes. + foreach my $m(@MITO_ALIASES){ + my $chrM = 'chr'.$m; + if(first { $_ eq $m } @$chrList){ + $use_chr = $m; + last; + } elsif (first { $_ eq $chrM } @$chrList){ + $use_chr = $chrM; + last; + } + } + } + } + } + return $use_chr; +} + +sub openFeatureFile { + my $file = shift; + my $fileType = File::Type->new->checktype_filename($file); + my $fh; + if(first { $_ eq $fileType } @ZIP_TYPES){ + # feature file is zipped + open $fh, "zcat $file |" or croak("unable to open feature input file: $file"); + } elsif(first { $_ eq $fileType } @TEXT_TYPES){ + # feature file is plain text + open $fh, "<$file" or croak("unable to open feature input file: $file"); + } else { + croak('Unsupported file type: '.$fileType); + } + my $gff_version; + if($file =~ m/\.gtf(?:\.gz)?/){ + $gff_version = 2.5; + } elsif($file =~ m/\.gff3(?:\.gz)?/){ + $gff_version = 3; + } else { + croak('Unable to determine format of '.$file); + } + + return Bio::Tools::GFF->new(-fh => $fh, -gff_version => $gff_version); +} + +sub openSeqFiles { + my ($tmpDir,@files) = @_; + my $out; + foreach my $file (@files){ + my $fileType = File::Type->new->checktype_filename($file); + if(first { $_ eq $fileType } @ZIP_TYPES){ + # File zipped, going to have to re-zip it with bgzip + my $unzipped = makeTmpFilePath($tmpDir,'.fa'); + my $zipped = makeTmpFilePath($tmpDir,'.fa.gz'); + my $unzip_cmd = sprintf($UNZIP, $file, $unzipped); + my $bgzip_cmd = sprintf($BGZIP, $unzipped, $zipped); + my ($uz_stdout, $uz_stderr, $uz_exit) = capture {system($unzip_cmd);}; + croak($uz_stderr) if $uz_exit > 0; + my ($bgz_stdout, $bgz_stderr, $bgz_exit) = capture {system($bgzip_cmd);}; + croak($bgz_stderr) if $bgz_exit > 0; + + $file = $zipped; + } elsif(first { $_ eq $fileType } @TEXT_TYPES){ + # plain text file, needs zipping + my $zipped = makeTmpFilePath($tmpDir,'.fa.gz'); + my $bgzip_cmd = sprintf($BGZIP, $file, $zipped); + my ($bgz_stdout, $bgz_stderr, $bgz_exit) = capture {system($bgzip_cmd);}; + croak($bgz_stderr) if $bgz_exit > 0; + $file = $zipped; + } else { + croak('Unsupported file type: '.$fileType) + } + # HTS call prints to stderr if it has to generate an fai file, which it will here + # feels dirty but this makes it quiet. + capture_stderr {push @$out, Bio::DB::HTS::Faidx->new($file)}; + } + return $out; +} + +sub parseCCDSfile { + my $file = shift; + return undef unless defined $file; + open my $fh, "<$file" or croak("unable to open CCDS file: ".$file); + my @data = <$fh>; + close $fh; + my $out; + foreach my $d(@data){ + next if $d !~ m/^CCDS\d/; + my @cols = split /\s+/,$d; + my $ccds = $cols[0]; + my $transcript_id = $cols[4]; + $out->{$transcript_id} = $ccds; + } + return $out; +} + +sub parseFaiFile { + my $fai = shift; + return undef unless defined $fai; + open my $fh, "<$fai" or croak("unable to open fai file: ".$fai); + my @data = <$fh>; + close $fh; + my @out; + foreach my $d(@data){ + my ($name) = split /\s+/,$d; + push @out, $name; + } + return \@out; +} + +sub parseTranscriptList { + my $file = shift; + open my $fh, "<$file" or croak('Unable to open transcript list file: '.$file); + my $out; + while(<$fh>){ + my $line = $_; + chomp $line; + $out->{$line} = 1; + } + close $fh; + return $out; +} + +sub makeTmpFilePath { + my ($tmpDir,$ext) = @_; + my $file; + (undef, $file) = tempfile('VagrentBuildCacheXXXXX', DIR => $tmpDir, OPEN => 0, SUFFIX => $ext); + return $file; +} + +sub option_builder { + my ($factory) = @_; + my %opts = (); + my $result = &GetOptions ( + 'h|help' => \$opts{'h'}, + 'f|fasta=s@' => \$opts{'f'}, + 'gf|features=s' => \$opts{'features'}, + 't|transcripts=s' => \$opts{'transcripts'}, + 'o|output=s' => \$opts{'o'}, + 'sp|species=s' => \$opts{'sp'}, + 'as|assembly=s' => \$opts{'as'}, + 'd|database=s' => \$opts{'d'}, + 'c|ccds=s' => \$opts{'c'}, + 'fai|fai=s' => \$opts{'fai'}, + 'debug|debug' => \$opts{'debug'}, + ); + + pod2usage() if($opts{'h'}); + if(scalar(@{$opts{'f'}}) > 0){ + foreach my $f(@{$opts{'f'}}){ + pod2usage("Unable to read the transcript sequence file: $f") unless -e $f && -r $f; + } + } else { + pod2usage('Must specify one or more fasta sequence file as input'); + } + + pod2usage('Must specify the input gtf/gff3 reference file') unless defined $opts{'features'} && -e $opts{'features'}; + pod2usage('Must specify transcript list file') unless defined $opts{'transcripts'} && -e $opts{'transcripts'}; + pod2usage('Must specify the output directory') unless defined $opts{'o'} && -e $opts{'o'} && -d $opts{'o'} ; + pod2usage('Must specify the species') unless defined $opts{'sp'}; + pod2usage('Must specify the genome version') unless defined $opts{'as'}; + pod2usage('Must specify the data version') unless defined $opts{'d'}; + if(defined($opts{'c'})){ + pod2usage('CCDS file unreadable') unless -e $opts{'c'} && -r $opts{'c'}; + } + if(defined($opts{'fai'})){ + pod2usage('Unable to read the fai file: '.$opts{'fai'}) unless -e $opts{'fai'} && -r $opts{'fai'}; + } + return \%opts; +} + + + +__END__ + +=head1 NAME + +Admin_CacheFileBuilder.pl - Generates the Vagrent reference data set from the supplied GFF£/GTF and Fasta files + +=head1 SYNOPSIS + +Admin_CacheFileBuilder.pl [-h] [-f /path/to/sequence.fa] [-gff /path/to/annotation.gff] [-sp human] [-as GRCh37] [-d homo_sapiens_core_74_37p] [-c /path/to/CCDS2Sequence.version.txt] + + Required Options: + + --output (-o) Output directory + + --fasta (-f) Fasta file containing Transcript sequences, can be specified multiple times. + + --features (-gf) GFF3 or GTF file containing genomic annotation + + --transcripts (-t) Transcript accession file, (simple list, one accession per line) + + --species (-sp) Species (ie human, mouse) + + --assembly (-as) Assembly version (ie GRCh37, GRCm38) + + --data_version (-d) Reference version number (Ensemble example: homo_sapiens_core_74_37p) + + Other Options: + + --help (-h) Brief documentation + + --ccds (-c) (Recommended) The CCDS2Sequence file from the relevant CCDS release, see http://www.ncbi.nlm.nih.gov/CCDS + + --fai (-fai) (Recommended) The samtools fasts index file (.fai) for your reference genome + This is the reference genome that your bam and vcf files will be mapped to + + --debug (-debug) This turns on the debug output, useful if you are having issues with a specific Ensembl build +=cut \ No newline at end of file diff --git a/bin/Admin_EnsemblGtf2CacheConverter.pl b/bin/Admin_EnsemblGtf2CacheConverter.pl deleted file mode 100755 index 83d9e3c..0000000 --- a/bin/Admin_EnsemblGtf2CacheConverter.pl +++ /dev/null @@ -1,400 +0,0 @@ -#!/usr/bin/perl - -##########LICENCE########## -# Copyright (c) 2014 Genome Research Ltd. -# -# Author: Cancer Genome Project cgpit@sanger.ac.uk -# -# This file is part of VAGrENT. -# -# VAGrENT is free software: you can redistribute it and/or modify it under -# the terms of the GNU Affero General Public License as published by the Free -# Software Foundation; either version 3 of the License, or (at your option) any -# later version. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more -# details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -##########LICENCE########## - - -use strict; -use English qw(-no_match_vars); -use warnings FATAL => 'all'; -use Carp; - -use FindBin qw($Bin); -use lib "$Bin/../lib"; - -use Getopt::Long; -use Pod::Usage; -use Try::Tiny; - -use Data::Dumper; -use File::Type; -use Const::Fast qw(const); - -use Bio::DB::HTS; - -use Sanger::CGP::Vagrent::Data::Transcript; -use Sanger::CGP::Vagrent::Data::Exon; - -$Data::Dumper::Indent = 0; - -const my @TEXT_TYPES => qw(application/octet-stream); - -const my @ZIP_TYPES => qw(application/x-gzip); - -const my $PROTEIN_CODING => 'protein_coding'; -const my $EXON_TYPE => 'exon'; -const my $CDS_START_TYPE => 'start_codon'; -const my $CDS_STOP_TYPE => 'stop_codon'; -const my $CDS_TYPE => 'CDS'; - -try { - my $opts = option_builder(); - my $ccds = undef; - my $lookup = parseFaiFile($opts->{'fai'}); - $ccds = parseCCDSFile($opts->{'c'}) if defined $opts->{'c'}; - convertGtf($opts,$lookup,$ccds); -} catch { - die "An error occurred while building reference support files\:\n\t$_"; # not $@ -}; - -sub convertGtf { - my ($opts,$lookup,$ccds) = @_; - my $type = getInputFileType($opts->{'g'}); - my $fasta = Bio::DB::HTS::Fai->load($opts->{'f'}); - my ($inFh,$outFh); - if($type eq 'text') { - open $inFh, '<'.$opts->{'g'} || croak("unable to open gtf input file:".$opts->{'g'}); - } elsif($type eq 'zip'){ - open $inFh, 'zcat '.$opts->{'g'}.' |' || croak("unable to open gtf input file:".$opts->{'g'}); - } - open $outFh, ">".$opts->{'o'} || croak("unable to open output file:".$opts->{'o'}); - my $c = 0; - my $wip; - my $currentChr = undef; - my $lc = 0; - while(<$inFh>){ - $lc++; - next if m/^#/; - my $line = $_; - chomp $line; - - my $acc; - - my ($chr,$bioType,$lineType, $start, $end, $strand, $frame, $tmpAttr, %attr); - - ($chr,$bioType,$lineType, $start, $end, undef, $strand, $frame, $tmpAttr) = split /\t/, $line; - - foreach my $t(split /\; /, $tmpAttr){ - my ($k,$v) = split /\s/,$t,2; - $attr{$k} = $v; - } - - if(exists $attr{'transcript_biotype'} && defined $attr{'transcript_biotype'} ){ - $bioType = unquoteValue($attr{'transcript_biotype'}); - } - - if(exists $attr{'transcript_id'} && defined $attr{'transcript_id'}){ - $acc = unquoteValue($attr{'transcript_id'}); - unless(exists $lookup->{$acc}) { - next unless(exists $attr{'transcript_version'}); - $acc .= '.'.unquoteValue($attr{'transcript_version'}); - next unless exists $lookup->{$acc}; - } - } else { - next; - } - $currentChr = $chr unless defined $currentChr; - if($currentChr ne $chr) { - # changed chr, any transcript on previous chr must be finished - foreach my $t (processCompleted($opts,$fasta,$lookup,$ccds,$wip,$currentChr)){ - writeTranscript($outFh,$t,$wip->{$t->getAccession}); - delete $wip->{$t->getAccession}; - } - $currentChr = $chr; - } - my @store = ($chr,$start,$end,$strand,$frame,unquoteValue($attr{'exon_number'})); - push(@{$wip->{$acc}->{'lines'}->{$lineType}},\@store); - unless(exists $wip->{$acc}->{'type'}){ - $c++; - $wip->{$acc}->{'type'} = $bioType; - $wip->{$acc}->{'acc'} = $acc; - if(exists $attr{'gene_name'}) { - $wip->{$acc}->{'gene'} = unquoteValue($attr{'gene_name'}); - } - elsif(exists $attr{'gene_id'}) { - $wip->{$acc}->{'gene'} = unquoteValue($attr{'gene_id'}); - } - else { - croak "Cannot identify gene name or ID for structure: ".Dumper(\%attr); - } - $wip->{$acc}->{'CCDS'} = unquoteValue($attr{'ccds_id'}) if exists $attr{'ccds_id'}; - } - if($lineType eq $CDS_TYPE && !defined $wip->{$acc}->{'protacc'}){ - $wip->{$acc}->{'protacc'} = unquoteValue($attr{'protein_id'}); - } - } - foreach my $t (processCompleted($opts,$fasta,$lookup,$ccds,$wip,$currentChr)){ - writeTranscript($outFh,$t,$wip->{$t->getAccession}); - delete $wip->{$t->getAccession}; - } - close $outFh; - close $inFh; -} - -sub processCompleted { - my ($opts,$fasta,$lookup,$ccds,$wip,$currentChr) = @_; - my @trans; - foreach my $acc (keys %$wip){ - #warn Dumper($wip->{$acc}); - if(exists $wip->{$acc}->{'lines'}->{$EXON_TYPE} && $wip->{$acc}->{'lines'}->{$EXON_TYPE}->[0]->[0] eq $currentChr){ - my $ccdsId = undef; - if(exists $ccds->{$acc} && defined $ccds->{$acc}){ - $ccdsId = $ccds->{$acc}; - } elsif(exists $wip->{$acc}->{'CCDS'} && defined $wip->{$acc}->{'CCDS'}){ - $ccdsId = $wip->{$acc}->{'CCDS'}; - } - my $t = convertTranscript($opts,$fasta,$lookup->{$acc},$ccdsId,$wip->{$acc}); - push @trans, $t; - } - } - return @trans; -} - -sub convertTranscript { - my ($opts,$fasta,$tlength,$ccds,$data) = @_; - my $type = getGeneTypeForTranscript($data->{'type'}); - my @exons; - my $strand = undef; - my $rnaLengthSum = 0; - my @sortedExonLines = sort {$a->[5] <=> $b->[5]} @{$data->{'lines'}->{$EXON_TYPE}}; - foreach my $rawE (@sortedExonLines){ - my $elength = ($rawE->[2] - $rawE->[1]) + 1; - my $rmin = $rnaLengthSum + 1; - my $rmax = $rnaLengthSum + $elength; - $rnaLengthSum += $elength; - my $convE = Sanger::CGP::Vagrent::Data::Exon->new( - species => $opts->{'sp'}, - genomeVersion => $opts->{'as'}, - chr => $rawE->[0], - minpos => $rawE->[1], - maxpos => $rawE->[2], - rnaminpos => $rmin, - rnamaxpos => $rmax,); - - unless(defined $strand){ - if($rawE->[3] eq '+'){ - $strand = 1; - } elsif($rawE->[3] eq '-'){ - $strand = -1; - } else { - croak "Expecting strand of + or -, recieved: ".$rawE->[3]; - } - } - push(@exons,$convE); - } - - my ($protAcc,$protAccVer,$cdsMin,$cdsMax,$cdsPhase); - - if($type eq Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType()){ - # protein coding - $protAcc = $data->{'protacc'}; - $protAccVer = 1; - my $cdsLength = 0; - my @sortedCdsLines = sort {$a->[5] <=> $b->[5]} @{$data->{'lines'}->{$CDS_TYPE}}; - foreach my $cds (@sortedCdsLines){ - $cdsLength += ($cds->[2] - $cds->[1]) + 1; - } - # find CDS start - my $fivePrimeUtrExonLength = 0; - foreach my $e (@sortedExonLines){ - if($sortedCdsLines[0]->[5] eq $e->[5]){ - if($strand > 0){ - $cdsMin = $fivePrimeUtrExonLength + (($sortedCdsLines[0]->[1] - $e->[1]) + 1); - $cdsMax = ($cdsMin + $cdsLength) - 1; - } else { - $cdsMin = $fivePrimeUtrExonLength + (($e->[2] - $sortedCdsLines[0]->[2]) + 1); - $cdsMax = ($cdsMin + $cdsLength) - 1; - } - last; - } else { - $fivePrimeUtrExonLength += ($e->[2] - $e->[1]) + 1; - } - } - $cdsMax += 3 if exists $data->{'lines'}->{$CDS_STOP_TYPE}; - $cdsPhase = 0; - if($sortedCdsLines[0]->[4] > 0){ - $cdsPhase = 3 - $sortedCdsLines[0]->[4]; - } - } else { - # non-coding - $protAcc = undef; - $protAccVer = undef; - $cdsMin = undef; - $cdsMax = undef; - $cdsPhase = -1; - } - - my @sortedExons = sort {$a->getMinPos <=> $b->getMinPos} @exons; - my $convT = Sanger::CGP::Vagrent::Data::Transcript->new( - db => 'Ensembl', - dbversion => $opts->{'d'}, - acc => $data->{'acc'}, - accversion => 1, - proteinacc => $protAcc, - proteinaccversion => $protAccVer, - ccds => $ccds, - genename => $data->{'gene'}, - genetype => $type, - strand => $strand, - cdnaseq => $fasta->seq($data->{'acc'},1,$tlength), - cdsminpos => $cdsMin, - cdsmaxpos => $cdsMax, - cdsphase => $cdsPhase, - genomicminpos => $sortedExons[0]->getMinPos, - genomicmaxpos => $sortedExons[-1]->getMaxPos, - exons => \@exons,); - return $convT; -} - -sub writeTranscript { - my ($fh,$t,$rawT) = @_; - eval { - print $fh join("\t",$rawT->{'lines'}->{$EXON_TYPE}->[0]->[0],$t->getGenomicMinPos - 1, - $t->getGenomicMaxPos,$t->getAccession,$t->getGeneName,length $t->getcDNASeq); - 1; - }; - if($@) { - die "\nTranscript Object: ".Dumper($t)."\n\nExon_Type layer: ".Dumper($rawT->{'lines'}->{$EXON_TYPE})."\n\nERROR: Abandon hope, Ensemble structure has changed\n"; - } - $t->{_cdnaseq} = undef; - print $fh "\t",Dumper($t),"\n"; -} - -sub getGeneTypeForTranscript { - my ($t) = @_; - if($t eq 'protein_coding'){ - return Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType(); - } elsif($t eq 'miRNA'){ - return Sanger::CGP::Vagrent::Data::Transcript::getMicroRnaType(); - } elsif($t eq 'lincRNA'){ - return Sanger::CGP::Vagrent::Data::Transcript::getLincRnaType(); - } elsif($t eq 'snoRNA'){ - return Sanger::CGP::Vagrent::Data::Transcript::getSnoRnaType(); - } elsif($t eq 'snRNA'){ - return Sanger::CGP::Vagrent::Data::Transcript::getSnRnaType(); - } elsif($t eq 'rRNA'){ - return Sanger::CGP::Vagrent::Data::Transcript::getRRnaType(); - } else { - croak("Unhandled type: $t"); - } -} - -sub unquoteValue { - my $val = shift; - $val =~ s/[\";]//g if defined $val; - return $val; -} - -sub parseCCDSFile { - my $file = shift; - my $out = undef; - open my $fh, "<$file" || croak ("unable to open $file for reading"); - while(<$fh>){ - my @data = split /\s+/, $_; - next unless $data[4] =~ m/^ENST/; - next unless $data[2] == 1; - $out->{$data[4]} = $data[0]; - } - close $fh; - return $out; -} - -sub parseFaiFile { - my $fai = shift; - open my $fh, "<$fai" || croak("unable to open fai file: $fai"); - my $out; - while (<$fh>) { - my ($enst,$length) = split /\s+/; - $out->{$enst} = $length; - } - return $out; -} - -sub getInputFileType { - my $infile = shift; - my $fileType = File::Type->new->checktype_filename($infile); - foreach my $t (@TEXT_TYPES){ - return 'text' if($t eq $fileType); - } - foreach my $t (@ZIP_TYPES) { - return 'zip' if($t eq $fileType); - } - return $fileType; -} - -sub option_builder { - my ($factory) = @_; - my %opts = (); - my $result = &GetOptions ( - 'h|help' => \$opts{'h'}, - 'f|fasta=s' => \$opts{'f'}, - 'g|gtf=s' => \$opts{'g'}, - 'o|output=s' => \$opts{'o'}, - 'sp|species=s' => \$opts{'sp'}, - 'as|assembly=s' => \$opts{'as'}, - 'd|database=s' => \$opts{'d'}, - 'c|ccds=s' => \$opts{'c'}, - ); - - pod2usage() if($opts{'h'}); - pod2usage('Must specify the input fasta reference file') unless defined $opts{'f'} && -e $opts{'f'}; - my $fai = $opts{'f'} .'.fai'; - pod2usage("fasta index reference file not found: $fai") unless -e $fai; - $opts{'fai'} = $fai; - pod2usage('Must specify the input gtf reference file') unless defined $opts{'g'} && -e $opts{'g'}; - pod2usage('Must specify the output file to use') unless defined $opts{'o'}; - pod2usage('Must specify the species') unless defined $opts{'sp'}; - pod2usage('Must specify the genome version') unless defined $opts{'as'}; - pod2usage('Must specify the ensembl core database version') unless defined $opts{'d'}; - if(defined($opts{'c'})){ - pod2usage('CCDS file unreadable') unless -e $opts{'c'} && -r $opts{'c'}; - } - return \%opts; -} - -__END__ - -=head1 NAME - -Admin_EnsemblGtf2CacheConverter.pl - Generates the Vagrent cache file from the ensembl GTF file - -=head1 SYNOPSIS - -Admin_EnsemblGtf2CacheConverter.pl [-h] [-f /path/to/ensembl.fa] [-g /path/to/ensembl.gtf] [-o /path/to/output.file] [-sp human] [-as GRCh37] [-d homo_sapiens_core_74_37p] [-c /path/to/CCDS2Sequence.version.txt] - - General Options: - - --help (-h) Brief documentation - - --fasta (-f) Fasta file containing the transcripts to be imported, fai file must also be present. - - --gtf (-g) Ensembl GTF file for converting - - --output (-o) Output file - - --species (-sp) Species (ie human, mouse) - - --assembly (-as) Assembly version (ie GRCh37, GRCm38) - - --database (-d) Ensembl core database version number (ie homo_sapiens_core_74_37p) - - --ccds (-c) (Optional, but strongly advised) The CCDS2Sequence file from the relevant CCDS release, see http://www.ncbi.nlm.nih.gov/CCDS -=cut diff --git a/bin/Admin_EnsemblReferenceFileGenerator.pl b/bin/Admin_EnsemblReferenceFileGenerator.pl old mode 100755 new mode 100644 index 074cba0..aa6a4c0 --- a/bin/Admin_EnsemblReferenceFileGenerator.pl +++ b/bin/Admin_EnsemblReferenceFileGenerator.pl @@ -1,9 +1,9 @@ #!/usr/bin/perl ##########LICENCE########## -# Copyright (c) 2014 Genome Research Ltd. +# Copyright (c) 2018 Genome Research Ltd. # -# Author: Cancer Genome Project cgpit@sanger.ac.uk +# Author: CASM/Cancer IT - cgphelp@sanger.ac.uk # # This file is part of VAGrENT. # @@ -21,7 +21,6 @@ # along with this program. If not, see . ##########LICENCE########## - use strict; use English qw(-no_match_vars); use warnings FATAL => 'all'; @@ -34,6 +33,7 @@ use Getopt::Long; use Pod::Usage; use Try::Tiny; +use Capture::Tiny qw(capture); use LWP::Simple; use IPC::System::Simple qw(run); use Net::FTP; @@ -46,8 +46,8 @@ use Data::Dumper; const my @ENSMBL_REF_FILE_EXTENTIONS => qw(cdna.all.fa.gz ncrna.fa.gz); -const my $FASTA_FILTER_SCRIPT => 'Admin_EnsemblTranscriptFastaFilter.pl'; -const my $GTF_CONVERSION_SCRIPT => 'Admin_EnsemblGtf2CacheConverter.pl'; +const my $TRANSCRIPT_FILTER_SCRIPT => 'Admin_EnsemblTranscriptFilter.pl'; +const my $BUILDER_SCRIPT => 'Admin_CacheFileBuilder.pl'; const my $FILTERED_FASTA_SUFFIX => 'vagrent.fa'; const my $CACHE_SUFFIX_GZ => 'vagrent.cache.gz'; const my $CACHE_SUFFIX_RAW => 'vagrent.cache.raw'; @@ -55,153 +55,162 @@ const my $ENSEMBL_SPECIES_ASSEMBLY => qr/([^\.]+?)\.(.+?)\./; const my $ENSEMBL_VERSION_PATTERN => qr/^ftp\:\/\/ftp\.ensembl(?:genomes)?\.org\/pub\/release\-(\d+?)\//; +my $tmpDir = tempdir("VagrentEnsemblRefFileGenXXXXX", TMPDIR => 1, CLEANUP => 1); + try { my $opts = option_builder(); - my $urls = getFileUrlsForRetrival($opts); - - print "Downloading Files -------- "; - my ($codFasta, $ncFasta, $gtf) = downloadFiles($urls); + my ($codFasta, $ncFasta, $features,$transList); + + print "Downloading Files -------- "; + if(defined $opts->{'f'}){ + my $urls = getFileUrlsForRetrival($opts); + ($codFasta, $ncFasta, $features) = downloadFiles($tmpDir, $urls); + print "Done\n"; + } else { + $codFasta = $opts->{'cdna_fa'}; + $ncFasta = $opts->{'ncrna_fa'}; + $features = $opts->{'features'}; + print "Skipped, files locally supplied\n"; + } + + print "Obtaining Filtered Transcript List ----- "; + unless(defined $opts->{'trans_list'}){ + $transList = generateTranscriptListFile($tmpDir,$features, $codFasta, $ncFasta); + print "Done\n"; + } else { + $transList = $opts->{'trans_list'}; + print "Skipped, files locally supplied\n"; + } + + print "Building Cache Files ----- "; + buildCacheFiles($tmpDir, $opts, $features, $transList, $codFasta, $ncFasta); print "Done\n"; - # make fasta file of selected cDNA sequences - print "Building Fasta Files ----- "; - my $fasta = generateFilteredFasta($opts, $codFasta); - if(defined $opts->{'n'}){ - generateFilteredFasta($opts, $ncFasta, $fasta, $opts->{'n'}); - } else { - generateFilteredFasta($opts, $ncFasta, $fasta); - } - print "Done\n"; - # Create fai index file - print "Building Fasta Index ----- "; - my $fai = createFaiIndex($fasta); - print "Done\n"; - # Create transcript object cache file - print "Building Vagrent Cache --- "; - my $cache = generateCacheFile($opts,$fasta,$gtf,$codFasta); - print "Done\n"; + } catch { croak "An error occurred while building reference support files\:\n\t$_"; # not $@ }; -sub generateCacheFile{ - my ($opts,$fasta,$gtf,$codFasta) = @_; - my $rawCache = createFilePathFromFasta($opts,$codFasta,$CACHE_SUFFIX_RAW); - my $cache = createFilePathFromFasta($opts,$codFasta,$CACHE_SUFFIX_GZ); - my $cmd = $^X.' '.getCacheFileScript(); - $cmd .= " -sp ".$opts->{'sp'}; - $cmd .= " -as ".$opts->{'as'}; - $cmd .= " -d ".$opts->{'d'}; - $cmd .= " -c ".$opts->{'c'} if defined $opts->{'c'}; - $cmd .= " -f $fasta"; - $cmd .= " -g $gtf"; - $cmd .= " -o $rawCache"; - - system($cmd) == 0 || croak "unable to create cache file: $rawCache"; - system("sort -k 1,1 -k 2n,2 -k 3n,3 $rawCache | bgzip > $cache") == 0 || croak "unable to sort and zip cache file: $cache"; - unlink $rawCache; - system("tabix -p bed $cache") == 0 || croak "unable to tabix index cache file: $cache"; - return $cache; -} - -sub createFaiIndex { - my $fa = shift; - my $cmd = "samtools faidx $fa"; - system($cmd) == 0 || croak "unable to index $fa: $?"; - return $fa.'.fai'; +sub buildCacheFiles { + my ($tmpDir, $opts, $features, $transList, @seq_files) = @_; + my $cmd = $^X.' '.getBuilderScript(); + foreach my $s(@seq_files){ + $cmd .= " -f $s"; + } + $cmd .= ' -o '.$opts->{'o'}; + $cmd .= " -gf $features"; + $cmd .= " -t $transList"; + $cmd .= ' -sp '.$opts->{'sp'}; + $cmd .= ' -as '.$opts->{'as'}; + $cmd .= ' -d '.$opts->{'d'}; + $cmd .= ' -fai '.$opts->{'fai'} if defined $opts->{'fai'}; + my ($stdout,$stderr,$exit) = capture {system($cmd)}; + croak('Unable to generate cache files list:- '.$stderr) if $exit; + return; } -sub generateFilteredFasta { - my ($opts, $inFa, $outFa, $statusLookupFile) = @_; - my $app = 0; - if(defined $outFa){ - $app = 1; - } else { - $outFa = createFilePathFromFasta($opts,$inFa,$FILTERED_FASTA_SUFFIX); - } - my $cmd = $^X.' '.getFastaFilterScript(); - $cmd .= " -f $inFa"; - $cmd .= " -o $outFa"; - $cmd .= ' -b '.join(' -b ',@TRANSCRIPT_BIOTYPES); - $cmd .= ' -a' if $app; - $cmd .= " -s $statusLookupFile" if defined $statusLookupFile; - system($cmd) == 0 || croak "unable to filter $inFa: $?"; - return $outFa; +sub getBuilderScript { + my $progPath = abs_path($0); + my ($vol,$dirs,$file) = File::Spec->splitpath($progPath); + return "$dirs".$BUILDER_SCRIPT; } -sub createFilePathFromFasta { - my ($opts, $inFa, $suffix) = @_; - my ($vol,$dirs,$file) = File::Spec->splitpath($inFa); - my $outFa; - if($file =~ m/$ENSEMBL_SPECIES_ASSEMBLY/){ - $outFa = File::Spec->catfile($opts->{'o'},"$1.$2.".$opts->{'e_version'}.".$suffix"); - } else { - croak("unable to match species and assembly from file name: $inFa"); +sub generateTranscriptListFile { + my ($tmpDir,$features, @seq_files) = @_; + my $cmd = $^X.' '.getFilterScript(); + my $transListFile = makeTmpFilePath($tmpDir,'.translist'); + foreach my $s(@seq_files){ + $cmd .= " -f $s"; } - return $outFa; + $cmd .= " -o $transListFile"; + $cmd .= ' -b '.join(' -b ',@TRANSCRIPT_BIOTYPES); + my ($stdout,$stderr,$exit) = capture {system($cmd)}; + croak('Unable to generate transcript list:- '.$stderr) if $exit; + return $transListFile; } -sub downloadFiles { - my $urls = shift; - my @out; - my $tmpDir = tempdir("VagrentEnsemblRefFileGenXXXXX", TMPDIR => 1, CLEANUP => 1); - foreach my $url(@$urls){ - my $file = $tmpDir.'/'.(split /\//, $url)[-1]; - push @out, $file; - my $rc = getstore($url, $file); - croak "An error occured when retrieving $url\n" if(is_error($rc)); - } - return @out; +sub getFilterScript { + my $progPath = abs_path($0); + my ($vol,$dirs,$file) = File::Spec->splitpath($progPath); + return "$dirs".$TRANSCRIPT_FILTER_SCRIPT; } sub getFileUrlsForRetrival { my $opts = shift; my @out; - # ftp://ftp.ensembl.org/pub/release-74/fasta/homo_sapiens/cdna - # ftp://ftp.ensembl.org/pub/release-74/fasta/homo_sapiens/ncrna - # ftp://ftp.ensembl.org/pub/release-74/gtf/homo_sapiens + # ftp://ftp.ensembl.org/pub/release-90/fasta/homo_sapiens/cdna + # ftp://ftp.ensembl.org/pub/release-90/fasta/homo_sapiens/ncrna + # ftp://ftp.ensembl.org/pub/release-90/gff3/homo_sapiens + + # ftp://ftp.ensembl.org/pub/release-70/fasta/homo_sapiens/cdna + # ftp://ftp.ensembl.org/pub/release-70/fasta/homo_sapiens/ncrna + # ftp://ftp.ensembl.org/pub/release-70/gtf/homo_sapiens # ftp://ftp.ensembl.org/pub/release-74/fasta/danio_rerio/cdna # ftp://ftp.ensembl.org/pub/release-74/fasta/danio_rerio/ncrna # ftp://ftp.ensembl.org/pub/release-74/gtf/danio_rerio + $opts->{'f'} =~ s|/$||; # clean training '/' off url my $ncFaDir = $opts->{'f'}; $ncFaDir =~ s|/cdna$|/ncrna|; + my $gffDir = $opts->{'f'}; + $gffDir =~ s|/cdna$||; + $gffDir =~ s|/fasta/|/gff3/|; my $gtfDir = $opts->{'f'}; $gtfDir =~ s|/cdna$||; $gtfDir =~ s|/fasta/|/gtf/|; - foreach my $dir($opts->{'f'}, $ncFaDir, $gtfDir) { + my $ftp = undef; + foreach my $dir($opts->{'f'}, $ncFaDir, $gffDir, $gtfDir) { my @comps = split /\/+/, $dir; my $root = shift @comps; # remove ftp: my $host = shift @comps; my $path = join '/', @comps; - my $ftp = Net::FTP->new($host, Debug => 0) or croak "Failed to connect (ftp) to $host\n\t$EVAL_ERROR"; - $ftp->login or croak "Failed to login (ftp) to $host\n\t", $ftp->message; - my @files = $ftp->ls($path) or croak "Failed to list directory $path on $host (ftp)\n\t", $ftp->message; - $ftp->quit; + my $have_feature_file = 0; + unless(defined $ftp){ + $ftp = Net::FTP->new($host, Debug => 0) or croak "Failed to connect (ftp) to $host\n\t$EVAL_ERROR"; + $ftp->login or croak "Failed to login (ftp) to $host\n\t", $ftp->message; + } + my @files; + unless(@files = $ftp->ls($path)){ + unless($dir eq $gffDir){ # older ensembl releases don't have a gff directory, can be missing. + croak "Failed to list directory $path on $host (ftp)\n\t", $ftp->message; + } + } foreach my $file (@files){ foreach my $ext (@ENSMBL_REF_FILE_EXTENTIONS){ if($file =~ m/$ext$/){ push @out, $dir. '/' . (split '/', $file)[-1]; } } - push @out, $dir. '/' . (split '/', $file)[-1] if($file =~ m/[[:digit:]]\.gtf\.gz$/); + if ($dir eq $gffDir && $file =~ m/\.[[:digit:]]+?\.gff3\.gz$/ && $file !~ m/\.[[:digit:]]+?\.(?:chromosome)\..+?\.gff3\.gz$/){ + push @out, $dir. '/' . (split '/', $file)[-1]; + $have_feature_file = 1; + } elsif ($dir eq $gtfDir && $file =~ m/[[:digit:]]\.gtf\.gz$/){ + push @out, $dir. '/' . (split '/', $file)[-1] + } } + last if $have_feature_file; } + $ftp->quit; return \@out; } -sub getFastaFilterScript { - my $progPath = abs_path($0); - my ($vol,$dirs,$file) = File::Spec->splitpath($progPath); - return "$dirs".$FASTA_FILTER_SCRIPT; +sub makeTmpFilePath { + my ($tmpDir,$ext) = @_; + my $file; + (undef, $file) = tempfile('VagrentEnsemblRefFileGenXXXXX', DIR => $tmpDir, OPEN => 0, SUFFIX => $ext); + return $file; } -sub getCacheFileScript { - my $progPath = abs_path($0); - my ($vol,$dirs,$file) = File::Spec->splitpath($progPath); - my $mods = $dirs; - $mods =~ s|bin/$|lib|; - return "-I $mods $dirs".$GTF_CONVERSION_SCRIPT; +sub downloadFiles { + my ($tmpDir, $urls) = @_; + my @out; + foreach my $url(@$urls){ + my $file = $tmpDir.'/'.(split /\//, $url)[-1]; + push @out, $file; + my $rc = getstore($url, $file); + croak "An error occured when retrieving $url\n" if(is_error($rc)); + } + return @out; } sub option_builder { @@ -210,19 +219,39 @@ sub option_builder { my $result = &GetOptions ( 'h|help' => \$opts{'h'}, 'f|ftp=s' => \$opts{'f'}, + 'gf|features=s' => \$opts{'features'}, + 'cf|cdna_fa=s' => \$opts{'cdna_fa'}, + 'nf|ncrna_fa=s' => \$opts{'ncrna_fa'}, 'o|output=s' => \$opts{'o'}, - 'n|ncstatus=s' => \$opts{'n'}, + 'tl|trans_list=s' => \$opts{'trans_list'}, 'sp|species=s' => \$opts{'sp'}, 'as|assembly=s' => \$opts{'as'}, 'd|database=s' => \$opts{'d'}, 'c|ccds=s' => \$opts{'c'}, + 'fai|fai=s' => \$opts{'fai'}, ); pod2usage() if($opts{'h'}); pod2usage('Must specify the output directory to use') unless(defined $opts{'o'}); pod2usage("Output directory must exist and be writable: $opts{o}") unless(-e $opts{'o'} && -d $opts{'o'} && -w $opts{'o'}); - pod2usage('Must specify the cDNA path for the ensembl reference data') unless defined $opts{'f'} && $opts{'f'} =~ m|cdna/?$|; - if(defined($opts{'n'})){ - pod2usage('Unable to read the non-coding transcript status file') unless -e $opts{'n'} && -r $opts{'n'}; + + if(defined $opts{'f'}){ + if(defined $opts{'features'} || defined $opts{'cdna_fa'} || defined $opts{'ncrna_fa'}){ + pod2usage('Please only define the remote URL or the local files, not both'); + } else { + pod2usage('Must specify the cDNA URL for the ensembl reference data') unless($opts{'f'} =~ m|cdna/?$|); + } + } elsif (defined $opts{'cdna_fa'} && defined $opts{'ncrna_fa'} && defined $opts{'features'}){ + if(defined $opts{'f'}){ + pod2usage('Please only define the remote URL or the local files, not both'); + } else { + pod2usage('Please specify a valid genomic feature file (-gff)') unless -e $opts{'features'} && -r $opts{'features'}; + pod2usage('Please specify a valid coding cDNA sequence file (-cf)') unless -e $opts{'cdna_fa'} && -r $opts{'cdna_fa'}; + pod2usage('Please specify a valid non-coding cDNA sequence file (-nf)') unless -e $opts{'ncrna_fa'} && -r $opts{'ncrna_fa'}; + } + + } + if(defined($opts{'fai'})){ + pod2usage('Unable to read the fai file: '.$opts{'fai'}) unless -e $opts{'fai'} && -r $opts{'fai'}; } pod2usage('Must specify the species') unless defined $opts{'sp'}; pod2usage('Must specify the genome version') unless defined $opts{'as'}; @@ -230,13 +259,9 @@ sub option_builder { if(defined($opts{'c'})){ pod2usage('CCDS file unreadable') unless -e $opts{'c'} && -r $opts{'c'}; } - $opts{'f'} =~ s|/$||; - if($opts{'f'} =~ m/$ENSEMBL_VERSION_PATTERN/){ - $opts{'e_version'} = $1; - } else { - pod2usage('Unable to parse Ensembl version from URL'); + if(defined($opts{'trans_list'})){ + pod2usage('Transcript file unreadable') unless -e $opts{'trans_list'} && -r $opts{'trans_list'}; } - return \%opts; } @@ -248,17 +273,11 @@ =head1 NAME =head1 SYNOPSIS -Admin_EnsemblReferenceFileGenerator.pl [-h] [-s human] [-v GRCh37] [-d homo_sapiens_core_74_37p] [-f ] [-c /path/to/CCDS2Sequence.version.txt] [-o /path/to/output/directory] - - General Options: - - --help (-h) Brief documentation - - --ftp (-f) Ensembl ftp directory containing the cDNA fasta sequence files +Admin_EnsemblReferenceFileGenerator.pl [-h] [-sp Human] [-as GRCh37] [-d homo_sapiens_core_74_37p] [-f ] [-o /path/to/output/directory] - --output (-o) Output directory + Required Options: - --ncstatus (-n) Optional, path to a lookup file for the status of non-coding transcripts + --output (-o) Output directory --species (-sp) Species (ie human, mouse) @@ -266,6 +285,27 @@ =head1 SYNOPSIS --database (-d) Ensembl core database version number (ie homo_sapiens_core_74_37p) - --ccds (-c) (Optional, but strongly advised) The CCDS2Sequence file from the relevant CCDS release, see http://www.ncbi.nlm.nih.gov/CCDS + Dynamic Download: + + --ftp (-f) Ensembl ftp directory containing the cDNA fasta sequence files + + Or Local Files: + + --features (-gf) gff3 or gtf file containing transcript and gene information + + --cdna_fa (-cf) Fasta file containing protein coding cdna sequences + + --ncrna_fa (-nf) Fasta file containing non-coding cdna sequences + Optional: + + --help (-h) Brief documentation + + --ccds (-c) (Recommended) The CCDS2Sequence file from the relevant CCDS release, see http://www.ncbi.nlm.nih.gov/CCDS + + --fai (-fai) (Recommended) The samtools fasts index file (.fai) for your reference genome + This is the reference genome that your bam and vcf files will be mapped to + + --trans_list (-tl) List of preprepared transcript accessions, only these accesions will be included in the reference output + =cut diff --git a/bin/Admin_EnsemblTranscriptFastaFilter.pl b/bin/Admin_EnsemblTranscriptFastaFilter.pl deleted file mode 100755 index a193db9..0000000 --- a/bin/Admin_EnsemblTranscriptFastaFilter.pl +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/perl - -##########LICENCE########## -# Copyright (c) 2014 Genome Research Ltd. -# -# Author: Cancer Genome Project cgpit@sanger.ac.uk -# -# This file is part of VAGrENT. -# -# VAGrENT is free software: you can redistribute it and/or modify it under -# the terms of the GNU Affero General Public License as published by the Free -# Software Foundation; either version 3 of the License, or (at your option) any -# later version. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more -# details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -##########LICENCE########## - - -use strict; -use English qw(-no_match_vars); -use warnings FATAL => 'all'; -use Carp; - -use Getopt::Long; -use Pod::Usage; -use Try::Tiny; - -use FindBin qw($Bin); -use lib "$Bin/../lib"; - -use File::Type; -use Const::Fast qw(const); - -use Data::Dumper; -use Bio::SeqIO; - -const my @TEXT_TYPES => qw(application/octet-stream); - -const my @ZIP_TYPES => qw(application/x-gzip); - -try { - my $opts = option_builder(); - my $trans = getInterestingTranscriptsFromFile($opts); - generateFilteredFasta($opts,$trans); -} catch { - warn "An error occurred while building reference support files\:\n\t$_"; # not $@ -}; - -sub generateFilteredFasta { - my ($opts,$trans) = @_; - my $type = getInputFileType($opts->{'f'}); - my ($inFh,$outFh); - - if($type eq 'text') { - open $inFh, '<'.$opts->{'f'} || croak("unable to open sequence input file:".$opts->{'f'}); - } elsif($type eq 'zip'){ - open $inFh, 'zcat '.$opts->{'f'}.' |' || croak("unable to open sequence input file:".$opts->{'f'}); - } - - if(defined $opts->{'a'}){ - open $outFh, '>>'.$opts->{'o'} || croak("unable to open sequence ouput file:".$opts->{'f'}); - } else { - open $outFh, '>'.$opts->{'o'} || croak("unable to open sequence ouput file:".$opts->{'f'}); - } - - my $keep = 0; - - while (<$inFh>){ - if(m/^>/){ - my ($key) = split /\s/; - $key =~ s/^>//; - if(exists $trans->{$key}){ - $keep = 1; - print $outFh ">$key\n"; - } else { - $keep = 0; - } - } elsif($keep){ - my $line = $_; - $line =~ s/\s//g; - print $outFh "$line\n"; - } - } - close $outFh; - close $inFh; -} - -sub getInterestingTranscriptsFromFile { - my $opts = shift; - my $type = getInputFileType($opts->{'f'}); - my $out; - my $status = undef; - my $cmd; - if($type eq 'text'){ - $cmd = "grep"; - } elsif($type eq 'zip'){ - $cmd = "zgrep"; - } else { - croak "file is of an unrecognised format ($type): ".$opts->{'f'}; - } - - $cmd .= " '>' ". $opts->{'f'}; - $cmd .= ' | grep '; - foreach my $b(@{$opts->{'b'}}){ - $cmd .= " -e transcript_biotype:$b"; - } - - if(defined $opts->{'s'}){ - open my $statfh, '<'.$opts->{'s'} || croak("unable to open status lookup file:".$opts->{'s'}); - while(<$statfh>){ - my @st = split /\s+/; - if(uc($st[1]) eq 'KNOWN'){ - $status->{$st[0]}++; - } - } - } else { - $cmd .= ' | grep known '; - } - - open my $fh, "$cmd |" || croak("unable to grep through fasta file with command: $cmd"); - if(defined $opts->{'s'}) { - while (<$fh>){ - my ($key) = split /\s/; - $key =~ s/^>//; - $out->{$key}++ if(exists $status->{$key}); - } - } else { - while (<$fh>){ - my ($key) = split /\s/; - $key =~ s/^>//; - $out->{$key}++; - } - } - close $fh; - return $out; -} - -sub getInputFileType { - my $infile = shift; - my $fileType = File::Type->new->checktype_filename($infile); - foreach my $t (@TEXT_TYPES){ - return 'text' if($t eq $fileType); - } - foreach my $t (@ZIP_TYPES) { - return 'zip' if($t eq $fileType); - } - return $fileType; -} - -sub option_builder { - my ($factory) = @_; - - my %opts = (); - - my $result = &GetOptions ( - 'h|help' => \$opts{'h'}, - 'f|fasta=s' => \$opts{'f'}, - 'o|outfile=s' => \$opts{'o'}, - 'a|append' => \$opts{'a'}, - 'b|biotypes=s@' => \$opts{'b'}, - 's|status=s' => \$opts{'s'}, - ); - - pod2usage() if($opts{'h'}); - pod2usage('Must specify the input fasta ensembl reference file') unless defined $opts{'f'} && -e $opts{'f'}; - pod2usage('Must specify the output file to use') unless defined $opts{'o'}; - pod2usage('Must specify at least one biotype') unless scalar @{$opts{'b'}} > 0 && defined($opts{'b'}->[0]); - if(defined($opts{'s'})){ - pod2usage('Unable to read the transcript status file') unless -e $opts{'s'} && -r $opts{'s'}; - } - return \%opts; -} - -__END__ - -=head1 NAME - -Admin_EnsemblTranscriptFastaFilter.pl - Generates the Vagrent reference fasta file from an Ensembl one - -=head1 SYNOPSIS - -Admin_EnsemblTranscriptFastaFilter.pl [-h] [-f /path/to/ensembl.fa] [-o /path/to/output.fa] [-b biotype] [-s /path/to/status/lookup.list][-a] - - General Options: - - --help (-h) Brief documentation - - --fasta (-f) Ensembl fasta file - - --output (-o) Output file - - --biotypes (-b) Ensembl transcript biotypes to filter for, supports multiple instances - - --status (-s) Optional, Transcript Status look up file, simple transcript id - status list, space separated, one entry per line - - --append (-a) Append to existing output file - -=cut diff --git a/bin/Admin_EnsemblTranscriptFilter.pl b/bin/Admin_EnsemblTranscriptFilter.pl new file mode 100644 index 0000000..da15c27 --- /dev/null +++ b/bin/Admin_EnsemblTranscriptFilter.pl @@ -0,0 +1,166 @@ +#!/usr/bin/perl + +##########LICENCE########## +# Copyright (c) 2018 Genome Research Ltd. +# +# Author: CASM/Cancer IT - cgphelp@sanger.ac.uk +# +# This file is part of VAGrENT. +# +# VAGrENT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +##########LICENCE########## + + +use strict; +use English qw(-no_match_vars); +use warnings FATAL => 'all'; +use Carp; + +use Getopt::Long; +use Pod::Usage; +use Try::Tiny; +use Capture::Tiny qw(capture); +use FindBin qw($Bin); +use lib "$Bin/../lib"; + +use File::Type; +use Const::Fast qw(const); + +use Data::Dumper; + +const my $FASTA_HEADER_GREP => 'zgrep "^>" %s'; + +try { + my $opts = option_builder(); + my $transcripts = getTranscripts($opts); + writeOutput($opts,$transcripts); +} catch { + warn "An error occurred while building reference support files\:\n\t$_"; # not $@ +}; + +sub writeOutput { + my ($opts,$trans) = @_; + my $fh; + if(defined $opts->{'o'}){ + open($fh,'>'.$opts->{'o'}) or die("Unable to open output file for writing: ".$opts->{'o'}); + } else { + $fh = *STDOUT; + } + foreach my $t(@$trans){ + print $fh $t,"\n"; + } + if(defined $opts->{'o'}){ + close $fh; + } +} + +sub getTranscripts { + my $opts = shift; + my $good_transcripts; + foreach my $seq_file (@{$opts->{'f'}}){ + my $cmd = sprintf $FASTA_HEADER_GREP, $seq_file; + my ($stdout, $stderr, $exit) = capture { + system($cmd); + }; + if($exit > 0){ + croak($stderr); + } + foreach my $rec (split /\n/,$stdout){ + my ($transcript,$biotype) = parseLine($rec); + next unless hasGoodBiotype($opts,$biotype); + # Have to store the transcript accessions twice, with and without the version. + # Depending on the Ensembl version or species, the accession in the fasta file may + # or may not have the transcript version on the end. + # Some species gff/gtf files include the the version in the accession, some don't + # Ensembl versions with imported gene build will preserve the originators naming conventions + push(@$good_transcripts,$transcript); + $transcript =~ s/\.\d+?$//; + push(@$good_transcripts,$transcript); + } + } + return $good_transcripts; +} + +sub hasGoodBiotype { + my ($opts,$biotype) = @_; + foreach my $type (@{$opts->{'b'}}){ + return 1 if lc($biotype) eq lc($type); + } + return 0; +} + +sub parseLine { + my $line = shift; + my @cols = split /\s/, $line; + my $full = join('|',@cols); + my $transcript = shift @cols; + $transcript =~ s/^>//; + my $biotype = undef; + foreach my $c (@cols){ + if($c =~ m/^transcript_biotype/){ + (undef,$biotype) = split(':',$c); + } + } + return ($transcript,$biotype); +} + +sub option_builder { + my ($factory) = @_; + + my %opts = (); + + my $result = &GetOptions ( + 'h|help' => \$opts{'h'}, + 'f|fasta=s@' => \$opts{'f'}, + 'o|outfile=s' => \$opts{'o'}, + 'b|biotypes=s@' => \$opts{'b'}, + ); + pod2usage() if($opts{'h'}); + pod2usage('Output file must not already exist') if defined $opts{'o'} && -e $opts{'o'}; + if(scalar(@{$opts{'f'}}) > 0){ + foreach my $f(@{$opts{'f'}}){ + pod2usage("Unable to read the transcript sequence file: $f") unless -e $f && -r $f; + } + } else { + pod2usage('Must specify one or more fasta sequence file as input'); + } + pod2usage('Must specify at least one biotype') unless scalar @{$opts{'b'}} > 0 && defined($opts{'b'}->[0]); + return \%opts; +} + +__END__ + +=head1 NAME + +Admin_EnsemblTranscriptFilter.pl - Generates filtered list of Transcript accessions + +=head1 SYNOPSIS + +Admin_EnsemblTranscriptFilter.pl [-h] [-f /path/to/ensembl.fa] [-o /path/to/output.list] [-b biotype] + + Required Options: + + --fasta (-f) Ensembl fasta file from Ensembl's FTP site, can be specified multiple times. + + --biotypes (-b) Ensembl transcript biotypes to filter for, can be specified multiple times. + + General Options: + + --help (-h) Brief documentation. + + --output (-o) Output file, optional will default to stdout. + +=cut + + diff --git a/docs.tar.gz b/docs.tar.gz index b271ed6..070dec1 100644 Binary files a/docs.tar.gz and b/docs.tar.gz differ diff --git a/lib/Sanger/CGP/Vagrent.pm b/lib/Sanger/CGP/Vagrent.pm index 2bf1a5f..8d2bb22 100644 --- a/lib/Sanger/CGP/Vagrent.pm +++ b/lib/Sanger/CGP/Vagrent.pm @@ -1,9 +1,9 @@ package Sanger::CGP::Vagrent; ##########LICENCE########## -# Copyright (c) 2014-2017 Genome Research Ltd. +# Copyright (c) 2014-2018 Genome Research Ltd. # -# Author: Cancer Genome Project cgpit@sanger.ac.uk +# Author: CASM/Cancer IT - cgphelp@sanger.ac.uk # # This file is part of VAGrENT. # @@ -26,7 +26,7 @@ use strict; use Const::Fast qw(const); use base 'Exporter'; -our $VERSION = '3.2.3'; +our $VERSION = '3.3.0'; our @EXPORT = qw($VERSION); 1;