diff --git a/CHANGES.md b/CHANGES.md
index a3af7bf..ea4a833 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,6 @@
+### 3.3.0
+* Complete re-write of the reference generation code
+
 ### 3.2.2
 * Add bedtools2 to `setup.sh`
 * Added bedtools2 to `README.md`
diff --git a/MANIFEST b/MANIFEST
index 954da43..c7d9ac4 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -1,7 +1,7 @@
 .travis.yml
-bin/Admin_EnsemblGtf2CacheConverter.pl
+bin/Admin_CacheFileBuilder.pl
 bin/Admin_EnsemblReferenceFileGenerator.pl
-bin/Admin_EnsemblTranscriptFastaFilter.pl
+bin/Admin_EnsemblTranscriptFilter.pl
 bin/Admin_GeneRegionBedDumper.pl
 bin/AnnotateVcf.pl
 bin/cpanm
diff --git a/Makefile.PL b/Makefile.PL
index a6042f9..bf98a2f 100755
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -31,9 +31,9 @@ WriteMakefile(
   NAME          => 'Vagrent',
   LICENSE       => 'agpl_3', # http://search.cpan.org/~dagolden/CPAN-Meta-2.142690/lib/CPAN/Meta/Spec.pm#license
   VERSION_FROM  => 'lib/Sanger/CGP/Vagrent.pm',
-  EXE_FILES     => [qw( bin/Admin_EnsemblGtf2CacheConverter.pl
+  EXE_FILES     => [qw( bin/Admin_CacheFileBuilder.pl
                         bin/Admin_EnsemblReferenceFileGenerator.pl
-                        bin/Admin_EnsemblTranscriptFastaFilter.pl
+                        bin/Admin_EnsemblTranscriptFilter.pl
                         bin/Admin_GeneRegionBedDumper.pl
                         bin/AnnotateVcf.pl)],
   PREREQ_PM     => {
diff --git a/README.md b/README.md
index 639ebc6..4d650f8 100644
--- a/README.md
+++ b/README.md
@@ -49,9 +49,9 @@ Please be aware that this expects basic C compilation libraries and tools to be
 LICENCE
 =======
 
-Copyright (c) 2014-2017 Genome Research Ltd.
+Copyright (c) 2014-2018 Genome Research Ltd.
 
-Author: Cancer Genome Project <cgpit@sanger.ac.uk>
+Author: CASM/Cancer IT <cgphelp@sanger.ac.uk>
 
 This file is part of VAGrENT.
 
diff --git a/bin/Admin_CacheFileBuilder.pl b/bin/Admin_CacheFileBuilder.pl
new file mode 100644
index 0000000..ddbef18
--- /dev/null
+++ b/bin/Admin_CacheFileBuilder.pl
@@ -0,0 +1,852 @@
+#!/usr/bin/perl
+
+##########LICENCE##########
+# Copyright (c) 2018 Genome Research Ltd.
+#
+# Author: CASM/Cancer IT - cgphelp@sanger.ac.uk
+#
+# This file is part of VAGrENT.
+#
+# VAGrENT is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+##########LICENCE##########
+
+use strict;
+use English qw(-no_match_vars);
+use warnings FATAL => 'all';
+use Carp;
+
+use FindBin qw($Bin);
+use lib "$Bin/../lib";
+
+use Getopt::Long;
+use Pod::Usage;
+use Try::Tiny;
+use Capture::Tiny qw(capture capture_stderr);
+use List::Util qw(first);
+use Scalar::Util qw(looks_like_number);
+
+use Data::Dumper;
+use File::Type;
+use File::Temp qw(tempdir tempfile);
+use File::Spec;
+use File::Copy qw(copy);
+use Const::Fast qw(const);
+
+use Bio::DB::HTS::Faidx;
+use Bio::SeqIO;
+use Bio::Seq;
+use Bio::Tools::GFF;
+
+use Sanger::CGP::Vagrent::Data::Transcript;
+use Sanger::CGP::Vagrent::Data::Exon;
+
+$Data::Dumper::Indent = 0;
+
+const my @TEXT_TYPES => qw(text/plain application/octet-stream);
+
+const my @ZIP_TYPES => qw(application/x-gzip);
+
+const my $TAG_GENE_ID => 'gene_id';
+const my $TAG_TRANSCRIPT_ID => 'transcript_id';
+const my $TAG_PROTEIN_ID => 'protein_id';
+const my $TAG_VERSION => 'version';
+const my $TAG_PARENT => 'Parent';
+const my $TAG_EXON_ORDER_GFF => 'rank';
+const my $TAG_EXON_ORDER_GTF => 'exon_number';
+const my @TAG_EXON_ORDER => ($TAG_EXON_ORDER_GTF,$TAG_EXON_ORDER_GFF);
+const my $TAG_GENE_BIOTYPE => 'gene_biotype';
+const my $TAG_BIOTYPE => 'biotype';
+const my @TAG_BIOTYPE => ($TAG_BIOTYPE,$TAG_GENE_BIOTYPE);
+
+const my $TAG_GENE_NAME_GFF => 'Name';
+const my $TAG_GENE_NAME_GTF => 'gene_name';
+const my @TAG_GENE_NAME => ($TAG_GENE_NAME_GFF,$TAG_GENE_NAME_GTF);
+
+const my @TAG_CCDS => qw(ccds_id ccdsid);
+
+const my $PRIMARY_UTR5P => 'five_prime_UTR';
+const my $PRIMARY_CDS => 'CDS';
+const my $PRIMARY_UTR3P => 'three_prime_UTR';
+const my $PRIMARY_STOP => 'stop_codon';
+
+const my @MITO_ALIASES => qw(M MT Mito MtDNA);
+
+const my @PRIMARY_EXON => qw(exon);
+const my @PRIMARY_REGIONS => ($PRIMARY_UTR5P,$PRIMARY_CDS,$PRIMARY_STOP,$PRIMARY_UTR3P);
+
+const my $PROTEIN_CODING => 'protein_coding';
+const my $EXON_TYPE => 'exon';
+const my $CDS_START_TYPE => 'start_codon';
+const my $CDS_STOP_TYPE => 'stop_codon';
+const my $CDS_TYPE => 'CDS';
+
+const my $UNZIP => 'gunzip -c %s > %s';
+const my $BGZIP => 'bgzip -c %s > %s';
+const my $SORT_N_BGZIP => 'sort -k 1,1 -k 2n,2 -k 3n,3 %s | bgzip > %s';
+const my $TABIX => 'tabix -p bed %s';
+
+const my $SEQ_LOCATION => '%s:%s-%s';
+
+const my $OUT_CACHE_FILE => 'vagrent.%s.%s.%s.cache.gz';
+const my $OUT_SEQ_FILE => 'vagrent.%s.%s.%s.fa';
+
+my $tmpDir = tempdir("VagrentBuildCacheXXXXX", TMPDIR => 1, CLEANUP => 1);
+
+try {
+	my $opts = option_builder();
+	my $transList = parseTranscriptList($opts->{'transcripts'});
+	my $seqFiles = openSeqFiles($tmpDir,@{$opts->{'f'}});
+	my $chrList = parseFaiFile($opts->{'fai'});
+	my $ccdsList = parseCCDSfile($opts->{'c'});
+	my ($tmp_cache_file,$tmp_seq_file) = processFeatureFile($opts,$tmpDir,$chrList,$seqFiles,$transList,$ccdsList);
+	finaliseOutputFiles($opts,$tmp_cache_file,$tmp_seq_file);
+	
+} catch {
+  die "An error occurred while building reference support files\:\n\t$_"; # not $@
+};
+
+sub finaliseOutputFiles {
+  my ($opts,$tmp_cache,$tmp_seq) = @_;
+  my $cache_file = makeCacheFilePath($opts);
+  my $seq_file = makeSeqFilePath($opts);
+  
+  copy($tmp_seq,$seq_file) or croak("unable to copy sequence file to desination: $!");
+  capture_stderr {my $fa_index = Bio::DB::HTS::Faidx->new($seq_file)};
+
+  
+  my $snz_cmd = sprintf($SORT_N_BGZIP, $tmp_cache, $cache_file);
+  my ($snz_stdout, $snz_stderr, $snz_exit) = capture {system($snz_cmd);};
+  croak('Error in sorting and bgzipping the cache file: '.$snz_stderr) if $snz_exit > 0;
+  my $tabix_cmd = sprintf $TABIX, $cache_file;
+  my ($tbx_stdout, $tbx_stderr, $tbx_exit) = capture {system($tabix_cmd);};
+  croak('Error in indexing the cache file: '.$tbx_stderr) if $tbx_exit > 0;
+}
+
+sub makeCacheFilePath {
+  my $opts = shift;
+  my $file = sprintf $OUT_CACHE_FILE, $opts->{'sp'},$opts->{'as'},$opts->{'d'};
+  return File::Spec->catfile($opts->{'o'},$file);
+}
+
+sub makeSeqFilePath {
+  my $opts = shift;
+  my $file = sprintf $OUT_SEQ_FILE, $opts->{'sp'},$opts->{'as'},$opts->{'d'};
+  return File::Spec->catfile($opts->{'o'},$file);
+}
+
+sub processFeatureFile {
+  my ($opts,$tmpDir,$chrList,$seqFiles,$transList,$ccdsList) = @_;
+  my ($cache_fh, $cache_tmp_file) = tempfile('VagrentBuildCacheXXXXX', DIR => $tmpDir, SUFFIX => '.cache');
+  my ($seq_tmp_fh, $fa_tmp_file) = tempfile('VagrentBuildCacheXXXXX', DIR => $tmpDir, SUFFIX => '.fa');
+  my $seq_fh = Bio::SeqIO->new(-fh => $seq_tmp_fh, -format => 'fasta');
+  my $features = openFeatureFile($opts->{'features'});
+  my $c = 0;
+  my $store = undef;
+  my $gene_names = undef;
+  my $current_chr = undef;
+  my $current_chr_translation = undef;
+  while(my $feat = $features->next_feature()) {
+    my $chr = $feat->seq_id();
+    if(!defined $current_chr){
+      $current_chr = $chr;
+      $current_chr_translation = checkChromosome($current_chr,$chrList);
+      if (!defined $current_chr_translation){
+        print "Sequence $current_chr not found in fai file: no alternative found, skipping\n" if $opts->{'debug'};
+      } elsif ($current_chr ne $current_chr_translation){
+        print "Sequence $current_chr not found in fai file: using $current_chr_translation as alternative\n" if $opts->{'debug'};
+      }
+    }
+    next if $current_chr eq $chr && !defined $current_chr_translation;
+    my $tid = undef;
+    if($feat->has_tag($TAG_GENE_ID)){
+      my ($gene_id) = $feat->get_tag_values($TAG_GENE_ID);
+      unless(exists $gene_names->{$gene_id}){
+        foreach my $tag (@TAG_GENE_NAME){
+          next unless $feat->has_tag($tag);
+          my ($gn) = $feat->get_tag_values($tag);
+          $gene_names->{$gene_id} = $gn;
+          last;
+        }
+      }
+    }
+    if($feat->has_tag($TAG_TRANSCRIPT_ID)){
+      ($tid) = $feat->get_tag_values($TAG_TRANSCRIPT_ID);
+    } else {
+      if($feat->has_tag($TAG_PARENT)){
+        my ($parent) = $feat->get_tag_values($TAG_PARENT);
+        if($parent =~ m/^transcript:(.+)$/){
+          $tid = $1;
+        } else {
+          next;
+        }
+      } else {
+        next;
+      }
+    } 
+    next unless defined $tid;
+    if($current_chr ne $chr) {
+      # chromosome has ended, have to process transcripts
+      if(scalar(keys %$store) > 0){
+        foreach my $obj (@{processCompleteTranscripts($opts,$store,$current_chr_translation,$gene_names,$chrList,$ccdsList,$seqFiles)}){
+          writeOutput($obj,$cache_fh,$seq_fh);
+        }
+        $store = undef;
+      }
+      $current_chr = $chr;
+      $current_chr_translation = checkChromosome($current_chr,$chrList);
+      if (!defined $current_chr_translation){
+        print "Sequence $current_chr not found in fai file: no alternative found, skipping\n" if $opts->{'debug'};
+      } elsif ($current_chr ne $current_chr_translation){
+        print "Sequence $current_chr not found in fai file: using $current_chr_translation as alternative\n" if $opts->{'debug'};
+      }
+      next if !defined $current_chr_translation;
+    }  
+    
+    if(exists $store->{$tid}){
+      push @{$store->{$tid}}, $feat;
+    } elsif(exists $transList->{$tid}) {
+      push @{$store->{$tid}}, $feat;
+      $c++;
+    } else {
+      next;
+    }
+  }
+  if(scalar(keys %$store) > 0){
+    foreach my $obj (@{processCompleteTranscripts($opts,$store,$current_chr_translation,$gene_names,$chrList,$ccdsList,$seqFiles)}){
+      writeOutput($obj,$cache_fh,$seq_fh);
+    }
+  }
+  close $cache_fh;
+  close $seq_tmp_fh;
+  return ($cache_tmp_file,$fa_tmp_file);
+}
+
+sub writeOutput {
+  my ($trans,$cache,$fa) = @_;
+  my $seq = Bio::Seq->new(-display_id => $trans->getAccession(),-seq => $trans->getcDNASeq);
+  my ($e) = $trans->getExons();
+  my $cache_row;
+  try{
+    $cache_row = join("\t",$e->getChr,$trans->getGenomicMinPos - 1,$trans->getGenomicMaxPos,$trans->getAccession,$trans->getGeneName,length $trans->getcDNASeq);
+  } catch {
+    croak('An error occured outputting transcript '.$trans->getAccession.', please check input data');
+  };
+  $trans->{_cdnaseq} = undef;
+  $cache_row .= "\t".Dumper($trans)."\n";
+  
+  $fa->write_seq($seq);
+  print $cache $cache_row;
+}
+
+sub processCompleteTranscripts {
+  my ($opts,$store,$chr,$gene_names,$chrList, $ccdsList,$seqFiles) = @_;
+  my @out = ();
+  foreach my $trans_id (keys %$store){
+    my $converted = convertTranscript($opts,$store->{$trans_id},$trans_id,$gene_names,$ccdsList,$seqFiles,$chr);
+    push(@out,$converted) if defined $converted;
+  }
+  return \@out;
+}
+
+sub convertTranscript {
+  my ($opts,$raw_transcript,$transcript_id,$gene_names,$ccdsList,$seqFiles,$chr) = @_;
+  my $type = getGeneTypeForTranscript($opts,$raw_transcript);
+  return undef unless defined $type;
+  my $rnaLengthSum = 0;
+  my $strand = undef;
+  my @exons;
+  
+  my @raw_exons;
+  my @raw_cds;
+  my $gene_id = undef;
+  my $transcript_version = 1;
+  foreach my $e(@$raw_transcript){
+    if($e->has_tag($TAG_TRANSCRIPT_ID) && $e->has_tag($TAG_VERSION)){
+      ($transcript_version) = $e->get_tag_values($TAG_VERSION);
+    }
+    unless(defined $gene_id){
+      if($e->has_tag($TAG_GENE_ID)){
+        ($gene_id) = $e->get_tag_values($TAG_GENE_ID);
+      } elsif ($e->has_tag($TAG_PARENT)){
+        my ($parent) = $e->get_tag_values($TAG_PARENT);
+        my ($pt,$pv) = split ':',$parent;
+        if($pt eq 'gene'){
+          $gene_id = $pv;
+        }
+      }
+    }
+    push @raw_exons, $e if first { $_ eq $e->primary_tag } @PRIMARY_EXON;
+    push @raw_cds, $e if first { $_ eq $e->primary_tag } @PRIMARY_REGIONS;
+  }
+  return undef if scalar(@raw_exons) == 0;
+  return undef if scalar(@raw_cds) == 0 && $type eq Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType();
+  my $gene_name;
+  if(exists $gene_names->{$gene_id} && defined $gene_names->{$gene_id}){
+    $gene_name = $gene_names->{$gene_id};
+  } else {
+    $gene_name = $gene_id
+  }
+  @raw_exons = sort featureOrderSortFunction @raw_exons; # sorting exons into transcript order
+  @raw_cds = sort featureOrderSortFunction @raw_cds; # sorting CDS fragments into transcript order
+  foreach my $e (@raw_exons){
+    my $rmin = $rnaLengthSum + 1;
+    my $rmax = $rnaLengthSum + $e->length;
+    $rnaLengthSum += $e->length;
+    my $convE = Sanger::CGP::Vagrent::Data::Exon->new(
+							  species => $opts->{'sp'},
+							  genomeVersion => $opts->{'as'},
+							  chr => $chr,
+							  minpos => $e->start,
+							  maxpos => $e->end,
+							  rnaminpos => $rmin,
+							  rnamaxpos => $rmax,);
+		
+		if(defined $strand){
+		  croak('Inconsistant strand in transcript') unless $strand == $e->strand;
+		} else {
+		  $strand = $e->strand;
+		}
+		push @exons, $convE;
+  }
+  my ($protAcc,$protAccVer,$cdsMin,$cdsMax,$cdsPhase,$ccds);
+  if($type eq Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType()){
+    ($cdsMin,$cdsMax,$cdsPhase) = calculateCDS($opts,\@raw_cds,\@raw_exons, $rnaLengthSum, $strand,$transcript_id);
+    return undef unless(defined $cdsMin && defined $cdsMax && defined $cdsPhase);
+    $ccds = getCCDS($raw_transcript,$transcript_id,$ccdsList);
+    foreach my $cds(@raw_cds){
+      if($cds->has_tag($TAG_PROTEIN_ID)){
+        ($protAcc) = $cds->get_tag_values($TAG_PROTEIN_ID);
+        last;
+      }
+    }
+    $protAccVer = 1;
+    
+  } else {
+		# non-coding
+		$protAcc = undef;
+		$protAccVer = undef;
+		$cdsMin = undef;
+		$cdsMax = undef;
+		$cdsPhase = -1;
+	}
+  
+  my @sortedExons = sort {$a->getMinPos <=> $b->getMinPos} @exons;
+  my $transcript = Sanger::CGP::Vagrent::Data::Transcript->new(
+																		db => 'Ensembl',
+																		dbversion => $opts->{'d'},
+																		acc => $transcript_id,
+																		accversion => $transcript_version,
+																		proteinacc => $protAcc,
+																		proteinaccversion => $protAccVer,
+																		ccds => $ccds,
+																		genename => $gene_name,
+																		genetype => $type,
+																		strand => $strand,
+																		cdnaseq => getSequence($transcript_id,$transcript_version,$seqFiles),
+																		cdsminpos => $cdsMin,
+																		cdsmaxpos => $cdsMax,
+																		cdsphase => $cdsPhase,
+																		genomicminpos => $sortedExons[0]->getMinPos,
+																		genomicmaxpos => $sortedExons[-1]->getMaxPos,
+																		exons => \@exons,);
+    
+  return $transcript; 
+}
+
+sub getSequence {
+  my ($id,$version,$seqFiles) = @_;
+  my $out = undef;
+  my $idv = join('.',$id,$version);
+  foreach my $fai(@$seqFiles){
+    my $loc;
+    if($fai->has_sequence($id)){
+      my $length = $fai->length($id);
+      $loc = sprintf($SEQ_LOCATION,$id,1,$length);
+    } elsif ($fai->has_sequence($idv)){
+      my $length = $fai->length($idv);
+      $loc = sprintf($SEQ_LOCATION,$idv,1,$length);
+    } else {
+      next;
+    }
+    $out = $fai->get_sequence_no_length($loc);
+  }
+  return $out;
+}
+
+sub getCCDS {
+  my ($raw_transcript,$transcript_id,$ccdsList) = @_;
+  my $ccds = undef;
+  foreach my $feat(@$raw_transcript){
+    foreach my $tag(@TAG_CCDS){
+      if($feat->has_tag($tag)){
+        ($ccds) = $feat->get_tag_values($tag);
+        last;
+      }
+    }
+    last if defined $ccds;
+  }
+  unless(defined $ccds){
+    if(defined $ccdsList){
+      if(exists $ccdsList->{$transcript_id}){
+        $ccds = $ccdsList->{$transcript_id};
+      }
+    }
+  }
+  return $ccds;
+}
+
+sub calculateCDS {
+  my ($opts,$feats,$exons,$rnaLength, $strand,$transcript_id) = @_;
+  my $utr5Length = 0;
+  my $cdsLength = 0;
+  my $utr3Length = 0;
+  my $stopCodonLength = 0;
+  my @cds;
+  foreach my $f (@$feats){
+    if($f->primary_tag eq $PRIMARY_UTR5P){
+      $utr5Length += $f->length;
+    } elsif($f->primary_tag eq $PRIMARY_CDS){
+      $cdsLength += $f->length;
+      push @cds, $f;
+    } elsif($f->primary_tag eq $PRIMARY_UTR3P){
+      $utr3Length += $f->length;
+    } elsif($f->primary_tag eq $PRIMARY_STOP){
+      $stopCodonLength += $f->length;
+    }
+  }  
+  my $utr5Length_e = 0;
+  my $cdsLength_e = 0;
+  my $utr3Length_e = 0;
+  
+  my $cds_first = $cds[0];
+  my $cds_last = $cds[-1];
+  
+  my $break = 0;
+  my $mess = '';
+  
+  foreach my $e(@$exons){
+    croak('Recieved undefined exon: '.Dumper($exons)) unless defined($e);
+    # remember start = min, end = max.  start and end DO NOT infer orientation in BIOPERL
+    if($strand > 0){
+      # forward strand start = min = start of feature, end = max = end of feature
+      if($e->end < $cds_first->start){
+        # exon ends before the CDS starts
+        # 5 prime UTR exon
+        $utr5Length_e += $e->length;
+      } elsif ($e->end < $cds_last->start && $e->start <= $cds_first->start && $e->end >= $cds_first->start){
+        # exon ends before last CDS fragment starts
+        # and first CDS fragment starts within this exon.
+        $utr5Length_e += $cds_first->start - $e->start;
+        $cdsLength_e += $e->length - ($cds_first->start - $e->start);
+      } elsif ($e->start >= $cds_first->start && $e->end <= $cds_last->end){
+        # exon starts after the first CDS fragment starts
+        # and exon ends before the last CDS fragment ends
+        # exon is entirely within the CDS
+        $cdsLength_e += $e->length; 
+      } elsif ($cds_first->start == $cds_last->start && $cds_first->end == $cds_last->end && $e->start <= $cds_first->start && $e->end >= $cds_last->end){
+        # single CDS fragment contained entirely within this exon  
+        $utr5Length_e += $cds_first->start - $e->start;
+        $cdsLength_e += $e->length - (($cds_first->start - $e->start) + ($e->end - $cds_last->end));
+        $utr3Length_e += $e->end - $cds_last->end;
+      } elsif ($e->start > $cds_first->end && $e->start <= $cds_last->end && $e->end >= $cds_last->end){  
+        # exon starts after the first CDS fragment ends
+        # and the last CDS fragment ends within this exon
+        $cdsLength_e += $e->length - ($e->end - $cds_last->end);
+        $utr3Length_e += $e->end - $cds_last->end;
+      } elsif ($e->start > $cds_last->end){
+        # exon starts after the CDS ends
+        $utr3Length_e += $e->length;
+      } else {
+        warn "\n",join(' - ',$utr5Length,$cdsLength,$stopCodonLength,$utr3Length),"\n";
+        warn join(' - ',$utr5Length_e,$cdsLength_e,0,$utr3Length_e),"\n";
+        warn "\n";
+        foreach (@cds){warn "CDS : ",Dumper($_),"\n"};
+        warn "\n";
+        foreach (@$exons){warn "EXON : ",Dumper($_),"\n"};
+        warn "\n";
+        foreach ($cds_first,$cds_last){warn "CDS BOUNDRIES : ",Dumper($_),"\n"};   
+        warn "\n";
+        warn "PROC : ",Dumper($e),"\n";
+        croak('unhandled exon');
+      }
+      
+            
+    } elsif($strand < 0){
+      # reverse strand start = min = end of feature, end = max = start of feature
+      if($e->start > $cds_first->end){
+        # exon ends (start) before the CDS starts (end)
+        # 5 prime UTR exon
+        $utr5Length_e += $e->length;
+      } elsif ($e->start > $cds_last->end && $e->end >= $cds_first->end && $e->start <= $cds_first->end){
+        # exon ends (start) before last CDS fragment starts (end)
+        # and first CDS fragment starts (end) within this exon
+        $utr5Length_e += $e->end - $cds_first->end;
+        $cdsLength_e += $e->length - ($e->end - $cds_first->end);
+      } elsif ($e->end <= $cds_first->end && $e->start >= $cds_last->start){
+        # exon starts (end) after the first CDS fragment starts (end)
+        # and exon ends (start) before the last CDS fragment ends (start)
+        # exon is entirely within the CDS
+        $cdsLength_e += $e->length;
+      } elsif ($cds_first->end == $cds_last->end && $cds_first->start == $cds_last->start && $e->end >= $cds_first->end && $e->start <= $cds_last->start){
+        # single CDS fragment contained entirely within this exon   
+        $utr5Length_e += $e->end - $cds_first->end;
+        $cdsLength_e += $e->length - (($e->end - $cds_first->end) + ($cds_last->start - $e->start));
+        $utr3Length_e += $cds_last->start - $e->start;
+      } elsif ($e->end < $cds_first->start && $e->end >= $cds_last->start && $e->start <= $cds_last->start){
+        # exon starts (end) after the first CDS fragment ends (start)
+        # and the last CDS fragment ends in this exon
+        $cdsLength_e += $e->length - ($cds_last->start - $e->start);
+        $utr3Length_e += $cds_last->start - $e->start;
+      } elsif ($e->end < $cds_last->start){
+        # exons starts (end) after CDS ends (start)
+        $utr3Length_e += $e->length;
+      } else {
+        warn "\n",join(' - ',$utr5Length,$cdsLength,$stopCodonLength,$utr3Length),"\n";
+        warn join(' - ',$utr5Length_e,$cdsLength_e,0,$utr3Length_e),"\n";
+        warn "\n";
+        foreach (@cds){warn "CDS : ",Dumper($_),"\n"};
+        warn "\n";
+        foreach (@$exons){warn "EXON : ",Dumper($_),"\n"};
+        warn "\n";
+        foreach ($cds_first,$cds_last){warn "CDS BOUNDRIES : ",Dumper($_),"\n"};   
+        warn "\n";
+        warn "PROC : ",Dumper($e),"\n";
+        croak('unhandled exon');
+      }
+    } else {
+      croak('Cannot process strand: ',$strand);
+    }
+  }
+  
+  # length calculation double check
+  
+  if($utr5Length + $utr3Length > 0){
+    # UTR features present in input file, check all 3 values, skip if data is inconsistant
+    return (undef,undef,undef) unless($utr5Length == $utr5Length_e && $cdsLength == $cdsLength_e && $utr3Length == $utr3Length_e);
+  } else {
+    # only compare the CDS length, skip if data is inconsistant
+    return (undef,undef,undef) unless($cdsLength == $cdsLength_e);
+  }
+  
+  my $cds_min = $utr5Length_e + 1;
+  my $cds_max = $utr5Length_e + $cdsLength_e + $stopCodonLength;
+  my $cds_phase = undef;
+  if(looks_like_number($cds_first->frame)){
+    if($cds_first->frame > 0){
+      $cds_phase = 3 - $cds_first->frame;
+    } else {
+      $cds_phase = 0;
+    }
+  } else {
+    return (undef,undef,undef);
+  }
+  return ($cds_min,$cds_max,$cds_phase);
+}
+
+sub getGeneTypeForTranscript {
+  my ($opts,$t) = @_;
+  
+  my $type = undef;
+  my $type_tag = undef;
+  
+  foreach my $f(@$t){ 
+    foreach my $tag(@TAG_BIOTYPE){
+      if($f->has_tag($tag)){
+        my ($raw_type) = $f->get_tag_values($tag);
+        if(!defined $type){
+          $type = $raw_type;
+          $type_tag = $tag;
+        } elsif($type ne $raw_type){
+          croak('Transcript has inconsistant biotype');
+        }            
+      }
+    }
+  }
+  if($type eq 'protein_coding'){
+		return Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType();
+	} elsif($type eq 'miRNA'){
+		return Sanger::CGP::Vagrent::Data::Transcript::getMicroRnaType();
+	} elsif($type eq 'lincRNA'){
+		return Sanger::CGP::Vagrent::Data::Transcript::getLincRnaType();
+	} elsif($type eq 'snoRNA'){
+		return Sanger::CGP::Vagrent::Data::Transcript::getSnoRnaType();
+	} elsif($type eq 'snRNA'){
+		return Sanger::CGP::Vagrent::Data::Transcript::getSnRnaType();
+	} elsif($type eq 'rRNA'){
+		return Sanger::CGP::Vagrent::Data::Transcript::getRRnaType();
+	} else {
+	  if($type_tag eq $TAG_GENE_BIOTYPE){
+	    # this can happen, there can be a discrepancy between gene and transcript biotype in ensembl.  
+	    # If they are only reporting the gene biotype and its not one we can handle we have to skip the transcript.
+	    return undef;
+	  } else {
+	    if($opts->{'debug'}){
+        foreach (@$t){
+          warn Dumper($_),"\n";
+        }
+      }
+      croak("Unhandled biotype: $type");
+	  }
+	}
+}
+
+sub featureOrderSortFunction {
+  my $tag = undef;
+  
+  if($a->strand != $b->strand){
+    croak('Features strands not consistent');
+  }
+  
+  if($a->strand == 1 && $a->strand == $b->strand){
+    return $a->start <=> $b->start;
+  } elsif($a->strand == -1 && $a->strand == $b->strand){
+    return $b->start <=> $a->start;
+  } else {
+    croak('unknown strand type: ',$a->strand);
+  }
+}
+
+sub checkChromosome {
+  my ($chr,$chrList) = @_;
+  my $use_chr = undef;
+  if(first { $_ eq $chr } @$chrList){
+    $use_chr = $chr;
+  } else {
+    # chromosome not in fai file
+    # lets try chr prefix discrepancies 
+    if($chr =~ m/^chr(.+)$/){
+      # chromosome starts with chr, try removing and search again.
+      $use_chr = $1 if first { $_ eq $1 } @$chrList;
+    } elsif($chr !~ m/^chr/){
+      # chromosome doesn't start with chr, try adding
+      $use_chr = 'chr'.$chr if first { $_ eq 'chr'.$chr } @$chrList;
+    }
+    if(!defined $use_chr){
+      # still not found a translation
+      # check for mitochondria, its a special case
+      my $mito_check;
+      if($chr =~ m/^chr(.+)$/){
+        $mito_check = $1;
+      } else {
+        $mito_check = $chr;
+      }
+      if(first { $_ eq $mito_check } @MITO_ALIASES){
+        # this is mitochondria
+        # check fai file for aliases, with and without chr prefixes.
+        foreach my $m(@MITO_ALIASES){
+          my $chrM = 'chr'.$m;
+          if(first { $_ eq $m } @$chrList){
+            $use_chr = $m;
+            last;
+          } elsif (first { $_ eq $chrM } @$chrList){
+            $use_chr = $chrM;
+            last;
+          }
+        }
+      }
+    }
+  }
+  return $use_chr;
+}
+
+sub openFeatureFile {
+  my $file = shift;
+  my $fileType = File::Type->new->checktype_filename($file);
+  my $fh;
+  if(first { $_ eq $fileType } @ZIP_TYPES){
+    # feature file is zipped
+    open $fh, "zcat $file |" or croak("unable to open feature input file: $file");
+  } elsif(first { $_ eq $fileType } @TEXT_TYPES){
+    # feature file is plain text
+    open $fh, "<$file" or croak("unable to open feature input file: $file");
+  } else {
+    croak('Unsupported file type: '.$fileType);
+  }
+  my $gff_version;
+  if($file =~ m/\.gtf(?:\.gz)?/){
+    $gff_version = 2.5;
+  } elsif($file =~ m/\.gff3(?:\.gz)?/){
+    $gff_version = 3;
+  } else {
+    croak('Unable to determine format of '.$file);
+  } 
+  
+  return Bio::Tools::GFF->new(-fh => $fh, -gff_version => $gff_version);
+}
+
+sub openSeqFiles {
+  my ($tmpDir,@files) = @_;
+  my $out;
+  foreach my $file (@files){
+    my $fileType = File::Type->new->checktype_filename($file);
+    if(first { $_ eq $fileType } @ZIP_TYPES){
+      # File zipped, going to have to re-zip it with bgzip
+      my $unzipped = makeTmpFilePath($tmpDir,'.fa');
+      my $zipped = makeTmpFilePath($tmpDir,'.fa.gz');
+      my $unzip_cmd = sprintf($UNZIP, $file, $unzipped);
+      my $bgzip_cmd = sprintf($BGZIP, $unzipped, $zipped);
+      my ($uz_stdout, $uz_stderr, $uz_exit) = capture {system($unzip_cmd);};
+      croak($uz_stderr) if $uz_exit > 0;
+      my ($bgz_stdout, $bgz_stderr, $bgz_exit) = capture {system($bgzip_cmd);};
+      croak($bgz_stderr) if $bgz_exit > 0;
+      
+      $file = $zipped;
+    } elsif(first { $_ eq $fileType } @TEXT_TYPES){
+      # plain text file, needs zipping
+      my $zipped = makeTmpFilePath($tmpDir,'.fa.gz');
+      my $bgzip_cmd = sprintf($BGZIP, $file, $zipped);
+      my ($bgz_stdout, $bgz_stderr, $bgz_exit) = capture {system($bgzip_cmd);};
+      croak($bgz_stderr) if $bgz_exit > 0;
+      $file = $zipped;
+    } else {
+      croak('Unsupported file type: '.$fileType)
+    }
+    # HTS call prints to stderr if it has to generate an fai file, which it will here
+    # feels dirty but this makes it quiet.
+    capture_stderr {push @$out, Bio::DB::HTS::Faidx->new($file)};
+  } 
+  return $out;
+}
+
+sub parseCCDSfile {
+  my $file = shift;
+  return undef unless defined $file;
+  open my $fh, "<$file" or croak("unable to open CCDS file: ".$file);
+  my @data = <$fh>;
+  close $fh;
+  my $out;
+  foreach my $d(@data){
+    next if $d !~ m/^CCDS\d/;
+    my @cols = split /\s+/,$d;
+    my $ccds = $cols[0];
+    my $transcript_id = $cols[4];
+    $out->{$transcript_id} = $ccds;
+  }
+  return $out;
+}
+
+sub parseFaiFile {
+  my $fai = shift;
+  return undef unless defined $fai;
+  open my $fh, "<$fai" or croak("unable to open fai file: ".$fai);
+  my @data = <$fh>;
+  close $fh;
+  my @out;
+  foreach my $d(@data){
+    my ($name) = split /\s+/,$d;
+    push @out, $name;
+  }
+  return \@out;
+}
+
+sub parseTranscriptList {
+  my $file = shift;
+  open my $fh, "<$file" or croak('Unable to open transcript list file: '.$file);
+  my $out;
+  while(<$fh>){
+    my $line = $_;
+    chomp $line;
+    $out->{$line} = 1;
+  }
+  close $fh;
+  return $out;
+}
+
+sub makeTmpFilePath {
+  my ($tmpDir,$ext) = @_;
+  my $file;
+  (undef, $file) = tempfile('VagrentBuildCacheXXXXX', DIR => $tmpDir, OPEN => 0, SUFFIX => $ext);
+  return $file;
+}
+
+sub option_builder {
+	my ($factory) = @_;
+	my %opts = ();
+	my $result = &GetOptions (
+		'h|help' => \$opts{'h'},
+		'f|fasta=s@' => \$opts{'f'},
+		'gf|features=s' => \$opts{'features'},
+		't|transcripts=s' => \$opts{'transcripts'},
+		'o|output=s' => \$opts{'o'},
+		'sp|species=s' => \$opts{'sp'},
+		'as|assembly=s' => \$opts{'as'},
+		'd|database=s' => \$opts{'d'},
+		'c|ccds=s' => \$opts{'c'},
+		'fai|fai=s' => \$opts{'fai'},
+		'debug|debug' => \$opts{'debug'},
+  );
+
+  pod2usage() if($opts{'h'});
+  if(scalar(@{$opts{'f'}}) > 0){
+    foreach my $f(@{$opts{'f'}}){
+      pod2usage("Unable to read the transcript sequence file: $f") unless -e $f && -r $f;
+    }
+  } else {
+    pod2usage('Must specify one or more fasta sequence file as input');
+  }
+
+  pod2usage('Must specify the input gtf/gff3 reference file') unless defined $opts{'features'} && -e $opts{'features'};
+  pod2usage('Must specify transcript list file') unless defined $opts{'transcripts'} && -e $opts{'transcripts'};
+  pod2usage('Must specify the output directory') unless defined $opts{'o'} && -e $opts{'o'} && -d $opts{'o'} ;
+  pod2usage('Must specify the species') unless defined $opts{'sp'};
+  pod2usage('Must specify the genome version') unless defined $opts{'as'};
+  pod2usage('Must specify the data version') unless defined $opts{'d'};
+  if(defined($opts{'c'})){
+  	pod2usage('CCDS file unreadable') unless -e $opts{'c'} && -r $opts{'c'};
+  }
+  if(defined($opts{'fai'})){
+  	pod2usage('Unable to read the fai file: '.$opts{'fai'}) unless -e $opts{'fai'} && -r $opts{'fai'};
+  }
+  return \%opts;
+}
+
+
+
+__END__
+
+=head1 NAME
+
+Admin_CacheFileBuilder.pl - Generates the Vagrent reference data set from the supplied GFF£/GTF and Fasta files 
+
+=head1 SYNOPSIS
+
+Admin_CacheFileBuilder.pl [-h] [-f /path/to/sequence.fa] [-gff /path/to/annotation.gff] [-sp human] [-as GRCh37] [-d homo_sapiens_core_74_37p] [-c /path/to/CCDS2Sequence.version.txt]
+
+  Required Options:
+
+    --output       (-o)     Output directory
+
+    --fasta        (-f)     Fasta file containing Transcript sequences, can be specified multiple times.
+
+    --features     (-gf)   GFF3 or GTF file containing genomic annotation
+    
+    --transcripts  (-t)     Transcript accession file, (simple list, one accession per line)
+
+    --species      (-sp)    Species (ie human, mouse)
+
+    --assembly     (-as)    Assembly version (ie GRCh37, GRCm38)
+
+    --data_version (-d)     Reference version number (Ensemble example: homo_sapiens_core_74_37p)
+    
+    Other Options:
+    
+    --help         (-h)     Brief documentation   
+
+    --ccds         (-c)     (Recommended) The CCDS2Sequence file from the relevant CCDS release, see http://www.ncbi.nlm.nih.gov/CCDS
+    
+    --fai          (-fai)   (Recommended) The samtools fasts index file (.fai) for your reference genome
+                              This is the reference genome that your bam and vcf files will be mapped to
+                              
+    --debug        (-debug) This turns on the debug output, useful if you are having issues with a specific Ensembl build                  
+=cut
\ No newline at end of file
diff --git a/bin/Admin_EnsemblGtf2CacheConverter.pl b/bin/Admin_EnsemblGtf2CacheConverter.pl
deleted file mode 100755
index 83d9e3c..0000000
--- a/bin/Admin_EnsemblGtf2CacheConverter.pl
+++ /dev/null
@@ -1,400 +0,0 @@
-#!/usr/bin/perl
-
-##########LICENCE##########
-# Copyright (c) 2014 Genome Research Ltd.
-#
-# Author: Cancer Genome Project cgpit@sanger.ac.uk
-#
-# This file is part of VAGrENT.
-#
-# VAGrENT is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Affero General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your option) any
-# later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-##########LICENCE##########
-
-
-use strict;
-use English qw(-no_match_vars);
-use warnings FATAL => 'all';
-use Carp;
-
-use FindBin qw($Bin);
-use lib "$Bin/../lib";
-
-use Getopt::Long;
-use Pod::Usage;
-use Try::Tiny;
-
-use Data::Dumper;
-use File::Type;
-use Const::Fast qw(const);
-
-use Bio::DB::HTS;
-
-use Sanger::CGP::Vagrent::Data::Transcript;
-use Sanger::CGP::Vagrent::Data::Exon;
-
-$Data::Dumper::Indent = 0;
-
-const my @TEXT_TYPES => qw(application/octet-stream);
-
-const my @ZIP_TYPES => qw(application/x-gzip);
-
-const my $PROTEIN_CODING => 'protein_coding';
-const my $EXON_TYPE => 'exon';
-const my $CDS_START_TYPE => 'start_codon';
-const my $CDS_STOP_TYPE => 'stop_codon';
-const my $CDS_TYPE => 'CDS';
-
-try {
-	my $opts = option_builder();
-	my $ccds = undef;
-	my $lookup = parseFaiFile($opts->{'fai'});
-	$ccds = parseCCDSFile($opts->{'c'}) if defined $opts->{'c'};
-	convertGtf($opts,$lookup,$ccds);
-} catch {
-  die "An error occurred while building reference support files\:\n\t$_"; # not $@
-};
-
-sub convertGtf {
-  my ($opts,$lookup,$ccds) = @_;
-  my $type = getInputFileType($opts->{'g'});
-	my $fasta = Bio::DB::HTS::Fai->load($opts->{'f'});
-	my ($inFh,$outFh);
-	if($type eq 'text') {
-		open $inFh, '<'.$opts->{'g'} || croak("unable to open gtf input file:".$opts->{'g'});
-	} elsif($type eq 'zip'){
-		open $inFh, 'zcat '.$opts->{'g'}.' |' || croak("unable to open gtf input file:".$opts->{'g'});
-	}
-  open $outFh, ">".$opts->{'o'} || croak("unable to open output file:".$opts->{'o'});
-  my $c = 0;
-	my $wip;
-	my $currentChr = undef;
-  my $lc = 0;
-  while(<$inFh>){
-    $lc++;
-    next if m/^#/;
-    my $line = $_;
-    chomp $line;
-
-    my $acc;
-
-    my ($chr,$bioType,$lineType, $start, $end, $strand, $frame, $tmpAttr, %attr);
-
-    ($chr,$bioType,$lineType, $start, $end, undef, $strand, $frame, $tmpAttr) = split /\t/, $line;
-
-    foreach my $t(split /\; /, $tmpAttr){
-      my ($k,$v) = split /\s/,$t,2;
-      $attr{$k} = $v;
-    }
-
-    if(exists $attr{'transcript_biotype'} && defined $attr{'transcript_biotype'} ){
-      $bioType = unquoteValue($attr{'transcript_biotype'});
-    }
-
-    if(exists $attr{'transcript_id'} && defined $attr{'transcript_id'}){
-			$acc = unquoteValue($attr{'transcript_id'});
-			unless(exists $lookup->{$acc}) {
-        next unless(exists $attr{'transcript_version'});
-			  $acc .= '.'.unquoteValue($attr{'transcript_version'});
-			  next unless exists $lookup->{$acc};
-			}
-		} else {
-			next;
-		}
-    $currentChr = $chr unless defined $currentChr;
-    if($currentChr ne $chr) {
-      # changed chr, any transcript on previous chr must be finished
-      foreach my $t (processCompleted($opts,$fasta,$lookup,$ccds,$wip,$currentChr)){
-				writeTranscript($outFh,$t,$wip->{$t->getAccession});
-				delete $wip->{$t->getAccession};
-			}
-			$currentChr = $chr;
-    }
-    my @store = ($chr,$start,$end,$strand,$frame,unquoteValue($attr{'exon_number'}));
-    push(@{$wip->{$acc}->{'lines'}->{$lineType}},\@store);
-    unless(exists $wip->{$acc}->{'type'}){
-      $c++;
-      $wip->{$acc}->{'type'} = $bioType;
-			$wip->{$acc}->{'acc'} = $acc;
-			if(exists $attr{'gene_name'}) {
-			  $wip->{$acc}->{'gene'} = unquoteValue($attr{'gene_name'});
-			}
-			elsif(exists $attr{'gene_id'}) {
-			  $wip->{$acc}->{'gene'} = unquoteValue($attr{'gene_id'});
-			}
-			else {
-			  croak "Cannot identify gene name or ID for structure: ".Dumper(\%attr);
-			}
-      $wip->{$acc}->{'CCDS'} = unquoteValue($attr{'ccds_id'}) if exists $attr{'ccds_id'};
-    }
-    if($lineType eq $CDS_TYPE && !defined $wip->{$acc}->{'protacc'}){
-			$wip->{$acc}->{'protacc'} = unquoteValue($attr{'protein_id'});
-		}
-  }
-  foreach my $t (processCompleted($opts,$fasta,$lookup,$ccds,$wip,$currentChr)){
-    writeTranscript($outFh,$t,$wip->{$t->getAccession});
-    delete $wip->{$t->getAccession};
-  }
-  close $outFh;
-	close $inFh;
-}
-
-sub processCompleted {
-	my ($opts,$fasta,$lookup,$ccds,$wip,$currentChr) = @_;
-	my @trans;
-	foreach my $acc (keys %$wip){
-		#warn Dumper($wip->{$acc});
-		if(exists $wip->{$acc}->{'lines'}->{$EXON_TYPE} && $wip->{$acc}->{'lines'}->{$EXON_TYPE}->[0]->[0] eq $currentChr){
-			my $ccdsId = undef;
-      if(exists $ccds->{$acc} && defined $ccds->{$acc}){
-        $ccdsId = $ccds->{$acc};
-      } elsif(exists $wip->{$acc}->{'CCDS'} && defined $wip->{$acc}->{'CCDS'}){
-        $ccdsId = $wip->{$acc}->{'CCDS'};
-      }
-      my $t = convertTranscript($opts,$fasta,$lookup->{$acc},$ccdsId,$wip->{$acc});
-			push @trans, $t;
-		}
-	}
-	return @trans;
-}
-
-sub convertTranscript {
-	my ($opts,$fasta,$tlength,$ccds,$data) = @_;
-	my $type = getGeneTypeForTranscript($data->{'type'});
-	my @exons;
-	my $strand = undef;
-	my $rnaLengthSum = 0;
-	my @sortedExonLines = sort {$a->[5] <=> $b->[5]} @{$data->{'lines'}->{$EXON_TYPE}};
-	foreach my $rawE (@sortedExonLines){
-		my $elength = ($rawE->[2] - $rawE->[1]) + 1;
-		my $rmin = $rnaLengthSum + 1;
-		my $rmax = $rnaLengthSum + $elength;
-		$rnaLengthSum += $elength;
-		my $convE = Sanger::CGP::Vagrent::Data::Exon->new(
-							species => $opts->{'sp'},
-							genomeVersion => $opts->{'as'},
-							chr => $rawE->[0],
-							minpos => $rawE->[1],
-							maxpos => $rawE->[2],
-							rnaminpos => $rmin,
-							rnamaxpos => $rmax,);
-
-		unless(defined $strand){
-			if($rawE->[3] eq '+'){
-				$strand = 1;
-			} elsif($rawE->[3] eq '-'){
-				$strand = -1;
-			} else {
-				croak "Expecting strand of + or -, recieved: ".$rawE->[3];
-			}
-		}
-		push(@exons,$convE);
-	}
-
-	my ($protAcc,$protAccVer,$cdsMin,$cdsMax,$cdsPhase);
-
-	if($type eq Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType()){
-		# protein coding
-		$protAcc = $data->{'protacc'};
-		$protAccVer = 1;
-		my $cdsLength = 0;
-		my @sortedCdsLines = sort {$a->[5] <=> $b->[5]} @{$data->{'lines'}->{$CDS_TYPE}};
-		foreach my $cds (@sortedCdsLines){
-			$cdsLength += ($cds->[2] - $cds->[1]) + 1;
-		}
-		# find CDS start
-		my $fivePrimeUtrExonLength = 0;
-		foreach my $e (@sortedExonLines){
-			if($sortedCdsLines[0]->[5] eq $e->[5]){
-				if($strand > 0){
-					$cdsMin = $fivePrimeUtrExonLength + (($sortedCdsLines[0]->[1] - $e->[1]) + 1);
-					$cdsMax = ($cdsMin + $cdsLength) - 1;
-				} else {
-					$cdsMin = $fivePrimeUtrExonLength + (($e->[2] - $sortedCdsLines[0]->[2]) + 1);
-					$cdsMax = ($cdsMin + $cdsLength) - 1;
-				}
-				last;
-			} else {
-				$fivePrimeUtrExonLength += ($e->[2] - $e->[1]) + 1;
-			}
-		}
-		$cdsMax += 3 if exists $data->{'lines'}->{$CDS_STOP_TYPE};
-		$cdsPhase = 0;
-    if($sortedCdsLines[0]->[4] > 0){
-      $cdsPhase = 3 - $sortedCdsLines[0]->[4];
-    }
-	} else {
-		# non-coding
-		$protAcc = undef;
-		$protAccVer = undef;
-		$cdsMin = undef;
-		$cdsMax = undef;
-		$cdsPhase = -1;
-	}
-
-	my @sortedExons = sort {$a->getMinPos <=> $b->getMinPos} @exons;
-	my $convT = Sanger::CGP::Vagrent::Data::Transcript->new(
-																		db => 'Ensembl',
-																		dbversion => $opts->{'d'},
-																		acc => $data->{'acc'},
-																		accversion => 1,
-																		proteinacc => $protAcc,
-																		proteinaccversion => $protAccVer,
-																		ccds => $ccds,
-																		genename => $data->{'gene'},
-																		genetype => $type,
-																		strand => $strand,
-																		cdnaseq => $fasta->seq($data->{'acc'},1,$tlength),
-																		cdsminpos => $cdsMin,
-																		cdsmaxpos => $cdsMax,
-																		cdsphase => $cdsPhase,
-																		genomicminpos => $sortedExons[0]->getMinPos,
-																		genomicmaxpos => $sortedExons[-1]->getMaxPos,
-																		exons => \@exons,);
-	return $convT;
-}
-
-sub writeTranscript {
-	my ($fh,$t,$rawT) = @_;
-	eval {
-    print $fh join("\t",$rawT->{'lines'}->{$EXON_TYPE}->[0]->[0],$t->getGenomicMinPos - 1,
-                        $t->getGenomicMaxPos,$t->getAccession,$t->getGeneName,length $t->getcDNASeq);
-    1;
-	};
-	if($@) {
-	  die "\nTranscript Object: ".Dumper($t)."\n\nExon_Type layer: ".Dumper($rawT->{'lines'}->{$EXON_TYPE})."\n\nERROR: Abandon hope, Ensemble structure has changed\n";
-	}
-	$t->{_cdnaseq} = undef;
-	print $fh "\t",Dumper($t),"\n";
-}
-
-sub getGeneTypeForTranscript {
-	my ($t) = @_;
-	if($t eq 'protein_coding'){
-		return Sanger::CGP::Vagrent::Data::Transcript::getProteinCodingType();
-	} elsif($t eq 'miRNA'){
-		return Sanger::CGP::Vagrent::Data::Transcript::getMicroRnaType();
-	} elsif($t eq 'lincRNA'){
-		return Sanger::CGP::Vagrent::Data::Transcript::getLincRnaType();
-	} elsif($t eq 'snoRNA'){
-		return Sanger::CGP::Vagrent::Data::Transcript::getSnoRnaType();
-	} elsif($t eq 'snRNA'){
-		return Sanger::CGP::Vagrent::Data::Transcript::getSnRnaType();
-	} elsif($t eq 'rRNA'){
-		return Sanger::CGP::Vagrent::Data::Transcript::getRRnaType();
-	} else {
-		croak("Unhandled type: $t");
-	}
-}
-
-sub unquoteValue {
-	my $val = shift;
-	$val =~ s/[\";]//g if defined $val;
-	return $val;
-}
-
-sub parseCCDSFile {
-	my $file = shift;
-	my $out = undef;
-	open my $fh, "<$file" || croak ("unable to open $file for reading");
-	while(<$fh>){
-		my @data = split /\s+/, $_;
-		next unless $data[4] =~ m/^ENST/;
-		next unless $data[2] == 1;
-		$out->{$data[4]} = $data[0];
-	}
-	close $fh;
-	return $out;
-}
-
-sub parseFaiFile {
-	my $fai = shift;
-	open my $fh, "<$fai" || croak("unable to open fai file: $fai");
-	my $out;
-	while (<$fh>) {
-		my ($enst,$length) = split /\s+/;
-		$out->{$enst} = $length;
-	}
-	return $out;
-}
-
-sub getInputFileType {
-	my $infile = shift;
-	my $fileType = File::Type->new->checktype_filename($infile);
-	foreach my $t (@TEXT_TYPES){
-		return 'text' if($t eq $fileType);
-	}
-	foreach my $t (@ZIP_TYPES) {
-		return 'zip' if($t eq $fileType);
-	}
-	return $fileType;
-}
-
-sub option_builder {
-	my ($factory) = @_;
-	my %opts = ();
-	my $result = &GetOptions (
-		'h|help' => \$opts{'h'},
-		'f|fasta=s' => \$opts{'f'},
-		'g|gtf=s' => \$opts{'g'},
-		'o|output=s' => \$opts{'o'},
-		'sp|species=s' => \$opts{'sp'},
-		'as|assembly=s' => \$opts{'as'},
-		'd|database=s' => \$opts{'d'},
-		'c|ccds=s' => \$opts{'c'},
-  );
-
-  pod2usage() if($opts{'h'});
-  pod2usage('Must specify the input fasta reference file') unless defined $opts{'f'} && -e $opts{'f'};
-  my $fai = $opts{'f'} .'.fai';
-  pod2usage("fasta index reference file not found: $fai") unless -e $fai;
-  $opts{'fai'} = $fai;
-  pod2usage('Must specify the input gtf reference file') unless defined $opts{'g'} && -e $opts{'g'};
-  pod2usage('Must specify the output file to use') unless defined $opts{'o'};
-  pod2usage('Must specify the species') unless defined $opts{'sp'};
-  pod2usage('Must specify the genome version') unless defined $opts{'as'};
-  pod2usage('Must specify the ensembl core database version') unless defined $opts{'d'};
-  if(defined($opts{'c'})){
-  	pod2usage('CCDS file unreadable') unless -e $opts{'c'} && -r $opts{'c'};
-  }
-  return \%opts;
-}
-
-__END__
-
-=head1 NAME
-
-Admin_EnsemblGtf2CacheConverter.pl - Generates the Vagrent cache file from the ensembl GTF file
-
-=head1 SYNOPSIS
-
-Admin_EnsemblGtf2CacheConverter.pl [-h] [-f /path/to/ensembl.fa] [-g /path/to/ensembl.gtf] [-o /path/to/output.file] [-sp human] [-as GRCh37] [-d homo_sapiens_core_74_37p] [-c /path/to/CCDS2Sequence.version.txt]
-
-  General Options:
-
-    --help         (-h)     Brief documentation
-
-    --fasta        (-f)     Fasta file containing the transcripts to be imported, fai file must also be present.
-
-    --gtf          (-g)     Ensembl GTF file for converting
-
-    --output       (-o)     Output file
-
-    --species      (-sp)    Species (ie human, mouse)
-
-    --assembly     (-as)  Assembly version (ie GRCh37, GRCm38)
-
-    --database     (-d)     Ensembl core database version number (ie homo_sapiens_core_74_37p)
-
-    --ccds         (-c)     (Optional, but strongly advised) The CCDS2Sequence file from the relevant CCDS release, see http://www.ncbi.nlm.nih.gov/CCDS
-=cut
diff --git a/bin/Admin_EnsemblReferenceFileGenerator.pl b/bin/Admin_EnsemblReferenceFileGenerator.pl
old mode 100755
new mode 100644
index 074cba0..aa6a4c0
--- a/bin/Admin_EnsemblReferenceFileGenerator.pl
+++ b/bin/Admin_EnsemblReferenceFileGenerator.pl
@@ -1,9 +1,9 @@
 #!/usr/bin/perl
 
 ##########LICENCE##########
-# Copyright (c) 2014 Genome Research Ltd.
+# Copyright (c) 2018 Genome Research Ltd.
 #
-# Author: Cancer Genome Project cgpit@sanger.ac.uk
+# Author: CASM/Cancer IT - cgphelp@sanger.ac.uk
 #
 # This file is part of VAGrENT.
 #
@@ -21,7 +21,6 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 ##########LICENCE##########
 
-
 use strict;
 use English qw(-no_match_vars);
 use warnings FATAL => 'all';
@@ -34,6 +33,7 @@
 use Getopt::Long;
 use Pod::Usage;
 use Try::Tiny;
+use Capture::Tiny qw(capture);
 use LWP::Simple;
 use IPC::System::Simple qw(run);
 use Net::FTP;
@@ -46,8 +46,8 @@
 use Data::Dumper;
 
 const my @ENSMBL_REF_FILE_EXTENTIONS => qw(cdna.all.fa.gz ncrna.fa.gz);
-const my $FASTA_FILTER_SCRIPT => 'Admin_EnsemblTranscriptFastaFilter.pl';
-const my $GTF_CONVERSION_SCRIPT => 'Admin_EnsemblGtf2CacheConverter.pl';
+const my $TRANSCRIPT_FILTER_SCRIPT => 'Admin_EnsemblTranscriptFilter.pl';
+const my $BUILDER_SCRIPT => 'Admin_CacheFileBuilder.pl';
 const my $FILTERED_FASTA_SUFFIX => 'vagrent.fa';
 const my $CACHE_SUFFIX_GZ => 'vagrent.cache.gz';
 const my $CACHE_SUFFIX_RAW => 'vagrent.cache.raw';
@@ -55,153 +55,162 @@
 const my $ENSEMBL_SPECIES_ASSEMBLY => qr/([^\.]+?)\.(.+?)\./;
 const my $ENSEMBL_VERSION_PATTERN => qr/^ftp\:\/\/ftp\.ensembl(?:genomes)?\.org\/pub\/release\-(\d+?)\//;
 
+my $tmpDir = tempdir("VagrentEnsemblRefFileGenXXXXX", TMPDIR => 1, CLEANUP => 1);
+
 try {
   my $opts = option_builder();
-	my $urls = getFileUrlsForRetrival($opts);
-
-	print "Downloading Files -------- ";
-	my ($codFasta, $ncFasta, $gtf) = downloadFiles($urls);
+  my ($codFasta, $ncFasta, $features,$transList);
+  
+  print "Downloading Files -------- ";
+  if(defined $opts->{'f'}){
+    my $urls = getFileUrlsForRetrival($opts);
+    ($codFasta, $ncFasta, $features) = downloadFiles($tmpDir, $urls);
+    print "Done\n";
+  } else {
+    $codFasta = $opts->{'cdna_fa'};
+    $ncFasta = $opts->{'ncrna_fa'};
+    $features = $opts->{'features'};
+    print "Skipped, files locally supplied\n";
+  }
+  
+  print "Obtaining Filtered Transcript List ----- ";
+  unless(defined $opts->{'trans_list'}){
+    $transList = generateTranscriptListFile($tmpDir,$features, $codFasta, $ncFasta);
+    print "Done\n";
+  } else {
+    $transList = $opts->{'trans_list'};
+    print "Skipped, files locally supplied\n";
+  }
+  
+  print "Building Cache Files ----- ";
+  buildCacheFiles($tmpDir, $opts, $features, $transList, $codFasta, $ncFasta);
   print "Done\n";
-	# make fasta file of selected cDNA sequences
-	print "Building Fasta Files ----- ";
-	my $fasta = generateFilteredFasta($opts, $codFasta);
-	if(defined $opts->{'n'}){
-		generateFilteredFasta($opts, $ncFasta, $fasta, $opts->{'n'});
-	} else {
-		generateFilteredFasta($opts, $ncFasta, $fasta);
-	}
-	print "Done\n";
-	# Create fai index file
-	print "Building Fasta Index ----- ";
-	my $fai = createFaiIndex($fasta);
-	print "Done\n";
-	# Create transcript object cache file
-	print "Building Vagrent Cache --- ";
-	my $cache = generateCacheFile($opts,$fasta,$gtf,$codFasta);
-	print "Done\n";
+    
 } catch {
   croak "An error occurred while building reference support files\:\n\t$_"; # not $@
 };
 
-sub generateCacheFile{
-	my ($opts,$fasta,$gtf,$codFasta) = @_;
-	my $rawCache = createFilePathFromFasta($opts,$codFasta,$CACHE_SUFFIX_RAW);
-	my $cache = createFilePathFromFasta($opts,$codFasta,$CACHE_SUFFIX_GZ);
-	my $cmd = $^X.' '.getCacheFileScript();
-	$cmd .= " -sp ".$opts->{'sp'};
-	$cmd .= " -as ".$opts->{'as'};
-	$cmd .= " -d ".$opts->{'d'};
-	$cmd .= " -c ".$opts->{'c'} if defined $opts->{'c'};
-	$cmd .= " -f $fasta";
-	$cmd .= " -g $gtf";
-	$cmd .= " -o $rawCache";
-
-	system($cmd) == 0 || croak "unable to create cache file: $rawCache";
-	system("sort -k 1,1 -k 2n,2 -k 3n,3 $rawCache | bgzip > $cache") == 0 || croak "unable to sort and zip cache file: $cache";
-	unlink $rawCache;
-	system("tabix -p bed $cache") == 0 || croak "unable to tabix index cache file: $cache";
-	return $cache;
-}
-
-sub createFaiIndex {
-	my $fa = shift;
-	my $cmd = "samtools faidx $fa";
-	system($cmd) == 0 || croak "unable to index $fa: $?";
-	return $fa.'.fai';
+sub buildCacheFiles {
+  my ($tmpDir, $opts, $features, $transList, @seq_files) = @_;
+  my $cmd = $^X.' '.getBuilderScript();
+  foreach my $s(@seq_files){
+    $cmd .= " -f $s";
+  }
+  $cmd .= ' -o '.$opts->{'o'};
+  $cmd .= " -gf $features";
+  $cmd .= " -t $transList";
+  $cmd .= ' -sp '.$opts->{'sp'};
+  $cmd .= ' -as '.$opts->{'as'};
+  $cmd .= ' -d '.$opts->{'d'};
+  $cmd .= ' -fai '.$opts->{'fai'} if defined $opts->{'fai'};
+  my ($stdout,$stderr,$exit) = capture {system($cmd)};
+  croak('Unable to generate cache files list:- '.$stderr) if $exit;
+  return;
 }
 
-sub generateFilteredFasta {
-	my ($opts, $inFa, $outFa, $statusLookupFile) = @_;
-	my $app = 0;
-	if(defined $outFa){
-		$app = 1;
-	} else {
-		$outFa = createFilePathFromFasta($opts,$inFa,$FILTERED_FASTA_SUFFIX);
-	}
-	my $cmd = $^X.' '.getFastaFilterScript();
-	$cmd .= " -f $inFa";
-	$cmd .= " -o $outFa";
-	$cmd .= ' -b '.join(' -b ',@TRANSCRIPT_BIOTYPES);
-	$cmd .= ' -a' if $app;
-	$cmd .= " -s $statusLookupFile" if defined $statusLookupFile;
-	system($cmd) == 0 || croak "unable to filter $inFa: $?";
-	return $outFa;
+sub getBuilderScript {
+  my $progPath = abs_path($0);
+	my ($vol,$dirs,$file) = File::Spec->splitpath($progPath);
+  return "$dirs".$BUILDER_SCRIPT;
 }
 
-sub createFilePathFromFasta {
-	my ($opts, $inFa, $suffix) = @_;
-	my ($vol,$dirs,$file) = File::Spec->splitpath($inFa);
-	my $outFa;
-  if($file =~ m/$ENSEMBL_SPECIES_ASSEMBLY/){
-    $outFa = File::Spec->catfile($opts->{'o'},"$1.$2.".$opts->{'e_version'}.".$suffix");
-  } else {
-    croak("unable to match species and assembly from file name: $inFa");
+sub generateTranscriptListFile {
+  my ($tmpDir,$features, @seq_files) = @_;
+  my $cmd = $^X.' '.getFilterScript();
+  my $transListFile = makeTmpFilePath($tmpDir,'.translist');
+  foreach my $s(@seq_files){
+    $cmd .= " -f $s";
   }
-	return $outFa;
+  $cmd .= " -o $transListFile";
+  $cmd .= ' -b '.join(' -b ',@TRANSCRIPT_BIOTYPES);
+  my ($stdout,$stderr,$exit) = capture {system($cmd)};
+  croak('Unable to generate transcript list:- '.$stderr) if $exit;
+  return $transListFile;
 }
 
-sub downloadFiles {
-	my $urls = shift;
-	my @out;
-	my $tmpDir = tempdir("VagrentEnsemblRefFileGenXXXXX", TMPDIR => 1, CLEANUP => 1);
-	foreach my $url(@$urls){
-		my $file = $tmpDir.'/'.(split /\//, $url)[-1];
-		push @out, $file;
-		my $rc = getstore($url, $file);
-    croak "An error occured when retrieving $url\n" if(is_error($rc));
-	}
-	return @out;
+sub getFilterScript {
+  my $progPath = abs_path($0);
+	my ($vol,$dirs,$file) = File::Spec->splitpath($progPath);
+  return "$dirs".$TRANSCRIPT_FILTER_SCRIPT;
 }
 
 sub getFileUrlsForRetrival {
 	my $opts = shift;
 	my @out;
-	# ftp://ftp.ensembl.org/pub/release-74/fasta/homo_sapiens/cdna
-	# ftp://ftp.ensembl.org/pub/release-74/fasta/homo_sapiens/ncrna
-	# ftp://ftp.ensembl.org/pub/release-74/gtf/homo_sapiens
+	# ftp://ftp.ensembl.org/pub/release-90/fasta/homo_sapiens/cdna
+	# ftp://ftp.ensembl.org/pub/release-90/fasta/homo_sapiens/ncrna
+	# ftp://ftp.ensembl.org/pub/release-90/gff3/homo_sapiens
+	
+	# ftp://ftp.ensembl.org/pub/release-70/fasta/homo_sapiens/cdna
+	# ftp://ftp.ensembl.org/pub/release-70/fasta/homo_sapiens/ncrna
+	# ftp://ftp.ensembl.org/pub/release-70/gtf/homo_sapiens
 
 	# ftp://ftp.ensembl.org/pub/release-74/fasta/danio_rerio/cdna
 	# ftp://ftp.ensembl.org/pub/release-74/fasta/danio_rerio/ncrna
 	# ftp://ftp.ensembl.org/pub/release-74/gtf/danio_rerio
 
+  $opts->{'f'} =~ s|/$||; # clean training '/' off url
 	my $ncFaDir = $opts->{'f'};
 	$ncFaDir =~ s|/cdna$|/ncrna|;
+	my $gffDir = $opts->{'f'};
+	$gffDir =~ s|/cdna$||;
+	$gffDir =~ s|/fasta/|/gff3/|;
 	my $gtfDir = $opts->{'f'};
 	$gtfDir =~ s|/cdna$||;
 	$gtfDir =~ s|/fasta/|/gtf/|;
-	foreach my $dir($opts->{'f'}, $ncFaDir, $gtfDir) {
+	my $ftp = undef; 
+	foreach my $dir($opts->{'f'}, $ncFaDir, $gffDir, $gtfDir) {
 		my @comps = split /\/+/, $dir;
   	my $root = shift @comps; # remove ftp:
   	my $host = shift @comps;
   	my $path = join '/', @comps;
-		my $ftp = Net::FTP->new($host, Debug => 0) or croak "Failed to connect (ftp) to $host\n\t$EVAL_ERROR";
-	  $ftp->login or croak "Failed to login (ftp) to $host\n\t", $ftp->message;
-  	my @files = $ftp->ls($path) or croak "Failed to list directory $path on $host (ftp)\n\t", $ftp->message;
-  	$ftp->quit;
+  	my $have_feature_file = 0;
+  	unless(defined $ftp){
+  	  $ftp = Net::FTP->new($host, Debug => 0) or croak "Failed to connect (ftp) to $host\n\t$EVAL_ERROR";
+	    $ftp->login or croak "Failed to login (ftp) to $host\n\t", $ftp->message;
+  	}
+  	my @files;
+  	unless(@files = $ftp->ls($path)){
+  	  unless($dir eq $gffDir){ # older ensembl releases don't have a gff directory, can be missing.
+  	    croak "Failed to list directory $path on $host (ftp)\n\t", $ftp->message;
+  	  }
+  	}
 		foreach my $file (@files){
 			foreach my $ext (@ENSMBL_REF_FILE_EXTENTIONS){
 				if($file =~ m/$ext$/){
 					push @out, $dir. '/' . (split '/', $file)[-1];
 				}
 			}
-			push @out, $dir. '/' . (split '/', $file)[-1] if($file =~ m/[[:digit:]]\.gtf\.gz$/);
+			if ($dir eq $gffDir && $file =~ m/\.[[:digit:]]+?\.gff3\.gz$/ && $file !~ m/\.[[:digit:]]+?\.(?:chromosome)\..+?\.gff3\.gz$/){
+			  push @out, $dir. '/' . (split '/', $file)[-1];
+			  $have_feature_file = 1;
+			} elsif ($dir eq $gtfDir && $file =~ m/[[:digit:]]\.gtf\.gz$/){
+			  push @out, $dir. '/' . (split '/', $file)[-1]
+			}
 		}
+		last if $have_feature_file;
 	}
+	$ftp->quit;
 	return \@out;
 }
 
-sub getFastaFilterScript {
-	my $progPath = abs_path($0);
-	my ($vol,$dirs,$file) = File::Spec->splitpath($progPath);
-  return "$dirs".$FASTA_FILTER_SCRIPT;
+sub makeTmpFilePath {
+  my ($tmpDir,$ext) = @_;
+  my $file;
+  (undef, $file) = tempfile('VagrentEnsemblRefFileGenXXXXX', DIR => $tmpDir, OPEN => 0, SUFFIX => $ext);
+  return $file;
 }
 
-sub getCacheFileScript {
-	my $progPath = abs_path($0);
-	my ($vol,$dirs,$file) = File::Spec->splitpath($progPath);
-	my $mods = $dirs;
-	$mods =~ s|bin/$|lib|;
-  return "-I $mods $dirs".$GTF_CONVERSION_SCRIPT;
+sub downloadFiles {
+	my ($tmpDir, $urls) = @_;
+	my @out;
+	foreach my $url(@$urls){
+		my $file = $tmpDir.'/'.(split /\//, $url)[-1];
+		push @out, $file;
+		my $rc = getstore($url, $file);
+    croak "An error occured when retrieving $url\n" if(is_error($rc));
+	}
+	return @out;
 }
 
 sub option_builder {
@@ -210,19 +219,39 @@ sub option_builder {
 	my $result = &GetOptions (
 		'h|help' => \$opts{'h'},
 		'f|ftp=s' => \$opts{'f'},
+		'gf|features=s' => \$opts{'features'},
+		'cf|cdna_fa=s' => \$opts{'cdna_fa'},
+		'nf|ncrna_fa=s' => \$opts{'ncrna_fa'},
 		'o|output=s' => \$opts{'o'},
-		'n|ncstatus=s' => \$opts{'n'},
+		'tl|trans_list=s' => \$opts{'trans_list'},
 		'sp|species=s' => \$opts{'sp'},
 		'as|assembly=s' => \$opts{'as'},
 		'd|database=s' => \$opts{'d'},
 		'c|ccds=s' => \$opts{'c'},
+		'fai|fai=s' => \$opts{'fai'},
   );
   pod2usage() if($opts{'h'});
   pod2usage('Must specify the output directory to use') unless(defined $opts{'o'});
   pod2usage("Output directory must exist and be writable: $opts{o}") unless(-e $opts{'o'} && -d $opts{'o'} && -w $opts{'o'});
-  pod2usage('Must specify the cDNA path for the ensembl reference data') unless defined $opts{'f'} && $opts{'f'} =~ m|cdna/?$|;
-  if(defined($opts{'n'})){
-  	pod2usage('Unable to read the non-coding transcript status file') unless -e $opts{'n'} && -r $opts{'n'};
+  
+  if(defined $opts{'f'}){
+    if(defined $opts{'features'} || defined $opts{'cdna_fa'} || defined $opts{'ncrna_fa'}){
+      pod2usage('Please only define the remote URL or the local files, not both');
+    } else {
+      pod2usage('Must specify the cDNA URL for the ensembl reference data') unless($opts{'f'} =~ m|cdna/?$|);
+    }
+  } elsif (defined $opts{'cdna_fa'} && defined $opts{'ncrna_fa'} && defined $opts{'features'}){
+    if(defined $opts{'f'}){
+      pod2usage('Please only define the remote URL or the local files, not both');
+    } else {
+      pod2usage('Please specify a valid genomic feature file (-gff)') unless -e $opts{'features'} && -r $opts{'features'};
+      pod2usage('Please specify a valid coding cDNA sequence file (-cf)') unless -e $opts{'cdna_fa'} && -r $opts{'cdna_fa'};
+      pod2usage('Please specify a valid non-coding cDNA sequence file (-nf)') unless -e $opts{'ncrna_fa'} && -r $opts{'ncrna_fa'};
+    }
+  
+  }
+  if(defined($opts{'fai'})){
+  	pod2usage('Unable to read the fai file: '.$opts{'fai'}) unless -e $opts{'fai'} && -r $opts{'fai'};
   }
   pod2usage('Must specify the species') unless defined $opts{'sp'};
   pod2usage('Must specify the genome version') unless defined $opts{'as'};
@@ -230,13 +259,9 @@ sub option_builder {
   if(defined($opts{'c'})){
   	pod2usage('CCDS file unreadable') unless -e $opts{'c'} && -r $opts{'c'};
   }
-  $opts{'f'} =~ s|/$||;
-  if($opts{'f'} =~ m/$ENSEMBL_VERSION_PATTERN/){
-    $opts{'e_version'} = $1;
-  } else {
-    pod2usage('Unable to parse Ensembl version from URL');
+  if(defined($opts{'trans_list'})){
+  	pod2usage('Transcript file unreadable') unless -e $opts{'trans_list'} && -r $opts{'trans_list'};
   }
-
   return \%opts;
 }
 
@@ -248,17 +273,11 @@ =head1 NAME
 
 =head1 SYNOPSIS
 
-Admin_EnsemblReferenceFileGenerator.pl [-h] [-s human] [-v GRCh37] [-d homo_sapiens_core_74_37p] [-f <ftp://ftp.ensembl.org/pub/release-XX/fasta/XXX_XXX/cdna/>] [-c /path/to/CCDS2Sequence.version.txt] [-o /path/to/output/directory]
-
-  General Options:
-
-    --help         (-h)     Brief documentation
-
-    --ftp          (-f)     Ensembl ftp directory containing the cDNA fasta sequence files
+Admin_EnsemblReferenceFileGenerator.pl [-h] [-sp Human] [-as GRCh37] [-d homo_sapiens_core_74_37p] [-f <ftp://ftp.ensembl.org/pub/release-XX/fasta/XXX_XXX/cdna/>] [-o /path/to/output/directory]
 
-    --output       (-o)     Output directory
+  Required Options:
 
-    --ncstatus     (-n)     Optional, path to a lookup file for the status of non-coding transcripts
+    --output       (-o)     Output directory    
 
     --species      (-sp)    Species (ie human, mouse)
 
@@ -266,6 +285,27 @@ =head1 SYNOPSIS
 
     --database     (-d)     Ensembl core database version number (ie homo_sapiens_core_74_37p)
 
-    --ccds         (-c)     (Optional, but strongly advised) The CCDS2Sequence file from the relevant CCDS release, see http://www.ncbi.nlm.nih.gov/CCDS
+  Dynamic Download:
+  
+    --ftp          (-f)     Ensembl ftp directory containing the cDNA fasta sequence files
+
+  Or Local Files:
+  
+    --features     (-gf)    gff3 or gtf file containing transcript and gene information
+  
+    --cdna_fa      (-cf)    Fasta file containing protein coding cdna sequences
+  
+    --ncrna_fa     (-nf)    Fasta file containing non-coding cdna sequences 
 
+  Optional:
+  
+    --help         (-h)     Brief documentation
+        
+    --ccds         (-c)     (Recommended) The CCDS2Sequence file from the relevant CCDS release, see http://www.ncbi.nlm.nih.gov/CCDS
+    
+    --fai          (-fai)   (Recommended) The samtools fasts index file (.fai) for your reference genome
+                              This is the reference genome that your bam and vcf files will be mapped to
+    
+    --trans_list   (-tl)    List of preprepared transcript accessions, only these accesions will be included in the reference output
+    
 =cut
diff --git a/bin/Admin_EnsemblTranscriptFastaFilter.pl b/bin/Admin_EnsemblTranscriptFastaFilter.pl
deleted file mode 100755
index a193db9..0000000
--- a/bin/Admin_EnsemblTranscriptFastaFilter.pl
+++ /dev/null
@@ -1,204 +0,0 @@
-#!/usr/bin/perl
-
-##########LICENCE##########
-# Copyright (c) 2014 Genome Research Ltd.
-#
-# Author: Cancer Genome Project cgpit@sanger.ac.uk
-#
-# This file is part of VAGrENT.
-#
-# VAGrENT is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Affero General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your option) any
-# later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>.
-##########LICENCE##########
-
-
-use strict;
-use English qw(-no_match_vars);
-use warnings FATAL => 'all';
-use Carp;
-
-use Getopt::Long;
-use Pod::Usage;
-use Try::Tiny;
-
-use FindBin qw($Bin);
-use lib "$Bin/../lib";
-
-use File::Type;
-use Const::Fast qw(const);
-
-use Data::Dumper;
-use Bio::SeqIO;
-
-const my @TEXT_TYPES => qw(application/octet-stream);
-
-const my @ZIP_TYPES => qw(application/x-gzip);
-
-try {
-	my $opts = option_builder();
-	my $trans = getInterestingTranscriptsFromFile($opts);
-	generateFilteredFasta($opts,$trans);
-} catch {
-  warn "An error occurred while building reference support files\:\n\t$_"; # not $@
-};
-
-sub generateFilteredFasta {
-	my ($opts,$trans) = @_;
-	my $type = getInputFileType($opts->{'f'});
-	my ($inFh,$outFh);
-
-	if($type eq 'text') {
-		open $inFh, '<'.$opts->{'f'} || croak("unable to open sequence input file:".$opts->{'f'});
-	} elsif($type eq 'zip'){
-		open $inFh, 'zcat '.$opts->{'f'}.' |' || croak("unable to open sequence input file:".$opts->{'f'});
-	}
-
-	if(defined $opts->{'a'}){
-		open $outFh, '>>'.$opts->{'o'} || croak("unable to open sequence ouput file:".$opts->{'f'});
-	} else {
-		open $outFh, '>'.$opts->{'o'} || croak("unable to open sequence ouput file:".$opts->{'f'});
-	}
-
-	my $keep = 0;
-
-	while (<$inFh>){
-		if(m/^>/){
-			my ($key) = split /\s/;
-			$key =~ s/^>//;
-			if(exists $trans->{$key}){
-				$keep = 1;
-				print $outFh ">$key\n";
-			} else {
-				$keep = 0;
-			}
-		} elsif($keep){
-			my $line = $_;
-			$line =~ s/\s//g;
-			print $outFh "$line\n";
-		}
-	}
-	close $outFh;
-	close $inFh;
-}
-
-sub getInterestingTranscriptsFromFile {
-	my $opts = shift;
-	my $type = getInputFileType($opts->{'f'});
-	my $out;
-	my $status = undef;
-	my $cmd;
-	if($type eq 'text'){
-		$cmd = "grep";
-	} elsif($type eq 'zip'){
-		$cmd = "zgrep";
-	} else {
-		croak "file is of an unrecognised format ($type): ".$opts->{'f'};
-	}
-
-	$cmd .= " '>' ". $opts->{'f'};
-	$cmd .= ' | grep ';
-	foreach my $b(@{$opts->{'b'}}){
-		$cmd .= " -e transcript_biotype:$b";
-	}
-
-	if(defined $opts->{'s'}){
-		open my $statfh, '<'.$opts->{'s'} || croak("unable to open status lookup file:".$opts->{'s'});
-		while(<$statfh>){
-			my @st = split /\s+/;
-			if(uc($st[1]) eq 'KNOWN'){
-				$status->{$st[0]}++;
-			}
-		}
-	} else {
-		$cmd .= ' | grep known ';
-	}
-
-	open my $fh, "$cmd |" || croak("unable to grep through fasta file with command: $cmd");
-	if(defined $opts->{'s'}) {
-		while (<$fh>){
-			my ($key) = split /\s/;
-			$key =~ s/^>//;
-			$out->{$key}++ if(exists $status->{$key});
-		}
-	} else {
-		while (<$fh>){
-			my ($key) = split /\s/;
-			$key =~ s/^>//;
-			$out->{$key}++;
-		}
-	}
-	close $fh;
-	return $out;
-}
-
-sub getInputFileType {
-	my $infile = shift;
-	my $fileType = File::Type->new->checktype_filename($infile);
-	foreach my $t (@TEXT_TYPES){
-		return 'text' if($t eq $fileType);
-	}
-	foreach my $t (@ZIP_TYPES) {
-		return 'zip' if($t eq $fileType);
-	}
-	return $fileType;
-}
-
-sub option_builder {
-	my ($factory) = @_;
-
-	my %opts = ();
-
-	my $result = &GetOptions (
-		'h|help' => \$opts{'h'},
-		'f|fasta=s' => \$opts{'f'},
-		'o|outfile=s' => \$opts{'o'},
-		'a|append' => \$opts{'a'},
-		'b|biotypes=s@' => \$opts{'b'},
-		's|status=s' => \$opts{'s'},
-  );
-
-  pod2usage() if($opts{'h'});
-  pod2usage('Must specify the input fasta ensembl reference file') unless defined $opts{'f'} && -e $opts{'f'};
-  pod2usage('Must specify the output file to use') unless defined $opts{'o'};
-  pod2usage('Must specify at least one biotype') unless scalar @{$opts{'b'}} > 0 && defined($opts{'b'}->[0]);
-  if(defined($opts{'s'})){
-  	pod2usage('Unable to read the transcript status file') unless -e $opts{'s'} && -r $opts{'s'};
-  }
-  return \%opts;
-}
-
-__END__
-
-=head1 NAME
-
-Admin_EnsemblTranscriptFastaFilter.pl - Generates the Vagrent reference fasta file from an Ensembl one
-
-=head1 SYNOPSIS
-
-Admin_EnsemblTranscriptFastaFilter.pl [-h] [-f /path/to/ensembl.fa] [-o /path/to/output.fa] [-b biotype] [-s /path/to/status/lookup.list][-a]
-
-  General Options:
-
-    --help         (-h)     Brief documentation
-
-    --fasta        (-f)     Ensembl fasta file
-
-    --output       (-o)     Output file
-
-    --biotypes     (-b)     Ensembl transcript biotypes to filter for, supports multiple instances
-
-    --status       (-s)     Optional, Transcript Status look up file, simple transcript id - status list, space separated, one entry per line
-
-    --append       (-a)     Append to existing output file
-
-=cut
diff --git a/bin/Admin_EnsemblTranscriptFilter.pl b/bin/Admin_EnsemblTranscriptFilter.pl
new file mode 100644
index 0000000..da15c27
--- /dev/null
+++ b/bin/Admin_EnsemblTranscriptFilter.pl
@@ -0,0 +1,166 @@
+#!/usr/bin/perl
+
+##########LICENCE##########
+# Copyright (c) 2018 Genome Research Ltd.
+#
+# Author: CASM/Cancer IT - cgphelp@sanger.ac.uk
+#
+# This file is part of VAGrENT.
+#
+# VAGrENT is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+##########LICENCE##########
+
+
+use strict;
+use English qw(-no_match_vars);
+use warnings FATAL => 'all';
+use Carp;
+
+use Getopt::Long;
+use Pod::Usage;
+use Try::Tiny;
+use Capture::Tiny qw(capture);
+use FindBin qw($Bin);
+use lib "$Bin/../lib";
+
+use File::Type;
+use Const::Fast qw(const);
+
+use Data::Dumper;
+
+const my $FASTA_HEADER_GREP => 'zgrep "^>" %s';
+
+try {
+	my $opts = option_builder();
+	my $transcripts = getTranscripts($opts);
+	writeOutput($opts,$transcripts);
+} catch {
+  warn "An error occurred while building reference support files\:\n\t$_"; # not $@
+};
+
+sub writeOutput {
+  my ($opts,$trans) = @_;
+  my $fh;
+  if(defined $opts->{'o'}){
+    open($fh,'>'.$opts->{'o'}) or die("Unable to open output file for writing: ".$opts->{'o'});
+  } else {
+    $fh = *STDOUT;
+  }
+  foreach my $t(@$trans){
+    print $fh $t,"\n";
+  }
+  if(defined $opts->{'o'}){
+    close $fh;
+  }
+}
+
+sub getTranscripts {
+  my $opts = shift;
+  my $good_transcripts;
+  foreach my $seq_file (@{$opts->{'f'}}){
+    my $cmd = sprintf $FASTA_HEADER_GREP, $seq_file;
+    my ($stdout, $stderr, $exit) = capture {
+      system($cmd);
+    };
+    if($exit > 0){
+      croak($stderr);
+    }
+    foreach my $rec (split /\n/,$stdout){
+      my ($transcript,$biotype) = parseLine($rec);
+      next unless hasGoodBiotype($opts,$biotype);
+      # Have to store the transcript accessions twice, with and without the version.
+      # Depending on the Ensembl version or species, the accession in the fasta file may 
+      # or may not have the transcript version on the end.
+      # Some species gff/gtf files include the the version in the accession, some don't
+      # Ensembl versions with imported gene build will preserve the originators naming conventions
+      push(@$good_transcripts,$transcript);
+      $transcript =~ s/\.\d+?$//;
+      push(@$good_transcripts,$transcript);      
+    }
+  }
+  return $good_transcripts;
+}
+
+sub hasGoodBiotype {
+  my ($opts,$biotype) = @_;
+  foreach my $type (@{$opts->{'b'}}){
+    return 1 if lc($biotype) eq lc($type);
+  }
+  return 0;
+}
+
+sub parseLine {
+  my $line = shift;
+  my @cols = split /\s/, $line;
+  my $full = join('|',@cols);
+  my $transcript = shift @cols; 
+  $transcript =~ s/^>//;
+  my $biotype = undef;
+  foreach my $c (@cols){
+    if($c =~ m/^transcript_biotype/){
+      (undef,$biotype) = split(':',$c);
+    }
+  }
+  return ($transcript,$biotype);
+}
+
+sub option_builder {
+	my ($factory) = @_;
+
+	my %opts = ();
+
+	my $result = &GetOptions (
+		'h|help' => \$opts{'h'},
+		'f|fasta=s@' => \$opts{'f'},
+		'o|outfile=s' => \$opts{'o'},
+		'b|biotypes=s@' => \$opts{'b'},
+  );
+  pod2usage() if($opts{'h'});
+  pod2usage('Output file must not already exist') if defined $opts{'o'} && -e $opts{'o'};
+  if(scalar(@{$opts{'f'}}) > 0){
+    foreach my $f(@{$opts{'f'}}){
+      pod2usage("Unable to read the transcript sequence file: $f") unless -e $f && -r $f;
+    }
+  } else {
+    pod2usage('Must specify one or more fasta sequence file as input');
+  }
+  pod2usage('Must specify at least one biotype') unless scalar @{$opts{'b'}} > 0 && defined($opts{'b'}->[0]);
+  return \%opts;
+}
+
+__END__
+
+=head1 NAME
+
+Admin_EnsemblTranscriptFilter.pl - Generates filtered list of Transcript accessions
+
+=head1 SYNOPSIS
+
+Admin_EnsemblTranscriptFilter.pl [-h] [-f /path/to/ensembl.fa] [-o /path/to/output.list] [-b biotype]
+
+  Required Options: 
+    
+    --fasta        (-f)     Ensembl fasta file from Ensembl's FTP site, can be specified multiple times.
+
+    --biotypes     (-b)     Ensembl transcript biotypes to filter for, can be specified multiple times.
+
+  General Options:
+
+    --help         (-h)     Brief documentation.
+
+    --output       (-o)     Output file, optional will default to stdout.
+
+=cut
+
+
diff --git a/docs.tar.gz b/docs.tar.gz
index b271ed6..070dec1 100644
Binary files a/docs.tar.gz and b/docs.tar.gz differ
diff --git a/lib/Sanger/CGP/Vagrent.pm b/lib/Sanger/CGP/Vagrent.pm
index 2bf1a5f..8d2bb22 100644
--- a/lib/Sanger/CGP/Vagrent.pm
+++ b/lib/Sanger/CGP/Vagrent.pm
@@ -1,9 +1,9 @@
 package Sanger::CGP::Vagrent;
 
 ##########LICENCE##########
-# Copyright (c) 2014-2017 Genome Research Ltd.
+# Copyright (c) 2014-2018 Genome Research Ltd.
 #
-# Author: Cancer Genome Project cgpit@sanger.ac.uk
+# Author: CASM/Cancer IT - cgphelp@sanger.ac.uk
 #
 # This file is part of VAGrENT.
 #
@@ -26,7 +26,7 @@ use strict;
 use Const::Fast qw(const);
 
 use base 'Exporter';
-our $VERSION = '3.2.3';
+our $VERSION = '3.3.0';
 our @EXPORT = qw($VERSION);
 
 1;