From 17cfbb920f818e239dcb5abebccdbae55df481c4 Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Mon, 31 Oct 2016 16:13:56 +0000 Subject: [PATCH 1/3] Allows use of ensemblgenomes.org as a datasource --- bin/Admin_EnsemblReferenceFileGenerator.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/Admin_EnsemblReferenceFileGenerator.pl b/bin/Admin_EnsemblReferenceFileGenerator.pl index d704931..074cba0 100755 --- a/bin/Admin_EnsemblReferenceFileGenerator.pl +++ b/bin/Admin_EnsemblReferenceFileGenerator.pl @@ -53,7 +53,7 @@ const my $CACHE_SUFFIX_RAW => 'vagrent.cache.raw'; const my @TRANSCRIPT_BIOTYPES => qw(protein_coding lincRNA miRNA snoRNA rRNA snRNA); const my $ENSEMBL_SPECIES_ASSEMBLY => qr/([^\.]+?)\.(.+?)\./; -const my $ENSEMBL_VERSION_PATTERN => qr/^ftp\:\/\/ftp\.ensembl\.org\/pub\/release\-(\d+?)\//; +const my $ENSEMBL_VERSION_PATTERN => qr/^ftp\:\/\/ftp\.ensembl(?:genomes)?\.org\/pub\/release\-(\d+?)\//; try { my $opts = option_builder(); From 5556199713e290e68a61e5845eaaaeada8cfe96c Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Mon, 31 Oct 2016 16:15:28 +0000 Subject: [PATCH 2/3] Handle genes without names, and give more useful error message --- bin/Admin_EnsemblGtf2CacheConverter.pl | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/bin/Admin_EnsemblGtf2CacheConverter.pl b/bin/Admin_EnsemblGtf2CacheConverter.pl index bc11c66..6f78393 100755 --- a/bin/Admin_EnsemblGtf2CacheConverter.pl +++ b/bin/Admin_EnsemblGtf2CacheConverter.pl @@ -104,6 +104,7 @@ sub convertGtf { if(exists $attr{'transcript_id'} && defined $attr{'transcript_id'}){ $acc = unquoteValue($attr{'transcript_id'}); unless(exists $lookup->{$acc}) { + next unless(exists $attr{'transcript_version'}); $acc .= '.'.unquoteValue($attr{'transcript_version'}); next unless exists $lookup->{$acc}; } @@ -125,7 +126,15 @@ sub convertGtf { $c++; $wip->{$acc}->{'type'} = $bioType; $wip->{$acc}->{'acc'} = $acc; - $wip->{$acc}->{'gene'} = unquoteValue($attr{'gene_name'}); + if(exists $attr{'gene_name'}) { + $wip->{$acc}->{'gene'} = unquoteValue($attr{'gene_name'}); + } + elsif(exists $attr{'gene_id'}) { + $wip->{$acc}->{'gene'} = unquoteValue($attr{'gene_id'}); + } + else { + croak "Cannot identify gene name or ID for structure: ".Dumper(\%attr); + } $wip->{$acc}->{'CCDS'} = unquoteValue($attr{'ccds_id'}) if exists $attr{'ccds_id'}; } if($lineType eq $CDS_TYPE && !defined $wip->{$acc}->{'protacc'}){ @@ -257,8 +266,14 @@ sub convertTranscript { sub writeTranscript { my ($fh,$t,$rawT) = @_; - print $fh join("\t",$rawT->{'lines'}->{$EXON_TYPE}->[0]->[0],$t->getGenomicMinPos - 1, - $t->getGenomicMaxPos,$t->getAccession,$t->getGeneName,length $t->getcDNASeq); + eval { + print $fh join("\t",$rawT->{'lines'}->{$EXON_TYPE}->[0]->[0],$t->getGenomicMinPos - 1, + $t->getGenomicMaxPos,$t->getAccession,$t->getGeneName,length $t->getcDNASeq); + 1; + }; + if($@) { + die "\nTranscript Object: ".Dumper($t)."\n\nExon_Type layer: ".Dumper($rawT->{'lines'}->{$EXON_TYPE})."\n\nERROR: Abandon hope, Ensemble structure has changed\n"; + } $t->{_cdnaseq} = undef; print $fh "\t",Dumper($t),"\n"; } From 7f92d5248ac03b1f75fb173fc50363dfdf3d929a Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Mon, 31 Oct 2016 16:18:53 +0000 Subject: [PATCH 3/3] Ensure an error code is emmitted on failure --- bin/Admin_EnsemblGtf2CacheConverter.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/Admin_EnsemblGtf2CacheConverter.pl b/bin/Admin_EnsemblGtf2CacheConverter.pl index 6f78393..83d9e3c 100755 --- a/bin/Admin_EnsemblGtf2CacheConverter.pl +++ b/bin/Admin_EnsemblGtf2CacheConverter.pl @@ -62,7 +62,7 @@ $ccds = parseCCDSFile($opts->{'c'}) if defined $opts->{'c'}; convertGtf($opts,$lookup,$ccds); } catch { - warn "An error occurred while building reference support files\:\n\t$_"; # not $@ + die "An error occurred while building reference support files\:\n\t$_"; # not $@ }; sub convertGtf {