From b38dd211960c32d1a6b00777ca98f508a44357e4 Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 4 Mar 2020 12:42:52 -0500 Subject: [PATCH 1/2] Allow utf8 filenames --- src/perl5/Bio/JBrowse/Cmd/IndexNames.pm | 16 ++-- .../Bio/JBrowse/FeatureStream/Genbank.pm | 85 +++++++++---------- 2 files changed, 52 insertions(+), 49 deletions(-) diff --git a/src/perl5/Bio/JBrowse/Cmd/IndexNames.pm b/src/perl5/Bio/JBrowse/Cmd/IndexNames.pm index eec797b131..d0f7be3067 100644 --- a/src/perl5/Bio/JBrowse/Cmd/IndexNames.pm +++ b/src/perl5/Bio/JBrowse/Cmd/IndexNames.pm @@ -18,6 +18,8 @@ use Storable (); use File::Path (); use File::Temp (); use List::Util (); +use Data::Dumper; +use Encode qw(decode encode); use GenomeDB (); use Bio::GFF3::LowLevel qw/gff3_parse_feature/; @@ -78,6 +80,7 @@ sub run { unless( @$refSeqs ) { die "No reference sequences defined in configuration, nothing to do.\n"; } + print Dumper($gdb->trackList); my @tracks = grep $self->track_is_included( $_->{label} ), @{ $gdb->trackList || [] }; unless( @tracks ) { @@ -195,8 +198,8 @@ sub _mergeIndexEntries { sub make_file_record { my ( $self, $track, $file ) = @_; - -f $file or die "$file not found\n"; - -r $file or die "$file not readable\n"; + -f decode('UTF-8',$file) or die "$file not found\n"; + -r decode('UTF-8',$file) or die "$file not readable\n"; my $gzipped = $file =~ /\.(txt|json|g)z(\.\d+)?$/; my $type = $file =~ /\.txtz?$/ ? 'txt' : $file =~ /\.jsonz?$/ ? 'json' : @@ -364,7 +367,8 @@ sub find_names_files { # read either names.txt or names.json files my $name_records_iterator; my $names_txt = File::Spec->catfile( $dir, 'names.txt' ); - if( -f $names_txt ) { + print "$names_txt\n"; + if( -e decode('UTF-8',$names_txt) ) { push @files, $self->make_file_record( $track, $names_txt ); } else { @@ -436,8 +440,8 @@ sub make_operation_stream { #print "sizes: $self->{stats}{total_namerec_bytes}, buffered: $namerecs_buffered, b/rec: ".$total_namerec_sizes/$namerecs_buffered."\n"; $self->{stats}{avg_record_text_bytes} = $self->{stats}{total_namerec_bytes}/($self->{stats}{namerecs_buffered}||1); $self->{stats}{total_input_bytes} = List::Util::sum( - map { my $s = -s $_->{fullpath}; - $s *= 8 if $_->{fullpath} =~ /\.(g|txt|json)z$/; + map { my $s = -s decode('UTF-8',$_->{fullpath}); + $s *= 8 if decode('UTF-8',$_->{fullpath}) =~ /\.(g|txt|json)z$/; $s; } @$names_files ) || 0; $self->{stats}{record_stream_estimated_count} = int( $self->{stats}{total_input_bytes} / ($self->{stats}{avg_record_text_bytes}||1));; @@ -710,7 +714,7 @@ sub open_names_file { } } else { - open my $fh, '<', $infile or die "$! reading $infile"; + open my $fh, '<', decode('UTF-8',$infile) or die "$! reading $infile"; return $fh; } } diff --git a/src/perl5/Bio/JBrowse/FeatureStream/Genbank.pm b/src/perl5/Bio/JBrowse/FeatureStream/Genbank.pm index 0fbe3552af..1e0b77d078 100644 --- a/src/perl5/Bio/JBrowse/FeatureStream/Genbank.pm +++ b/src/perl5/Bio/JBrowse/FeatureStream/Genbank.pm @@ -10,6 +10,7 @@ use strict; use warnings; use base 'Bio::JBrowse::FeatureStream'; +use Data::Dumper; use Bio::JBrowse::FeatureStream::Genbank::LocationParser; @@ -31,11 +32,12 @@ sub _aggregate_features_from_gbk_record { # get index of top level feature ('mRNA' at current writing) my $indexTopLevel; my $count = 0; + print Dumper $record; foreach my $feat ( @{$record->{FEATURES}} ){ - if ( _isTopLevel( $feat ) ){ - $indexTopLevel = $count; - } - $count++; + if ( _isTopLevel( $feat ) ){ + $indexTopLevel = $count; + } + $count++; } return unless defined $indexTopLevel; @@ -54,55 +56,52 @@ sub _aggregate_features_from_gbk_record { delete $f->{SEQUENCE}; $f->{end} = $locations[-1]{end}; - #for my $f ( @features ) { - $f->{start} += $offset + 1; - $f->{end} += $offset; - $f->{strand} = 1 unless defined $f->{strand}; - $f->{type} = $record->{FEATURES}[$indexTopLevel]{name}; - $f->{seq_id} ||= $seq_id; - - %$f = ( %{$record->{FEATURES}[$indexTopLevel]{feature} || {}}, %$f ); # get other attrs - if( $f->{type} eq 'mRNA' ) { - $f->{name} = $record->{FEATURES}[$indexTopLevel]{feature}{gene}; - $f->{description} = $record->{FEATURES}[$indexTopLevel]{feature}{product} || $f->{FEATURES}[$indexTopLevel]{feature}{note}; - } + $f->{type} = $record->{FEATURES}[$indexTopLevel]{name}; + $f->{seq_id} ||= $seq_id; + + %$f = ( %{$record->{FEATURES}[$indexTopLevel]{feature} || {}}, %$f ); # get other attrs + if( $f->{type} eq 'gene' ) { + print "here2\n"; + $f->{name} = $record->{FEATURES}[$indexTopLevel]{feature}{gene}; + $f->{description} = $record->{FEATURES}[$indexTopLevel]{feature}{product} || $f->{FEATURES}[$indexTopLevel]{feature}{note}; + } - # convert FEATURES to subfeatures - $f->{subfeatures} = []; - if ( scalar( @{$record->{FEATURES} || [] }) > $indexTopLevel ) { - for my $i ( $indexTopLevel + 1 .. $#{$record->{FEATURES}} ) { - my $feature = $record->{FEATURES}[$i]; - my @sublocations = _parseLocation( $feature->{location} ); - for my $subloc ( @sublocations ) { - $subloc->{start} += $offset + 1; - $subloc->{end} += $offset; - - my $newFeature = { - %{ $feature->{feature}||{} }, - %$subloc, - type => $feature->{name} - }; - - $newFeature->{seq_id} ||= $seq_id; - - push @{$f->{subfeatures}}, $newFeature; - } + # convert FEATURES to subfeatures + $f->{subfeatures} = []; + if ( scalar( @{$record->{FEATURES} || [] }) > $indexTopLevel ) { + for my $i ( $indexTopLevel + 1 .. $#{$record->{FEATURES}} ) { + my $feature = $record->{FEATURES}[$i]; + my @sublocations = _parseLocation( $feature->{location} ); + for my $subloc ( @sublocations ) { + $subloc->{start} += $offset + 1; + $subloc->{end} += $offset; + + my $newFeature = { + %{ $feature->{feature}||{} }, + %$subloc, + type => $feature->{name} + }; + + $newFeature->{seq_id} ||= $seq_id; + + push @{$f->{subfeatures}}, $newFeature; } } -# } + } return $f; } sub _isTopLevel { my $feat = shift; - my @topLevelFeatures = qw( mRNA ); # add more as needed? + my @topLevelFeatures = qw( gene ); # add more as needed? my $isTopLevel = 0; foreach my $thisTopFeat ( @topLevelFeatures ){ - if ( $feat->{'name'} =~ m/$thisTopFeat/ ){ - $isTopLevel = 1; - last; - } + if ( $feat->{'name'} =~ m/$thisTopFeat/ ){ + print "here\n"; + $isTopLevel = 1; + last; + } } return $isTopLevel; } @@ -115,7 +114,7 @@ sub _getRegionOffset { my $f = shift; my $offset = 0; - if ( grep {$_ =~ /REGION\:/} @{$f->{'VERSION'}} ){ # this is a region file + if ( grep {$_ =~ /REGION\:/} @{$f->{'VERSION'}} ){ # this is a region file # get array item after REGION token my $count = 0; my $regionIndexInArray; From 7778cca1e7632a01497a3073c346acb650f48d8d Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 4 Mar 2020 12:50:07 -0500 Subject: [PATCH 2/2] Remove some print commands --- src/perl5/Bio/JBrowse/Cmd/IndexNames.pm | 3 - .../Bio/JBrowse/FeatureStream/Genbank.pm | 85 ++++++++++--------- 2 files changed, 43 insertions(+), 45 deletions(-) diff --git a/src/perl5/Bio/JBrowse/Cmd/IndexNames.pm b/src/perl5/Bio/JBrowse/Cmd/IndexNames.pm index d0f7be3067..0296ec882f 100644 --- a/src/perl5/Bio/JBrowse/Cmd/IndexNames.pm +++ b/src/perl5/Bio/JBrowse/Cmd/IndexNames.pm @@ -18,7 +18,6 @@ use Storable (); use File::Path (); use File::Temp (); use List::Util (); -use Data::Dumper; use Encode qw(decode encode); use GenomeDB (); @@ -80,7 +79,6 @@ sub run { unless( @$refSeqs ) { die "No reference sequences defined in configuration, nothing to do.\n"; } - print Dumper($gdb->trackList); my @tracks = grep $self->track_is_included( $_->{label} ), @{ $gdb->trackList || [] }; unless( @tracks ) { @@ -367,7 +365,6 @@ sub find_names_files { # read either names.txt or names.json files my $name_records_iterator; my $names_txt = File::Spec->catfile( $dir, 'names.txt' ); - print "$names_txt\n"; if( -e decode('UTF-8',$names_txt) ) { push @files, $self->make_file_record( $track, $names_txt ); } diff --git a/src/perl5/Bio/JBrowse/FeatureStream/Genbank.pm b/src/perl5/Bio/JBrowse/FeatureStream/Genbank.pm index 1e0b77d078..0fbe3552af 100644 --- a/src/perl5/Bio/JBrowse/FeatureStream/Genbank.pm +++ b/src/perl5/Bio/JBrowse/FeatureStream/Genbank.pm @@ -10,7 +10,6 @@ use strict; use warnings; use base 'Bio::JBrowse::FeatureStream'; -use Data::Dumper; use Bio::JBrowse::FeatureStream::Genbank::LocationParser; @@ -32,12 +31,11 @@ sub _aggregate_features_from_gbk_record { # get index of top level feature ('mRNA' at current writing) my $indexTopLevel; my $count = 0; - print Dumper $record; foreach my $feat ( @{$record->{FEATURES}} ){ - if ( _isTopLevel( $feat ) ){ - $indexTopLevel = $count; - } - $count++; + if ( _isTopLevel( $feat ) ){ + $indexTopLevel = $count; + } + $count++; } return unless defined $indexTopLevel; @@ -56,52 +54,55 @@ sub _aggregate_features_from_gbk_record { delete $f->{SEQUENCE}; $f->{end} = $locations[-1]{end}; - $f->{type} = $record->{FEATURES}[$indexTopLevel]{name}; - $f->{seq_id} ||= $seq_id; - - %$f = ( %{$record->{FEATURES}[$indexTopLevel]{feature} || {}}, %$f ); # get other attrs - if( $f->{type} eq 'gene' ) { - print "here2\n"; - $f->{name} = $record->{FEATURES}[$indexTopLevel]{feature}{gene}; - $f->{description} = $record->{FEATURES}[$indexTopLevel]{feature}{product} || $f->{FEATURES}[$indexTopLevel]{feature}{note}; - } + #for my $f ( @features ) { + $f->{start} += $offset + 1; + $f->{end} += $offset; + $f->{strand} = 1 unless defined $f->{strand}; + $f->{type} = $record->{FEATURES}[$indexTopLevel]{name}; + $f->{seq_id} ||= $seq_id; + + %$f = ( %{$record->{FEATURES}[$indexTopLevel]{feature} || {}}, %$f ); # get other attrs + if( $f->{type} eq 'mRNA' ) { + $f->{name} = $record->{FEATURES}[$indexTopLevel]{feature}{gene}; + $f->{description} = $record->{FEATURES}[$indexTopLevel]{feature}{product} || $f->{FEATURES}[$indexTopLevel]{feature}{note}; + } - # convert FEATURES to subfeatures - $f->{subfeatures} = []; - if ( scalar( @{$record->{FEATURES} || [] }) > $indexTopLevel ) { - for my $i ( $indexTopLevel + 1 .. $#{$record->{FEATURES}} ) { - my $feature = $record->{FEATURES}[$i]; - my @sublocations = _parseLocation( $feature->{location} ); - for my $subloc ( @sublocations ) { - $subloc->{start} += $offset + 1; - $subloc->{end} += $offset; - - my $newFeature = { - %{ $feature->{feature}||{} }, - %$subloc, - type => $feature->{name} - }; - - $newFeature->{seq_id} ||= $seq_id; - - push @{$f->{subfeatures}}, $newFeature; + # convert FEATURES to subfeatures + $f->{subfeatures} = []; + if ( scalar( @{$record->{FEATURES} || [] }) > $indexTopLevel ) { + for my $i ( $indexTopLevel + 1 .. $#{$record->{FEATURES}} ) { + my $feature = $record->{FEATURES}[$i]; + my @sublocations = _parseLocation( $feature->{location} ); + for my $subloc ( @sublocations ) { + $subloc->{start} += $offset + 1; + $subloc->{end} += $offset; + + my $newFeature = { + %{ $feature->{feature}||{} }, + %$subloc, + type => $feature->{name} + }; + + $newFeature->{seq_id} ||= $seq_id; + + push @{$f->{subfeatures}}, $newFeature; + } } } - } +# } return $f; } sub _isTopLevel { my $feat = shift; - my @topLevelFeatures = qw( gene ); # add more as needed? + my @topLevelFeatures = qw( mRNA ); # add more as needed? my $isTopLevel = 0; foreach my $thisTopFeat ( @topLevelFeatures ){ - if ( $feat->{'name'} =~ m/$thisTopFeat/ ){ - print "here\n"; - $isTopLevel = 1; - last; - } + if ( $feat->{'name'} =~ m/$thisTopFeat/ ){ + $isTopLevel = 1; + last; + } } return $isTopLevel; } @@ -114,7 +115,7 @@ sub _getRegionOffset { my $f = shift; my $offset = 0; - if ( grep {$_ =~ /REGION\:/} @{$f->{'VERSION'}} ){ # this is a region file + if ( grep {$_ =~ /REGION\:/} @{$f->{'VERSION'}} ){ # this is a region file # get array item after REGION token my $count = 0; my $regionIndexInArray;