Skip to content

Commit

Permalink
generate file with unique genes per sample
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewjpage committed Aug 9, 2017
1 parent f8ed5c4 commit 805b93e
Show file tree
Hide file tree
Showing 7 changed files with 238 additions and 1 deletion.
19 changes: 19 additions & 0 deletions bin/roary-unique_genes_per_sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env perl

package Bio::Roary::Main::UniqueGenesPerSample;

# ABSTRACT: Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
# PODNAME: roary-unique_genes_per_sample

=head1 SYNOPSIS
Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
=cut

use Cwd qw(abs_path);
BEGIN { unshift( @INC, abs_path('./lib') ) }
BEGIN { unshift( @INC, abs_path('./t/lib') ) }
use Bio::Roary::CommandLine::UniqueGenesPerSample;

Bio::Roary::CommandLine::UniqueGenesPerSample->new(args => \@ARGV, script_name => $0)->run;
2 changes: 1 addition & 1 deletion dist.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name = Bio-Roary
version = 3.8.2
version = 3.9.0
author = Andrew J. Page <[email protected]>
license = GPL_3
copyright_holder = Wellcome Trust Sanger Institute
Expand Down
91 changes: 91 additions & 0 deletions lib/Bio/Roary/CommandLine/UniqueGenesPerSample.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
undef $VERSION;

package Bio::Roary::CommandLine::UniqueGenesPerSample;

# ABSTRACT: Take in the clustered file and produce a sorted file with the frequency of each samples unique genes

=head1 SYNOPSIS
Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
=cut

use Moose;
use Getopt::Long qw(GetOptionsFromArray);
extends 'Bio::Roary::CommandLine::Common';

has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );

has 'clustered_proteins' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'unique_genes_per_sample.tsv' );
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
has '_error_message' => ( is => 'rw', isa => 'Str' );

sub BUILD {
my ($self) = @_;

my ( $clustered_proteins, $output_filename, $verbose, $help );

GetOptionsFromArray(
$self->args,
'o|output=s' => \$output_filename,
'c|clustered_proteins=s' => \$clustered_proteins,
'v|verbose' => \$verbose,
'h|help' => \$help,
);

if ( defined($verbose) ) {
$self->verbose($verbose);
$self->logger->level(10000);
}

$self->help($help) if ( defined($help) );
( !$self->help ) or die $self->usage_text;

$self->output_filename($output_filename) if ( defined($output_filename) );
if ( defined($clustered_proteins) && ( -e $clustered_proteins ) ) {
$self->clustered_proteins($clustered_proteins);
}
else {
$self->_error_message("Error: Cant access the clustered proteins file");
}
}

sub run {
my ($self) = @_;

if ( defined( $self->_error_message ) ) {
print $self->_error_message . "\n";
die $self->usage_text;
}

my $obj = Bio::Roary::UniqueGenesPerSample->new(
gff_files => $self->gff_files,
output_filename => $self->output_filename,
groups_filename => $self->groups_filename,
);
$obj->reannotate;

}

sub usage_text {
my ($self) = @_;

return <<USAGE;
Usage: roary-unique_genes_per_sample [options] -c clustered_proteins
Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
Options: -o STR output filename [unique_genes_per_sample.tsv]
-c STR clusters filename [clustered_proteins]
-v verbose output to STDOUT
-h this help message
For further info see: http://sanger-pathogens.github.io/Roary/
USAGE
}

__PACKAGE__->meta->make_immutable;
no Moose;
1;
80 changes: 80 additions & 0 deletions lib/Bio/Roary/UniqueGenesPerSample.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package Bio::Roary::UniqueGenesPerSample;

# ABSTRACT: Take in the clustered file and produce a sorted file with the frequency of each samples unique genes

=head1 SYNOPSIS
Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
use Bio::Roary::UniqueGenesPerSample;
my $obj = Bio::Roary::SequenceLengths->new(
clustered_proteins => 'clustered_proteins',
output_filename => 'output_filename',
);
$obj->write_unique_frequency;
=cut

use Moose;
use Bio::Roary::Exceptions;

has 'clustered_proteins' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'unique_genes_per_sample.tsv' );

has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' );

sub _build__output_fh {
my ($self) = @_;
open( my $fh, '>', $self->output_filename )
or Bio::Roary::Exceptions::CouldntWriteToFile->throw( error => "Couldnt write output file:" . $self->output_filename );
return $fh;
}

#group_17585: 14520_6#21_00645
sub _sample_to_gene_freq {
my ($self) = @_;

open( my $input_fh, $self->clustered_proteins )
or Bio::Roary::Exceptions::FileNotFound->throw( error => "Couldnt read input file:" . $self->clustered_proteins );

my %sample_to_gene_freq;
while (<$input_fh>) {
chomp;
my $line = $_;
next if ( length( $line ) < 6 );
if ( $line =~ /^.+: ([^\s]+)$/ ) {
my $gene_id = $1;
if ( $gene_id =~ /^(.+)_[\d]+$/ ) {
my $sample_name = $1;
$sample_to_gene_freq{$sample_name}++;
}
else {
# gene id may not be valid so ignore
next;
}
}
else {
# its either an invalid line or theres more than 1 gene in the cluster
next;
}
}

return \%sample_to_gene_freq;
}

sub write_unique_frequency {
my ($self) = @_;

my %sample_to_gene_freq = %{$self->_sample_to_gene_freq};

for my $sample ( sort { $sample_to_gene_freq{$b} <=> $sample_to_gene_freq{$a} || $a cmp $b } keys %sample_to_gene_freq ) {
print { $self->_output_fh } $sample . "\t" . $sample_to_gene_freq{$sample} . "\n";
}
close($self->_output_fh);
return 1;
}

no Moose;
__PACKAGE__->meta->make_immutable;

1;
37 changes: 37 additions & 0 deletions t/Bio/Roary/UniqueGenesPerSample.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env perl
use strict;
use warnings;
use Test::Files;
use Data::Dumper;

BEGIN { unshift( @INC, './lib' ) }
$ENV{PATH} .= ":./bin";

BEGIN {
use Test::Most;
use_ok('Bio::Roary::UniqueGenesPerSample');
}

ok(
my $obj = Bio::Roary::UniqueGenesPerSample->new(
clustered_proteins => 't/data/unique_genes_per_sample/clustered_proteins_valid',
),
'Initialise object'
);

is_deeply($obj->_sample_to_gene_freq, {
'11111_4#44' => 1,
'123_4#5' => 2,
'999_4#5' => 1,
'22222_6#21' => 1
}, 'sample frequencies');


ok($obj->write_unique_frequency, 'create output file');
ok(-e $obj->output_filename, 'output file exists');

compare_ok($obj->output_filename, 't/data/unique_genes_per_sample/expected_unique_genes_per_sample.tsv', 'got expected unique gene frequency');

unlink($obj->output_filename);

done_testing();
6 changes: 6 additions & 0 deletions t/data/unique_genes_per_sample/clustered_proteins_valid
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
group_2: 123_4#5_02659 999_4#5_02659
group_2: 123_4#5_02654
group_8: 999_4#5_02651
group_7: 123_4#5_02674
nagK: 11111_4#44_01973
dnaA: 22222_6#21_00645
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
123_4#5 2
11111_4#44 1
22222_6#21 1
999_4#5 1

0 comments on commit 805b93e

Please sign in to comment.