forked from sanger-pathogens/Roary
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
generate file with unique genes per sample
- Loading branch information
1 parent
f8ed5c4
commit 805b93e
Showing
7 changed files
with
238 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#!/usr/bin/env perl | ||
|
||
package Bio::Roary::Main::UniqueGenesPerSample; | ||
|
||
# ABSTRACT: Take in the clustered file and produce a sorted file with the frequency of each samples unique genes | ||
# PODNAME: roary-unique_genes_per_sample | ||
|
||
=head1 SYNOPSIS | ||
Take in the clustered file and produce a sorted file with the frequency of each samples unique genes | ||
=cut | ||
|
||
use Cwd qw(abs_path); | ||
BEGIN { unshift( @INC, abs_path('./lib') ) } | ||
BEGIN { unshift( @INC, abs_path('./t/lib') ) } | ||
use Bio::Roary::CommandLine::UniqueGenesPerSample; | ||
|
||
Bio::Roary::CommandLine::UniqueGenesPerSample->new(args => \@ARGV, script_name => $0)->run; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
name = Bio-Roary | ||
version = 3.8.2 | ||
version = 3.9.0 | ||
author = Andrew J. Page <[email protected]> | ||
license = GPL_3 | ||
copyright_holder = Wellcome Trust Sanger Institute | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
undef $VERSION; | ||
|
||
package Bio::Roary::CommandLine::UniqueGenesPerSample; | ||
|
||
# ABSTRACT: Take in the clustered file and produce a sorted file with the frequency of each samples unique genes | ||
|
||
=head1 SYNOPSIS | ||
Take in the clustered file and produce a sorted file with the frequency of each samples unique genes | ||
=cut | ||
|
||
use Moose; | ||
use Getopt::Long qw(GetOptionsFromArray); | ||
extends 'Bio::Roary::CommandLine::Common'; | ||
|
||
has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); | ||
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 ); | ||
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 ); | ||
|
||
has 'clustered_proteins' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' ); | ||
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'unique_genes_per_sample.tsv' ); | ||
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); | ||
has '_error_message' => ( is => 'rw', isa => 'Str' ); | ||
|
||
sub BUILD { | ||
my ($self) = @_; | ||
|
||
my ( $clustered_proteins, $output_filename, $verbose, $help ); | ||
|
||
GetOptionsFromArray( | ||
$self->args, | ||
'o|output=s' => \$output_filename, | ||
'c|clustered_proteins=s' => \$clustered_proteins, | ||
'v|verbose' => \$verbose, | ||
'h|help' => \$help, | ||
); | ||
|
||
if ( defined($verbose) ) { | ||
$self->verbose($verbose); | ||
$self->logger->level(10000); | ||
} | ||
|
||
$self->help($help) if ( defined($help) ); | ||
( !$self->help ) or die $self->usage_text; | ||
|
||
$self->output_filename($output_filename) if ( defined($output_filename) ); | ||
if ( defined($clustered_proteins) && ( -e $clustered_proteins ) ) { | ||
$self->clustered_proteins($clustered_proteins); | ||
} | ||
else { | ||
$self->_error_message("Error: Cant access the clustered proteins file"); | ||
} | ||
} | ||
|
||
sub run { | ||
my ($self) = @_; | ||
|
||
if ( defined( $self->_error_message ) ) { | ||
print $self->_error_message . "\n"; | ||
die $self->usage_text; | ||
} | ||
|
||
my $obj = Bio::Roary::UniqueGenesPerSample->new( | ||
gff_files => $self->gff_files, | ||
output_filename => $self->output_filename, | ||
groups_filename => $self->groups_filename, | ||
); | ||
$obj->reannotate; | ||
|
||
} | ||
|
||
sub usage_text { | ||
my ($self) = @_; | ||
|
||
return <<USAGE; | ||
Usage: roary-unique_genes_per_sample [options] -c clustered_proteins | ||
Take in the clustered file and produce a sorted file with the frequency of each samples unique genes | ||
Options: -o STR output filename [unique_genes_per_sample.tsv] | ||
-c STR clusters filename [clustered_proteins] | ||
-v verbose output to STDOUT | ||
-h this help message | ||
For further info see: http://sanger-pathogens.github.io/Roary/ | ||
USAGE | ||
} | ||
|
||
__PACKAGE__->meta->make_immutable; | ||
no Moose; | ||
1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
package Bio::Roary::UniqueGenesPerSample; | ||
|
||
# ABSTRACT: Take in the clustered file and produce a sorted file with the frequency of each samples unique genes | ||
|
||
=head1 SYNOPSIS | ||
Take in the clustered file and produce a sorted file with the frequency of each samples unique genes | ||
use Bio::Roary::UniqueGenesPerSample; | ||
my $obj = Bio::Roary::SequenceLengths->new( | ||
clustered_proteins => 'clustered_proteins', | ||
output_filename => 'output_filename', | ||
); | ||
$obj->write_unique_frequency; | ||
=cut | ||
|
||
use Moose; | ||
use Bio::Roary::Exceptions; | ||
|
||
has 'clustered_proteins' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' ); | ||
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'unique_genes_per_sample.tsv' ); | ||
|
||
has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' ); | ||
|
||
sub _build__output_fh { | ||
my ($self) = @_; | ||
open( my $fh, '>', $self->output_filename ) | ||
or Bio::Roary::Exceptions::CouldntWriteToFile->throw( error => "Couldnt write output file:" . $self->output_filename ); | ||
return $fh; | ||
} | ||
|
||
#group_17585: 14520_6#21_00645 | ||
sub _sample_to_gene_freq { | ||
my ($self) = @_; | ||
|
||
open( my $input_fh, $self->clustered_proteins ) | ||
or Bio::Roary::Exceptions::FileNotFound->throw( error => "Couldnt read input file:" . $self->clustered_proteins ); | ||
|
||
my %sample_to_gene_freq; | ||
while (<$input_fh>) { | ||
chomp; | ||
my $line = $_; | ||
next if ( length( $line ) < 6 ); | ||
if ( $line =~ /^.+: ([^\s]+)$/ ) { | ||
my $gene_id = $1; | ||
if ( $gene_id =~ /^(.+)_[\d]+$/ ) { | ||
my $sample_name = $1; | ||
$sample_to_gene_freq{$sample_name}++; | ||
} | ||
else { | ||
# gene id may not be valid so ignore | ||
next; | ||
} | ||
} | ||
else { | ||
# its either an invalid line or theres more than 1 gene in the cluster | ||
next; | ||
} | ||
} | ||
|
||
return \%sample_to_gene_freq; | ||
} | ||
|
||
sub write_unique_frequency { | ||
my ($self) = @_; | ||
|
||
my %sample_to_gene_freq = %{$self->_sample_to_gene_freq}; | ||
|
||
for my $sample ( sort { $sample_to_gene_freq{$b} <=> $sample_to_gene_freq{$a} || $a cmp $b } keys %sample_to_gene_freq ) { | ||
print { $self->_output_fh } $sample . "\t" . $sample_to_gene_freq{$sample} . "\n"; | ||
} | ||
close($self->_output_fh); | ||
return 1; | ||
} | ||
|
||
no Moose; | ||
__PACKAGE__->meta->make_immutable; | ||
|
||
1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/usr/bin/env perl | ||
use strict; | ||
use warnings; | ||
use Test::Files; | ||
use Data::Dumper; | ||
|
||
BEGIN { unshift( @INC, './lib' ) } | ||
$ENV{PATH} .= ":./bin"; | ||
|
||
BEGIN { | ||
use Test::Most; | ||
use_ok('Bio::Roary::UniqueGenesPerSample'); | ||
} | ||
|
||
ok( | ||
my $obj = Bio::Roary::UniqueGenesPerSample->new( | ||
clustered_proteins => 't/data/unique_genes_per_sample/clustered_proteins_valid', | ||
), | ||
'Initialise object' | ||
); | ||
|
||
is_deeply($obj->_sample_to_gene_freq, { | ||
'11111_4#44' => 1, | ||
'123_4#5' => 2, | ||
'999_4#5' => 1, | ||
'22222_6#21' => 1 | ||
}, 'sample frequencies'); | ||
|
||
|
||
ok($obj->write_unique_frequency, 'create output file'); | ||
ok(-e $obj->output_filename, 'output file exists'); | ||
|
||
compare_ok($obj->output_filename, 't/data/unique_genes_per_sample/expected_unique_genes_per_sample.tsv', 'got expected unique gene frequency'); | ||
|
||
unlink($obj->output_filename); | ||
|
||
done_testing(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
group_2: 123_4#5_02659 999_4#5_02659 | ||
group_2: 123_4#5_02654 | ||
group_8: 999_4#5_02651 | ||
group_7: 123_4#5_02674 | ||
nagK: 11111_4#44_01973 | ||
dnaA: 22222_6#21_00645 |
4 changes: 4 additions & 0 deletions
4
t/data/unique_genes_per_sample/expected_unique_genes_per_sample.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
123_4#5 2 | ||
11111_4#44 1 | ||
22222_6#21 1 | ||
999_4#5 1 |