generate file with unique genes per sample

andrewjpage · Aug 9, 2017 · 805b93e · 805b93e
1 parent f8ed5c4
commit 805b93e
Show file tree

Hide file tree

Showing 7 changed files with 238 additions and 1 deletion.
diff --git a/bin/roary-unique_genes_per_sample b/bin/roary-unique_genes_per_sample
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+package Bio::Roary::Main::UniqueGenesPerSample;
+
+# ABSTRACT: Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
+# PODNAME: roary-unique_genes_per_sample
+
+=head1 SYNOPSIS
+
+Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
+
+=cut
+
+use Cwd qw(abs_path); 
+BEGIN { unshift( @INC, abs_path('./lib') ) }
+BEGIN { unshift( @INC, abs_path('./t/lib') ) }
+use Bio::Roary::CommandLine::UniqueGenesPerSample;
+
+Bio::Roary::CommandLine::UniqueGenesPerSample->new(args => \@ARGV, script_name => $0)->run;
diff --git a/dist.ini b/dist.ini
@@ -1,5 +1,5 @@
 name    = Bio-Roary
-version = 3.8.2
+version = 3.9.0
 author  = Andrew J. Page <[email protected]>
 license = GPL_3
 copyright_holder = Wellcome Trust Sanger Institute

diff --git a/lib/Bio/Roary/CommandLine/UniqueGenesPerSample.pm b/lib/Bio/Roary/CommandLine/UniqueGenesPerSample.pm
@@ -0,0 +1,91 @@
+undef $VERSION;
+
+package Bio::Roary::CommandLine::UniqueGenesPerSample;
+
+# ABSTRACT: Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
+
+=head1 SYNOPSIS
+
+Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
+
+=cut
+
+use Moose;
+use Getopt::Long qw(GetOptionsFromArray);
+extends 'Bio::Roary::CommandLine::Common';
+
+has 'args'        => ( is => 'ro', isa => 'ArrayRef', required => 1 );
+has 'script_name' => ( is => 'ro', isa => 'Str',      required => 1 );
+has 'help'        => ( is => 'rw', isa => 'Bool',     default  => 0 );
+
+has 'clustered_proteins' => ( is => 'rw', isa => 'Str',  default => 'clustered_proteins' );
+has 'output_filename'    => ( is => 'rw', isa => 'Str',  default => 'unique_genes_per_sample.tsv' );
+has 'verbose'            => ( is => 'rw', isa => 'Bool', default => 0 );
+has '_error_message'     => ( is => 'rw', isa => 'Str' );
+
+sub BUILD {
+    my ($self) = @_;
+
+    my ( $clustered_proteins, $output_filename, $verbose, $help );
+
+    GetOptionsFromArray(
+        $self->args,
+        'o|output=s'             => \$output_filename,
+        'c|clustered_proteins=s' => \$clustered_proteins,
+        'v|verbose'              => \$verbose,
+        'h|help'                 => \$help,
+    );
+
+    if ( defined($verbose) ) {
+        $self->verbose($verbose);
+        $self->logger->level(10000);
+    }
+
+    $self->help($help) if ( defined($help) );
+    ( !$self->help ) or die $self->usage_text;
+
+    $self->output_filename($output_filename) if ( defined($output_filename) );
+    if ( defined($clustered_proteins) && ( -e $clustered_proteins ) ) {
+        $self->clustered_proteins($clustered_proteins);
+    }
+    else {
+        $self->_error_message("Error: Cant access the clustered proteins file");
+    }
+}
+
+sub run {
+    my ($self) = @_;
+
+    if ( defined( $self->_error_message ) ) {
+        print $self->_error_message . "\n";
+        die $self->usage_text;
+    }
+
+    my $obj = Bio::Roary::UniqueGenesPerSample->new(
+        gff_files       => $self->gff_files,
+        output_filename => $self->output_filename,
+        groups_filename => $self->groups_filename,
+    );
+    $obj->reannotate;
+
+}
+
+sub usage_text {
+    my ($self) = @_;
+
+    return <<USAGE;
+Usage: roary-unique_genes_per_sample [options] -c clustered_proteins
+Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
+
+Options: -o STR output filename [unique_genes_per_sample.tsv]
+         -c STR clusters filename [clustered_proteins]
+         -v     verbose output to STDOUT
+         -h     this help message
+
+For further info see: http://sanger-pathogens.github.io/Roary/
+USAGE
+}
+
+__PACKAGE__->meta->make_immutable;
+no Moose;
+1;
diff --git a/lib/Bio/Roary/UniqueGenesPerSample.pm b/lib/Bio/Roary/UniqueGenesPerSample.pm
@@ -0,0 +1,80 @@
+package Bio::Roary::UniqueGenesPerSample;
+
+# ABSTRACT:  Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
+
+=head1 SYNOPSIS
+
+Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
+   use Bio::Roary::UniqueGenesPerSample;
+   
+   my $obj = Bio::Roary::SequenceLengths->new(
+     clustered_proteins   => 'clustered_proteins',
+     output_filename   => 'output_filename',
+   );
+   $obj->write_unique_frequency;
+
+=cut
+
+use Moose;
+use Bio::Roary::Exceptions;
+
+has 'clustered_proteins' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
+has 'output_filename'    => ( is => 'rw', isa => 'Str', default => 'unique_genes_per_sample.tsv' );
+
+has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' );
+
+sub _build__output_fh {
+    my ($self) = @_;
+    open( my $fh, '>', $self->output_filename )
+      or Bio::Roary::Exceptions::CouldntWriteToFile->throw( error => "Couldnt write output file:" . $self->output_filename );
+    return $fh;
+}
+
+#group_17585: 14520_6#21_00645
+sub _sample_to_gene_freq {
+    my ($self) = @_;
+
+    open( my $input_fh, $self->clustered_proteins )
+      or Bio::Roary::Exceptions::FileNotFound->throw( error => "Couldnt read input file:" . $self->clustered_proteins );
+
+    my %sample_to_gene_freq;
+    while (<$input_fh>) {
+        chomp;
+        my $line = $_;
+        next if ( length( $line ) < 6 );
+        if ( $line =~ /^.+: ([^\s]+)$/ ) {
+            my $gene_id = $1;
+            if ( $gene_id =~ /^(.+)_[\d]+$/ ) {
+                my $sample_name = $1;
+                $sample_to_gene_freq{$sample_name}++;
+            }
+            else {
+                # gene id may not be valid so ignore
+                next;
+            }
+        }
+        else {
+            # its either an invalid line or theres more than 1 gene in the cluster
+            next;
+        }
+    }
+
+    return \%sample_to_gene_freq;
+}
+
+sub write_unique_frequency {
+    my ($self) = @_;
+
+    my %sample_to_gene_freq = %{$self->_sample_to_gene_freq};
+
+    for my $sample ( sort { $sample_to_gene_freq{$b} <=> $sample_to_gene_freq{$a}  || $a cmp $b } keys %sample_to_gene_freq ) {
+        print { $self->_output_fh } $sample . "\t" . $sample_to_gene_freq{$sample} . "\n";
+    }
+	close($self->_output_fh);
+	return 1;
+}
+
+no Moose;
+__PACKAGE__->meta->make_immutable;
+
+1;
diff --git a/t/Bio/Roary/UniqueGenesPerSample.t b/t/Bio/Roary/UniqueGenesPerSample.t
@@ -0,0 +1,37 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::Files;
+use Data::Dumper;
+
+BEGIN { unshift( @INC, './lib' ) }
+$ENV{PATH} .= ":./bin";
+
+BEGIN {
+    use Test::Most;
+    use_ok('Bio::Roary::UniqueGenesPerSample');
+}
+
+ok(
+    my $obj = Bio::Roary::UniqueGenesPerSample->new(
+        clustered_proteins => 't/data/unique_genes_per_sample/clustered_proteins_valid',
+    ),
+    'Initialise object'
+);
+
+is_deeply($obj->_sample_to_gene_freq, {
+          '11111_4#44' => 1,
+          '123_4#5' => 2,
+          '999_4#5' => 1,
+          '22222_6#21' => 1
+        }, 'sample frequencies');
+
+
+ok($obj->write_unique_frequency, 'create output file');
+ok(-e $obj->output_filename, 'output file exists');
+
+compare_ok($obj->output_filename, 't/data/unique_genes_per_sample/expected_unique_genes_per_sample.tsv', 'got expected unique gene frequency');
+
+unlink($obj->output_filename);
+
+done_testing();
diff --git a/t/data/unique_genes_per_sample/clustered_proteins_valid b/t/data/unique_genes_per_sample/clustered_proteins_valid
@@ -0,0 +1,6 @@
+group_2: 123_4#5_02659 999_4#5_02659
+group_2: 123_4#5_02654
+group_8: 999_4#5_02651
+group_7: 123_4#5_02674
+nagK: 11111_4#44_01973
+dnaA: 22222_6#21_00645
diff --git a/t/data/unique_genes_per_sample/expected_unique_genes_per_sample.tsv b/t/data/unique_genes_per_sample/expected_unique_genes_per_sample.tsv
@@ -0,0 +1,4 @@
+123_4#5	2
+11111_4#44	1
+22222_6#21	1
+999_4#5	1
-Original file line number
+Diff line change
@@ -0,0 +1,4 @@
+_4#5	2
+_4#44	1
+_6#21	1
+_4#5	1