Skip to content

Commit

Permalink
cluster_stats.pl: rewrite to complete replace MyLib with HistoneSeque…
Browse files Browse the repository at this point in the history
…ncesDB.

Stop using our old MyLib.  Refactor code for simplicity with tests.  Sorting
cluster numbers makes output always the same (sometimes it forced a rebuild
when the order changed).

Change warning to error when fails to find locus of transcript.
  • Loading branch information
carandraug committed Dec 7, 2015
1 parent fd87520 commit d47b57b
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 81 deletions.
2 changes: 1 addition & 1 deletion SConstruct
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ analysis = [
env.PerlOutput(
target = path4result("variables-cluster_stats.tex"),
source = path4script("cluster_stats.pl"),
args = ["--sequences", seq_dir],
args = [db_store],
),
env.PerlScript(
target = prot_targets,
Expand Down
201 changes: 121 additions & 80 deletions scripts/cluster_stats.pl
Original file line number Diff line number Diff line change
Expand Up @@ -14,90 +14,131 @@
## You should have received a copy of the GNU General Public License
## along with this program; if not, see <http://www.gnu.org/licenses/>.

use 5.010; # Use Perl 5.10
use strict; # Enforce some good programming rules
use warnings; # Replacement for the -w flag, but lexically scoped
use List::Util; # Includes min and max
## SYNOPSIS
##
## cluster_stats.pl path/for/dbfile
##
## DESCRIPTION
##
## It will read the store file of an HistoneSequencesDB object, and
## print to stdout length of in base pairs, and the chromosome locus,
## of each cluster.

use 5.010;
use strict;
use warnings;
use List::Util; # for min and max

use HistoneSequencesDB;
use HistoneCatalogue;
use MyLib;

## This script will print stats for each of the histone clusters. It will
## include:
## (LaTeX variables with the number of histones
## in each cluster, how many are pseudo and coding, the length of each
## each cluster, and the location in the genome of each cluster)
##
## Usage is:
##
## cluster_stats.pl --sequences path/for/sequences

## Check input options
my %path = MyLib::parse_argv("sequences");

## Read the data and create a data structure for the analysis
my %canon; # organized by cluster, with counts of histones and other info
my @data = MyLib::load_canonical ($path{sequences});
foreach my $gene (@data) {
my $symbol = $$gene{'symbol'};
my $cluster = "HIST" . $$gene{'cluster'};
my $histone = $$gene{'histone'};

## to find the start and end of each cluster, we just list all the start and
## end coordinates. At the end, we get the min and max of them.
push (@{$canon{$cluster}{"coordinates"}}, $$gene{'start'}, $$gene{'end'});

## Get the locus.
## It is not possible to calculate it from the genomic coordinates (but
## should be possible to make bp_genbank_ref_extractor do it). However, we
## can find the value in the features of the transcripts files. Because of
## that, this only works on coding genes.
my @transcripts = keys %{$$gene{"transcripts"}};
if (@transcripts) {
my $seq = MyLib::load_seq("transcript", $transcripts[0], $path{sequences});
my ($feature) = $seq->get_SeqFeatures("source");
my ($locus) = $feature->get_tag_values("map");
if ($locus =~ m/[\d]+[qp][\d\.]+/) {
push (@{$canon{$cluster}{'locus'}}, $locus);
} else {
warn ("Could not find locus for $symbol in $transcripts[0]");
=func get_locus
Return locus of a Gene object (reads it from the metadata on its transcript).
Implementation detail:
It is not possible to calculate it from the genomic coordinates (but
should be possible to make bp_genbank_ref_extractor do it). However, we
can find the value in the features of the transcripts files. Because of
that, this only works on coding genes.
Args:
db (HistoneSequencesDB)
gene (HistoneGene)
Returns:
string with locus (arm letter and region and band)
=cut
sub get_locus
{
my $db = shift;
my $gene = shift;

## XXX this is a safe assumption in histone genes but may not be so for
## other cases. We can use the locus of a single transcript, and
## assume it applies to the whole gene.
for my $acc ($gene->transcripts)
{
my $seq = $db->get_transcript($acc);
my ($feature) = $seq->get_SeqFeatures("source");
my ($locus) = $feature->get_tag_values("map");
if ($locus =~ m/[\d]+[qp][\d\.]+/)
{ return $locus; }
## else, try the next transcript
}
}
die ("Could not find locus for ${gene->symbol}");
}

## Get the counts and stats for each of the clusters
foreach my $cluster_k (keys %canon) {
my $cluster = $canon{$cluster_k};
## Calculate the length (in bp) of each cluster
my $coord_start = List::Util::min (@{$$cluster{'coordinates'}});
my $coord_end = List::Util::max (@{$$cluster{'coordinates'}});
my $coord_length = abs ($coord_start - $coord_end);
say HistoneCatalogue::latex_newcommand(
$cluster_k."Span",
$coord_length,
"Span, in bp, of the histone cluster $cluster_k"
);

## Get a nice LaTeX string showing the range of locus for each cluster,
## e.g, 6p21.3--6p22.2. The problem is that some locus have less precision
## than others. For example, 1q21 does not mean 1q21.0, it only means
## somewhere in 1q21. While it is tempting to just ignore it, it is not
## correct, we should use the one with lowest precision. Luckily, sorting
## seems to already do that for us. We are left with the problems of
## having some genes in the short and long arm of a chromosome but a cluster
## should not be spanning the two arms.
my @locus = @{$$cluster{'locus'}};
if (@locus == 0) {
warn ("No locus information for cluster $cluster_k.");
} else {
my $locus_start = List::Util::minstr (@locus);
my $locus_end = List::Util::maxstr (@locus);
my $locus = $locus_start eq $locus_end ?
$locus_start : "$locus_start--$locus_end";
say HistoneCatalogue::latex_newcommand(
$cluster_k."Locus",
$locus,
"Locus of the histone cluster $cluster_k"
);
}
sub main
{
if (@_ != 1)
{
print "Usage error -- no input arguments.\n";
print "Correct usage is:\n";
print "\n";
print " \$ cluster_stats.pl path/for/dbfile\n";
exit (1);
}

my $db = HistoneSequencesDB::read_db ($_[0]);
my @canonical_core = $db->canonical_core;

my %clusters = map {$_->cluster => 1} @canonical_core;
my @clusters = sort keys %clusters;
for my $cluster_k (@clusters)
{
my @this_cluster = grep {$_->cluster == $cluster_k} @canonical_core;


##
## Compute length in bp of each cluster
##

## to find the start and end of each cluster, we just list all the
## start and end coordinates. Then we only need the min and max.
my @coordinates;
push (@coordinates, $_->chr_start, $_->chr_end) for (@this_cluster);
my $coord_start = List::Util::min (@coordinates);
my $coord_end = List::Util::max (@coordinates);
my $coord_length = abs ($coord_start - $coord_end);
say HistoneCatalogue::latex_newcommand(
"HIST${cluster_k}Span",
$coord_length,
"Span, in bp, of the histone cluster HIST$cluster_k"
);


##
## Compute cluster genomic locus
##

## FIXME get_locus only works with coding genes...
my @coding = grep {$_->type eq "coding"} @this_cluster;
my @this_locus = map {get_locus($db, $_)} @coding;

## Get a nice LaTeX string showing the range of locus for each cluster,
## e.g, 6p21.3--6p22.2. The problem is that some locus have less precision
## than others. For example, 1q21 does not mean 1q21.0, it only means
## somewhere in 1q21. While it is tempting to just ignore it, it is not
## correct, we should use the one with lowest precision. Luckily, sorting
## seems to already do that for us. We are left with the problems of
## having some genes in the short and long arm of a chromosome but a cluster
## should not be spanning the two arms.
if (@this_locus == 0) {
warn ("No locus information for cluster HIST$cluster_k.");
} else {
my $locus_start = List::Util::minstr (@this_locus);
my $locus_end = List::Util::maxstr (@this_locus);
my $locus = $locus_start eq $locus_end ?
$locus_start : "$locus_start--$locus_end";
say HistoneCatalogue::latex_newcommand(
"HIST${cluster_k}Locus",
$locus,
"Locus of the histone cluster HIST$cluster_k"
);
}
}
return 0;
}

main (@ARGV);

0 comments on commit d47b57b

Please sign in to comment.