From d47b57b20196f1d0b68bdfc010195bfa79fb98aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carn=C3=AB=20Draug?= Date: Mon, 7 Dec 2015 02:32:58 +0000 Subject: [PATCH] cluster_stats.pl: rewrite to complete replace MyLib with HistoneSequencesDB. Stop using our old MyLib. Refactor code for simplicity with tests. Sorting cluster numbers makes output always the same (sometimes it forced a rebuild when the order changed). Change warning to error when fails to find locus of transcript. --- SConstruct | 2 +- scripts/cluster_stats.pl | 201 +++++++++++++++++++++++---------------- 2 files changed, 122 insertions(+), 81 deletions(-) diff --git a/SConstruct b/SConstruct index eb6ba95..f3ff61e 100644 --- a/SConstruct +++ b/SConstruct @@ -343,7 +343,7 @@ analysis = [ env.PerlOutput( target = path4result("variables-cluster_stats.tex"), source = path4script("cluster_stats.pl"), - args = ["--sequences", seq_dir], + args = [db_store], ), env.PerlScript( target = prot_targets, diff --git a/scripts/cluster_stats.pl b/scripts/cluster_stats.pl index cd486ca..abc31c4 100755 --- a/scripts/cluster_stats.pl +++ b/scripts/cluster_stats.pl @@ -14,90 +14,131 @@ ## You should have received a copy of the GNU General Public License ## along with this program; if not, see . -use 5.010; # Use Perl 5.10 -use strict; # Enforce some good programming rules -use warnings; # Replacement for the -w flag, but lexically scoped -use List::Util; # Includes min and max +## SYNOPSIS +## +## cluster_stats.pl path/for/dbfile +## +## DESCRIPTION +## +## It will read the store file of an HistoneSequencesDB object, and +## print to stdout length of in base pairs, and the chromosome locus, +## of each cluster. +use 5.010; +use strict; +use warnings; +use List::Util; # for min and max + +use HistoneSequencesDB; use HistoneCatalogue; -use MyLib; -## This script will print stats for each of the histone clusters. It will -## include: -## (LaTeX variables with the number of histones -## in each cluster, how many are pseudo and coding, the length of each -## each cluster, and the location in the genome of each cluster) -## -## Usage is: -## -## cluster_stats.pl --sequences path/for/sequences - -## Check input options -my %path = MyLib::parse_argv("sequences"); - -## Read the data and create a data structure for the analysis -my %canon; # organized by cluster, with counts of histones and other info -my @data = MyLib::load_canonical ($path{sequences}); -foreach my $gene (@data) { - my $symbol = $$gene{'symbol'}; - my $cluster = "HIST" . $$gene{'cluster'}; - my $histone = $$gene{'histone'}; - - ## to find the start and end of each cluster, we just list all the start and - ## end coordinates. At the end, we get the min and max of them. - push (@{$canon{$cluster}{"coordinates"}}, $$gene{'start'}, $$gene{'end'}); - - ## Get the locus. - ## It is not possible to calculate it from the genomic coordinates (but - ## should be possible to make bp_genbank_ref_extractor do it). However, we - ## can find the value in the features of the transcripts files. Because of - ## that, this only works on coding genes. - my @transcripts = keys %{$$gene{"transcripts"}}; - if (@transcripts) { - my $seq = MyLib::load_seq("transcript", $transcripts[0], $path{sequences}); - my ($feature) = $seq->get_SeqFeatures("source"); - my ($locus) = $feature->get_tag_values("map"); - if ($locus =~ m/[\d]+[qp][\d\.]+/) { - push (@{$canon{$cluster}{'locus'}}, $locus); - } else { - warn ("Could not find locus for $symbol in $transcripts[0]"); +=func get_locus +Return locus of a Gene object (reads it from the metadata on its transcript). + +Implementation detail: + +It is not possible to calculate it from the genomic coordinates (but +should be possible to make bp_genbank_ref_extractor do it). However, we +can find the value in the features of the transcripts files. Because of +that, this only works on coding genes. + +Args: + db (HistoneSequencesDB) + gene (HistoneGene) + +Returns: + string with locus (arm letter and region and band) +=cut +sub get_locus +{ + my $db = shift; + my $gene = shift; + + ## XXX this is a safe assumption in histone genes but may not be so for + ## other cases. We can use the locus of a single transcript, and + ## assume it applies to the whole gene. + for my $acc ($gene->transcripts) + { + my $seq = $db->get_transcript($acc); + my ($feature) = $seq->get_SeqFeatures("source"); + my ($locus) = $feature->get_tag_values("map"); + if ($locus =~ m/[\d]+[qp][\d\.]+/) + { return $locus; } + ## else, try the next transcript } - } + die ("Could not find locus for ${gene->symbol}"); } -## Get the counts and stats for each of the clusters -foreach my $cluster_k (keys %canon) { - my $cluster = $canon{$cluster_k}; - ## Calculate the length (in bp) of each cluster - my $coord_start = List::Util::min (@{$$cluster{'coordinates'}}); - my $coord_end = List::Util::max (@{$$cluster{'coordinates'}}); - my $coord_length = abs ($coord_start - $coord_end); - say HistoneCatalogue::latex_newcommand( - $cluster_k."Span", - $coord_length, - "Span, in bp, of the histone cluster $cluster_k" - ); - - ## Get a nice LaTeX string showing the range of locus for each cluster, - ## e.g, 6p21.3--6p22.2. The problem is that some locus have less precision - ## than others. For example, 1q21 does not mean 1q21.0, it only means - ## somewhere in 1q21. While it is tempting to just ignore it, it is not - ## correct, we should use the one with lowest precision. Luckily, sorting - ## seems to already do that for us. We are left with the problems of - ## having some genes in the short and long arm of a chromosome but a cluster - ## should not be spanning the two arms. - my @locus = @{$$cluster{'locus'}}; - if (@locus == 0) { - warn ("No locus information for cluster $cluster_k."); - } else { - my $locus_start = List::Util::minstr (@locus); - my $locus_end = List::Util::maxstr (@locus); - my $locus = $locus_start eq $locus_end ? - $locus_start : "$locus_start--$locus_end"; - say HistoneCatalogue::latex_newcommand( - $cluster_k."Locus", - $locus, - "Locus of the histone cluster $cluster_k" - ); - } +sub main +{ + if (@_ != 1) + { + print "Usage error -- no input arguments.\n"; + print "Correct usage is:\n"; + print "\n"; + print " \$ cluster_stats.pl path/for/dbfile\n"; + exit (1); + } + + my $db = HistoneSequencesDB::read_db ($_[0]); + my @canonical_core = $db->canonical_core; + + my %clusters = map {$_->cluster => 1} @canonical_core; + my @clusters = sort keys %clusters; + for my $cluster_k (@clusters) + { + my @this_cluster = grep {$_->cluster == $cluster_k} @canonical_core; + + + ## + ## Compute length in bp of each cluster + ## + + ## to find the start and end of each cluster, we just list all the + ## start and end coordinates. Then we only need the min and max. + my @coordinates; + push (@coordinates, $_->chr_start, $_->chr_end) for (@this_cluster); + my $coord_start = List::Util::min (@coordinates); + my $coord_end = List::Util::max (@coordinates); + my $coord_length = abs ($coord_start - $coord_end); + say HistoneCatalogue::latex_newcommand( + "HIST${cluster_k}Span", + $coord_length, + "Span, in bp, of the histone cluster HIST$cluster_k" + ); + + + ## + ## Compute cluster genomic locus + ## + + ## FIXME get_locus only works with coding genes... + my @coding = grep {$_->type eq "coding"} @this_cluster; + my @this_locus = map {get_locus($db, $_)} @coding; + + ## Get a nice LaTeX string showing the range of locus for each cluster, + ## e.g, 6p21.3--6p22.2. The problem is that some locus have less precision + ## than others. For example, 1q21 does not mean 1q21.0, it only means + ## somewhere in 1q21. While it is tempting to just ignore it, it is not + ## correct, we should use the one with lowest precision. Luckily, sorting + ## seems to already do that for us. We are left with the problems of + ## having some genes in the short and long arm of a chromosome but a cluster + ## should not be spanning the two arms. + if (@this_locus == 0) { + warn ("No locus information for cluster HIST$cluster_k."); + } else { + my $locus_start = List::Util::minstr (@this_locus); + my $locus_end = List::Util::maxstr (@this_locus); + my $locus = $locus_start eq $locus_end ? + $locus_start : "$locus_start--$locus_end"; + say HistoneCatalogue::latex_newcommand( + "HIST${cluster_k}Locus", + $locus, + "Locus of the histone cluster HIST$cluster_k" + ); + } + } + return 0; } + +main (@ARGV);