cluster_stats.pl: rewrite to complete replace MyLib with HistoneSeque…

…ncesDB. Stop using our old MyLib. Refactor code for simplicity with tests. Sorting cluster numbers makes output always the same (sometimes it forced a rebuild when the order changed). Change warning to error when fails to find locus of transcript.
af-lab · Dec 7, 2015 · d47b57b · d47b57b
1 parent fd87520
commit d47b57b
Show file tree

Hide file tree

Showing 2 changed files with 122 additions and 81 deletions.
diff --git a/SConstruct b/SConstruct
@@ -343,7 +343,7 @@ analysis = [
   env.PerlOutput(
     target = path4result("variables-cluster_stats.tex"),
     source = path4script("cluster_stats.pl"),
-    args   = ["--sequences", seq_dir],
+    args   = [db_store],
   ),
   env.PerlScript(
     target = prot_targets,

diff --git a/scripts/cluster_stats.pl b/scripts/cluster_stats.pl
@@ -14,90 +14,131 @@
 ## You should have received a copy of the GNU General Public License
 ## along with this program; if not, see <http://www.gnu.org/licenses/>.
 
-use 5.010;                      # Use Perl 5.10
-use strict;                     # Enforce some good programming rules
-use warnings;                   # Replacement for the -w flag, but lexically scoped
-use List::Util;                 # Includes min and max
+## SYNOPSIS
+##
+##   cluster_stats.pl path/for/dbfile
+##
+## DESCRIPTION
+##
+## It will read the store file of an HistoneSequencesDB object, and
+## print to stdout length of in base pairs, and the chromosome locus,
+## of each cluster.
 
+use 5.010;
+use strict;
+use warnings;
+use List::Util; # for min and max
+
+use HistoneSequencesDB;
 use HistoneCatalogue;
-use MyLib;
 
-## This script will print stats for each of the histone clusters. It will
-## include:
-##    (LaTeX variables with the number of histones
-##    in each cluster, how many are pseudo and coding, the length of each
-##    each cluster, and the location in the genome of each cluster)
-##
-## Usage is:
-##
-## cluster_stats.pl --sequences path/for/sequences
-
-## Check input options
-my %path = MyLib::parse_argv("sequences");
-
-## Read the data and create a data structure for the analysis
-my %canon;   # organized by cluster, with counts of histones and other info
-my @data = MyLib::load_canonical ($path{sequences});
-foreach my $gene (@data) {
-  my $symbol  = $$gene{'symbol'};
-  my $cluster = "HIST" . $$gene{'cluster'};
-  my $histone = $$gene{'histone'};
-
-  ## to find the start and end of each cluster, we just list all the start and
-  ## end coordinates. At the end, we get the min and max of them.
-  push (@{$canon{$cluster}{"coordinates"}}, $$gene{'start'}, $$gene{'end'});
-
-  ## Get the locus.
-  ## It is not possible to calculate it from the genomic coordinates (but
-  ## should be possible to make bp_genbank_ref_extractor do it). However, we
-  ## can find the value in the features of the transcripts files. Because of
-  ## that, this only works on coding genes.
-  my @transcripts = keys %{$$gene{"transcripts"}};
-  if (@transcripts) {
-    my  $seq      = MyLib::load_seq("transcript", $transcripts[0], $path{sequences});
-    my ($feature) = $seq->get_SeqFeatures("source");
-    my ($locus)   = $feature->get_tag_values("map");
-    if ($locus =~ m/[\d]+[qp][\d\.]+/) {
-      push (@{$canon{$cluster}{'locus'}}, $locus);
-    } else {
-      warn ("Could not find locus for $symbol in $transcripts[0]");
+=func get_locus
+Return locus of a Gene object (reads it from the metadata on its transcript).
+
+Implementation detail:
+
+It is not possible to calculate it from the genomic coordinates (but
+should be possible to make bp_genbank_ref_extractor do it). However, we
+can find the value in the features of the transcripts files. Because of
+that, this only works on coding genes.
+
+Args:
+  db (HistoneSequencesDB)
+  gene (HistoneGene)
+
+Returns:
+  string with locus (arm letter and region and band)
+=cut
+sub get_locus
+{
+  my $db = shift;
+  my $gene = shift;
+
+  ## XXX this is a safe assumption in histone genes but may not be so for
+  ##      other cases.  We can use the locus of a single transcript, and
+  ##      assume it applies to the whole gene.
+  for my $acc ($gene->transcripts)
+    {
+      my $seq = $db->get_transcript($acc);
+      my ($feature) = $seq->get_SeqFeatures("source");
+      my ($locus)   = $feature->get_tag_values("map");
+      if ($locus =~ m/[\d]+[qp][\d\.]+/)
+        { return $locus; }
+      ## else, try the next transcript
     }
-  }
+  die ("Could not find locus for ${gene->symbol}");
 }
 
-## Get the counts and stats for each of the clusters
-foreach my $cluster_k (keys %canon) {
-  my $cluster = $canon{$cluster_k};
-  ## Calculate the length (in bp) of each cluster
-  my $coord_start  = List::Util::min (@{$$cluster{'coordinates'}});
-  my $coord_end    = List::Util::max (@{$$cluster{'coordinates'}});
-  my $coord_length = abs ($coord_start - $coord_end);
-  say HistoneCatalogue::latex_newcommand(
-    $cluster_k."Span",
-    $coord_length,
-    "Span, in bp, of the histone cluster $cluster_k"
-  );
-
-  ## Get a nice LaTeX string showing the range of locus for each cluster,
-  ## e.g, 6p21.3--6p22.2. The problem is that some locus have less precision
-  ## than others. For example, 1q21 does not mean 1q21.0, it only means
-  ## somewhere in 1q21. While it is tempting to just ignore it, it is not
-  ## correct, we should use the one with lowest precision. Luckily, sorting
-  ## seems to already do that for us. We are left with the problems of
-  ## having some genes in the short and long arm of a chromosome but a cluster
-  ## should not be spanning the two arms.
-  my @locus = @{$$cluster{'locus'}};
-  if (@locus == 0) {
-    warn ("No locus information for cluster $cluster_k.");
-  } else {
-    my $locus_start = List::Util::minstr (@locus);
-    my $locus_end   = List::Util::maxstr (@locus);
-    my $locus = $locus_start eq $locus_end ?
-                $locus_start : "$locus_start--$locus_end";
-    say HistoneCatalogue::latex_newcommand(
-      $cluster_k."Locus",
-      $locus,
-      "Locus of the histone cluster $cluster_k"
-    );
-  }
+sub main
+{
+  if (@_ != 1)
+    {
+      print "Usage error -- no input arguments.\n";
+      print "Correct usage is:\n";
+      print "\n";
+      print "  \$ cluster_stats.pl path/for/dbfile\n";
+      exit (1);
+    }
+
+  my $db = HistoneSequencesDB::read_db ($_[0]);
+  my @canonical_core = $db->canonical_core;
+
+  my %clusters = map {$_->cluster => 1} @canonical_core;
+  my @clusters = sort keys %clusters;
+  for my $cluster_k (@clusters)
+    {
+      my @this_cluster = grep {$_->cluster == $cluster_k} @canonical_core;
+
+
+      ##
+      ## Compute length in bp of each cluster
+      ##
+
+      ## to find the start and end of each cluster, we just list all the
+      ## start and end coordinates. Then we only need the min and max.
+      my @coordinates;
+      push (@coordinates, $_->chr_start, $_->chr_end) for (@this_cluster);
+      my $coord_start  = List::Util::min (@coordinates);
+      my $coord_end    = List::Util::max (@coordinates);
+      my $coord_length = abs ($coord_start - $coord_end);
+      say HistoneCatalogue::latex_newcommand(
+        "HIST${cluster_k}Span",
+        $coord_length,
+        "Span, in bp, of the histone cluster HIST$cluster_k"
+      );
+
+
+      ##
+      ## Compute cluster genomic locus
+      ##
+
+      ## FIXME get_locus only works with coding genes...
+      my @coding = grep {$_->type eq "coding"} @this_cluster;
+      my @this_locus = map {get_locus($db, $_)} @coding;
+
+      ## Get a nice LaTeX string showing the range of locus for each cluster,
+      ## e.g, 6p21.3--6p22.2. The problem is that some locus have less precision
+      ## than others. For example, 1q21 does not mean 1q21.0, it only means
+      ## somewhere in 1q21. While it is tempting to just ignore it, it is not
+      ## correct, we should use the one with lowest precision. Luckily, sorting
+      ## seems to already do that for us. We are left with the problems of
+      ## having some genes in the short and long arm of a chromosome but a cluster
+      ## should not be spanning the two arms.
+      if (@this_locus == 0) {
+        warn ("No locus information for cluster HIST$cluster_k.");
+      } else {
+        my $locus_start = List::Util::minstr (@this_locus);
+        my $locus_end   = List::Util::maxstr (@this_locus);
+        my $locus = $locus_start eq $locus_end ?
+                    $locus_start : "$locus_start--$locus_end";
+        say HistoneCatalogue::latex_newcommand(
+          "HIST${cluster_k}Locus",
+          $locus,
+          "Locus of the histone cluster HIST$cluster_k"
+        );
+      }
+    }
+  return 0;
 }
+
+main (@ARGV);