bb.fastaconsensus

#!/usr/bin/perl
#----------------------------------------------------------#
#        Author: Douglas Senalik dsenalik@wisc.edu         #
#----------------------------------------------------------#
# "Black Box" program series
=bb
Generate a consensus of a multi-fasta alignment
=cut bb
use strict;
use warnings;
use Getopt::Long;      # for getting command line parameters

                # 1.1 add --dropinvariant and some error checking
                # 1.2 May 29, 2016, add --table, fix lower case problem
my $ver="1.3";  # 1.3 July 12, 2016, add --gapfraction


############################################################
# configuration variables
############################################################
my $defaultunknown = '?*-';
my $defaultwrap = 0;
my $defaultmajority = 20;
my $defaultgapfraction = 1;
my %degen = (  # lookup for degenerate bases
  'A'    => 'A',
  'AC'   => 'M',
  'ACG'  => 'V',
  'ACGNT'=> 'N',
  'ACGT' => 'N',
  'ACN'  => 'M',
  'ACT'  => 'H',
  'AG'   => 'R',
  'AGN'  => 'R',
  'AGT'  => 'D',
  'AN'   => 'A',
  'AT'   => 'W',
  'ANT'  => 'W',
  'C'    => 'C',
  'CG'   => 'S',
  'CGN'  => 'S',
  'CGT'  => 'B',
  'CN'   => 'C',
  'CT'   => 'Y',
  'CNT'  => 'Y',
  'G'    => 'G',
  'GN'   => 'G',
  'GNT'  => 'K',
  'GT'   => 'K',
  'N'    => 'N',
  'NT'   => 'T',
  'T'    => 'T' );


############################################################
# global variables
############################################################
my $ansiup    = "\033[1A";  # terminal control
(my $prognopath = $0) =~ s/^.*[\/\\]//;
my @seq; # sequences loaded from $infilename
my %unk; # unknown base index from $unknown
my @counts;
my %allbases;


############################################################
# command line parameters
############################################################
my $infilename   = "";   # input file name
my $outfilename  = "";   # output file name
my $consensusonly;
my $unknown      = $defaultunknown;
my $majority     = $defaultmajority;
my $gapfraction  = $defaultgapfraction;
my $nodegen      = 0;
my $wrap         = $defaultwrap;
my $report;
my $dropinvariant;
my $table;
my $help         = 0;    # print help and exit
my $quiet        = 0;    # only show errors
my $debug        = 0;    # print extra debugging information
GetOptions (
            "infile=s"       => \$infilename,        # string
            "outfile=s"      => \$outfilename,       # string
            "majority=s"     => \$majority,          # string(real)
            "gapfraction=s"  => \$gapfraction,       # string(real)
            "nodegenerate"   => \$nodegen,           # flag
            "consensusonly"  => \$consensusonly,     # flag
            "report:s"       => \$report,            # flag|filename
            "table=s"        => \$table,             # filename
            "unknown=s"      => \$unknown,           # string(char list)
            "wrap=i"         => \$wrap,              # integer
            "dropinvariant"  => \$dropinvariant,     # flag
            "help"           => \$help,              # flag
            "quiet"          => \$quiet,             # flag
            "debug"          => \$debug);            # flag
# debug implies not quiet
if ( $debug ) { $quiet = 0; }
unless ( $infilename ) { $help = 1 }
unless ( $outfilename ) { $help = 1 }
if ( $outfilename eq "-" ) { $quiet = 1 }

 
############################################################
# print help screen
############################################################
if ( $help )
  {
    print "$prognopath version $ver

This program takes an aligned multi-FASTA file as input, and generates
a consensus sequence based on the most abundant nucleotide

Required parameters:
  --infile=xxx      input multi-FASTA file name
  --outfile=xxx     output file name, consensus will be added as the
                    first sequence
                    filenames can be .gz compressed, or use - for STDIN/STDOUT
Optional parameters:
  --majority=xxx    if second most abundant base is within
                    this percentage of most abundant, then
                    generate a degenerate base. Default = $defaultmajority,
                    use --majority=0 to only select most abundant base
  --gapfraction=xxx value between 0 and 1, if >= this fraction of sequences have
                    gap character, make consensus gap also. Default = $defaultgapfraction
  --nodegen         do not use degenerate base codes in the consensus
                    sequence. If equal frequencies, choose one at random
  --consensusonly   only output the consensus sequence
  --unknown=xxx     characters representing unknow bases which are ignored
                    for creating the consensus
                    default = \"$defaultunknown\"
  --wrap=xxx        wrap output sequences to this length,
                    default is no wrap
  --report[=xxx]    print a summary of base frequencies
                    optional value is output file name, default is STDOUT
  --table=xxx       save a table with all variant locations relative to
                    consensus: 'D' for deletion, 'I' for insertion,
                    'P' for polymorphism, '.' for same as consensus
  --dropinvariant   all locations where all sequences are identical are
                    eliminated, and sequences will therefore be shortened
  --help            print this screen
  --quiet           only print error messages
  --debug           print extra debugging information
";
    exit 1;
  } # if ( $help )


############################################################
# generate unknown base hash
############################################################
{
for my $i ( 0 .. length($unknown)-1 )
  { $unk{substr($unknown, $i, 1)} = 1; }
# to all degenerate combinations, add an additional one with gap character included
my @tmp = keys( %degen );
foreach my $u ( keys %unk )
  {
    $degen{$u} = $u;
    foreach my $t ( @tmp )
      {
        # characters in key must be in alphabetical order
        my $sortkey = join( '', sort( split(//, $u.$t ) ) );
        $degen{$sortkey} = $degen{$t};
      } # for $t
  } # for $u
}


############################################################
# load input file
############################################################
debugmsg ( "Loading input file \"$infilename\"" );
@seq = loadfasta ( $infilename, 1 );
# do analysis in all upper case
foreach my $aseq ( @seq )
  { $aseq->{seq} = uc($aseq->{seq}); }
unless ( $quiet ) { print "Loaded ".commify(scalar @seq)." sequences\n"; }


############################################################
# error checking
############################################################
unless ( @seq ) { die "Error, no sequences loaded from \"$infilename\"\n"; }
unless ( ( scalar @seq ) > 1 ) { die "Error, only one sequence loaded from \"$infilename\"\n"; }
my $lzero = length( $seq[0]->{seq} );
my $okay = 1;
for my $i ( 1..$#seq )
  {
    my $li = length( $seq[$i]->{seq} );
    if ( $li != $lzero )
      {
        $okay = 0;
        print "Error, length of sequence #".($i+1).":$li is"
            . " not equal to length of sequence #1:$lzero\n";
      }
  }
unless ( $okay ) { exit 1; }


############################################################
# optional --dropinvariant
############################################################
if ( $dropinvariant )
  {
    debugmsg ( "Drop invariant" );
    my @newseq;
    my $newlen = 0;
    for my $i ( 0..$#seq )
      { $newseq[$i]->{hdr} = $seq[$i]->{hdr}; }
    for my $i ( 0 .. length($seq[0]->{seq})-1 )
      {
        my %counts;
        foreach my $aseq ( @seq )
          {
            my $base = substr($aseq->{seq}, $i, 1);
            $counts{$base}++;
          }
        # will be only one key in %counts if all bases were identical
        if ( ( scalar keys %counts ) > 1 )
          {
            for my $j ( 0..$#seq )
              {
                my $base = substr($seq[$j]->{seq}, $i, 1);
                $newseq[$j]->{seq} .= $base;
              }
            $newlen++;
          }
      } # for $i
    unless( $newlen )
      { die "Error, all sequences were identical and --dropinvariant was specified\n"; }
    unless( $quiet )
      { print "Sequence length after removal of invariant sites is ".commify($newlen)."\n"; }
    @seq = @newseq;
  } # if ( $dropinvariant )


############################################################
# main processing loop
############################################################
{
debugmsg ( "Counting bases" );
foreach my $aseq ( @seq )
  {
    for my $i ( 0 .. length($aseq->{seq})-1 )
      {
        my $base = substr($aseq->{seq}, $i, 1);
#        unless ( $unk{$base} ) # skip gaps
#          {
            $counts[$i]->{$base}++;
            $allbases{$base}++;
#          }
      }
  } # for $aseq

debugmsg ( "Generating consensus" );
my %consseq = ( 'seq' => '', 'hdr' => 'Consensus' );
my $ngapsincons = 0;
for my $i ( 0 .. $#counts )
  {
    # rank bases by frequency
    my @byfreq = sort { $counts[$i]->{$b} <=> $counts[$i]->{$a} } keys %{$counts[$i]};

    my $totalbases = 0;
    foreach my $base ( keys %{$counts[$i]} )
      { $totalbases += $counts[$i]->{$base}; }

    ### Gap to consensus
    if ( ( ( $counts[$i]->{'-'} // 0 ) / $totalbases ) >= $gapfraction )
      {
        $consseq{seq} .= '-';
        $ngapsincons++;
      }
    else
      {

        ### code for degenerate
        my @pct;
        my @deg;
        for my $j ( 0 .. $#byfreq )
          {
            $pct[$j] = $counts[$i]->{$byfreq[$j]} * 100 / $totalbases;
            # skip gaps in consensus, --gapfraction does that
            unless ( $unk{$byfreq[$j]} )
              {
                unless ( @deg )  # first entry in list
                  { push ( @deg, $byfreq[$j] ); }
                else
                  {
                    if ( ( $pct[$j] + $majority ) >= $pct[0] )
                      { push ( @deg, $byfreq[$j] ); }
                    else
                      { last; }
                  } # else after first entry
              } # unless gap character
          } # for $j
        my $degenstr = join ( '', sort ( @deg ) );  # alphabetical to simplify degenerate hash
        if ( $nodegen )
          {
            if ( ( $degenstr =~ m/[\?N\-]/ ) and ( substr($degenstr,1,1) !~ m/[\?N\-]/ ))
              { $degenstr = substr($degenstr,1,1); }
            else
              { $degenstr = substr($degenstr,0,1); }
          }
        my $db = $degen{$degenstr};
        if ( $unk{$db} ) { $ngapsincons++; }
        unless ( $db )
          {
            $db = "N";
            print "Warning, cannot lookup degenerate base for \"$degenstr\" position $i totalbases=$totalbases, converting to \"N\"\n";
          }
        $consseq{seq} .= $db;
      } # else not >= $gapfraction
  } # for $i
unshift ( @seq, \%consseq );
unless ( $quiet ) { print commify($ngapsincons)." gaps in consensus\n"; }
}


############################################################
# save output file
############################################################
debugmsg ( "Saving output file \"$outfilename\"" );
if ( $consensusonly ) { @seq = $seq[0]; }
savefasta ( $outfilename, \@seq, $wrap );


############################################################
# --report
############################################################
if ( defined $report )
  {
    unless ( $report ) { $report = '-'; }
    my $OUTF = stdopen ( ">", $report );
    print $OUTF "Pos";
    for my $base ( sort keys %allbases )
      { print $OUTF "\t", $base; }
    print $OUTF "\n";
    for my $i ( 0 .. $#counts )
      {
        print $OUTF $i+1;
        for my $base ( sort keys %allbases )
          { print $OUTF "\t", ( $counts[$i]->{$base} // 0 ); }
        print $OUTF "\n";
      }
    stdclose ( $OUTF );
  } # if $report


############################################################
# --table
############################################################
if ( $table )
  {
    my $OUTF = stdopen ( ">", $table );
    # header line
    print $OUTF join( "\t", 'Start', 'End' );
    foreach my $aseq ( @seq )
      {
        ( my $id = $aseq->{hdr} ) =~ s/\s.*$//;
        print $OUTF "\t", $id;
      }
    print $OUTF "\n";

    # table body - record SNPs relative to consensus, and deletions
    my %events;  # 3-D hash, first index is start, second is end, third is genotype index, value is 'D' or 'P'
    for my $i ( 0 .. $#seq )
      {
        my $status = '.';
        my $coord;
        for my $j ( 0 .. length($seq[$i]->{seq})-1 )
          {
            my $nt = substr($seq[$i]->{seq},$j,1);
            my $connt = substr($seq[0]->{seq},$j,1);
            if ( $nt eq $connt )
              {
                if ( $status ne '.' )
                  { $events{$coord}->{$j-1}->[$i] = $status; }
                $status = '.';
                $coord = undef;
              }
            elsif ( $nt eq '-' )
              {
                if ( $status ne 'D' )
                  {
                    if ( $status ne '.' )
                      { $events{$coord}->{$j-1}->[$i] = $status; }
                    $status = 'D';
                    $coord = $j;
                  }
              }
            elsif ( $connt eq '-' )
              {
                if ( $status ne 'I' )
                  {
                    if ( $status ne '.' )
                      { $events{$coord}->{$j-1}->[$i] = $status; }
                    $status = 'I';
                    $coord = $j;
                  }
              }
            else
              {
                if ( $status ne 'P' )
                  {
                    if ( $status ne '.' )
                      { $events{$coord}->{$j-1}->[$i] = $status; }
                    $status = 'P';
                    $coord = $j;
                  }
              }
          } # for $j
        if ( $status ne '.' )
          { $events{$coord}->{length($seq[$i]->{seq})-1}->[$i] = $status; }
      } # for $i

    for my $i ( sort{$a<=>$b} keys %events )  # start coordinate
      {
        for my $j ( sort{$a<=>$b} keys %{$events{$i}} )  # end coordinate
          {
            print $OUTF join( "\t", $i+1, $j+1 );
            for my $k ( 0 .. $#seq )
              {
                my $state = '.';
                if ( ( $events{$i} ) and ( $events{$i}->{$j} ) and ( $events{$i}->{$j}->[$k] ) )
                  { $state = $events{$i}->{$j}->[$k]; }
                print $OUTF "\t", $state;
              } # for $k
            print $OUTF "\n";
          } # for $j
      } # for $i
  } # if $table


############################################################
# cleanup and end program
############################################################
unless ( $quiet ) { print "$0 Done\n"; }
exit 0;


############################################################
sub debugmsg { my ( $text, $noreturn, $nolinenum ) = @_;
############################################################
  if ( $debug )
    {
      my ($package, $filename, $line, $sub) = caller(0);
      unless ( $nolinenum ) { $text = "Line $line: " . $text; }
      if ( ! ( $noreturn ) ) { $text .= "\n"; }
      print $text;
    } # if ( $debug )
} # sub debugmsg


###############################################################
sub timestr {
###############################################################
  @_ = localtime(shift || time);
  return(sprintf("%04d/%02d/%02d %02d:%02d", $_[5]+1900, $_[4]+1, $_[3], @_[2,1]));
} # sub timestr


###############################################################
sub commify {
###############################################################
# http://perldoc.perl.org/perlfaq5.html#How-can-I-output-my-numbers-with-commas
  local $_ = shift;
  1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
  return $_;
} # commify


###############################################################
sub stdopen { my ( $mode, $filename, $extratext ) = @_;
###############################################################
# a replacement for the three-parameter open which also allows
# the use of "-" as the file name to mean STDIN or STDOUT
  my $fh;  # the file handle
  if ( $filename eq "-" )  # only exact match to "-" has special meaning
    {
      if ( $mode =~ m/>/ )
        { $fh = *STDOUT }
      else
        { $fh = *STDIN }
    }
  else
    {
      # supplemental passed text for error messages, need one more space
      if ( defined $extratext )
        { $extratext .= " " }
      else
        { $extratext = "" }

      my $text;  # this is only used for error message
      if ( $mode =~ m/^\+?>>/ )  # ">>" or "+>>"
        { $text = "append" }
      elsif ( $mode =~ m/^\+?>/ )  # ">" or "+>"
        { $text = "output" }
      elsif ( $mode =~ m/^\+?</ )  # "<" or "+<"
        { $text = "input" }
      elsif ( $mode eq "-|" )
        { $text = "piped input" }
      elsif ( $mode eq "|-" )
        { $text = "piped output" }
      else
        { die "Error, unsupported file mode \"$mode\" specified to stdopen( $mode, $filename, $extratext )\n"; }

      # if file name ends in ".gz", gzip compression is assumed, and handle it transparently
      if ( $filename =~ m/\.gz$/ )
        {
          if ( $mode =~ m/^>$/ ) # output mode
            { $mode = "|-"; $filename = "gzip -c > \"$filename\""; }
          elsif ( $mode =~ m/^<$/ ) # input mode
            { $mode = "-|"; $filename = "gunzip -c \"$filename\""; }
          elsif ( $mode =~ m/^>>$/ ) # append mode
            { $mode = "|-"; $filename = "gzip -c >> \"$filename\""; }
          else
            { die "Error, can't handle gzip compression with mode \"$mode\" for file \"filename\"\n"; }
        } # if gzip compressed file
      elsif ( $filename	=~ m/\.bz2$/ )
       	{
          if ( $mode =~ m/^>$/ ) # output mode
            { $mode = "|-"; $filename = "bzip2 -c > \"$filename\""; }
          elsif ( $mode =~ m/^<$/ ) # input mode
            { $mode = "-|"; $filename = "bunzip2 -c \"$filename\""; }
          else
            { die "Error, can't handle bzip2 compression with mode \"$mode\" for file \"filename\"\n"; }
       	}
      open ( $fh, $mode, $filename ) or die ( "Error opening ${extratext}file \"$filename\" for $text: $!\n" );
    }
  # return the opened file handle to the caller
  return $fh;
} # sub stdopen


###############################################################
sub stdclose { my ( $fh ) = @_;
###############################################################
# same as built-in close, except in case of STDIN or STDOUT,
# and in this case the file handle is not closed

  unless ( fileno($fh) <= 2 )  # if file number is this low, is stdin or stdout or stderr
    { close ( $fh ) or die ( "Error closing file handle: $!\n" ); }

} # sub stdclose


###############################################################
sub loadfasta { my ( $infilename, $stripreturns ) = @_;
###############################################################
# load a single FASTA file into an array of hashes to separate header and sequence portion
# e.g. $seq[3]->{hdr} contains the header of the fourth sequence, $seq[3]->{seq} contains the sequence
# and the ">" is stripped off of the header
  my @seq;
  my $count = 0;  # number of sequences loaded
  my $mode = "<";
  if ( $infilename =~ m/\.gz$/ )
    { $mode = "-|"; $infilename = "gunzip -c \"$infilename\""; }
  my $INF = stdopen ( $mode, $infilename, 'input FASTA' );
  while ( my $aline = <$INF> )
    {
      if ( $aline =~ m/^#/ ) # ignore comment lines
       	{  }
      elsif ( $aline =~ m/^>(.*)$/ )
        {
          $aline =~ s/[\r\n]//g;
          $aline =~ s/^>//;
          $count++;
          $seq[$count-1]->{hdr} = $aline;
        }
      else
        {
          if ( $stripreturns ) { $aline =~ s/[\r\n]//g; }
          $seq[$count-1]->{seq} .= $aline;
        }
    } # while ( my $aline = <$INF> )
  stdclose ( $INF );
  return @seq;
} # sub loadfasta


###############################################################
sub savefasta { my ( $outfilename, $arrayref, $addreturns ) = @_;
###############################################################
  # save a FASTA file in a hash as described by loadfasta above
  # addreturns is the desired wrap length, or undefined or zero for no wrapping
  my $OUTF = stdopen ( ">", $outfilename, 'output FASTA' );
  foreach my $seqref ( @{$arrayref} )
    {
      next unless ( defined $seqref );
      print $OUTF ">", $seqref->{"hdr"}, "\n";
      my $s = $seqref->{"seq"};
      if ( $addreturns )
        { $s =~ s/(.{$addreturns})/$1\n/g; }
      print $OUTF $s;
      unless ( $s =~ m/\n$/ ) {	print $OUTF "\n"; }
    } # foreach my $seqref ( @{$arrayref} )
  stdclose ( $OUTF );
} # sub savefasta


###############################################################
sub indexfasta { my ( $seqref, $stripheaders ) = @_;
###############################################################
# if $stripheaders is true, remove anything in header after
# the first white space. Returns a hash reference, key is
# header, value is array index
  my %index;
  for my $i ( 0..$#{$seqref} )
    {
      my $hdr = $seqref->[$i]->{'hdr'};
      if ( $stripheaders ) { $hdr =~ s/\s.*$//; }
      if ( defined $index{$hdr} ) { die "Error, duplicate header \"$hdr\" while indexing\n"; }
      $index{$hdr} = $i;
    }
  return \%index
} # sub indexfasta


# eof