-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d2ef60e
commit f9b84cd
Showing
3 changed files
with
211 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#!/usr/bin/env perl | ||
|
||
use strict; | ||
use warnings; | ||
use Getopt::Long qw(:config pass_through no_ignore_case); | ||
|
||
my $outfile = pop @ARGV; | ||
my $qryfile = pop @ARGV; | ||
my $reffile = pop @ARGV; | ||
|
||
my $num_cores = 4; | ||
my $num_lines = 10000; | ||
|
||
GetOptions ( | ||
"cpus|cores:i" => \$num_cores, | ||
"lines:i" => \$num_lines, | ||
); | ||
|
||
my $tmpdir = "$outfile\_" . int(rand(10000)); | ||
mkdir $tmpdir or die $!; | ||
my $parallel_command = ""; | ||
system ("fastaqual_multiline_to_singleline.pl $qryfile | split -l $num_lines -a 4 - $tmpdir/split_") and die $!; | ||
system ("parallel -j $num_cores blat @ARGV $reffile {} {}.psl ::: $tmpdir/split_*") and die $!; | ||
system ("cat $tmpdir/*.psl >$outfile; rm -rf $tmpdir") and die $!; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
#!/usr/bin/perl | ||
|
||
use strict; | ||
use warnings; | ||
use Getopt::Long qw(:config pass_through no_ignore_case); | ||
|
||
my ($fastafile,$sort,$numfasta,$length,$prefix,$regexp,$headerfile,$includefile,$excludefile,$delimiter,$case) = ("-","S","","","","","","",""," ",""); | ||
GetOptions ( | ||
"fastafile:s" => \$fastafile, | ||
"sort:s" => \$sort, | ||
"numfasta:i" => \$numfasta, | ||
"length:i" => \$length, | ||
"prefix:s" => \$prefix, | ||
"regexp:s" => \$regexp, | ||
"includefile:s" => \$includefile, | ||
"excludefile:s" => \$excludefile, | ||
"delimiter:s" => \$delimiter, | ||
"case:s" => \$case, | ||
); | ||
|
||
$case = uc(substr($case,0,1)) if $case; | ||
|
||
my (%include_headers, %exclude_headers); | ||
if ($includefile) | ||
{ | ||
open FILE,"<$includefile" or die "Couldn't open includes file $includefile\n"; | ||
while (<FILE>) | ||
{ | ||
chomp; | ||
if (/^>?(\S+)\s(\d+)\s(\d+)\b/ or /^>?(\S+)_(\d+)_(\d+)\b/) | ||
{ | ||
push @{$include_headers{$1}}, ($2 <= $3) ? [$2,$3] : [$3,$2]; | ||
} elsif (/^>?(\S+)/) | ||
{ | ||
$include_headers{$1}=1 | ||
} | ||
} | ||
close FILE; | ||
} | ||
if ($excludefile) | ||
{ | ||
open FILE,"<$excludefile" or die "Couldn't open excludes file $excludefile\n"; | ||
while (<FILE>) | ||
{ | ||
if (/^>?(\S+)/) | ||
{ | ||
$exclude_headers{$1}=1 | ||
} | ||
} | ||
close FILE; | ||
} | ||
|
||
my $seqs = &fastafile2hash($fastafile,"N",$sort); | ||
my @sortkeys; | ||
|
||
if (uc($sort) eq "R") { | ||
@sortkeys = sort {$$seqs{$a}{order} <=> $$seqs{$b}{order}} keys %{$seqs}; | ||
} elsif (uc($sort) eq "A") { | ||
@sortkeys = sort keys %{$seqs}; | ||
} else { | ||
@sortkeys = sort {length($$seqs{$b}{seq}) <=> length($$seqs{$a}{seq})} keys %{$seqs}; | ||
} | ||
|
||
my $num_printed = 0; | ||
for my $chrontig (@sortkeys) | ||
{ | ||
last if $numfasta and $numfasta == $num_printed; | ||
if ($$seqs{$chrontig}{seq}=~ /\d+/) { | ||
# it's a qual file | ||
my @qual = split(/\s+/,$$seqs{$chrontig}{seq}); | ||
next if $length and scalar @qual < $length; | ||
} | ||
else { | ||
next if $length and length($$seqs{$chrontig}{seq}) < $length; | ||
} | ||
next if $excludefile and exists $exclude_headers{$chrontig}; | ||
next if $includefile and not exists $include_headers{$chrontig}; | ||
my $toprint; | ||
if (%include_headers and $include_headers{$chrontig} =~ /array/i) | ||
{ | ||
for my $interval (@{$include_headers{$chrontig}}) | ||
{ | ||
$toprint = ">$prefix$chrontig"; | ||
my ($start, $stop) = @{$interval}; | ||
$toprint .= "$delimiter$start$delimiter$stop"; | ||
$toprint .= " $$seqs{$chrontig}{desc}" if $$seqs{$chrontig}{desc}; | ||
$toprint .= "\n" . substr($$seqs{$chrontig}{seq}, $start - 1, $stop - $start + 1) . "\n"; | ||
next if $regexp and $toprint !~ /$regexp/; | ||
print $toprint; | ||
$num_printed++; | ||
} | ||
} | ||
else | ||
{ | ||
$toprint = ">$prefix$chrontig"; | ||
$toprint .= " $$seqs{$chrontig}{desc}" if $$seqs{$chrontig}{desc}; | ||
$$seqs{$chrontig}{seq} = lc($$seqs{$chrontig}{seq}) if $case eq "L"; | ||
$$seqs{$chrontig}{seq} = uc($$seqs{$chrontig}{seq}) if $case eq "U"; | ||
$$seqs{$chrontig}{seq} =~ tr/[A-Z][a-z]/[a-z][A-Z]/ if $case eq "I"; | ||
$toprint .= "\n$$seqs{$chrontig}{seq}\n"; | ||
next if $regexp and $toprint !~ /$regexp/; | ||
print $toprint; | ||
$num_printed++; | ||
} | ||
} | ||
|
||
############################################################################# | ||
|
||
sub fastafile2hash | ||
{ | ||
my $fastafile = shift @_; | ||
my $changecase = "N"; | ||
my $order = "S"; # S = same as input, or R = random | ||
$changecase = substr(uc(shift @_),0,1) if @_; | ||
$order = substr(uc(shift @_),0,1) if @_; | ||
my %sequences; | ||
my $fh = &read_fh($fastafile); | ||
my $seqid; | ||
my $seq_counter; | ||
while (<$fh>) | ||
{ | ||
if (/^>(\S+)(.*)/) { | ||
$seqid = $1; | ||
$sequences{$seqid}{desc} = $2; | ||
$sequences{$seqid}{order} = $order eq "S" ? $seq_counter++ : rand; | ||
} | ||
else { | ||
if (/\d/) { | ||
chomp($sequences{$seqid}{seq} .= " $_"); # add space to sep qual values | ||
$sequences{$seqid}{seq} =~ s/^\s+//; | ||
$sequences{$seqid}{seq} =~ s/\s+$//; | ||
next; | ||
} | ||
chomp($sequences{$seqid}{seq} .= lc($_)) if $changecase eq "L"; | ||
chomp($sequences{$seqid}{seq} .= uc($_)) if $changecase eq "U"; | ||
chomp($sequences{$seqid}{seq} .= $_ ) if $changecase eq "N"; | ||
} | ||
} | ||
return \%sequences; | ||
} | ||
|
||
############################################################################# | ||
|
||
sub read_fh { | ||
my $filename = shift @_; | ||
my $filehandle; | ||
if ($filename =~ /gz$/) { | ||
open $filehandle, "gunzip -dc $filename |" or die $!; | ||
} | ||
else { | ||
open $filehandle, "<$filename" or die $!; | ||
} | ||
return $filehandle; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#!/usr/bin/perl | ||
|
||
use strict; | ||
use warnings; | ||
|
||
# takes as input file with: | ||
# chrontigname_st_en or chrontigname\tst\ten, and gives back chrontig with merged intervals: | ||
|
||
my %maskstrings; | ||
|
||
while (<>) { | ||
next unless /^>?\s*(\S+)[\t_ ](\d+)[\t_ ](\d+)\s*$/; | ||
|
||
my ($chrontig, $st, $en) = ($1, $2, $3); | ||
($st, $en) = ($en, $st) if $st > $en; | ||
|
||
if (exists $maskstrings{$chrontig}) { | ||
$maskstrings{$chrontig} .= "0" x ($en - length($maskstrings{$chrontig})) | ||
} | ||
else { | ||
$maskstrings{$chrontig} .= "0" x $en | ||
} | ||
substr($maskstrings{$chrontig}, $st - 1, $en - $st + 1, "1" x ($en - $st + 1)); | ||
} | ||
|
||
for my $id (keys %maskstrings) | ||
{ | ||
my $mask = $maskstrings{$id}; | ||
while ($mask =~ /(1+)/g) | ||
{ | ||
print $id . "\t" . (pos($mask) - length($1) + 1) . "\t" . pos($mask) . "\n"; | ||
} | ||
} |