-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[#6] Added convert_to_impute2_ids.pl; Improves handling of chr X; Req…
…uires modified legend files with chr col added; Fixes minor bug in convert_to_1000g_ids.pl
- Loading branch information
cfncluster user
committed
Mar 20, 2020
1 parent
43217c0
commit 4156c91
Showing
1 changed file
with
267 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,267 @@ | ||
#!/usr/bin/perl | ||
|
||
use strict; | ||
use warnings; | ||
use Getopt::Long; | ||
use constant FALSE => 0; | ||
use constant TRUE => 1; | ||
|
||
# Autoflush STDOUT | ||
select((select(STDOUT), $|=1)[0]); | ||
|
||
# Assign defaults for legend columns | ||
my $legendIdCol = 0; | ||
my $legendChrCol = 1; | ||
my $legendPosCol = 2; | ||
my $legendA1Col = 3; | ||
my $legendA2Col = 4; | ||
|
||
my $fileIn = ''; | ||
my $fileOut = ''; | ||
my $legend = ''; | ||
my $fileInHeader = -1; | ||
my $fileInIdCol = -1; | ||
my $fileInChrCol = -1; | ||
my $fileInPosCol = -1; | ||
my $fileInA1Col = -1; | ||
my $fileInA2Col = -1; | ||
my $fileInMonomorphicAllele = ""; | ||
|
||
GetOptions ( | ||
'file_in=s' => \$fileIn, | ||
'file_in_header=i' => \$fileInHeader, | ||
'file_in_id_col=i' => \$fileInIdCol, | ||
'file_in_chr_col=i' => \$fileInChrCol, | ||
'file_in_pos_col=i' => \$fileInPosCol, | ||
'file_in_a1_col=i' => \$fileInA1Col, | ||
'file_in_a2_col=i' => \$fileInA2Col, | ||
'file_in_monomorphic_allele=s' => \$fileInMonomorphicAllele, | ||
'legend_with_chr=s' => \$legend, | ||
'file_out=s' => \$fileOut, | ||
); | ||
|
||
if ($fileIn eq '') { die "Please provide an input file\n"; } | ||
if ($fileInIdCol eq -1) { die "Please provide the ID column in the input file\n"; } | ||
if ($fileInChrCol eq -1) { die "Please provide the chromosome column in the input file\n"; } | ||
if ($fileInPosCol eq -1) { die "Please provide the position column in the input file\n"; } | ||
if ($fileInA1Col eq -1) { die "Please provide the allele 1 column in the input file\n"; } | ||
if ($fileInA2Col eq -1) { die "Please provide the allele 2 column in the input file\n"; } | ||
if ($fileInMonomorphicAllele eq '') { die "Please provide the monomorphic allele code used in the input file\n"; } | ||
if ($legend eq '') { die "Please provide a legend file\n"; } | ||
if ($fileOut eq '') { die "Please provide an output file\n"; } | ||
|
||
sub flip { | ||
|
||
my ($allele, $monomorphicAllele) = @_; | ||
my $alleleComplement = ""; | ||
|
||
if (uc($allele) eq uc($monomorphicAllele)) { | ||
|
||
$alleleComplement = $allele; | ||
|
||
} else { | ||
|
||
my %flipMap = ( | ||
"A" => "T", | ||
"T" => "A", | ||
"C" => "G", | ||
"G" => "C", | ||
"-" => "-" | ||
); | ||
|
||
foreach my $nt (reverse(split("", $allele))) { | ||
if (!exists($flipMap{uc($nt)})) { | ||
$alleleComplement = "error"; | ||
last; | ||
} else { | ||
$alleleComplement .= $flipMap{uc($nt)}; | ||
} | ||
} | ||
|
||
} | ||
|
||
return $alleleComplement; | ||
|
||
} | ||
|
||
my %xChr = ( | ||
"23" => 1, | ||
"25" => 1, | ||
"X" => 1, | ||
"XY" => 1, | ||
"X_NONPAR" => 1, | ||
"X_PAR" => 1, | ||
"X_PAR1" => 1, | ||
"X_PAR2" => 1 | ||
); | ||
my %variants = (); | ||
my %positionVariantCount = (); | ||
my %rsIndels = (); | ||
my %ridAlleles = ( | ||
'R' => 1, | ||
'I' => 1, | ||
'D' => 1 | ||
); | ||
my %indelRegExPatterns = ( | ||
'<DEL>' => 1, | ||
'^!.*' => 1, | ||
'NA' => 1 | ||
); | ||
if (exists($indelRegExPatterns{$fileInMonomorphicAllele})) { | ||
delete $indelRegExPatterns{$fileInMonomorphicAllele}; | ||
} | ||
|
||
# Read legend file | ||
print "Reading legend file...\n"; | ||
if ($legend =~ /\.gz$/) { | ||
open(LEGEND, "gunzip -c $legend |") or die "Cannot open ".$legend."\n"; | ||
} else { | ||
open(LEGEND, $legend) or die "Cannot open ".$legend."\n"; | ||
} | ||
my $junk = <LEGEND>; | ||
while (<LEGEND>) { | ||
|
||
chomp; | ||
my @F = split; | ||
|
||
# Convert alleles to uppercase | ||
$F[$legendA1Col] = uc($F[$legendA1Col]); | ||
$F[$legendA2Col] = uc($F[$legendA2Col]); | ||
|
||
# Increment count for position of variant by 1 in %positionVariantCount | ||
if (exists($positionVariantCount{$F[$legendPosCol]})) { | ||
$positionVariantCount{$F[$legendPosCol]}++; | ||
} else { | ||
$positionVariantCount{$F[$legendPosCol]} = 1; | ||
} | ||
|
||
# Add variant to %variants | ||
$variants{join('_', $F[$legendChrCol], $F[$legendPosCol], $F[$legendA1Col], $F[$legendA2Col])} = $F[$legendIdCol]; | ||
|
||
# Add monomorphic versions of variants to %variants | ||
$variants{join('_', $F[$legendChrCol], $F[$legendPosCol], $F[$legendA1Col], $fileInMonomorphicAllele)} = $F[$legendIdCol]; | ||
$variants{join('_', $F[$legendChrCol], $F[$legendPosCol], $F[$legendA2Col], $fileInMonomorphicAllele)} = $F[$legendIdCol]; | ||
|
||
# If an indel, add an entry for the alternative notation that uses "-" for one of the alleles to %variants | ||
if ($F[5] eq 'Biallelic_INDEL' || ($F[5] eq 'Multiallelic_INDEL' && (length($F[$legendA1Col]) == 1 || length($F[$legendA2Col]) == 1))) { | ||
my @alleles = ( | ||
$F[$legendA1Col], | ||
$F[$legendA2Col] | ||
); | ||
foreach my $allele (@alleles) { | ||
$allele =~ s/^.//; | ||
$allele = ($allele eq '') ? '-' : $allele; | ||
} | ||
$variants{join('_', $F[$legendChrCol], ($F[$legendPosCol] + 1), @alleles)} = $F[$legendIdCol]; | ||
} | ||
|
||
# If an indel that has an rs ID, add to %rsIndels if rsID is unique | ||
if ($F[5] eq 'Biallelic_INDEL' || ($F[5] eq 'Multiallelic_INDEL' && (length($F[$legendA1Col]) > 1 || length($F[$legendA2Col]) > 1))) { | ||
if ($F[$legendIdCol] =~ /^(rs\d+)/) { | ||
my $rsId = $1; | ||
if (exists($rsIndels{$rsId})) { | ||
# delete from hash because not unique | ||
delete $rsIndels{$rsId}; | ||
} else { | ||
$rsIndels{$rsId} = $F[$legendIdCol]; | ||
} | ||
} | ||
} | ||
|
||
} | ||
close LEGEND; | ||
|
||
|
||
# Process input file | ||
print "Processing input file...\n"; | ||
open(FILE_OUT, ">".$fileOut) or die "Cannot open ".$fileOut."\n"; | ||
if ($fileIn =~ /\.gz$/) { | ||
open(FILE_IN, "gunzip -c $fileIn |") or die "Cannot open ".$fileIn."\n"; | ||
} else { | ||
open(FILE_IN, $fileIn) or die "Cannot open ".$fileIn."\n"; | ||
} | ||
# Print header | ||
for (my $i=0; $i<$fileInHeader; $i++) { | ||
my $header = <FILE_IN>; | ||
$header =~ s/ /\t/g; | ||
print FILE_OUT $header; | ||
} | ||
while (<FILE_IN>) { | ||
|
||
chomp; | ||
my @F = split; | ||
my $chr = exists($xChr{$F[$fileInChrCol]}) ? "X" : $F[$fileInChrCol]; | ||
my $pos = $F[$fileInPosCol]; | ||
my @alleles = ( | ||
$F[$fileInA1Col], | ||
$F[$fileInA2Col] | ||
); | ||
|
||
# Standardize alleles | ||
foreach my $allele (@alleles) { | ||
$allele = uc($allele); | ||
foreach my $pattern (keys(%indelRegExPatterns)) { | ||
$allele =~ s/$pattern/-/; | ||
} | ||
} | ||
if (exists($ridAlleles{$alleles[0]}) && exists($ridAlleles{$alleles[1]}) && $F[$fileInIdCol] =~ /^\S+[_:]\d+[_:](\S*)[_:](\S*)$/) { | ||
$alleles[0] = ($1 ne '') ? $1 : '-'; | ||
$alleles[1] = ($2 ne '') ? $2 : '-'; | ||
} | ||
|
||
# Get allele complements | ||
my @alleleComplements = (); | ||
for (my $i=0; $i<2; $i++) { | ||
$alleleComplements[$i] = flip($alleles[$i], $fileInMonomorphicAllele); | ||
} | ||
|
||
my @possibleKeys = ( | ||
join("_", $chr, $pos, $alleles[0], $alleles[1]), | ||
join("_", $chr, $pos, $alleles[1], $alleles[0]), | ||
join("_", $chr, $pos, $alleleComplements[0], $alleleComplements[1]), | ||
join("_", $chr, $pos, $alleleComplements[1], $alleleComplements[0]) | ||
); | ||
|
||
my $createId = 1; | ||
my $monomorphic = ($alleles[0] eq $fileInMonomorphicAllele || $alleles[1] eq $fileInMonomorphicAllele); | ||
my $rid = (exists($ridAlleles{$alleles[0]}) && exists($ridAlleles{$alleles[1]})); | ||
if ($rid) { | ||
if ($F[$fileInIdCol] =~ /^(rs\d+)/ && exists($rsIndels{$1})) { | ||
$F[$fileInIdCol] = $rsIndels{$1}; | ||
$createId = 0; | ||
} | ||
} elsif (!$monomorphic || ($monomorphic && exists($positionVariantCount{$pos}) && $positionVariantCount{$pos} == 1)) { | ||
foreach my $possibleKey (@possibleKeys) { | ||
if (exists($variants{$possibleKey})) { | ||
$F[$fileInIdCol] = $variants{$possibleKey}; | ||
$createId = 0; | ||
} | ||
} | ||
} | ||
|
||
if ($createId) { | ||
|
||
if (!($alleles[0] eq 'A' || $alleles[0] eq 'C' || $alleles[1] eq 'A' || $alleles[1] eq 'C')) { | ||
$alleles[0] = flip($alleles[0], $fileInMonomorphicAllele); | ||
$alleles[1] = flip($alleles[1], $fileInMonomorphicAllele); | ||
} | ||
|
||
if ($alleles[0] eq $fileInMonomorphicAllele) { | ||
$F[$fileInIdCol] = $chr.":".$pos.":".uc($alleles[1]).":".uc($alleles[0]); | ||
} elsif ($alleles[1] eq $fileInMonomorphicAllele) { | ||
$F[$fileInIdCol] = $chr.":".$pos.":".uc($alleles[0]).":".uc($alleles[1]); | ||
} elsif ($alleles[0] lt $alleles[1]) { | ||
$F[$fileInIdCol] = $chr.":".$pos.":".uc($alleles[0]).":".uc($alleles[1]); | ||
} else { | ||
$F[$fileInIdCol] = $chr.":".$pos.":".uc($alleles[1]).":".uc($alleles[0]); | ||
} | ||
|
||
} | ||
|
||
print FILE_OUT join("\t", @F)."\n"; | ||
|
||
} | ||
close FILE_IN; | ||
close FILE_OUT; | ||
|
||
print "Done\n"; |