Skip to content

Commit

Permalink
Merge pull request #261 from RoanKanninga/master
Browse files Browse the repository at this point in the history
new way of VKGL comparison (using different tool for concordance)
  • Loading branch information
kdelange authored Dec 7, 2023
2 parents 0e96be8 + b48fe95 commit ab35234
Show file tree
Hide file tree
Showing 15 changed files with 279 additions and 30 deletions.
Empty file modified bin/CramConversion.sh
100644 → 100755
Empty file.
Empty file modified bin/checkDataLimitGroups.sh
100644 → 100755
Empty file.
Empty file modified bin/checkValidationNGS_DNA_v5.sh
100644 → 100755
Empty file.
52 changes: 22 additions & 30 deletions bin/compareWithVKGL_gatk4.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,36 +98,32 @@ function giabvshc(){
echo "comparing ${type}: ${sampleName}vsHC"
## sample vs HC callset (precision)
# shellcheck disable=SC2154
gatk VariantEval \
-R "${refGenome}" \
-O "${outputFolder}/tmp/precision-${sampleName}vsHC_union.${type}.vcf" \
--eval "${outputFolder}/tmp/${sampleName}_${type}.union.20bp_filtered.vcf" \
--comp "/apps/data/NIST/${namingConvention}/GIAB_HC.${type}_20bp.vcf"
"${EBROOTNGSMINUTILS}/bin/vcf-compare-precision-sensitivity.sh" \
-1 "${outputFolder}/tmp/${sampleName}_${type}.union.20bp_filtered.vcf" \
-2 "/apps/data/NIST/${namingConvention}/GIAB_HC.${type}_20bp.vcf" \
-o "${outputFolder}/tmp/precision-${sampleName}vsHC_union_${type}/"


if [[ "${firstLine}" == "false" ]]
then
## print header too
head -4 "${outputFolder}/tmp/precision-${sampleName}vsHC_union.${type}.vcf" | tail -1 | awk '{OFS="\t"}{print "measurement","Type",$6,$7,$8,$9,$10,$11,"FinalConcordance"}' > "${outputFolder}/precision_output.txt"
awk '{if(NR==1){print "Measurement\tType\t"$0}}' "${outputFolder}/tmp/precision-${sampleName}vsHC_union_${type}/comparison.txt" > "${outputFolder}/precision_output.txt"
fi
head -5 "${outputFolder}/tmp/precision-${sampleName}vsHC_union.${type}.vcf" | tail -1 | awk -v type="${type}" '{OFS="\t"}{print "precision",type,$6,$7,$8,$9,$10,$11,(($10/$6)*100)}' >> "${outputFolder}/precision_output.txt"

echo "comparing ${type}: HCvs${sampleName}"
awk -v type=${type} '{if(NR>1){print "precision\t"type"\t"$0}}' "${outputFolder}/tmp/precision-${sampleName}vsHC_union_${type}/comparison.txt" >> "${outputFolder}/precision_output.txt"

##HC callset vs sample (sensitivity)
# shellcheck disable=SC2154
gatk VariantEval \
-R "${refGenome}" \
-O "${outputFolder}/tmp/sensitivity-HCvs${sampleName}_union.${type}.vcf" \
--comp "${outputFolder}/tmp/${sampleName}_${type}.union.20bp_filtered.vcf" \
--eval "/apps/data/NIST/${namingConvention}/GIAB_HC.${type}_20bp.vcf"

"${EBROOTNGSMINUTILS}/bin/vcf-compare-precision-sensitivity.sh" \
-1 "/apps/data/NIST/${namingConvention}/GIAB_HC.${type}_20bp.vcf" \
-2 "${outputFolder}/tmp/${sampleName}_${type}.union.20bp_filtered.vcf" \
-o "${outputFolder}/tmp/sensitivity-HCvs${sampleName}_union_${type}/"

if [[ "${firstLine}" == "false" ]]
then
## print header too
head -4 "${outputFolder}/tmp/sensitivity-HCvs${sampleName}_union.${type}.vcf" | tail -1 | awk '{OFS="\t"}{print "measurement","Type",$6,$7,$8,$9,$10,$11,"FinalConcordance"}' > "${outputFolder}/sensitivity_output.txt"
firstLine="true"
awk '{if(NR==1){print "Measurement\tType\t"$0}}' "${outputFolder}/tmp/sensitivity-HCvs${sampleName}_union_${type}/comparison.txt" > "${outputFolder}/sensitivity_output.txt"
firstLine="True"
fi
awk -v type="${type}" '{if(NR>1){print "sensitivity\t"type"\t"$0}}' "${outputFolder}/tmp/sensitivity-HCvs${sampleName}_union_${type}/comparison.txt" >> "${outputFolder}/sensitivity_output.txt"

head -5 "${outputFolder}/tmp/sensitivity-HCvs${sampleName}_union.${type}.vcf" | tail -1 | awk -v type="${type}" '{OFS="\t"}{print "sensitivity",type,$6,$7,$8,$9,$10,$11,(($10/$6)*100)}' >> "${outputFolder}/sensitivity_output.txt"
done
}

Expand Down Expand Up @@ -171,16 +167,12 @@ else
fi

##precision
echo -e "Measurement\tType\tTP\tFP\tTP/TP+FP"
echo -e "Measurement\tType\tTP\tFP\tTP/TP+FP" >> "${outputFolder}/output.txt"
head -1 "${outputFolder}/precision_output.txt" > "${outputFolder}/output.txt"

#awk '{if (NR>1){print $1,$2"\t"$5"\t"$4"\t"$6}}' "${outputFolder}/precision_output.txt"
awk '{if (NR>1){print $1,$2"\t"$5"\t"($3-$7)"\t"(($7/$3)*100)"\t"}}' "${outputFolder}/precision_output.txt"
awk '{if (NR>1){print $1,$2"\t"$5"\t"($3-$7)"\t"(($7/$3)*100)"\t"}}' "${outputFolder}/precision_output.txt" >> "${outputFolder}/output.txt"

##sensitivity
echo -e "Measurement\tType\tTP\tFN\tTP/TP+FN"
echo -e "Measurement\tType\tTP\tFN\tTP/TP+FN" >> "${outputFolder}/output.txt"
awk '{if (NR>1){print $0}}' "${outputFolder}/precision_output.txt" >> "${outputFolder}/output.txt"
cat "${outputFolder}/precision_output.txt"
echo -e "\n" >> "${outputFolder}/output.txt"

awk '{if (NR>1){print $1,$2"\t"$5"\t"($3-$7)"\t"(($7/$3)*100)"\t"}}' "${outputFolder}/sensitivity_output.txt"
awk '{if (NR>1){print $1,$2"\t"$5"\t"($3-$7)"\t"(($7/$3)*100)"\t"}}' "${outputFolder}/sensitivity_output.txt" >> "${outputFolder}/output.txt"
cat "${outputFolder}/sensitivity_output.txt"
awk '{if (NR>1){print $0}}' "${outputFolder}/sensitivity_output.txt" >> "${outputFolder}/output.txt"
Empty file modified bin/concordanceCheck-ManyToManyVCFs.sh
100644 → 100755
Empty file.
Empty file modified bin/convertDos2Unix.sh
100644 → 100755
Empty file.
Empty file modified bin/countCoverage.sh
100644 → 100755
Empty file.
Empty file modified bin/create_per_base_intervals.sh
100644 → 100755
Empty file.
Empty file modified bin/getNgsFilesDiagnosticsPrmSystem.sh
100644 → 100755
Empty file.
Empty file modified bin/makeArrayReferenceSamplesheet.sh
100644 → 100755
Empty file.
Empty file modified bin/makeFinishedAndEnvFiles.sh
100644 → 100755
Empty file.
Empty file modified bin/parseManVarList.sh
100644 → 100755
Empty file.
Empty file modified bin/revertFromBamToFastQ.sh
100644 → 100755
Empty file.
Empty file modified bin/runGavin.sh
100644 → 100755
Empty file.
257 changes: 257 additions & 0 deletions bin/vcf-compare-precision-sensitivity.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
#!/usr/bin/env bash

#
# Bash sanity
#
set -e # Exit if any subcommand or pipeline returns a non-zero status.
set -u # Exit if any uninitialised variable is used.

showHelp() {
cat <<EOH
##################################################################################################'
This tool compares 2 VCF files, based on the variants of vcf1 (or GZIPPED vcf) and shows only the different rows based on the following columns:
CHROM, POS, REF, ALT and SampleID:GT
Usage:
bash vcf-compare-precision-sensitivity.sh -1 <input_vcf_file> -2 <input_vcf_file> -o <output_folder>
Example:
bash vcf-compare-precision-sensitivity.sh \
-1 old_analysis.snps.final.vcf \
-2 new_analysis.snps.final.vcf \
-o ./results/
##################################################################################################
EOH
trap - EXIT
exit 0
}


declare VCF1=""
declare VCF2=""
declare OUT=""
#
# Take the parameters given on the commandline and assign them to variables.
#
while getopts ":1:2:d:o:h" opt;
do
case "${opt}" in 1) VCF1="${OPTARG}";; 2) VCF2="${OPTARG}";; o) OUT="${OPTARG}";; h) showHelp ;;
esac
done

#
# Check if all parameters are set.
#


if [[ -z "${VCF1:-}" ]]
then
echo -e '\nERROR: Must specify a first VCF!\n'
showHelp
exit 1
fi

if [[ -z "${VCF2:-}" ]]
then
echo -e '\nERROR: Must specify a second VCF!\n'
showHelp
exit 1
fi

if [[ -z "${OUT:-}" ]]
then
echo -e '\nERROR: Must specify an output folder!\n'
showHelp
exit 1
fi

#
# Create tmp folders.
#
SCRATCH="${OUT}/TMP/"
rm -Rf "${SCRATCH}"
mkdir -p "${SCRATCH}"

printf "vcf1:${VCF1}\nvcf2:${VCF2}"> "${OUT}/runparameters.txt"
#
# select filenames from given path and check if they are unique.
#

checkformatVCF1="${VCF1##*.}"
checkformatVCF2="${VCF2##*.}"

if [ "${checkformatVCF1}" != "${checkformatVCF2}" ]
then
echo "formats are not the same (${checkformatVCF1} vs ${checkformatVCF2})"
exit 1
fi

if [[ "${checkformatVCF1}" == "gz" ]]
then
inputVcf1="${VCF1%.*}"
gzip -c -d "${VCF1}" > "${inputVcf1}"

inputVcf2="${VCF2%.*}"
gzip -c -d "${VCF2}" > "${inputVcf2}"
elif [[ "${checkformatVCF1}" == 'vcf' ]]
then
inputVcf1="${VCF1}"
inputVcf2="${VCF2}"
else
echo 'not a correct format, please use vcf or vcf.gz'
exit 1
fi

vcf01="$(basename ${inputVcf1})"
vcf02="$(basename ${inputVcf2})"
if [ "${vcf01}" == "${vcf02}" ]; then
echo "files are identical, vcf02 gets a suffix"
t="${vcf02}.2"
vcf02="${t}"
fi

##Remove header
grep -v '^#' "${inputVcf1}" > "${SCRATCH}/${vcf01}.removedHeader.txt"
## Get only necessary columns
awk '{OFS="\t"} {print $1,$2,$4,$5,$10}' "${SCRATCH}/${vcf01}.removedHeader.txt" > "${SCRATCH}/${vcf01}.stripColumns_removedHeader.txt"
##get only genotype call
awk -F '[\t:]' '{OFS="-"}{print $1,$2,$3,$4,$5}' "${SCRATCH}/${vcf01}.stripColumns_removedHeader.txt" > "${SCRATCH}/${vcf01}.stripped.txt"
grep -v '0/0' "${SCRATCH}/${vcf01}.stripped.txt" | grep -v "\./\." > "${SCRATCH}/${vcf01}.strippedReferenceCalls.txt"

awk '{OFS="\t"} {print $1,$2,$4,$5,$10}' "${inputVcf2}" > "${SCRATCH}/${vcf02}.stripped.txt"
grep -v '^#' "${SCRATCH}/${vcf02}.stripped.txt" > "${SCRATCH}/${vcf02}.stripColumns_removedHeader.txt"
awk -F '[\t:]' '{OFS="-"}{print $1,$2,$3,$4,$5}' "${SCRATCH}/${vcf02}.stripColumns_removedHeader.txt" > "${SCRATCH}/${vcf02}.stripped.txt"
grep -v '0/0' "${SCRATCH}/${vcf02}.stripped.txt" | grep -v "\./\." > "${SCRATCH}/${vcf02}.strippedReferenceCalls.txt"


declare -A arrVCF1
declare -A arrVCF2


while read line
do
OLDIFS="${IFS}"
IFS="-"
set ${line}
chr="${1}"
pos="${2}"
ref="${3}"
alt="${4}"
gen="${5}"
IFS="${OLDIFS}"
gen="${gen/|//}"
myvalue="${ref}-${alt}-${gen}"
mykey="${chr}-${pos}"
arrVCF1["${mykey}"]="${myvalue}"

done<"${SCRATCH}/${vcf01}.strippedReferenceCalls.txt"

while read line
do
OLDIFS="${IFS}"
IFS="-"
set ${line}
chr="${1}"
pos="${2}"
ref="${3}"
alt="${4}"
gen="${5}"
IFS="${OLDIFS}"
gen="${gen/|//}"
arrVCF2["${chr}-${pos}"]="${ref}-${alt}-${gen}"

done<"${SCRATCH}/${vcf02}.strippedReferenceCalls.txt"
printf "" > "${SCRATCH}/differences.txt"
printf "" > "${SCRATCH}/diff.txt"

echo "comparing vcf1 with vcf2"
for i in "${!arrVCF1[@]}"
do
if [[ "${arrVCF2[$i]+abc}" ]]
then
if [ "${arrVCF1[$i]}" != "${arrVCF2[$i]}" ]
then
printf "${i} \t| ${arrVCF1[$i]}\t|\t${arrVCF2[$i]} \n" >> "${SCRATCH}/diff.txt"
printf "${i} \t| ${arrVCF1[$i]}\t|\t${arrVCF2[$i]} \n" >> "${SCRATCH}/inconsistent.txt"
else
printf "${i} ${arrVCF1[$i]}\n" >> "${SCRATCH}/truePos.txt"
fi
else
printf "${i}\t${arrVCF1[$i]}\n" >> "${SCRATCH}/diff.txt"
printf "${i}\t${arrVCF1[$i]}\n" >> "${SCRATCH}/notInVcf2.txt"
fi
done

sort -n -k1 "${SCRATCH}/diff.txt" > "${SCRATCH}/differences.txt"
perl -pi -e 's|-|\t|' "${SCRATCH}/differences.txt"
printf "" > "${OUT}/vcfStats.txt"

alarm=0
falseNegative=0
falsePositive=0

##NOT IN VCF2
if [[ -f "${SCRATCH}/notInVcf2.txt" ]]
then
falseNegative=$(cat "${SCRATCH}/notInVcf2.txt" | wc -l)
sort -V -k1 "${SCRATCH}/notInVcf2.txt" > "${SCRATCH}/notInVcf2.txt.sorted"
printf "chr\tpos\tref\talt\tgen\n" > "${OUT}/notInVcf2.txt"
cat "${SCRATCH}/notInVcf2.txt.sorted" >> "${OUT}/notInVcf2.txt"
perl -pi -e 's|-|\t|g' "${OUT}/notInVcf2.txt"
fi

## INCONSISTENT
if [[ -f "${SCRATCH}/inconsistent.txt" ]]
then
alarm=$(cat "${SCRATCH}/inconsistent.txt" | wc -l)
sort -V -k1 "${SCRATCH}/inconsistent.txt" > "${SCRATCH}/inconsistent.txt.sorted"
printf "\t\t\t|\tvcf1\t\t|\t\tvcf2\t\t\n" > "${OUT}/inconsistent.txt"
printf "chr\tposition\t| ref\talt\tgen\t|\tref\talt\tgen\n" >> "${OUT}/inconsistent.txt"
cat "${SCRATCH}/inconsistent.txt.sorted" >> "${OUT}/inconsistent.txt"
perl -pi -e 's|-|\t|g' "${OUT}/inconsistent.txt"

fi

##TRUE POS
truePos=$(cat "${SCRATCH}/truePos.txt" | wc -l)
sort -V -k1 "${SCRATCH}/truePos.txt" > "${SCRATCH}/truePos.txt.sorted"
printf "chr\tpos\tref\talt\tgen\n" > "${OUT}/truePos.txt"
cat "${SCRATCH}/truePos.txt.sorted" >> "${OUT}/truePos.txt"

perl -pi -e 's|-|\t|g' "${OUT}/truePos.txt"


total=$((truePos + alarm + falseNegative))
printf "TotalNumber:${total}\n" >> "${OUT}/vcfStats.txt"
##TRUE POS
tpr=$(awk "BEGIN {printf \"%.2f\n\", ((${truePos}/$total)*100)}")
printf "TP:$truePos, TP rate: ${tpr}%%\n" >> "${OUT}/vcfStats.txt"

if [[ "${falseNegative}" -ne 0 ]]
then
fnr=$(awk "BEGIN {printf \"%.2f\n\", ((${falseNegative}/$total)*100)}")
printf "FN:${falseNegative}, FN rate: ${fnr}%%\n" >> "${OUT}/vcfStats.txt"
else
printf "FP:${falseNegative}, FN rate: 0%%\n" >> "${OUT}/vcfStats.txt"

fi
if [[ "${alarm}" -ne 0 ]]
then
alarmRate=$(awk "BEGIN {printf \"%.2f\n\", ((${alarm}/$total)*100)}")
printf "Inconsistent:${alarm}, InconsistentRate: ${alarmRate}%%\n" >> "${OUT}/vcfStats.txt"
else
printf "Inconsistent:${alarm}, InconsistentRate: 0%%\n" >> "${OUT}/vcfStats.txt"
fi

totalDifferences=$((${falseNegative}+${alarm}))

printf "${total}\t${truePos}\t${totalDifferences}" > "${OUT}/simpleStats.txt"


concordant=$(awk '{print 100-(($3/$1)*100)}' "${OUT}/simpleStats.txt")
printf "Total\tTP\tdiff\tconcordant(100-((diff/total)*100)\n" > "${OUT}/comparison.txt"
printf "${total}\t${truePos}\t${totalDifferences}\t${concordant}\n" >> "${OUT}/comparison.txt"

printf "done..\nComparison can be found: ${OUT} \n"

exit 0

0 comments on commit ab35234

Please sign in to comment.