diff --git a/bin/CramConversion.sh b/bin/CramConversion.sh old mode 100644 new mode 100755 diff --git a/bin/checkDataLimitGroups.sh b/bin/checkDataLimitGroups.sh old mode 100644 new mode 100755 diff --git a/bin/checkValidationNGS_DNA_v5.sh b/bin/checkValidationNGS_DNA_v5.sh old mode 100644 new mode 100755 diff --git a/bin/compareWithVKGL_gatk4.sh b/bin/compareWithVKGL_gatk4.sh index 7771290..451f8fa 100755 --- a/bin/compareWithVKGL_gatk4.sh +++ b/bin/compareWithVKGL_gatk4.sh @@ -98,36 +98,32 @@ function giabvshc(){ echo "comparing ${type}: ${sampleName}vsHC" ## sample vs HC callset (precision) # shellcheck disable=SC2154 - gatk VariantEval \ - -R "${refGenome}" \ - -O "${outputFolder}/tmp/precision-${sampleName}vsHC_union.${type}.vcf" \ - --eval "${outputFolder}/tmp/${sampleName}_${type}.union.20bp_filtered.vcf" \ - --comp "/apps/data/NIST/${namingConvention}/GIAB_HC.${type}_20bp.vcf" + "${EBROOTNGSMINUTILS}/bin/vcf-compare-precision-sensitivity.sh" \ + -1 "${outputFolder}/tmp/${sampleName}_${type}.union.20bp_filtered.vcf" \ + -2 "/apps/data/NIST/${namingConvention}/GIAB_HC.${type}_20bp.vcf" \ + -o "${outputFolder}/tmp/precision-${sampleName}vsHC_union_${type}/" + if [[ "${firstLine}" == "false" ]] then - ## print header too - head -4 "${outputFolder}/tmp/precision-${sampleName}vsHC_union.${type}.vcf" | tail -1 | awk '{OFS="\t"}{print "measurement","Type",$6,$7,$8,$9,$10,$11,"FinalConcordance"}' > "${outputFolder}/precision_output.txt" + awk '{if(NR==1){print "Measurement\tType\t"$0}}' "${outputFolder}/tmp/precision-${sampleName}vsHC_union_${type}/comparison.txt" > "${outputFolder}/precision_output.txt" fi - head -5 "${outputFolder}/tmp/precision-${sampleName}vsHC_union.${type}.vcf" | tail -1 | awk -v type="${type}" '{OFS="\t"}{print "precision",type,$6,$7,$8,$9,$10,$11,(($10/$6)*100)}' >> "${outputFolder}/precision_output.txt" - - echo "comparing ${type}: HCvs${sampleName}" + awk -v type=${type} '{if(NR>1){print "precision\t"type"\t"$0}}' "${outputFolder}/tmp/precision-${sampleName}vsHC_union_${type}/comparison.txt" >> "${outputFolder}/precision_output.txt" + ##HC callset vs sample (sensitivity) # shellcheck disable=SC2154 - gatk VariantEval \ - -R "${refGenome}" \ - -O "${outputFolder}/tmp/sensitivity-HCvs${sampleName}_union.${type}.vcf" \ - --comp "${outputFolder}/tmp/${sampleName}_${type}.union.20bp_filtered.vcf" \ - --eval "/apps/data/NIST/${namingConvention}/GIAB_HC.${type}_20bp.vcf" - + "${EBROOTNGSMINUTILS}/bin/vcf-compare-precision-sensitivity.sh" \ + -1 "/apps/data/NIST/${namingConvention}/GIAB_HC.${type}_20bp.vcf" \ + -2 "${outputFolder}/tmp/${sampleName}_${type}.union.20bp_filtered.vcf" \ + -o "${outputFolder}/tmp/sensitivity-HCvs${sampleName}_union_${type}/" + if [[ "${firstLine}" == "false" ]] then - ## print header too - head -4 "${outputFolder}/tmp/sensitivity-HCvs${sampleName}_union.${type}.vcf" | tail -1 | awk '{OFS="\t"}{print "measurement","Type",$6,$7,$8,$9,$10,$11,"FinalConcordance"}' > "${outputFolder}/sensitivity_output.txt" - firstLine="true" + awk '{if(NR==1){print "Measurement\tType\t"$0}}' "${outputFolder}/tmp/sensitivity-HCvs${sampleName}_union_${type}/comparison.txt" > "${outputFolder}/sensitivity_output.txt" + firstLine="True" fi + awk -v type="${type}" '{if(NR>1){print "sensitivity\t"type"\t"$0}}' "${outputFolder}/tmp/sensitivity-HCvs${sampleName}_union_${type}/comparison.txt" >> "${outputFolder}/sensitivity_output.txt" - head -5 "${outputFolder}/tmp/sensitivity-HCvs${sampleName}_union.${type}.vcf" | tail -1 | awk -v type="${type}" '{OFS="\t"}{print "sensitivity",type,$6,$7,$8,$9,$10,$11,(($10/$6)*100)}' >> "${outputFolder}/sensitivity_output.txt" done } @@ -171,16 +167,12 @@ else fi ##precision -echo -e "Measurement\tType\tTP\tFP\tTP/TP+FP" -echo -e "Measurement\tType\tTP\tFP\tTP/TP+FP" >> "${outputFolder}/output.txt" +head -1 "${outputFolder}/precision_output.txt" > "${outputFolder}/output.txt" #awk '{if (NR>1){print $1,$2"\t"$5"\t"$4"\t"$6}}' "${outputFolder}/precision_output.txt" -awk '{if (NR>1){print $1,$2"\t"$5"\t"($3-$7)"\t"(($7/$3)*100)"\t"}}' "${outputFolder}/precision_output.txt" -awk '{if (NR>1){print $1,$2"\t"$5"\t"($3-$7)"\t"(($7/$3)*100)"\t"}}' "${outputFolder}/precision_output.txt" >> "${outputFolder}/output.txt" - -##sensitivity -echo -e "Measurement\tType\tTP\tFN\tTP/TP+FN" -echo -e "Measurement\tType\tTP\tFN\tTP/TP+FN" >> "${outputFolder}/output.txt" +awk '{if (NR>1){print $0}}' "${outputFolder}/precision_output.txt" >> "${outputFolder}/output.txt" +cat "${outputFolder}/precision_output.txt" +echo -e "\n" >> "${outputFolder}/output.txt" -awk '{if (NR>1){print $1,$2"\t"$5"\t"($3-$7)"\t"(($7/$3)*100)"\t"}}' "${outputFolder}/sensitivity_output.txt" -awk '{if (NR>1){print $1,$2"\t"$5"\t"($3-$7)"\t"(($7/$3)*100)"\t"}}' "${outputFolder}/sensitivity_output.txt" >> "${outputFolder}/output.txt" +cat "${outputFolder}/sensitivity_output.txt" +awk '{if (NR>1){print $0}}' "${outputFolder}/sensitivity_output.txt" >> "${outputFolder}/output.txt" diff --git a/bin/concordanceCheck-ManyToManyVCFs.sh b/bin/concordanceCheck-ManyToManyVCFs.sh old mode 100644 new mode 100755 diff --git a/bin/convertDos2Unix.sh b/bin/convertDos2Unix.sh old mode 100644 new mode 100755 diff --git a/bin/countCoverage.sh b/bin/countCoverage.sh old mode 100644 new mode 100755 diff --git a/bin/create_per_base_intervals.sh b/bin/create_per_base_intervals.sh old mode 100644 new mode 100755 diff --git a/bin/getNgsFilesDiagnosticsPrmSystem.sh b/bin/getNgsFilesDiagnosticsPrmSystem.sh old mode 100644 new mode 100755 diff --git a/bin/makeArrayReferenceSamplesheet.sh b/bin/makeArrayReferenceSamplesheet.sh old mode 100644 new mode 100755 diff --git a/bin/makeFinishedAndEnvFiles.sh b/bin/makeFinishedAndEnvFiles.sh old mode 100644 new mode 100755 diff --git a/bin/parseManVarList.sh b/bin/parseManVarList.sh old mode 100644 new mode 100755 diff --git a/bin/revertFromBamToFastQ.sh b/bin/revertFromBamToFastQ.sh old mode 100644 new mode 100755 diff --git a/bin/runGavin.sh b/bin/runGavin.sh old mode 100644 new mode 100755 diff --git a/bin/vcf-compare-precision-sensitivity.sh b/bin/vcf-compare-precision-sensitivity.sh new file mode 100755 index 0000000..0e60738 --- /dev/null +++ b/bin/vcf-compare-precision-sensitivity.sh @@ -0,0 +1,257 @@ +#!/usr/bin/env bash + +# +# Bash sanity +# +set -e # Exit if any subcommand or pipeline returns a non-zero status. +set -u # Exit if any uninitialised variable is used. + +showHelp() { + cat < -2 -o + +Example: +bash vcf-compare-precision-sensitivity.sh \ +-1 old_analysis.snps.final.vcf \ +-2 new_analysis.snps.final.vcf \ +-o ./results/ +################################################################################################## +EOH +trap - EXIT +exit 0 +} + + +declare VCF1="" +declare VCF2="" +declare OUT="" +# +# Take the parameters given on the commandline and assign them to variables. +# +while getopts ":1:2:d:o:h" opt; +do + case "${opt}" in 1) VCF1="${OPTARG}";; 2) VCF2="${OPTARG}";; o) OUT="${OPTARG}";; h) showHelp ;; + esac +done + +# +# Check if all parameters are set. +# + + +if [[ -z "${VCF1:-}" ]] +then + echo -e '\nERROR: Must specify a first VCF!\n' + showHelp + exit 1 +fi + +if [[ -z "${VCF2:-}" ]] +then + echo -e '\nERROR: Must specify a second VCF!\n' + showHelp + exit 1 +fi + +if [[ -z "${OUT:-}" ]] +then + echo -e '\nERROR: Must specify an output folder!\n' + showHelp + exit 1 +fi + +# +# Create tmp folders. +# +SCRATCH="${OUT}/TMP/" +rm -Rf "${SCRATCH}" +mkdir -p "${SCRATCH}" + +printf "vcf1:${VCF1}\nvcf2:${VCF2}"> "${OUT}/runparameters.txt" +# +# select filenames from given path and check if they are unique. +# + +checkformatVCF1="${VCF1##*.}" +checkformatVCF2="${VCF2##*.}" + +if [ "${checkformatVCF1}" != "${checkformatVCF2}" ] +then + echo "formats are not the same (${checkformatVCF1} vs ${checkformatVCF2})" + exit 1 +fi + +if [[ "${checkformatVCF1}" == "gz" ]] +then + inputVcf1="${VCF1%.*}" + gzip -c -d "${VCF1}" > "${inputVcf1}" + + inputVcf2="${VCF2%.*}" + gzip -c -d "${VCF2}" > "${inputVcf2}" +elif [[ "${checkformatVCF1}" == 'vcf' ]] +then + inputVcf1="${VCF1}" + inputVcf2="${VCF2}" +else + echo 'not a correct format, please use vcf or vcf.gz' + exit 1 +fi + +vcf01="$(basename ${inputVcf1})" +vcf02="$(basename ${inputVcf2})" +if [ "${vcf01}" == "${vcf02}" ]; then + echo "files are identical, vcf02 gets a suffix" + t="${vcf02}.2" + vcf02="${t}" +fi + +##Remove header +grep -v '^#' "${inputVcf1}" > "${SCRATCH}/${vcf01}.removedHeader.txt" +## Get only necessary columns +awk '{OFS="\t"} {print $1,$2,$4,$5,$10}' "${SCRATCH}/${vcf01}.removedHeader.txt" > "${SCRATCH}/${vcf01}.stripColumns_removedHeader.txt" +##get only genotype call +awk -F '[\t:]' '{OFS="-"}{print $1,$2,$3,$4,$5}' "${SCRATCH}/${vcf01}.stripColumns_removedHeader.txt" > "${SCRATCH}/${vcf01}.stripped.txt" +grep -v '0/0' "${SCRATCH}/${vcf01}.stripped.txt" | grep -v "\./\." > "${SCRATCH}/${vcf01}.strippedReferenceCalls.txt" + +awk '{OFS="\t"} {print $1,$2,$4,$5,$10}' "${inputVcf2}" > "${SCRATCH}/${vcf02}.stripped.txt" +grep -v '^#' "${SCRATCH}/${vcf02}.stripped.txt" > "${SCRATCH}/${vcf02}.stripColumns_removedHeader.txt" +awk -F '[\t:]' '{OFS="-"}{print $1,$2,$3,$4,$5}' "${SCRATCH}/${vcf02}.stripColumns_removedHeader.txt" > "${SCRATCH}/${vcf02}.stripped.txt" +grep -v '0/0' "${SCRATCH}/${vcf02}.stripped.txt" | grep -v "\./\." > "${SCRATCH}/${vcf02}.strippedReferenceCalls.txt" + + +declare -A arrVCF1 +declare -A arrVCF2 + + +while read line +do + OLDIFS="${IFS}" + IFS="-" + set ${line} + chr="${1}" + pos="${2}" + ref="${3}" + alt="${4}" + gen="${5}" + IFS="${OLDIFS}" + gen="${gen/|//}" + myvalue="${ref}-${alt}-${gen}" + mykey="${chr}-${pos}" + arrVCF1["${mykey}"]="${myvalue}" + +done<"${SCRATCH}/${vcf01}.strippedReferenceCalls.txt" + +while read line +do + OLDIFS="${IFS}" + IFS="-" + set ${line} + chr="${1}" + pos="${2}" + ref="${3}" + alt="${4}" + gen="${5}" + IFS="${OLDIFS}" + gen="${gen/|//}" + arrVCF2["${chr}-${pos}"]="${ref}-${alt}-${gen}" + +done<"${SCRATCH}/${vcf02}.strippedReferenceCalls.txt" +printf "" > "${SCRATCH}/differences.txt" +printf "" > "${SCRATCH}/diff.txt" + +echo "comparing vcf1 with vcf2" +for i in "${!arrVCF1[@]}" +do + if [[ "${arrVCF2[$i]+abc}" ]] + then + if [ "${arrVCF1[$i]}" != "${arrVCF2[$i]}" ] + then + printf "${i} \t| ${arrVCF1[$i]}\t|\t${arrVCF2[$i]} \n" >> "${SCRATCH}/diff.txt" + printf "${i} \t| ${arrVCF1[$i]}\t|\t${arrVCF2[$i]} \n" >> "${SCRATCH}/inconsistent.txt" + else + printf "${i} ${arrVCF1[$i]}\n" >> "${SCRATCH}/truePos.txt" + fi + else + printf "${i}\t${arrVCF1[$i]}\n" >> "${SCRATCH}/diff.txt" + printf "${i}\t${arrVCF1[$i]}\n" >> "${SCRATCH}/notInVcf2.txt" + fi +done + +sort -n -k1 "${SCRATCH}/diff.txt" > "${SCRATCH}/differences.txt" +perl -pi -e 's|-|\t|' "${SCRATCH}/differences.txt" +printf "" > "${OUT}/vcfStats.txt" + +alarm=0 +falseNegative=0 +falsePositive=0 + +##NOT IN VCF2 +if [[ -f "${SCRATCH}/notInVcf2.txt" ]] +then + falseNegative=$(cat "${SCRATCH}/notInVcf2.txt" | wc -l) + sort -V -k1 "${SCRATCH}/notInVcf2.txt" > "${SCRATCH}/notInVcf2.txt.sorted" + printf "chr\tpos\tref\talt\tgen\n" > "${OUT}/notInVcf2.txt" + cat "${SCRATCH}/notInVcf2.txt.sorted" >> "${OUT}/notInVcf2.txt" + perl -pi -e 's|-|\t|g' "${OUT}/notInVcf2.txt" +fi + +## INCONSISTENT +if [[ -f "${SCRATCH}/inconsistent.txt" ]] +then + alarm=$(cat "${SCRATCH}/inconsistent.txt" | wc -l) + sort -V -k1 "${SCRATCH}/inconsistent.txt" > "${SCRATCH}/inconsistent.txt.sorted" + printf "\t\t\t|\tvcf1\t\t|\t\tvcf2\t\t\n" > "${OUT}/inconsistent.txt" + printf "chr\tposition\t| ref\talt\tgen\t|\tref\talt\tgen\n" >> "${OUT}/inconsistent.txt" + cat "${SCRATCH}/inconsistent.txt.sorted" >> "${OUT}/inconsistent.txt" + perl -pi -e 's|-|\t|g' "${OUT}/inconsistent.txt" + +fi + +##TRUE POS +truePos=$(cat "${SCRATCH}/truePos.txt" | wc -l) +sort -V -k1 "${SCRATCH}/truePos.txt" > "${SCRATCH}/truePos.txt.sorted" +printf "chr\tpos\tref\talt\tgen\n" > "${OUT}/truePos.txt" +cat "${SCRATCH}/truePos.txt.sorted" >> "${OUT}/truePos.txt" + +perl -pi -e 's|-|\t|g' "${OUT}/truePos.txt" + + +total=$((truePos + alarm + falseNegative)) +printf "TotalNumber:${total}\n" >> "${OUT}/vcfStats.txt" +##TRUE POS +tpr=$(awk "BEGIN {printf \"%.2f\n\", ((${truePos}/$total)*100)}") +printf "TP:$truePos, TP rate: ${tpr}%%\n" >> "${OUT}/vcfStats.txt" + +if [[ "${falseNegative}" -ne 0 ]] +then + fnr=$(awk "BEGIN {printf \"%.2f\n\", ((${falseNegative}/$total)*100)}") + printf "FN:${falseNegative}, FN rate: ${fnr}%%\n" >> "${OUT}/vcfStats.txt" +else + printf "FP:${falseNegative}, FN rate: 0%%\n" >> "${OUT}/vcfStats.txt" + +fi +if [[ "${alarm}" -ne 0 ]] +then + alarmRate=$(awk "BEGIN {printf \"%.2f\n\", ((${alarm}/$total)*100)}") + printf "Inconsistent:${alarm}, InconsistentRate: ${alarmRate}%%\n" >> "${OUT}/vcfStats.txt" +else + printf "Inconsistent:${alarm}, InconsistentRate: 0%%\n" >> "${OUT}/vcfStats.txt" +fi + +totalDifferences=$((${falseNegative}+${alarm})) + +printf "${total}\t${truePos}\t${totalDifferences}" > "${OUT}/simpleStats.txt" + + +concordant=$(awk '{print 100-(($3/$1)*100)}' "${OUT}/simpleStats.txt") +printf "Total\tTP\tdiff\tconcordant(100-((diff/total)*100)\n" > "${OUT}/comparison.txt" +printf "${total}\t${truePos}\t${totalDifferences}\t${concordant}\n" >> "${OUT}/comparison.txt" + +printf "done..\nComparison can be found: ${OUT} \n" + +exit 0