Skip to content

Commit

Permalink
Merge pull request #194 from kdelange/master
Browse files Browse the repository at this point in the history
bamUtil tool to calculated coverage of specific position in BAM files
  • Loading branch information
pneerincx authored May 30, 2018
2 parents 9079960 + a70575b commit 4137ca6
Show file tree
Hide file tree
Showing 5 changed files with 3,489 additions and 0 deletions.
205 changes: 205 additions & 0 deletions bamUtil_Coverage_Tool/MVL_Coverage_Tool.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
#!/bin/bash
set -u
set -e
underline=`tput smul`
normal=`tput sgr0`
bold=`tput bold`

function usage () {
echo "
${bold}This script is calculating coverage per position of pathogenic and likely pathogenic varaints from the MVL.
AvgCoverage, SD, avarage cov <1x,<10x,<30x,
${bold}Arguments${normal}
example sh MVL_Coverage_Tool.sh -s /groups/umcg-gd/prm02/projects/QXTR_90-Exoom_v1/run01/results/alignment/ OPTIONS
Required:
-s|--sampledir Directory of of bam files of the used samples
-t|--tmpdir Give tmpfolder location for inbetween and final files
Optional:
-b|--bamstat path to the ban stats (your/directory/to/bamUtil-1.0.14/bin/bam stats)
-m|--mvl location of MVL_per_bp_unique.txt (default: $EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique.txt)
-x|--mvltabix location of MVL_per_bp_unique_tabix.txt (default: $EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique_tabix.txt)
-q|--qcmvldir location of MerdgeQCofMVL_file.py necessary for final txt generation (default:$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/)
"
}

module load HTSlib
module load Python

PARSED_OPTIONS=$(getopt -n "$0" -o s:b:o:m:x:t:f:z: --long "sampledir:bamstat:outputdir:mvl:mvltabix:tmpdir:outputfilter:finaloutput:" -- "$@")

#
# Bad arguments, something has gone wrong with the getopt command.
#
if [ $? -ne 0 ]; then
usage
echo "FATAL: Wrong arguments."
exit 1
fi

eval set -- "$PARSED_OPTIONS"

#
# Now goes through all the options with a case and using shift to analyse 1 argument at a time.
# $1 identifies the first argument, and when we use shift we discard the first argument, so $2 becomes $1 and goes again through the case.
#
while true; do
case "$1" in
-s|--sampledir)
case "$2" in
"") shift 2 ;;
*) SAMPLEDIR=$2 ; shift 2 ;;
esac ;;
-b|--bamstat)
case "$2" in
*) BAMSTAT=$2 ; shift 2 ;;
esac ;;
-m|--mvl)
case "$2" in
*) MVL=$2 ; shift 2 ;;
esac ;;
-x|--mvltabix)
case "$2" in
*) MVLTABIX=$2 ; shift 2 ;;
esac ;;
-t|--tmpdir)
case "$2" in
*) TmpDirBam=$2 ; shift 2 ;;
esac ;;
-q|--qcmvl)
case "$2" in
*) QCMVLDIR=$2 ; shift 2 ;;
esac ;;
--) shift ; break ;;
*) echo "Internal error!" ; exit 1 ;;
esac
done


#
#Check required options were provided.
#
if [[ -z "${SAMPLEDIR-}" ]]; then
usage
exit 1
fi
if [[ -z "${BAMSTAT-}" ]]; then
#BAMSTAT="your/directry/to/bamUtil_Coverage_Tool/bamUtil-1.0.14/bin/bam"
usage
fi
if [[ -z "${MVL-}" ]]; then
MVL="$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique.txt"
fi
if [[ -z "${MVLTABIX-}" ]]; then
MVLTABIX="$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique_tabix.txt"
fi
if [[ -z "${TmpDirBam=-}" ]]; then
usage
exit 1
fi
if [[ -z "${QCMVLDIR-}" ]]; then
QCMVLDIR="$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/"
fi
echo "SAMPLEDIR: ${SAMPLEDIR}"
echo "BAMSTAT: ${BAMSTAT}"
echo "MVL: ${MVL}"
echo "MVLTABIX: ${MVLTABIX}"
echo "TMPDIR: ${TmpDirBam}"
echo "QCMVLDIR: ${QCMVLDIR}"
echo "starting"


rm -rf "${TmpDirBam}"
mkdir -p "${TmpDirBam}"/input/
mkdir -p "${TmpDirBam}"/outputfilter/
mkdir -p "${TmpDirBam}"/output_final/
mkdir -p "${TmpDirBam}"/output_final_txt_files/


#runBamStatWithRegion3.sh run the BAM stat tool to generate QC report per position
echo "start Bam stats"

count=0

for i in $(ls "${SAMPLEDIR}"/*.bam)
do
echo "processingBamstat ${i}"
"${BAMSTAT}" stats \
--in "${i}" \
--cBaseQC "${TmpDirBam}"/input/outputBamUtil_${count}.txt \
--regionlist "${MVL}" \
--withinRegion \
--nophonehome

count=$((count+1))

done
echo "finisched Bam stats"


# test.sh, make QC report complete by adding also position with 0x coverage

echo "start tabix 0x coverage"
for i in $(ls "${TmpDirBam}"/input/output*.txt)
do
b=$(basename "${i}")
bedfile=${b%.*}.bed.txt
awk 'BEGIN{OFS="\t"}{print $1,$2,($3-1),$4}' ${i} > "${TmpDirBam}"/"${bedfile}"

sed -i '1d' "${TmpDirBam}"/"${bedfile}"
bgzip -c "${TmpDirBam}"/"${bedfile}" > "${TmpDirBam}"/"${bedfile}".gz
tabix -0 -p bed "${TmpDirBam}"/"${bedfile}".gz
rm -f "${TmpDirBam}"/outputfilter/${b%.*}_filter.txt
while read line
do
if tabix -0 "${TmpDirBam}"/"${bedfile}".gz ${line} | grep -q "^"
then
tabix -0 "${TmpDirBam}"/"${bedfile}".gz ${line}
else
echo $line | awk -F":" '{print $1"\t"$2}' | awk -F"-" '{print $1"\t"$2"\t0"}'
fi
done < "${MVLTABIX}" >> "${TmpDirBam}"/outputfilter/${b%.*}_filter.txt
echo "${i} done tabix 0x coverage"
done

# test_chr_pos_cov.sh, make files usable for python script including 0x
echo "starting making final txt files per patient"
for i in $(ls ${TmpDirBam}/outputfilter/output*.txt)
do
b=$(basename ${i})
filename=${b%.*}

awk '{
if ($4==0) {
print $1"\t"($2)"\t"$4
}
else{
print $1"\t"$2"\t"$4
}
}' $i > "${TmpDirBam}/output_final/${filename}_final.txt"
echo "${i} done"
done
echo "finisched making final txt per patient"

#add gene name to coverage file.
echo "add gene name"
sort -V "${MVL}" > "${MVL}".sorted
awk '{print $4}' "${MVL}" > "${MVL}".genes

for i in $(ls "${TmpDirBam}"/output_final/outputBamUtil_*_filter_final.txt)
do

sortedFile=${i%.*}.sorted.txt
sort -V ${i} > ${sortedFile}
a=$(diff <(awk '{print $2}' "${MVL}.sorted") <(awk '{print $2}' "${sortedFile}"))
if [ "${a}" == "" ]
then
paste -d"\t" $i "${MVL}".genes > ${i%.*}_gene.txt
else
echo 'positions in MVL and BAMutil output do not match'
fi
echo "${i} done adding gene name"
done
echo "start python script"

python "${QCMVLDIR}"/MerdgeQCofMVL_file.py --in "${TmpDirBam}"/output_final/ --ff "${TmpDirBam}"/output_final_txt_files/

Loading

0 comments on commit 4137ca6

Please sign in to comment.