Skip to content

Commit

Permalink
ajusted comments from Pieter
Browse files Browse the repository at this point in the history
  • Loading branch information
kdelange committed May 29, 2018
2 parents 5c70a17 + 9079960 commit a70575b
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 44 deletions.
73 changes: 36 additions & 37 deletions bamUtil_Coverage_Tool/MVL_Coverage_Tool.sh
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ ${bold}Arguments${normal}
example sh MVL_Coverage_Tool.sh -s /groups/umcg-gd/prm02/projects/QXTR_90-Exoom_v1/run01/results/alignment/ OPTIONS
Required:
-s|--sampledir Directory of of bam files of the used samples
-t|--tmpdir Give tmpfolder location for inbetween and final files
-t|--tmpdir Give tmpfolder location for inbetween and final files
Optional:
-b|--bamstat path to the ban stats (default: $EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/bamUtil-1.0.14/bin/bam stats)
-b|--bamstat path to the ban stats (your/directory/to/bamUtil-1.0.14/bin/bam stats)
-m|--mvl location of MVL_per_bp_unique.txt (default: $EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique.txt)
-x|--mvltabix location of MVL_per_bp_unique_tabix.txt (default: $EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique_tabix.txt)
-q|--qcmvldir location of MerdgeQCofMVL_file.py necessary for final txt generation (default:$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/)
-x|--mvltabix location of MVL_per_bp_unique_tabix.txt (default: $EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique_tabix.txt)
-q|--qcmvldir location of MerdgeQCofMVL_file.py necessary for final txt generation (default:$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/)
"
}

Expand Down Expand Up @@ -63,7 +63,7 @@ while true; do
esac ;;
-t|--tmpdir)
case "$2" in
*) TMPDIR=$2 ; shift 2 ;;
*) TmpDirBam=$2 ; shift 2 ;;
esac ;;
-q|--qcmvl)
case "$2" in
Expand All @@ -75,8 +75,6 @@ while true; do
done



empty=""
#
#Check required options were provided.
#
Expand All @@ -85,15 +83,16 @@ if [[ -z "${SAMPLEDIR-}" ]]; then
exit 1
fi
if [[ -z "${BAMSTAT-}" ]]; then
BAMSTAT="$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/bamUtil-1.0.14/bin/bam"
#BAMSTAT="your/directry/to/bamUtil_Coverage_Tool/bamUtil-1.0.14/bin/bam"
usage
fi
if [[ -z "${MVL-}" ]]; then
MVL="$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique.txt"
fi
if [[ -z "${MVLTABIX-}" ]]; then
MVLTABIX="$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique_tabix.txt"
fi
if [[ -z "${TMPDIR-}" ]]; then
if [[ -z "${TmpDirBam=-}" ]]; then
usage
exit 1
fi
Expand All @@ -104,30 +103,30 @@ echo "SAMPLEDIR: ${SAMPLEDIR}"
echo "BAMSTAT: ${BAMSTAT}"
echo "MVL: ${MVL}"
echo "MVLTABIX: ${MVLTABIX}"
echo "TMPDIR: ${TMPDIR}"
echo "TMPDIR: ${TmpDirBam}"
echo "QCMVLDIR: ${QCMVLDIR}"
echo "starting"


rm -rf ${TMPDIR}
mkdir -p ${TMPDIR}/input/
mkdir -p ${TMPDIR}/outputfilter/
mkdir -p ${TMPDIR}/output_final/
mkdir -p ${TMPDIR}/output_final_txt_files/
rm -rf "${TmpDirBam}"
mkdir -p "${TmpDirBam}"/input/
mkdir -p "${TmpDirBam}"/outputfilter/
mkdir -p "${TmpDirBam}"/output_final/
mkdir -p "${TmpDirBam}"/output_final_txt_files/


#runBamStatWithRegion3.sh run the BAM stat tool to generate QC report per position
echo "start Bam stats"

count=0

for i in $(ls ${SAMPLEDIR}/*.bam)
for i in $(ls "${SAMPLEDIR}"/*.bam)
do
echo "processingBamstat ${i}"
${BAMSTAT} stats \
--in ${i} \
--cBaseQC ${TMPDIR}/input/outputBamUtil_${count}.txt \
--regionlist ${MVL} \
"${BAMSTAT}" stats \
--in "${i}" \
--cBaseQC "${TmpDirBam}"/input/outputBamUtil_${count}.txt \
--regionlist "${MVL}" \
--withinRegion \
--nophonehome

Expand All @@ -140,31 +139,31 @@ echo "finisched Bam stats"
# test.sh, make QC report complete by adding also position with 0x coverage

echo "start tabix 0x coverage"
for i in $(ls ${TMPDIR}/input/output*.txt)
for i in $(ls "${TmpDirBam}"/input/output*.txt)
do
b=$(basename ${i})
b=$(basename "${i}")
bedfile=${b%.*}.bed.txt
awk 'BEGIN{OFS="\t"}{print $1,$2,($3-1),$4}' ${i} > ${TMPDIR}/${bedfile}
awk 'BEGIN{OFS="\t"}{print $1,$2,($3-1),$4}' ${i} > "${TmpDirBam}"/"${bedfile}"

sed -i '1d' ${TMPDIR}/${bedfile}
bgzip -c ${TMPDIR}/${bedfile} > ${TMPDIR}/${bedfile}.gz
tabix -0 -p bed ${TMPDIR}/${bedfile}.gz
rm -f ${TMPDIR}/outputfilter/${b%.*}_filter.txt
sed -i '1d' "${TmpDirBam}"/"${bedfile}"
bgzip -c "${TmpDirBam}"/"${bedfile}" > "${TmpDirBam}"/"${bedfile}".gz
tabix -0 -p bed "${TmpDirBam}"/"${bedfile}".gz
rm -f "${TmpDirBam}"/outputfilter/${b%.*}_filter.txt
while read line
do
if tabix -0 ${TMPDIR}/${bedfile}.gz ${line} | grep -q "^"
if tabix -0 "${TmpDirBam}"/"${bedfile}".gz ${line} | grep -q "^"
then
tabix -0 ${TMPDIR}/${bedfile}.gz ${line}
tabix -0 "${TmpDirBam}"/"${bedfile}".gz ${line}
else
echo $line | awk -F":" '{print $1"\t"$2}' | awk -F"-" '{print $1"\t"$2"\t0"}'
fi
done < ${MVLTABIX} >> ${TMPDIR}/outputfilter/${b%.*}_filter.txt
done < "${MVLTABIX}" >> "${TmpDirBam}"/outputfilter/${b%.*}_filter.txt
echo "${i} done tabix 0x coverage"
done

# test_chr_pos_cov.sh, make files usable for python script including 0x
echo "starting making final txt files per patient"
for i in $(ls ${TMPDIR}/outputfilter/output*.txt)
for i in $(ls ${TmpDirBam}/outputfilter/output*.txt)
do
b=$(basename ${i})
filename=${b%.*}
Expand All @@ -176,31 +175,31 @@ do
else{
print $1"\t"$2"\t"$4
}
}' $i > "${TMPDIR}/output_final/${filename}_final.txt"
}' $i > "${TmpDirBam}/output_final/${filename}_final.txt"
echo "${i} done"
done
echo "finisched making final txt per patient"

#add gene name to coverage file.
echo "add gene name"
sort -V ${MVL} > ${MVL}.sorted
awk '{print $4}' ${MVL} > ${MVL}.genes
sort -V "${MVL}" > "${MVL}".sorted
awk '{print $4}' "${MVL}" > "${MVL}".genes

for i in $(ls ${TMPDIR}/output_final/outputBamUtil_*_filter_final.txt)
for i in $(ls "${TmpDirBam}"/output_final/outputBamUtil_*_filter_final.txt)
do

sortedFile=${i%.*}.sorted.txt
sort -V ${i} > ${sortedFile}
a=$(diff <(awk '{print $2}' "${MVL}.sorted") <(awk '{print $2}' "${sortedFile}"))
if [ "${a}" == "" ]
then
paste -d"\t" $i ${MVL}.genes > ${i%.*}_gene.txt
paste -d"\t" $i "${MVL}".genes > ${i%.*}_gene.txt
else
echo 'positions in MVL and BAMutil output do not match'
fi
echo "${i} done adding gene name"
done
echo "start python script"

python ${QCMVLDIR}/MerdgeQCofMVL_file.py --in ${TMPDIR}/output_final/ --ff ${TMPDIR}/output_final_txt_files/
python "${QCMVLDIR}"/MerdgeQCofMVL_file.py --in "${TmpDirBam}"/output_final/ --ff "${TmpDirBam}"/output_final_txt_files/

22 changes: 15 additions & 7 deletions renameFastQs.bash
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ function _RenameFastQ() {
then
echo "DEBUG: Found _firstReadID ............ = ${_firstReadID}"
fi
local _regex='^@([A-Z0-9][A-Z0-9]*):([0-9][0-9]*):([A-Z0-9][A-Z0-9]*XX):([1-8]):[0-9]*:[0-9]*:[0-9]* ([1-2]):[YN]:[0-9][0-9]*:[ATCGN][ATCGN+]*'
local _regex='^@([A-Z0-9][A-Z0-9]*):([0-9][0-9]*):([A-Z0-9][A-Z0-9]*):([1-8]):[0-9]*:[0-9]*:[0-9]* ([1-2]):[YN]:[0-9][0-9]*:[ATCGN][ATCGN+]*'
if [[ "${_firstReadID}" =~ ${_regex} ]]
then
local _sequencer="${BASH_REMATCH[1]}"
Expand All @@ -126,8 +126,7 @@ function _RenameFastQ() {
local _sequenceReadOfPair="${BASH_REMATCH[5]}"
#
# Note: we do require the ID of the first read to contain a DNA barcode at the end,
# but we don't parse barcodes here. Due to sequencing errors the barcode may deviate slightly.
# Instead we parse the barcodes from the first reads and determine the most abundant one later on.
# but we don't parse barcodes here. See below for why...
#

#
Expand All @@ -150,11 +149,20 @@ function _RenameFastQ() {
echo "DEBUG: Found _sequenceReadOfPair ..... = ${_sequenceReadOfPair}"
fi
else
echo "FATAL: Failed to required meta-data values from ID of first read of ${_fastqPath}"
echo "FATAL: Failed to parse required meta-data values from ID of first read of ${_fastqPath}"
exit 1
fi

local _mostAbundandBarcode=$(zcat "${_fastqPath}" | head -4000 | awk 'NR % 4 == 1' | awk -F ':' '{print $10}' | sort | uniq -c | sort -k 1,1nr | head -1 | awk '{print $2}' | tr -d '\n')
#
# Due to sequencing errors the barcode may deviate slightly, so looking only at the first one won't fly.
# In addition the start and end of a FastQ file tends to be enriched for sequencing errors / low quality.
# Therefore we:
# A. use a combination of head and tail commands to skip the first 10,000 reads (40,000 lines)
# and get data from in the middle of the FastQ file.
# (assuming the yield was high enough; otherwise you get the last 1000 reads).
# B. Next we parse the barcodes from the read ID lines, sort, count and determine the most abundant barcode.
#
local _mostAbundandBarcode=$(zcat "${_fastqPath}" | head -n 44000 | tail -n 4000 | awk 'NR % 4 == 1' | awk -F ':' '{print $10}' | sort | uniq -c | sort -k 1,1nr | head -1 | awk '{print $2}' | tr -d '\n')
local _barcodeRegex='^([ATCG][ATCG+]*)$'
if [[ "${_mostAbundandBarcode}" =~ ${_barcodeRegex} ]]
then
Expand All @@ -169,11 +177,11 @@ function _RenameFastQ() {
fi
elif [[ "${_mostAbundandBarcode}" =~ N ]]
then
echo "ERROR: Most abundant barcode(s) in max first 1000 reads contains Ns: ${_barcodes}"
echo "ERROR: Most abundant barcode(s) in max 1000 reads from middle of FastQ contains Ns: ${_barcodes}"
echo "ERROR: Skipping discarded FastQ ${_fastqFile} due to poor sequencing quality of barcode(s)."
return
else
echo "ERROR: Failed to determine the most abundant barcode(s) from the max first 1000 reads."
echo "ERROR: Failed to determine the most abundant barcode(s) from max 1000 reads from middle of FastQ."
echo "FATAL: Failed to process FastQ ${_fastqFile} due to poor sequencing quality of barcode(s)."
exit 1
fi
Expand Down

0 comments on commit a70575b

Please sign in to comment.