ajusted comments from Pieter

molgenis · May 29, 2018 · a70575b · a70575b
2 parents 5c70a17 + 9079960
commit a70575b
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 44 deletions.
diff --git a/bamUtil_Coverage_Tool/MVL_Coverage_Tool.sh b/bamUtil_Coverage_Tool/MVL_Coverage_Tool.sh
@@ -13,12 +13,12 @@ ${bold}Arguments${normal}
 	example sh MVL_Coverage_Tool.sh -s /groups/umcg-gd/prm02/projects/QXTR_90-Exoom_v1/run01/results/alignment/ OPTIONS
 	Required:
 	-s|--sampledir		Directory of of bam files of the used samples
-	-t|--tmpdir         Give tmpfolder location for inbetween and final files
+	-t|--tmpdir 	        Give tmpfolder location for inbetween and final files
 	Optional:
-	-b|--bamstat		path to the ban stats (default: $EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/bamUtil-1.0.14/bin/bam stats)
+	-b|--bamstat		path to the ban stats (your/directory/to/bamUtil-1.0.14/bin/bam stats)
 	-m|--mvl	        location of MVL_per_bp_unique.txt (default: $EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique.txt)
-	-x|--mvltabix       location of MVL_per_bp_unique_tabix.txt (default: $EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique_tabix.txt)
-	-q|--qcmvldir          location of MerdgeQCofMVL_file.py necessary for final txt generation (default:$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/)
+	-x|--mvltabix	        location of MVL_per_bp_unique_tabix.txt (default: $EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique_tabix.txt)
+	-q|--qcmvldir           location of MerdgeQCofMVL_file.py necessary for final txt generation (default:$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/)
 "
 }
 
@@ -63,7 +63,7 @@ while true; do
             esac ;;
     -t|--tmpdir)
                 case "$2" in
-                *) TMPDIR=$2 ; shift 2 ;;
+                *) TmpDirBam=$2 ; shift 2 ;;
             esac ;; 
     -q|--qcmvl)
                 case "$2" in
@@ -75,8 +75,6 @@ while true; do
 done
 
 
-
-empty=""
 #
 #Check required options were provided.
 #
@@ -85,15 +83,16 @@ if [[ -z "${SAMPLEDIR-}" ]]; then
 	exit 1
 fi
 if [[ -z "${BAMSTAT-}" ]]; then
-	BAMSTAT="$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/bamUtil-1.0.14/bin/bam"
+	#BAMSTAT="your/directry/to/bamUtil_Coverage_Tool/bamUtil-1.0.14/bin/bam"
+	usage
 fi
 if [[ -z "${MVL-}" ]]; then
         MVL="$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique.txt"
 fi
 if [[ -z "${MVLTABIX-}" ]]; then
         MVLTABIX="$EBROOTNGSMINUTILS/bamUtil_Coverage_Tool/MVL_per_bp_unique_tabix.txt"
 fi
-if [[ -z "${TMPDIR-}" ]]; then
+if [[ -z "${TmpDirBam=-}" ]]; then
 	usage
 	exit 1
 fi
@@ -104,30 +103,30 @@ echo "SAMPLEDIR: ${SAMPLEDIR}"
 echo "BAMSTAT: ${BAMSTAT}"
 echo "MVL: ${MVL}"
 echo "MVLTABIX: ${MVLTABIX}"
-echo "TMPDIR: ${TMPDIR}"
+echo "TMPDIR: ${TmpDirBam}"
 echo "QCMVLDIR: ${QCMVLDIR}"
 echo "starting"
 
 
-rm -rf ${TMPDIR}
-mkdir -p ${TMPDIR}/input/
-mkdir -p ${TMPDIR}/outputfilter/
-mkdir -p ${TMPDIR}/output_final/
-mkdir -p ${TMPDIR}/output_final_txt_files/
+rm -rf "${TmpDirBam}"
+mkdir -p "${TmpDirBam}"/input/
+mkdir -p "${TmpDirBam}"/outputfilter/
+mkdir -p "${TmpDirBam}"/output_final/
+mkdir -p "${TmpDirBam}"/output_final_txt_files/
 
 
 #runBamStatWithRegion3.sh run the BAM stat tool to generate QC report per position
 echo "start Bam stats"
 
 count=0
 
-for i in $(ls ${SAMPLEDIR}/*.bam)
+for i in $(ls "${SAMPLEDIR}"/*.bam)
 do
 	echo "processingBamstat ${i}"
-	${BAMSTAT} stats \
-	--in ${i} \
-	--cBaseQC ${TMPDIR}/input/outputBamUtil_${count}.txt \
-	--regionlist ${MVL} \
+	"${BAMSTAT}" stats \
+	--in "${i}" \
+	--cBaseQC "${TmpDirBam}"/input/outputBamUtil_${count}.txt \
+	--regionlist "${MVL}" \
 	--withinRegion \
 	--nophonehome
 
@@ -140,31 +139,31 @@ echo "finisched Bam stats"
 # test.sh, make QC report complete by adding also position with 0x coverage
 
 echo "start tabix 0x coverage"
-for i in $(ls ${TMPDIR}/input/output*.txt)
+for i in $(ls "${TmpDirBam}"/input/output*.txt)
 do 
-    b=$(basename ${i})
+    b=$(basename "${i}")
     bedfile=${b%.*}.bed.txt
-    awk 'BEGIN{OFS="\t"}{print $1,$2,($3-1),$4}' ${i} > ${TMPDIR}/${bedfile}
+    awk 'BEGIN{OFS="\t"}{print $1,$2,($3-1),$4}' ${i} > "${TmpDirBam}"/"${bedfile}"
 
-sed -i '1d' ${TMPDIR}/${bedfile}    
-bgzip -c ${TMPDIR}/${bedfile} > ${TMPDIR}/${bedfile}.gz
-    tabix -0 -p bed ${TMPDIR}/${bedfile}.gz
-rm -f ${TMPDIR}/outputfilter/${b%.*}_filter.txt    
+sed -i '1d' "${TmpDirBam}"/"${bedfile}"    
+bgzip -c "${TmpDirBam}"/"${bedfile}" > "${TmpDirBam}"/"${bedfile}".gz
+    tabix -0 -p bed "${TmpDirBam}"/"${bedfile}".gz
+rm -f "${TmpDirBam}"/outputfilter/${b%.*}_filter.txt    
     while read line
     do 
-        if tabix -0 ${TMPDIR}/${bedfile}.gz ${line} | grep -q "^"
+        if tabix -0 "${TmpDirBam}"/"${bedfile}".gz ${line} | grep -q "^"
         then
-            tabix -0 ${TMPDIR}/${bedfile}.gz ${line}
+            tabix -0 "${TmpDirBam}"/"${bedfile}".gz ${line}
         else
             echo $line | awk -F":" '{print $1"\t"$2}' | awk -F"-" '{print $1"\t"$2"\t0"}' 
         fi
-    done < ${MVLTABIX} >> ${TMPDIR}/outputfilter/${b%.*}_filter.txt
+    done < "${MVLTABIX}" >> "${TmpDirBam}"/outputfilter/${b%.*}_filter.txt
 echo "${i} done tabix 0x coverage"
 done
 
 # test_chr_pos_cov.sh, make files usable for python script including 0x 
 echo "starting making final txt files per patient"
-for i in $(ls ${TMPDIR}/outputfilter/output*.txt)
+for i in $(ls ${TmpDirBam}/outputfilter/output*.txt)
 do
         b=$(basename ${i})
         filename=${b%.*}
@@ -176,31 +175,31 @@ do
                         else{
                                 print $1"\t"$2"\t"$4
                         }
-                }' $i > "${TMPDIR}/output_final/${filename}_final.txt" 
+                }' $i > "${TmpDirBam}/output_final/${filename}_final.txt" 
 echo "${i} done"
 done
 echo "finisched making final txt per patient"
 
 #add gene name to coverage file. 
 echo "add gene name"
-sort -V ${MVL} > ${MVL}.sorted
-awk '{print $4}' ${MVL} > ${MVL}.genes
+sort -V "${MVL}" > "${MVL}".sorted
+awk '{print $4}' "${MVL}" > "${MVL}".genes
 
-for i in $(ls ${TMPDIR}/output_final/outputBamUtil_*_filter_final.txt)
+for i in $(ls "${TmpDirBam}"/output_final/outputBamUtil_*_filter_final.txt)
 do
 
     sortedFile=${i%.*}.sorted.txt
     sort -V ${i} > ${sortedFile}
     a=$(diff <(awk '{print $2}' "${MVL}.sorted") <(awk '{print $2}' "${sortedFile}"))
     if [ "${a}" == "" ]
     then
-        paste -d"\t" $i ${MVL}.genes > ${i%.*}_gene.txt
+        paste -d"\t" $i "${MVL}".genes > ${i%.*}_gene.txt
     else 
         echo 'positions in MVL and BAMutil output do not match'
     fi
 echo "${i} done adding gene name"    
 done
 echo "start python script"
 
-python ${QCMVLDIR}/MerdgeQCofMVL_file.py --in ${TMPDIR}/output_final/ --ff ${TMPDIR}/output_final_txt_files/
+python "${QCMVLDIR}"/MerdgeQCofMVL_file.py --in "${TmpDirBam}"/output_final/ --ff "${TmpDirBam}"/output_final_txt_files/
 
diff --git a/renameFastQs.bash b/renameFastQs.bash
@@ -116,7 +116,7 @@ function _RenameFastQ() {
 	then
 		echo "DEBUG:    Found _firstReadID ............ = ${_firstReadID}"
 	fi
-	local _regex='^@([A-Z0-9][A-Z0-9]*):([0-9][0-9]*):([A-Z0-9][A-Z0-9]*XX):([1-8]):[0-9]*:[0-9]*:[0-9]* ([1-2]):[YN]:[0-9][0-9]*:[ATCGN][ATCGN+]*'
+	local _regex='^@([A-Z0-9][A-Z0-9]*):([0-9][0-9]*):([A-Z0-9][A-Z0-9]*):([1-8]):[0-9]*:[0-9]*:[0-9]* ([1-2]):[YN]:[0-9][0-9]*:[ATCGN][ATCGN+]*'
 	if [[ "${_firstReadID}" =~ ${_regex} ]]
 	then
 		local _sequencer="${BASH_REMATCH[1]}"
@@ -126,8 +126,7 @@ function _RenameFastQ() {
 		local _sequenceReadOfPair="${BASH_REMATCH[5]}"
 		#
 		# Note: we do require the ID of the first read to contain a DNA barcode at the end,
-		# but we don't parse barcodes here. Due to sequencing errors the barcode may deviate slightly.
-		# Instead we parse the barcodes from the first reads and determine the most abundant one later on.
+		# but we don't parse barcodes here. See below for why...
 		#
 
 		#
@@ -150,11 +149,20 @@ function _RenameFastQ() {
 			echo "DEBUG:    Found _sequenceReadOfPair ..... = ${_sequenceReadOfPair}"
 		fi
 	else
-		echo "FATAL: Failed to required meta-data values from ID of first read of ${_fastqPath}"
+		echo "FATAL: Failed to parse required meta-data values from ID of first read of ${_fastqPath}"
 		exit 1
 	fi
 
-	local _mostAbundandBarcode=$(zcat "${_fastqPath}" | head -4000 | awk 'NR % 4 == 1' | awk -F ':' '{print $10}' | sort | uniq -c | sort -k 1,1nr | head -1 | awk '{print $2}' | tr -d '\n')
+	#
+	# Due to sequencing errors the barcode may deviate slightly, so looking only at the first one won't fly.
+	# In addition the start and end of a FastQ file tends to be enriched for sequencing errors / low quality.
+	# Therefore we:
+	#   A. use a combination of head and tail commands to skip the first 10,000 reads (40,000 lines)
+	#      and get data from in the middle of the FastQ file.
+	#      (assuming the yield was high enough; otherwise you get the last 1000 reads).
+	#   B. Next we parse the barcodes from the read ID lines, sort, count and determine the most abundant barcode.
+	#
+	local _mostAbundandBarcode=$(zcat "${_fastqPath}" | head -n 44000 | tail -n 4000 | awk 'NR % 4 == 1' | awk -F ':' '{print $10}' | sort | uniq -c | sort -k 1,1nr | head -1 | awk '{print $2}' | tr -d '\n')
 	local _barcodeRegex='^([ATCG][ATCG+]*)$'
 	if [[ "${_mostAbundandBarcode}" =~ ${_barcodeRegex} ]]
 	then
@@ -169,11 +177,11 @@ function _RenameFastQ() {
 		fi
 	elif [[ "${_mostAbundandBarcode}" =~ N ]]
 	then
-		echo "ERROR: Most abundant barcode(s) in max first 1000 reads contains Ns: ${_barcodes}"
+		echo "ERROR: Most abundant barcode(s) in max 1000 reads from middle of FastQ contains Ns: ${_barcodes}"
 		echo "ERROR: Skipping discarded FastQ ${_fastqFile} due to poor sequencing quality of barcode(s)."
 		return
 	else
-		echo "ERROR: Failed to determine the most abundant barcode(s) from the max first 1000 reads."
+		echo "ERROR: Failed to determine the most abundant barcode(s) from max 1000 reads from middle of FastQ."
 		echo "FATAL: Failed to process FastQ ${_fastqFile} due to poor sequencing quality of barcode(s)."
 		exit 1
 	fi