diff --git a/checkValidationNGS_DNA.sh b/checkValidationNGS_DNA.sh index c62d3db..d9877a0 100755 --- a/checkValidationNGS_DNA.sh +++ b/checkValidationNGS_DNA.sh @@ -47,7 +47,7 @@ then else scp calculon.hpc.rug.nl:${validationFolder}/* "${inputFolder}/validationVcfs/" fi - validationFolder=${inputFolder}/validationVcfs/ + validationFolder="${inputFolder}/validationVcfs/" fi for i in $(ls ${validationFolder}/*.vcf.gz) do @@ -57,17 +57,17 @@ do -T VariantEval \ -R /apps/data/1000G/phase1/human_g1k_v37_phiX.fasta \ -o "${outputFolder}/output.${name}.eval.grp" \ - --eval ${inputFolder}/*${name}*.${inputType} \ + --eval "${inputFolder}/"*"${name}"*".${inputType}" \ --comp "${i}" done echo "Sample Chr Pos Ref Alt Found?" -for i in $(ls ${validationFolder}/*.vcf.gz) +for i in $(ls "${validationFolder}/"*".vcf.gz") do name=$(basename "${i}" ".vcf.gz") referenceCall=$(grep "0/0" "${i}" | wc -l) - referenceCallMale=$(grep -P "\t0:" ${i} | wc -l) + referenceCallMale=$(grep -P "\t0:" "${i}" | wc -l) check=$(awk '{if (NR==5){if ($11 == "100.00"){print "correct"}}}' "${outputFolder}/output.${name}.eval.grp") diff --git a/countCoverage.sh b/countCoverage.sh index d63f048..c83264e 100644 --- a/countCoverage.sh +++ b/countCoverage.sh @@ -94,15 +94,15 @@ if [[ -z "${PRMDIR-}" ]]; then PRMDIR="/groups/umcg-gd/prm0*/projects/" fi if [[ -z "${TMP-}" ]]; then - TMP=${WORKDIR}/tmp/ - mkdir -p ${TMP} + TMP="${WORKDIR}/tmp/" + mkdir -p "${TMP}" echo "makedir ${TMP}" fi echo "starting, first cleaning up old run in ${WORKDIR}" -rm -rf ${WORKDIR} -mkdir -p ${WORKDIR}/coverage/ -mkdir -p ${WORKDIR}/tmp/ +rm -rf "${WORKDIR}" +mkdir -p "${WORKDIR}/coverage/" +mkdir -p "${WORKDIR}/tmp/" echo "starting to extract all the ${PANEL} projects and retreive the coverage for each sample" echo "ls ${PRMDIR}/*${PANEL}/${STRUCTURE}/*${PANEL}*.coveragePerTarget.txt" @@ -116,29 +116,29 @@ then do sampleName="$(basename "${i%%.*}")" SAMPLES+=("${sampleName}") - if [ $count == 0 ] + if [ "${count}" == 0 ] then - awk '{print $2,$3,$4,$6}' $i > ${TMP}/firstcolumns.txt + awk '{print $2,$3,$4,$6}' "${i}" > "${TMP}/firstcolumns.txt" count=$((count+1)) fi - NAMEFILE=$(basename $i) - DIRNAME=$(dirname $i) + NAMEFILE=$(basename "${i}") + DIRNAME=$(dirname "${i}") SAMPLENAME=${NAMEFILE%%.*} #awk '{sum+=$5}END {print sum/210414}' $i > $DIRNAME/$SAMPLENAME.averageCoverage.txt - awk '{print $5}' $i > ${WORKDIR}/coverage/${SAMPLENAME}.coverage - totalcount=$(($(cat ${WORKDIR}/coverage/${SAMPLENAME}.coverage | wc -l)-1)) + awk '{print $5}' "${i}" > "${WORKDIR}/coverage/${SAMPLENAME}.coverage" + totalcount=$(($(cat "${WORKDIR}/coverage/${SAMPLENAME}.coverage" | wc -l)-1)) count=0 - count=$(awk 'BEGIN{sum=0}{if($1 < 20){sum++}} END {print sum}' ${WORKDIR}/coverage/${SAMPLENAME}.coverage) + count=$(awk 'BEGIN{sum=0}{if($1 < 20){sum++}} END {print sum}' "${WORKDIR}/coverage/${SAMPLENAME}.coverage") - if [ $count == 0 ] + if [ "${count}" == 0 ] then percentage=0 else percentage=$(echo $((count*100/totalcount))) if [ ${percentage%%.*} -gt 10 ] then - echo "${SAMPLENAME}: percentage $percentage ($count/$totalcount) is more than 10 procent, skipped" + echo "${SAMPLENAME}: percentage $percentage (${count}/${totalcount}) is more than 10 procent, skipped" REJECTEDSAMPLES+=("${SAMPLENAME}") continue @@ -155,18 +155,18 @@ then for i in "${REJECTEDSAMPLES[@]}" do echo "removed ${WORKDIR}/coverage/${i}.coverage" - echo "removed ${WORKDIR}/coverage/${i}.coverage" > ${WORKDIR}/rejectedSamples.txt - rm -f ${WORKDIR}/coverage/${i}.coverage + echo "removed ${WORKDIR}/coverage/${i}.coverage" > "${WORKDIR}/rejectedSamples.txt" + rm -f "${WORKDIR}/coverage/${i}.coverage" done fi -echo ${SAMPLES[@]} > ${TMP}/headers.txt +echo ${SAMPLES[@]} > ${TMP}/headers.txt" echo "${WORKDIR}/coverage/" -total=$(ls ${WORKDIR}/coverage/*.coverage | wc -l) +total=$(ls "${WORKDIR}/coverage/"*".coverage" | wc -l) -paste ${WORKDIR}/coverage/*.coverage > ${TMP}/coverageAllSamples.txt +paste "${WORKDIR}/coverage/"*".coverage" > "${TMP}/coverageAllSamples.txt" echo "## calculate MEDIAN ##" @@ -185,39 +185,39 @@ awk -v max=1 ' } END { print median(count,values); -}' ${TMP}/coverageAllSamples.txt > ${TMP}/coverageAllSamples_Median.txt +}' "${TMP}/coverageAllSamples.txt" > "${TMP}/coverageAllSamples_Median.txt" echo "## calculate SD ##" ## calculate SD -awk '{ A=0; V=0; for(N=1; N<=NF; N++) A+=$N ; A/=NF ; for(N=1; N<=NF; N++) V+=(($N-A)*($N-A))/(NF-1); print sqrt(V) }' ${TMP}/coverageAllSamples.txt > ${TMP}/coverageAllSamples_SD.txt +awk '{ A=0; V=0; for(N=1; N<=NF; N++) A+=$N ; A/=NF ; for(N=1; N<=NF; N++) V+=(($N-A)*($N-A))/(NF-1); print sqrt(V) }' "${TMP}/coverageAllSamples.txt" > "${TMP}/coverageAllSamples_SD.txt" echo "## calculate AVG ##" ## CALCULATE AVG -awk '{ for(i = 1; i <= NF; i++) sum+=$i;print sum/NF;sum=0 }' ${TMP}/coverageAllSamples.txt > ${TMP}/coverageAllSamples_AVG.txt +awk '{ for(i = 1; i <= NF; i++) sum+=$i;print sum/NF;sum=0 }' "${TMP}/coverageAllSamples.txt" > "${TMP}/coverageAllSamples_AVG.txt" echo "## Calculate percentage under 10,20,30,50 and 100x ##" ## Calculate percentage under 10,20,50 and 100x ## -awk '{ for(i = 1; i <= NF; i++) if ($i < 10 )counter+=1;print 100-((counter/NF))*100;counter=0 }' ${TMP}/coverageAllSamples.txt > ${TMP}/coverageAllSamples_moreThan10x.txt -awk '{ for(i = 1; i <= NF; i++) if ($i < 20 )counter+=1;print 100-((counter/NF))*100;counter=0 }' ${TMP}/coverageAllSamples.txt > ${TMP}/coverageAllSamples_moreThan20x.txt -awk '{ for(i = 1; i <= NF; i++) if ($i < 30 )counter+=1;print 100-((counter/NF))*100;counter=0 }' ${TMP}/coverageAllSamples.txt > ${TMP}/coverageAllSamples_moreThan30x.txt -awk '{ for(i = 1; i <= NF; i++) if ($i < 50 )counter+=1;print 100-((counter/NF))*100;counter=0 }' ${TMP}/coverageAllSamples.txt > ${TMP}/coverageAllSamples_moreThan50x.txt -awk '{ for(i = 1; i <= NF; i++) if ($i < 100 )counter+=1;print 100-((counter/NF))*100;counter=0 }' ${TMP}/coverageAllSamples.txt > ${TMP}/coverageAllSamples_moreThan100x.txt +awk '{ for(i = 1; i <= NF; i++) if ($i < 10 )counter+=1;print 100-((counter/NF))*100;counter=0 }' "${TMP}/coverageAllSamples.txt" > "${TMP}/coverageAllSamples_moreThan10x.txt" +awk '{ for(i = 1; i <= NF; i++) if ($i < 20 )counter+=1;print 100-((counter/NF))*100;counter=0 }' "${TMP}/coverageAllSamples.txt" > "${TMP}/coverageAllSamples_moreThan20x.txt" +awk '{ for(i = 1; i <= NF; i++) if ($i < 30 )counter+=1;print 100-((counter/NF))*100;counter=0 }' "${TMP}/coverageAllSamples.txt" > "${TMP}/coverageAllSamples_moreThan30x.txt" +awk '{ for(i = 1; i <= NF; i++) if ($i < 50 )counter+=1;print 100-((counter/NF))*100;counter=0 }' "${TMP}/coverageAllSamples.txt" > "${TMP}/coverageAllSamples_moreThan50x.txt" +awk '{ for(i = 1; i <= NF; i++) if ($i < 100 )counter+=1;print 100-((counter/NF))*100;counter=0 }' "${TMP}/coverageAllSamples.txt" > "${TMP}/coverageAllSamples_moreThan100x.txt" -awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$4}' /apps/data/Agilent/${PANEL}/human_g1k_v37/captured.merged.bed > ${TMP}/first4columns.txt +awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$4}' "/apps/data/Agilent/${PANEL}/human_g1k_v37/captured.merged.bed" > "${TMP}/first4columns.txt" -echo "## update column gene with only one annotation possible" +echo "## update column gene with only one annotation possible" ## update column gene with only one annotation possible echo "pasting median,avg,10x,20x,30x,50x and 100x" -rm -f ${WORKDIR}/CoverageOverview.txt +rm -f "${WORKDIR}/CoverageOverview.txt" firstPartOfLink="https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&lastVirtModeType=default&lastVirtModeExtraState=&virtModeType=default&virtMode=0&nonVirtPosition=&position=" secondPartOfLink="&hgsid=653811211_bIwQegXO9Zbd8eoOt7J1cdi7D9zi" -paste -d '\t' ${TMP}/first4columns.txt ${TMP}/coverageAllSamples_Median.txt ${TMP}/coverageAllSamples_AVG.txt ${TMP}/coverageAllSamples_SD.txt ${TMP}/coverageAllSamples_moreThan10x.txt ${TMP}/coverageAllSamples_moreThan20x.txt ${TMP}/coverageAllSamples_moreThan20x.txt ${TMP}/coverageAllSamples_moreThan50x.txt ${TMP}/coverageAllSamples_moreThan100x.txt > ${TMP}/pasteAllInfoTogether.txt -echo -e "Chr\tStart\tStop\tGene\tMedian\tAvgCoverage\tSD\tmoreThan10x\tmoreThan20x\tmoreThan30x\tmoreThan50x\tmoreThan100x\tgenomeBrowse" > ${WORKDIR}/CoverageOverview.txt -tail -n+2 ${TMP}/pasteAllInfoTogether.txt >> ${WORKDIR}/CoverageOverview.txt -head -n -1 ${WORKDIR}/CoverageOverview.txt > ${TMP}/CoverageOverview.txt.tmp +paste -d '\t' "${TMP}/first4columns.txt" "${TMP}/coverageAllSamples_Median.txt" "${TMP}/coverageAllSamples_AVG.txt" "${TMP}/coverageAllSamples_SD.txt" "${TMP}/coverageAllSamples_moreThan10x.txt" "${TMP}/coverageAllSamples_moreThan20x.txt ${TMP}/coverageAllSamples_moreThan20x.txt ${TMP}/coverageAllSamples_moreThan50x.txt ${TMP}/coverageAllSamples_moreThan100x.txt > ${TMP}/pasteAllInfoTogether.txt" +echo -e "Chr\tStart\tStop\tGene\tMedian\tAvgCoverage\tSD\tmoreThan10x\tmoreThan20x\tmoreThan30x\tmoreThan50x\tmoreThan100x\tgenomeBrowse" > "${WORKDIR}/CoverageOverview.txt" +tail -n+2 "${TMP}/pasteAllInfoTogether.txt" >> "${WORKDIR}/CoverageOverview.txt" +head -n -1 "${WORKDIR}/CoverageOverview.txt" > "${TMP}/CoverageOverview.txt.tmp" awk -v link1="${firstPartOfLink}" -v link2="${secondPartOfLink}" '{OFS="\t"}{OFMT="%.2f"; if (NR==1){print $0}else{print $1,$2,$3,$4,$5/1,$6/1,$7/1,$8/1,$9/1,$10/1,$11/1,$12/1,link1""$1"%3A"$2""link2}}' ${TMP}/CoverageOverview.txt.tmp > ${WORKDIR}/CoverageOverview_basedOn_${total}_Samples.txt echo "DONE, final file is ${WORKDIR}/CoverageOverview_basedOn_${total}_Samples.txt" @@ -236,14 +236,14 @@ ml ngs-utils mkdir -p "${TMP}/perGene/" echo "writing all the targets to the genes file, this will take some time" awk -v tmpDirectory="${TMP}/perGene/" '{if (NR>1){print $0 > tmpDirectory$4".txt"}}' "${WORKDIR}/CoverageOverview_basedOn_${total}_SamplesFinal.txt" -echo "genomeBrowser" > ${TMP}/perGene/allGenes.startStop +echo "genomeBrowser > ${TMP}/perGene/allGenes.startStop" for i in $(ls ${TMP}/perGene/*.txt) do geneName="$(basename "${i%.txt}")" echo "working on $geneName" - head -1 $i | awk -v g="${geneName}" -v link="${firstPartOfLink}" '{print link""$1"%3A"$2}' > ${i}.start - tail -1 $i | awk -v link="${secondPartOfLink}" '{print $3""link}' > ${i}.stop - paste -d"-" ${i}.start ${i}.stop >> ${TMP}/perGene/allGenes.startStop + head -1 "${i}" | awk -v g="${geneName}" -v link="${firstPartOfLink}" '{print link""$1"%3A"$2}' > "${i}.start" + tail -1 "${i}" | awk -v link="${secondPartOfLink}" '{print $3""link}' > "${i}.stop" + paste -d"-" "${i}.start" "${i}.stop" >> "${TMP}/perGene/allGenes.startStop" done echo "python ~/github/ngs-utils/countCoveragePerGene.py ${WORKDIR}/CoverageOverview_basedOn_${total}_SamplesFinal.txt > ${WORKDIR}/CoverageOverview_PerGene.txt" @@ -253,6 +253,6 @@ awk '{OFS="\t"}{OFMT="%.2f"; print $1,$2/1,$3,$4/1,$5/1,$6/1,$7/1,$8/1,$9/1,$10/ echo -e "Gene\tAvgCoverage\tNo_of_Targets\tMedian\tSD\tmoreThan10x\tmoreThan20x\tmoreThan30x\tmoreThan50x\tmoreThan100x" > ${TMP}/CoverageOverview_PerGene_basedOn_${total}_Samples.txt.tmp -sort -V -k1 ${WORKDIR}/CoverageOverview_PerGene.txt.tmp >> ${TMP}/CoverageOverview_PerGene_basedOn_${total}_Samples.txt.tmp -paste -d"\t" ${TMP}/CoverageOverview_PerGene_basedOn_${total}_Samples.txt.tmp ${TMP}/perGene/allGenes.startStop > ${WORKDIR}/CoverageOverview_PerGene_basedOn_${total}_Samples.txt +sort -V -k1 "${WORKDIR}/CoverageOverview_PerGene.txt.tmp" >> "${TMP}/CoverageOverview_PerGene_basedOn_${total}_Samples.txt.tmp" +paste -d"\t" "${TMP}/CoverageOverview_PerGene_basedOn_${total}_Samples.txt.tmp" "${TMP}/perGene/allGenes.startStop" > "${WORKDIR}/CoverageOverview_PerGene_basedOn_${total}_Samples.txt" echo "done, gene file can be found here: ${WORKDIR}/CoverageOverview_PerGene_basedOn_${total}_Samples.txt"