From e5450d08aabbe472f15b9338fde9ebd3964d8864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Nordstr=C3=B6m?= Date: Fri, 8 Mar 2019 15:25:09 +0100 Subject: [PATCH] Improvements: - Better documentation - Made sure the AWK variable is correctly transferred to the subroutines --- README.md | 30 +++++++++++++++++++++++++++- gNOMeHMM.sh | 4 ++-- subroutines/gNOMePeaks_splitOnChr.sh | 2 +- subroutines/preProcessing.sh | 7 ++++--- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7b8bb6d..56ca9d1 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Peak calling in whole-ganome NOMe data * snow * parallel * java -* awk or mawk +* gawk or mawk (macOS comes with nawk ==> use brew to install gawk) ### create jar-file from source @@ -33,6 +33,34 @@ gNOMePeaks utilizes the R package HiddenMarkov for hidden Markov models. This pa R CMD INSTALL ...PATH.../gNOMePeaks/thirdParty/HiddenMarkov.mod ``` +## Input format + +gNOMePeaks works with methylation calls in Bis-SNP format. The first line of the this format contains track information and will be ignored. In essence it is enough to provide a tab-separated file with six columns, and an arbitrary or empty first line: + +- chr, chromosome +- start, start of Cytosine in GC context +- end, end of Cytosine in GC context (start+1) +- methylation ratio, double value scaled from 0 to 100 +- coverage, number of reads covering the position +- strand, +/- + +Example: + +``` +track name=sampleFile type=bedDetail description="GCH methylation level" visibility=3 +1 10481 10482 0.00 1 - +1 10482 10483 0.00 4 + +1 10485 10486 0.00 1 - +1 10486 10487 0.00 4 + +1 10490 10491 0.00 6 + +1 10494 10495 0.00 5 + +1 10520 10521 0.00 5 - +1 10521 10522 12.50 8 + +1 10526 10527 0.00 8 + +1 10551 10552 0.00 4 - +1 10552 10553 12.50 8 + +``` + ## Output format The gNOMePeaks output table contains the following fields: diff --git a/gNOMeHMM.sh b/gNOMeHMM.sh index 8906f38..6224889 100755 --- a/gNOMeHMM.sh +++ b/gNOMeHMM.sh @@ -4,7 +4,7 @@ name="gNOMeHMM.sh" # Local configuration #AWK=mawk -AWK=awk +export AWK=gawk Rscript=Rscript java=java @@ -186,7 +186,7 @@ postDecodeCol=7 # pre-processing = putative merge of GcPs and coverage filter # prepare pre-processing for parallelization for raw in $TMPWD/sep/raw*bed; do - echo "bash $SCRIPTFOLDER/subroutines/preProcessing.sh $raw $mergeGpC $covCutoff $TMPWD `dirname $raw`/`basename $raw |sed s/raw_//`" + echo "bash $SCRIPTFOLDER/subroutines/preProcessing.sh $raw $mergeGpC $covCutoff $TMPWD `dirname $raw`/`basename $raw |sed s/raw_//` $AWK" done > $TMPWD/job20.preProcessing.txt diff --git a/subroutines/gNOMePeaks_splitOnChr.sh b/subroutines/gNOMePeaks_splitOnChr.sh index 8148d7a..740eb00 100644 --- a/subroutines/gNOMePeaks_splitOnChr.sh +++ b/subroutines/gNOMePeaks_splitOnChr.sh @@ -11,7 +11,7 @@ then zcat $LOCALINPUTFILE else cat $LOCALINPUTFILE -fi | awk -vWITHHEADER=$WITHHEADER -vFOLDER=$LOCALOUTPUTPREFIX ' +fi | ${AWK:-gawk} -vWITHHEADER=$WITHHEADER -vFOLDER=$LOCALOUTPUTPREFIX ' NR==1 && WITHHEADER==1 { header=$0 print header > FOLDER"rest.bed" diff --git a/subroutines/preProcessing.sh b/subroutines/preProcessing.sh index 8ab558b..0ee48ea 100644 --- a/subroutines/preProcessing.sh +++ b/subroutines/preProcessing.sh @@ -10,16 +10,17 @@ mergeGpC=$2 covCutoff=$3 TMPWD=$4 output=$5 +AWK=$6 mkdir -p $TMPWD/tmp if [[ "$mergeGpC" == "T" ]] then - join -1 1 -2 1 -a 1 -a 2 <(awk '$6=="+" {print $1"_"($2-1),$6,$1,($2-1),$4,$5}' $inputFile |sort -T $TMPWD/tmp -k1b,1) <(awk '$6=="-" {print $1"_"$2,$6,$1,$2,$4,$5}' $inputFile |sort -T $TMPWD/tmp -k1b,1) | - awk -vOFS='\t' 'NF==11 {print $3,$4,$4+2,($5*$6+$10*$11)/($6+$11),$6+$11; next} {print $3,$4,$4+2,$5,$6}' - + join -1 1 -2 1 -a 1 -a 2 <(${AWK:-gawk} '$6=="+" {print $1"_"($2-1),$6,$1,($2-1),$4,$5}' $inputFile |sort -T $TMPWD/tmp -k1b,1) <(${AWK:-gawk} '$6=="-" {print $1"_"$2,$6,$1,$2,$4,$5}' $inputFile |sort -T $TMPWD/tmp -k1b,1) | + ${AWK:-gawk} -vOFS='\t' 'NF==11 {print $3,$4,$4+2,($5*$6+$10*$11)/($6+$11),$6+$11; next} {print $3,$4,$4+2,$5,$6}' - else cut -f 1-5 $inputFile fi | -awk -vcovCutoff=$covCutoff '$5>covCutoff' - | +${AWK:-gawk} -vcovCutoff=$covCutoff '$5>covCutoff' - | sort -T $TMPWD/tmp -k1,1 -k2,2g -k3,3g - > $output