From e5450d08aabbe472f15b9338fde9ebd3964d8864 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Karl=20Nordstr=C3=B6m?= <karl.nordstrom@gmail.com>
Date: Fri, 8 Mar 2019 15:25:09 +0100
Subject: [PATCH] Improvements: - Better documentation - Made sure the AWK
 variable is correctly transferred to the subroutines

---
 README.md                            | 30 +++++++++++++++++++++++++++-
 gNOMeHMM.sh                          |  4 ++--
 subroutines/gNOMePeaks_splitOnChr.sh |  2 +-
 subroutines/preProcessing.sh         |  7 ++++---
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 7b8bb6d..56ca9d1 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Peak calling in whole-ganome NOMe data
   * snow
   * parallel
 * java
-* awk or mawk
+* gawk or mawk (macOS comes with nawk ==> use brew to install gawk)
 
 ### create jar-file from source
 
@@ -33,6 +33,34 @@ gNOMePeaks utilizes the R package HiddenMarkov for hidden Markov models. This pa
 R CMD INSTALL ...PATH.../gNOMePeaks/thirdParty/HiddenMarkov.mod
 ```
 
+## Input format
+
+gNOMePeaks works with methylation calls in Bis-SNP format. The first line of the this format contains track information and will be ignored. In essence it is enough to provide a tab-separated file with six columns, and an arbitrary or empty first line:
+
+- chr, chromosome
+- start, start of Cytosine in GC context
+- end, end of Cytosine in GC context (start+1)
+- methylation ratio, double value scaled from 0 to 100
+- coverage, number of reads covering the position
+- strand, +/-
+
+Example:
+
+```
+track name=sampleFile type=bedDetail description="GCH methylation level" visibility=3
+1       10481   10482   0.00    1       -
+1       10482   10483   0.00    4       +
+1       10485   10486   0.00    1       -
+1       10486   10487   0.00    4       +
+1       10490   10491   0.00    6       +
+1       10494   10495   0.00    5       +
+1       10520   10521   0.00    5       -
+1       10521   10522   12.50   8       +
+1       10526   10527   0.00    8       +
+1       10551   10552   0.00    4       -
+1       10552   10553   12.50   8       +
+```
+
 ## Output format
 
 The gNOMePeaks output table contains the following fields:
diff --git a/gNOMeHMM.sh b/gNOMeHMM.sh
index 8906f38..6224889 100755
--- a/gNOMeHMM.sh
+++ b/gNOMeHMM.sh
@@ -4,7 +4,7 @@ name="gNOMeHMM.sh"
 
 # Local configuration
 #AWK=mawk
-AWK=awk
+export AWK=gawk
 Rscript=Rscript
 java=java
 
@@ -186,7 +186,7 @@ postDecodeCol=7
 # pre-processing = putative merge of GcPs and coverage filter
 # prepare pre-processing for parallelization
 for raw in $TMPWD/sep/raw*bed; do
-  echo "bash $SCRIPTFOLDER/subroutines/preProcessing.sh $raw $mergeGpC $covCutoff $TMPWD `dirname $raw`/`basename $raw |sed s/raw_//`"
+  echo "bash $SCRIPTFOLDER/subroutines/preProcessing.sh $raw $mergeGpC $covCutoff $TMPWD `dirname $raw`/`basename $raw |sed s/raw_//` $AWK"
 done > $TMPWD/job20.preProcessing.txt
 
 
diff --git a/subroutines/gNOMePeaks_splitOnChr.sh b/subroutines/gNOMePeaks_splitOnChr.sh
index 8148d7a..740eb00 100644
--- a/subroutines/gNOMePeaks_splitOnChr.sh
+++ b/subroutines/gNOMePeaks_splitOnChr.sh
@@ -11,7 +11,7 @@ then
  zcat $LOCALINPUTFILE
 else
  cat $LOCALINPUTFILE
-fi | awk -vWITHHEADER=$WITHHEADER -vFOLDER=$LOCALOUTPUTPREFIX '
+fi | ${AWK:-gawk} -vWITHHEADER=$WITHHEADER -vFOLDER=$LOCALOUTPUTPREFIX '
 NR==1 && WITHHEADER==1 {
  header=$0
  print header > FOLDER"rest.bed"
diff --git a/subroutines/preProcessing.sh b/subroutines/preProcessing.sh
index 8ab558b..0ee48ea 100644
--- a/subroutines/preProcessing.sh
+++ b/subroutines/preProcessing.sh
@@ -10,16 +10,17 @@ mergeGpC=$2
 covCutoff=$3
 TMPWD=$4
 output=$5
+AWK=$6
 
 mkdir -p $TMPWD/tmp
 
 if [[ "$mergeGpC" == "T" ]]
 then
- join -1 1 -2 1 -a 1 -a 2 <(awk '$6=="+" {print $1"_"($2-1),$6,$1,($2-1),$4,$5}' $inputFile |sort -T $TMPWD/tmp -k1b,1) <(awk '$6=="-" {print $1"_"$2,$6,$1,$2,$4,$5}' $inputFile |sort -T $TMPWD/tmp -k1b,1) | 
- awk -vOFS='\t' 'NF==11 {print $3,$4,$4+2,($5*$6+$10*$11)/($6+$11),$6+$11; next} {print $3,$4,$4+2,$5,$6}' -
+ join -1 1 -2 1 -a 1 -a 2 <(${AWK:-gawk} '$6=="+" {print $1"_"($2-1),$6,$1,($2-1),$4,$5}' $inputFile |sort -T $TMPWD/tmp -k1b,1) <(${AWK:-gawk} '$6=="-" {print $1"_"$2,$6,$1,$2,$4,$5}' $inputFile |sort -T $TMPWD/tmp -k1b,1) | 
+ ${AWK:-gawk} -vOFS='\t' 'NF==11 {print $3,$4,$4+2,($5*$6+$10*$11)/($6+$11),$6+$11; next} {print $3,$4,$4+2,$5,$6}' -
 else
  cut -f 1-5 $inputFile
 fi |
-awk -vcovCutoff=$covCutoff '$5>covCutoff' - |
+${AWK:-gawk} -vcovCutoff=$covCutoff '$5>covCutoff' - |
 sort -T $TMPWD/tmp -k1,1 -k2,2g -k3,3g - > $output