improved sensitivity on interval stats

katsikora · May 9, 2018 · 7839b64 · 7839b64
1 parent ef7dab9
commit 7839b64
Show file tree

Hide file tree

Showing 12 changed files with 205 additions and 11 deletions.
diff --git a/BSmapWGBS.pyc b/BSmapWGBS.pyc
diff --git a/BSmethXT_WGBS.pyc b/BSmethXT_WGBS.pyc
diff --git a/BSmetricsWGBS.py b/BSmetricsWGBS.py
@@ -96,6 +96,31 @@ def BS_doc(INfile,OUTfileList,refG,auxdir,GATKpath,metDir,my_session,logobject):
             logobject.info('Depth of coverage calculation complete')
     return
 
+
+def BS_downsample_reads(numOut,nthreads,R1in,R1out,R2in,R2out,pipev,outdir,my_session,logobject):
+    read_root=os.path.basename(R1in)
+    cmd='/data/manke/repository/scripts/DNA_methylation/WGBS_pipe/'+ pipev + '/downsample_se_pe.sh ' + str(numOut) + ' ' + str(nthreads) + ' ' + R1in + ' ' + R1out + ' ' + R2in + ' ' + R2out
+    logobject.info(cmd)
+    with open(os.path.join(outdir,"logs","%s.downsample_reads.out.log" % read_root),'w') as stdoutF, open(os.path.join(outdir,"logs","%s.downsample_reads.err.log" % read_root),'w') as stderrF:
+        try:
+            stdout_res, stderr_res  = run_job(cmd_str       = cmd,
+                                          job_name          = 'r_dwnsmpl',
+                                          logger            = logobject,
+                                          drmaa_session     = my_session,
+                                          run_locally       = False,
+                                          working_directory = os.getcwd(),
+                                          job_other_options = '-p bioinfo ')
+            stdoutF.write("".join(stdout_res))
+            stderrF.write("".join(stderr_res))
+
+        # relay all the stdout, stderr, drmaa output to diagnose failures
+        except Exception as err:
+            logobject.error("Read downsampling error: %s" % err)
+            raise
+        else:
+            logobject.info('Read downsampling complete')
+    return
+
 def BS_conv_rate(ii1sub,oo,metDir,my_session,logobject):
     read_root=os.path.basename(ii1sub)
     CR_cmd='/data/manke/repository/scripts/DNA_methylation/DEEP_scripts/conversionRate_KS.sh '+ ii1sub + ' ' + oo
@@ -108,7 +133,7 @@ def BS_conv_rate(ii1sub,oo,metDir,my_session,logobject):
                                           drmaa_session     = my_session,
                                           run_locally       = False,
                                           working_directory = os.getcwd(),
-                                          job_other_options = '-p bioinfo')
+                                          job_other_options = '-p bioinfo --mem-per-cpu=6000')
             stdoutF.write("".join(stdout_res))
             stderrF.write("".join(stderr_res))
 

diff --git a/BSmetricsWGBS.pyc b/BSmetricsWGBS.pyc
diff --git a/BSmetricsWGBS.py~ b/BSmetricsWGBS.py~
@@ -96,6 +96,31 @@ def BS_doc(INfile,OUTfileList,refG,auxdir,GATKpath,metDir,my_session,logobject):
             logobject.info('Depth of coverage calculation complete')
     return
 
+
+def BS_downsample_reads(numOut,nthreads,R1in,R1out,R2in,R2out,pipev,outdir,my_session,logobject):
+    read_root=os.path.basename(R1in)
+    cmd='/data/manke/repository/scripts/DNA_methylation/WGBS_pipe/'+ pipev + '/downsample_se_pe.sh ' + str(numOut) + ' ' + str(nthreads) + ' ' + R1in + ' ' + R1out + ' ' + R2in + ' ' + R2out
+    logobject.info(cmd)
+    with open(os.path.join(outdir,"logs","%s.downsample_reads.out.log" % read_root),'w') as stdoutF, open(os.path.join(outdir,"logs","%s.downsample_reads.err.log" % read_root),'w') as stderrF:
+        try:
+            stdout_res, stderr_res  = run_job(cmd_str       = cmd,
+                                          job_name          = 'r_dwnsmpl',
+                                          logger            = logobject,
+                                          drmaa_session     = my_session,
+                                          run_locally       = False,
+                                          working_directory = os.getcwd(),
+                                          job_other_options = '-p bioinfo ')
+            stdoutF.write("".join(stdout_res))
+            stderrF.write("".join(stderr_res))
+
+        # relay all the stdout, stderr, drmaa output to diagnose failures
+        except Exception as err:
+            logobject.error("Read downsampling error: %s" % err)
+            raise
+        else:
+            logobject.info('Read downsampling complete')
+    return
+
 def BS_conv_rate(ii1sub,oo,metDir,my_session,logobject):
     read_root=os.path.basename(ii1sub)
     CR_cmd='/data/manke/repository/scripts/DNA_methylation/DEEP_scripts/conversionRate_KS.sh '+ ii1sub + ' ' + oo
@@ -108,7 +133,7 @@ def BS_conv_rate(ii1sub,oo,metDir,my_session,logobject):
                                           drmaa_session     = my_session,
                                           run_locally       = False,
                                           working_directory = os.getcwd(),
-                                          job_other_options = '-p bioinfo')
+                                          job_other_options = '-p bioinfo --mem-per-cpu=30000')
             stdoutF.write("".join(stdout_res))
             stderrF.write("".join(stderr_res))
 
@@ -132,7 +157,7 @@ def BS_Mbias(INfile,OUTfile,POMpath,refG,metDir,nthreads,my_session,logobject):
                                           drmaa_session     = my_session,
                                           run_locally       = False,
                                           working_directory = os.getcwd(),
-                                          job_other_options = '-p bioinfo --mem-per-cpu=3000 --nodes 1=1 --mincpus='+str(nthreads))
+                                          job_other_options = '-p bioinfo --mem-per-cpu=3000 --nodes=1=1 --mincpus='+str(nthreads))
 
             stdoutF.write("".join(stdout_res))
             stderrF.write("".join(stderr_res))

diff --git a/BSprecut.pyc b/BSprecut.pyc
diff --git a/BSstats_WGBS.pyc b/BSstats_WGBS.pyc
diff --git a/WGBS_pipe_lite.py b/WGBS_pipe_lite.py
@@ -1,4 +1,4 @@
-__version__="v1.2.0"
+__version__="v1.2.1"
 
 
 import os
@@ -319,6 +319,7 @@ def PCRdup_rm(input_file,output_file):
 #RUN VARIOUS COVERAGE AND QUALITY METRICS#######################################################################
 metout=os.path.join(wdir,'QC_metrics')
 auxdir=os.path.join(wdir,'aux_files')
+reads_downsampled=os.path.join(wdir,'downsampled_reads')
 
 #prepare bed file with random CpGs
 @mkdir(auxdir,os.path.join(auxdir,'logs'))
@@ -369,19 +370,38 @@ def depth_of_cov(input_file,output_files):
 
 #conversion rate: currently from fastq, implement phiX control!
 if  args.trimReads :
+    @mkdir(reads_downsampled,os.path.join(reads_downsampled,'logs'))
+    @transform(trim_reads,suffix('_R1.fastq.gz'),['_R1.fastq.gz','_R2.fastq.gz'],output_dir=reads_downsampled) #down to 5mln reads
+    def dsmpl_reads(input_files,output_files):
+        ii1=input_files[0]
+        ii2=input_files[1]
+        oo1=output_files[0]
+        oo2=output_files[1]
+        from BSmetricsWGBS import BS_downsample_reads
+        BS_downsample_reads('5000000',args.nthreads,ii1,oo1,ii2,oo2,pipev,reads_downsampled,mySession,logger)
+
     @mkdir(metout,os.path.join(metout,'logs'))
-    @transform(trim_reads,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
+    @transform(dsmpl_reads,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
     def conv_rate(input_files,output_file):
         ii1=input_files[0]
         ii1sub=re.sub('_R1.fastq.gz','',ii1)
         oo=output_file
         from BSmetricsWGBS import BS_conv_rate
         BS_conv_rate(ii1sub,oo,metout,mySession,logger)
 else: 
+    @mkdir(reads_downsampled,os.path.join(reads_downsampled,'logs'))
+    @transform(IN_files,suffix('_R1.fastq.gz'),['_R1.fastq.gz','_R2.fastq.gz'],output_dir=reads_downsampled) #down to 5mln reads
+    def dsmpl_reads(input_files,output_files):
+        ii1=input_files[0]
+        ii2=input_files[1]
+        oo1=output_files[0]
+        oo2=output_files[1]
+        from BSmetricsWGBS import BS_downsample_reads
+        BS_downsample_reads('5000000',args.nthreads,ii1,oo1,ii2,oo2,pipev,reads_downsampled,mySession,logger)
     @mkdir(metout,os.path.join(metout,'logs'))
-    @transform(IN_files,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
+    @transform(dsmpl_reads,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
     def conv_rate(input_files,output_file):
-        ii1=input_files[0][0]
+        ii1=input_files[0]
         ii1sub=re.sub('_R1.fastq.gz','',ii1)
         oo=output_file
         from BSmetricsWGBS import BS_conv_rate

diff --git a/WGBS_pipe_lite.py~ b/WGBS_pipe_lite.py~
@@ -1,4 +1,4 @@
-__version__="v1.2.0"
+__version__="v1.2.1"
 
 
 import os
@@ -319,6 +319,7 @@ def PCRdup_rm(input_file,output_file):
 #RUN VARIOUS COVERAGE AND QUALITY METRICS#######################################################################
 metout=os.path.join(wdir,'QC_metrics')
 auxdir=os.path.join(wdir,'aux_files')
+reads_downsampled=os.path.join(wdir,'downsampled_reads')
 
 #prepare bed file with random CpGs
 @mkdir(auxdir,os.path.join(auxdir,'logs'))
@@ -369,17 +370,36 @@ else:
 
 #conversion rate: currently from fastq, implement phiX control!
 if  args.trimReads :
+    @mkdir(reads_downsampled,os.path.join(reads_downsampled,'logs'))
+    @transform(trim_reads,suffix('_R1.fastq.gz'),['_R1.fastq.gz','_R2.fastq.gz'],output_dir=reads_downsampled) #down to 5mln reads
+    def dsmpl_reads(input_files,output_files):
+        ii1=input_files[0]
+        ii2=input_files[1]
+        oo1=output_files[0]
+        oo2=output_files[1]
+        from BSmetricsWGBS import BS_downsample_reads
+        BS_downsample_reads('5000000',args.nthreads,ii1,oo1,ii2,oo2,pipev,reads_downsampled,mySession,logger)
+
     @mkdir(metout,os.path.join(metout,'logs'))
-    @transform(trim_reads,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
+    @transform(dsmpl_reads,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
     def conv_rate(input_files,output_file):
         ii1=input_files[0]
         ii1sub=re.sub('_R1.fastq.gz','',ii1)
         oo=output_file
         from BSmetricsWGBS import BS_conv_rate
         BS_conv_rate(ii1sub,oo,metout,mySession,logger)
 else: 
+    @mkdir(reads_downsampled,os.path.join(reads_downsampled,'logs'))
+    @transform(IN_files,suffix('_R1.fastq.gz'),['_R1.fastq.gz','_R2.fastq.gz'],output_dir=reads_downsampled) #down to 5mln reads
+    def dsmpl_reads(input_files,output_files):
+        ii1=input_files[0]
+        ii2=input_files[1]
+        oo1=output_files[0]
+        oo2=output_files[1]
+        from BSmetricsWGBS import BS_downsample_reads
+        BS_downsample_reads('5000000',args.nthreads,ii1,oo1,ii2,oo2,pipev,reads_downsampled,mySession,logger)
     @mkdir(metout,os.path.join(metout,'logs'))
-    @transform(IN_files,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
+    @transform(dsmpl_reads,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
     def conv_rate(input_files,output_file):
         ii1=input_files[0][0]
         ii1sub=re.sub('_R1.fastq.gz','',ii1)
@@ -470,7 +490,7 @@ if ( args.intList and args.sampleInfo ):
         oo = output_files
         auxList=[os.path.join(auxdir,re.sub('.fa',re.sub('.bed','.CpGlist.bed',os.path.basename(x)),os.path.basename(refG))) for x in args.intList]
         from BSstats_WGBS import int_stats_limma
-        int_stats_limma(ii,args.intList,auxList,args.sampleInfo,intStat_out,Rpath,pipev,mySession,logger)
+        int_stats_limma(ii,args.intList,auxList,args.sampleInfo,intStat_out,Rpath,Rlib,pipev,mySession,logger)
 #####################################################################################################
 
 ####RUN DMR calling ################################################################################

diff --git a/WGBSpipe.interval_stats.limma.R b/WGBSpipe.interval_stats.limma.R
@@ -53,6 +53,10 @@ auxmtch<-as.data.frame(findOverlaps(query=auxGR, subject=bedGR))
 
 limdat.LG.inCGI<-auxdat[auxmtch$queryHits,] #for compatibility with previous method; not really necessary as auxdat comes from a pre-intersected bed file
 limdat.LG.inCGI$IntID<-names(ranges(bedGR))[auxmtch$subjectHits] ##needed for grouping per interval
+##
+noNA2<-apply(limdat.LG.inCGI[,!colnames(limdat.LG.inCGI) %in% c("Name","NAf","IntID"),with=FALSE],1,function(X)sum(is.na(X)))
+NAf2<-ifelse(noNA2>1,1,0)
+limdat.LG.inCGI$NAf<-NAf2
 ########################### END UPDATED CODE
 NA.inCGI<-with(limdat.LG.inCGI,ave(NAf,factor(IntID),FUN=sum))
 limdat.LG.inCGI$NA.inCGI<-NA.inCGI

diff --git a/WGBSpipe.metilene_stats.limma.R b/WGBSpipe.metilene_stats.limma.R
@@ -50,6 +50,11 @@ auxmtch<-as.data.frame(findOverlaps(query=auxGR, subject=bedGR))
 
 limdat.LG.inCGI<-auxdat[auxmtch$queryHits,] #for compatibility with previous method; not really necessary as auxdat comes from a pre-intersected bed file
 limdat.LG.inCGI$IntID<-names(ranges(bedGR))[auxmtch$subjectHits] ##needed for grouping per interval
+##
+noNA2<-apply(limdat.LG.inCGI[,!colnames(limdat.LG.inCGI) %in% c("Name","NAf","IntID"),with=FALSE],1,function(X)sum(is.na(X)))
+NAf2<-ifelse(noNA2>1,1,0)
+limdat.LG.inCGI$NAf<-NAf2
+
 ########################### END UPDATED CODE
 
 NA.inCGI<-with(limdat.LG.inCGI,ave(NAf,factor(IntID),FUN=sum))

diff --git a/downsample_se_pe.sh b/downsample_se_pe.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/bash
+
+## Usage: bash downsample_se_pe.sh 1000 10 a.fastq.gz a.downsampled.fastq.gz b.fastq.gz b.downsampled.fastq.gz
+
+num_reads=$1
+threads=$2
+R1=$3
+R1_out=$4
+R2=$5
+R2_out=$6
+
+seed=12345
+
+## check for paired or single end mode
+paired=false
+if [ -z "$R1" ] || [ -z "$R1_out" ]; then
+	echo -e "\nPlease provide R1 input file and R1 output filename!\n"
+	echo "usage: downsample_se_pe.sh 1000 10 a.fastq.gz a.downsampled.fastq.gz (single end)"
+	echo "or"
+	echo -e "(paired end) downsample_se_pe.sh 1000 10 a.fastq.gz a.downsampled.fastq.gz b.fastq.gz b.downsampled.fastq.gz\n"
+	exit 1;
+elif [ -n "$R2" ] && [ -n "$R2_out" ]; then
+	paired=true
+elif ( ( [ -z "$R2" ] && [ -n "$R2_out" ] ) || ( [ -n "$R2" ] && [ -z "$R2_out" ] ) ); then
+	echo "Please provide for R2 either both input and output file or leave both empty!"
+	exit 1;
+fi
+
+## check for pigz
+tmp=$(which pigz)
+if [ -z "$tmp" ]; then
+	echo "Pigz needs to be installed!"
+	exit 1;
+fi
+
+##############################################################################
+## Usage: awk -v n=5 -v s=12345 -f downsample.awk <(seq 1 100)
+## Reservoir downsampling in AWK!
+## Fabian Kilpert, January 2017
+read -d '' reservoir_sampling << 'EOF'
+BEGIN {
+  if (n=="") n = 1;
+
+	if (s=="") srand()
+	else srand(s)
+}
+
+# function generating numbers between 1 and n
+function randint(n) { return int(1 + n * rand()) }
+
+{
+	# fill reservoir
+	if (NR <= n) R[NR] = $0
+
+	# replace reservoir elements with gradually decreasing probability
+	else {
+		j = randint(NR);
+		if (j <= n) { R[j] = $0 };
+	}
+}
+
+END {
+	for (key in R) print R[key]
+}
+EOF
+##############################################################################
+
+
+## do the work
+if [ "$paired" == true ]; then
+	## paired end
+	echo -e "\nDownsampling fastq files to $num_reads reads in paired-end mode..." > /dev/stderr;
+	paste <(pigz -p2 -dc ${R1} | sed '/^$/d' | paste -d "\t" - - - - ) <(pigz -p2 -dc ${R2} | sed '/^$/d' | paste -d "\t" - - - - ) \
+	| awk -v n=${num_reads} -v s=$seed "$reservoir_sampling" \
+	| tee >(cut -f1,2,3,4 | sed 's/\t/\n/g' | pigz -p $(($threads/2>2?$threads/2:2)) -9 > ${R1_out}) >(cut -f5,6,7,8 | sed 's/\t/\n/g' | pigz -p $(($threads/2>2?$threads/2:2)) -9 > ${R2_out}) \
+	| cut -f1,5 | tr " " "\t" | awk '{if ($1!=$3) print $0,"Read names in paired-end fastq files are out of sync!"; }'
+else
+	## single end
+	echo -e "\nDownsampling fastq file to $num_reads reads in single-end mode..." > /dev/stderr
+	pigz -p2 -dc ${R1} | sed '/^$/d' | paste -d "\t" - - - - | awk -v n=${num_reads} -v s=${seed} "$reservoir_sampling" | sed 's/\t/\n/g' | pigz -p $threads -9 > ${R1_out}
+fi
+
+
+################ END
+
+
+
+#DIFF=$(diff <(pigz -dc ${2} | sed -n '1p;1~4p' | cut -d " " -f1) <(pigz -dc ${3} | sed -n '1p;1~4p' | cut -d " " -f1))
+#if [ "$DIFF" != "" ] 
+#then
+#    echo "Error! FASTQ sequence identifiers do NOT match!"
+#    exit 1
+#else
+
+