Skip to content

Commit

Permalink
improved sensitivity on interval stats
Browse files Browse the repository at this point in the history
  • Loading branch information
Katarzyna Sikora committed May 9, 2018
1 parent ef7dab9 commit 7839b64
Show file tree
Hide file tree
Showing 12 changed files with 205 additions and 11 deletions.
Binary file modified BSmapWGBS.pyc
Binary file not shown.
Binary file modified BSmethXT_WGBS.pyc
Binary file not shown.
27 changes: 26 additions & 1 deletion BSmetricsWGBS.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,31 @@ def BS_doc(INfile,OUTfileList,refG,auxdir,GATKpath,metDir,my_session,logobject):
logobject.info('Depth of coverage calculation complete')
return


def BS_downsample_reads(numOut,nthreads,R1in,R1out,R2in,R2out,pipev,outdir,my_session,logobject):
read_root=os.path.basename(R1in)
cmd='/data/manke/repository/scripts/DNA_methylation/WGBS_pipe/'+ pipev + '/downsample_se_pe.sh ' + str(numOut) + ' ' + str(nthreads) + ' ' + R1in + ' ' + R1out + ' ' + R2in + ' ' + R2out
logobject.info(cmd)
with open(os.path.join(outdir,"logs","%s.downsample_reads.out.log" % read_root),'w') as stdoutF, open(os.path.join(outdir,"logs","%s.downsample_reads.err.log" % read_root),'w') as stderrF:
try:
stdout_res, stderr_res = run_job(cmd_str = cmd,
job_name = 'r_dwnsmpl',
logger = logobject,
drmaa_session = my_session,
run_locally = False,
working_directory = os.getcwd(),
job_other_options = '-p bioinfo ')
stdoutF.write("".join(stdout_res))
stderrF.write("".join(stderr_res))

# relay all the stdout, stderr, drmaa output to diagnose failures
except Exception as err:
logobject.error("Read downsampling error: %s" % err)
raise
else:
logobject.info('Read downsampling complete')
return

def BS_conv_rate(ii1sub,oo,metDir,my_session,logobject):
read_root=os.path.basename(ii1sub)
CR_cmd='/data/manke/repository/scripts/DNA_methylation/DEEP_scripts/conversionRate_KS.sh '+ ii1sub + ' ' + oo
Expand All @@ -108,7 +133,7 @@ def BS_conv_rate(ii1sub,oo,metDir,my_session,logobject):
drmaa_session = my_session,
run_locally = False,
working_directory = os.getcwd(),
job_other_options = '-p bioinfo')
job_other_options = '-p bioinfo --mem-per-cpu=6000')
stdoutF.write("".join(stdout_res))
stderrF.write("".join(stderr_res))

Expand Down
Binary file modified BSmetricsWGBS.pyc
Binary file not shown.
29 changes: 27 additions & 2 deletions BSmetricsWGBS.py~
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,31 @@ def BS_doc(INfile,OUTfileList,refG,auxdir,GATKpath,metDir,my_session,logobject):
logobject.info('Depth of coverage calculation complete')
return


def BS_downsample_reads(numOut,nthreads,R1in,R1out,R2in,R2out,pipev,outdir,my_session,logobject):
read_root=os.path.basename(R1in)
cmd='/data/manke/repository/scripts/DNA_methylation/WGBS_pipe/'+ pipev + '/downsample_se_pe.sh ' + str(numOut) + ' ' + str(nthreads) + ' ' + R1in + ' ' + R1out + ' ' + R2in + ' ' + R2out
logobject.info(cmd)
with open(os.path.join(outdir,"logs","%s.downsample_reads.out.log" % read_root),'w') as stdoutF, open(os.path.join(outdir,"logs","%s.downsample_reads.err.log" % read_root),'w') as stderrF:
try:
stdout_res, stderr_res = run_job(cmd_str = cmd,
job_name = 'r_dwnsmpl',
logger = logobject,
drmaa_session = my_session,
run_locally = False,
working_directory = os.getcwd(),
job_other_options = '-p bioinfo ')
stdoutF.write("".join(stdout_res))
stderrF.write("".join(stderr_res))

# relay all the stdout, stderr, drmaa output to diagnose failures
except Exception as err:
logobject.error("Read downsampling error: %s" % err)
raise
else:
logobject.info('Read downsampling complete')
return

def BS_conv_rate(ii1sub,oo,metDir,my_session,logobject):
read_root=os.path.basename(ii1sub)
CR_cmd='/data/manke/repository/scripts/DNA_methylation/DEEP_scripts/conversionRate_KS.sh '+ ii1sub + ' ' + oo
Expand All @@ -108,7 +133,7 @@ def BS_conv_rate(ii1sub,oo,metDir,my_session,logobject):
drmaa_session = my_session,
run_locally = False,
working_directory = os.getcwd(),
job_other_options = '-p bioinfo')
job_other_options = '-p bioinfo --mem-per-cpu=30000')
stdoutF.write("".join(stdout_res))
stderrF.write("".join(stderr_res))

Expand All @@ -132,7 +157,7 @@ def BS_Mbias(INfile,OUTfile,POMpath,refG,metDir,nthreads,my_session,logobject):
drmaa_session = my_session,
run_locally = False,
working_directory = os.getcwd(),
job_other_options = '-p bioinfo --mem-per-cpu=3000 --nodes 1=1 --mincpus='+str(nthreads))
job_other_options = '-p bioinfo --mem-per-cpu=3000 --nodes=1=1 --mincpus='+str(nthreads))

stdoutF.write("".join(stdout_res))
stderrF.write("".join(stderr_res))
Expand Down
Binary file modified BSprecut.pyc
Binary file not shown.
Binary file modified BSstats_WGBS.pyc
Binary file not shown.
28 changes: 24 additions & 4 deletions WGBS_pipe_lite.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__="v1.2.0"
__version__="v1.2.1"


import os
Expand Down Expand Up @@ -319,6 +319,7 @@ def PCRdup_rm(input_file,output_file):
#RUN VARIOUS COVERAGE AND QUALITY METRICS#######################################################################
metout=os.path.join(wdir,'QC_metrics')
auxdir=os.path.join(wdir,'aux_files')
reads_downsampled=os.path.join(wdir,'downsampled_reads')

#prepare bed file with random CpGs
@mkdir(auxdir,os.path.join(auxdir,'logs'))
Expand Down Expand Up @@ -369,19 +370,38 @@ def depth_of_cov(input_file,output_files):

#conversion rate: currently from fastq, implement phiX control!
if args.trimReads :
@mkdir(reads_downsampled,os.path.join(reads_downsampled,'logs'))
@transform(trim_reads,suffix('_R1.fastq.gz'),['_R1.fastq.gz','_R2.fastq.gz'],output_dir=reads_downsampled) #down to 5mln reads
def dsmpl_reads(input_files,output_files):
ii1=input_files[0]
ii2=input_files[1]
oo1=output_files[0]
oo2=output_files[1]
from BSmetricsWGBS import BS_downsample_reads
BS_downsample_reads('5000000',args.nthreads,ii1,oo1,ii2,oo2,pipev,reads_downsampled,mySession,logger)

@mkdir(metout,os.path.join(metout,'logs'))
@transform(trim_reads,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
@transform(dsmpl_reads,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
def conv_rate(input_files,output_file):
ii1=input_files[0]
ii1sub=re.sub('_R1.fastq.gz','',ii1)
oo=output_file
from BSmetricsWGBS import BS_conv_rate
BS_conv_rate(ii1sub,oo,metout,mySession,logger)
else:
@mkdir(reads_downsampled,os.path.join(reads_downsampled,'logs'))
@transform(IN_files,suffix('_R1.fastq.gz'),['_R1.fastq.gz','_R2.fastq.gz'],output_dir=reads_downsampled) #down to 5mln reads
def dsmpl_reads(input_files,output_files):
ii1=input_files[0]
ii2=input_files[1]
oo1=output_files[0]
oo2=output_files[1]
from BSmetricsWGBS import BS_downsample_reads
BS_downsample_reads('5000000',args.nthreads,ii1,oo1,ii2,oo2,pipev,reads_downsampled,mySession,logger)
@mkdir(metout,os.path.join(metout,'logs'))
@transform(IN_files,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
@transform(dsmpl_reads,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
def conv_rate(input_files,output_file):
ii1=input_files[0][0]
ii1=input_files[0]
ii1sub=re.sub('_R1.fastq.gz','',ii1)
oo=output_file
from BSmetricsWGBS import BS_conv_rate
Expand Down
28 changes: 24 additions & 4 deletions WGBS_pipe_lite.py~
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__="v1.2.0"
__version__="v1.2.1"


import os
Expand Down Expand Up @@ -319,6 +319,7 @@ def PCRdup_rm(input_file,output_file):
#RUN VARIOUS COVERAGE AND QUALITY METRICS#######################################################################
metout=os.path.join(wdir,'QC_metrics')
auxdir=os.path.join(wdir,'aux_files')
reads_downsampled=os.path.join(wdir,'downsampled_reads')

#prepare bed file with random CpGs
@mkdir(auxdir,os.path.join(auxdir,'logs'))
Expand Down Expand Up @@ -369,17 +370,36 @@ else:

#conversion rate: currently from fastq, implement phiX control!
if args.trimReads :
@mkdir(reads_downsampled,os.path.join(reads_downsampled,'logs'))
@transform(trim_reads,suffix('_R1.fastq.gz'),['_R1.fastq.gz','_R2.fastq.gz'],output_dir=reads_downsampled) #down to 5mln reads
def dsmpl_reads(input_files,output_files):
ii1=input_files[0]
ii2=input_files[1]
oo1=output_files[0]
oo2=output_files[1]
from BSmetricsWGBS import BS_downsample_reads
BS_downsample_reads('5000000',args.nthreads,ii1,oo1,ii2,oo2,pipev,reads_downsampled,mySession,logger)

@mkdir(metout,os.path.join(metout,'logs'))
@transform(trim_reads,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
@transform(dsmpl_reads,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
def conv_rate(input_files,output_file):
ii1=input_files[0]
ii1sub=re.sub('_R1.fastq.gz','',ii1)
oo=output_file
from BSmetricsWGBS import BS_conv_rate
BS_conv_rate(ii1sub,oo,metout,mySession,logger)
else:
@mkdir(reads_downsampled,os.path.join(reads_downsampled,'logs'))
@transform(IN_files,suffix('_R1.fastq.gz'),['_R1.fastq.gz','_R2.fastq.gz'],output_dir=reads_downsampled) #down to 5mln reads
def dsmpl_reads(input_files,output_files):
ii1=input_files[0]
ii2=input_files[1]
oo1=output_files[0]
oo2=output_files[1]
from BSmetricsWGBS import BS_downsample_reads
BS_downsample_reads('5000000',args.nthreads,ii1,oo1,ii2,oo2,pipev,reads_downsampled,mySession,logger)
@mkdir(metout,os.path.join(metout,'logs'))
@transform(IN_files,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
@transform(dsmpl_reads,suffix('_R1.fastq.gz'),'.conv.rate.txt',output_dir=metout) #it's ok to have both reads summarized in 1 file
def conv_rate(input_files,output_file):
ii1=input_files[0][0]
ii1sub=re.sub('_R1.fastq.gz','',ii1)
Expand Down Expand Up @@ -470,7 +490,7 @@ if ( args.intList and args.sampleInfo ):
oo = output_files
auxList=[os.path.join(auxdir,re.sub('.fa',re.sub('.bed','.CpGlist.bed',os.path.basename(x)),os.path.basename(refG))) for x in args.intList]
from BSstats_WGBS import int_stats_limma
int_stats_limma(ii,args.intList,auxList,args.sampleInfo,intStat_out,Rpath,pipev,mySession,logger)
int_stats_limma(ii,args.intList,auxList,args.sampleInfo,intStat_out,Rpath,Rlib,pipev,mySession,logger)
#####################################################################################################

####RUN DMR calling ################################################################################
Expand Down
4 changes: 4 additions & 0 deletions WGBSpipe.interval_stats.limma.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ auxmtch<-as.data.frame(findOverlaps(query=auxGR, subject=bedGR))

limdat.LG.inCGI<-auxdat[auxmtch$queryHits,] #for compatibility with previous method; not really necessary as auxdat comes from a pre-intersected bed file
limdat.LG.inCGI$IntID<-names(ranges(bedGR))[auxmtch$subjectHits] ##needed for grouping per interval
##
noNA2<-apply(limdat.LG.inCGI[,!colnames(limdat.LG.inCGI) %in% c("Name","NAf","IntID"),with=FALSE],1,function(X)sum(is.na(X)))
NAf2<-ifelse(noNA2>1,1,0)
limdat.LG.inCGI$NAf<-NAf2
########################### END UPDATED CODE
NA.inCGI<-with(limdat.LG.inCGI,ave(NAf,factor(IntID),FUN=sum))
limdat.LG.inCGI$NA.inCGI<-NA.inCGI
Expand Down
5 changes: 5 additions & 0 deletions WGBSpipe.metilene_stats.limma.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ auxmtch<-as.data.frame(findOverlaps(query=auxGR, subject=bedGR))

limdat.LG.inCGI<-auxdat[auxmtch$queryHits,] #for compatibility with previous method; not really necessary as auxdat comes from a pre-intersected bed file
limdat.LG.inCGI$IntID<-names(ranges(bedGR))[auxmtch$subjectHits] ##needed for grouping per interval
##
noNA2<-apply(limdat.LG.inCGI[,!colnames(limdat.LG.inCGI) %in% c("Name","NAf","IntID"),with=FALSE],1,function(X)sum(is.na(X)))
NAf2<-ifelse(noNA2>1,1,0)
limdat.LG.inCGI$NAf<-NAf2

########################### END UPDATED CODE

NA.inCGI<-with(limdat.LG.inCGI,ave(NAf,factor(IntID),FUN=sum))
Expand Down
95 changes: 95 additions & 0 deletions downsample_se_pe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/bash

## Usage: bash downsample_se_pe.sh 1000 10 a.fastq.gz a.downsampled.fastq.gz b.fastq.gz b.downsampled.fastq.gz

num_reads=$1
threads=$2
R1=$3
R1_out=$4
R2=$5
R2_out=$6

seed=12345

## check for paired or single end mode
paired=false
if [ -z "$R1" ] || [ -z "$R1_out" ]; then
echo -e "\nPlease provide R1 input file and R1 output filename!\n"
echo "usage: downsample_se_pe.sh 1000 10 a.fastq.gz a.downsampled.fastq.gz (single end)"
echo "or"
echo -e "(paired end) downsample_se_pe.sh 1000 10 a.fastq.gz a.downsampled.fastq.gz b.fastq.gz b.downsampled.fastq.gz\n"
exit 1;
elif [ -n "$R2" ] && [ -n "$R2_out" ]; then
paired=true
elif ( ( [ -z "$R2" ] && [ -n "$R2_out" ] ) || ( [ -n "$R2" ] && [ -z "$R2_out" ] ) ); then
echo "Please provide for R2 either both input and output file or leave both empty!"
exit 1;
fi

## check for pigz
tmp=$(which pigz)
if [ -z "$tmp" ]; then
echo "Pigz needs to be installed!"
exit 1;
fi

##############################################################################
## Usage: awk -v n=5 -v s=12345 -f downsample.awk <(seq 1 100)
## Reservoir downsampling in AWK!
## Fabian Kilpert, January 2017
read -d '' reservoir_sampling << 'EOF'
BEGIN {
if (n=="") n = 1;
if (s=="") srand()
else srand(s)
}
# function generating numbers between 1 and n
function randint(n) { return int(1 + n * rand()) }
{
# fill reservoir
if (NR <= n) R[NR] = $0
# replace reservoir elements with gradually decreasing probability
else {
j = randint(NR);
if (j <= n) { R[j] = $0 };
}
}
END {
for (key in R) print R[key]
}
EOF
##############################################################################


## do the work
if [ "$paired" == true ]; then
## paired end
echo -e "\nDownsampling fastq files to $num_reads reads in paired-end mode..." > /dev/stderr;
paste <(pigz -p2 -dc ${R1} | sed '/^$/d' | paste -d "\t" - - - - ) <(pigz -p2 -dc ${R2} | sed '/^$/d' | paste -d "\t" - - - - ) \
| awk -v n=${num_reads} -v s=$seed "$reservoir_sampling" \
| tee >(cut -f1,2,3,4 | sed 's/\t/\n/g' | pigz -p $(($threads/2>2?$threads/2:2)) -9 > ${R1_out}) >(cut -f5,6,7,8 | sed 's/\t/\n/g' | pigz -p $(($threads/2>2?$threads/2:2)) -9 > ${R2_out}) \
| cut -f1,5 | tr " " "\t" | awk '{if ($1!=$3) print $0,"Read names in paired-end fastq files are out of sync!"; }'
else
## single end
echo -e "\nDownsampling fastq file to $num_reads reads in single-end mode..." > /dev/stderr
pigz -p2 -dc ${R1} | sed '/^$/d' | paste -d "\t" - - - - | awk -v n=${num_reads} -v s=${seed} "$reservoir_sampling" | sed 's/\t/\n/g' | pigz -p $threads -9 > ${R1_out}
fi


################ END



#DIFF=$(diff <(pigz -dc ${2} | sed -n '1p;1~4p' | cut -d " " -f1) <(pigz -dc ${3} | sed -n '1p;1~4p' | cut -d " " -f1))
#if [ "$DIFF" != "" ]
#then
# echo "Error! FASTQ sequence identifiers do NOT match!"
# exit 1
#else


0 comments on commit 7839b64

Please sign in to comment.