From a6cfe91b4a5e283ad846cec08c4f6856823ce4d7 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Thu, 27 Jun 2024 12:27:59 -0400 Subject: [PATCH 01/28] chore: refactor rule all, move some files to more appropriate locations --- .../scripts/createtable => bin/creattable.py | 0 .../filterMetrics => bin/filterMetrics.py | 8 +- {workflow/scripts => bin}/jobby | 0 {workflow/scripts => bin}/ppqt/LICENSE | 0 {workflow/scripts => bin}/ppqt/README.txt | 0 .../ppqt/peakCallingPipelineForIdr.txt | 0 {workflow/scripts => bin}/ppqt/run_spp.R | 0 .../scripts => bin}/ppqt/run_spp_nodups.R | 0 .../scripts => bin}/ppqt/spp_1.10.1.tar.gz | Bin workflow/Snakefile | 402 +++++++----------- workflow/scripts/grouping.py | 59 +++ 11 files changed, 206 insertions(+), 263 deletions(-) rename workflow/scripts/createtable => bin/creattable.py (100%) rename workflow/scripts/filterMetrics => bin/filterMetrics.py (93%) rename {workflow/scripts => bin}/jobby (100%) rename {workflow/scripts => bin}/ppqt/LICENSE (100%) rename {workflow/scripts => bin}/ppqt/README.txt (100%) rename {workflow/scripts => bin}/ppqt/peakCallingPipelineForIdr.txt (100%) rename {workflow/scripts => bin}/ppqt/run_spp.R (100%) rename {workflow/scripts => bin}/ppqt/run_spp_nodups.R (100%) rename {workflow/scripts => bin}/ppqt/spp_1.10.1.tar.gz (100%) create mode 100644 workflow/scripts/grouping.py diff --git a/workflow/scripts/createtable b/bin/creattable.py similarity index 100% rename from workflow/scripts/createtable rename to bin/creattable.py diff --git a/workflow/scripts/filterMetrics b/bin/filterMetrics.py similarity index 93% rename from workflow/scripts/filterMetrics rename to bin/filterMetrics.py index 1d56ade..6562b71 100755 --- a/workflow/scripts/filterMetrics +++ b/bin/filterMetrics.py @@ -60,9 +60,9 @@ def getmetadata(type): elif type == 'tnreads': metadata = 'NReads' elif type == 'mnreads': - metadata = 'NMappedReads' + metadata = 'NMappedReads' elif type == 'unreads': - metadata = 'NUniqMappedReads' + metadata = 'NUniqMappedReads' elif type == 'fragLen': metadata = 'FragmentLength' return metadata @@ -88,11 +88,11 @@ def filteredData(sample, ftype): extenders = [] for ppqt_value in linelist: if int(ppqt_value) > 150: - extenders.append(ppqt_value) + extenders.append(ppqt_value) if len(extenders) > 0: print("{}\t{}\t{}".format(sample, mtypes, extenders[0])) else: - print("{}\t{}\t{}".format(sample, mtypes, linelist[0])) + print("{}\t{}\t{}".format(sample, mtypes, linelist[0])) elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf': mtypes = getmetadata(ftype) for i in range(len(linelist)): diff --git a/workflow/scripts/jobby b/bin/jobby similarity index 100% rename from workflow/scripts/jobby rename to bin/jobby diff --git a/workflow/scripts/ppqt/LICENSE b/bin/ppqt/LICENSE similarity index 100% rename from workflow/scripts/ppqt/LICENSE rename to bin/ppqt/LICENSE diff --git a/workflow/scripts/ppqt/README.txt b/bin/ppqt/README.txt similarity index 100% rename from workflow/scripts/ppqt/README.txt rename to bin/ppqt/README.txt diff --git a/workflow/scripts/ppqt/peakCallingPipelineForIdr.txt b/bin/ppqt/peakCallingPipelineForIdr.txt similarity index 100% rename from workflow/scripts/ppqt/peakCallingPipelineForIdr.txt rename to bin/ppqt/peakCallingPipelineForIdr.txt diff --git a/workflow/scripts/ppqt/run_spp.R b/bin/ppqt/run_spp.R similarity index 100% rename from workflow/scripts/ppqt/run_spp.R rename to bin/ppqt/run_spp.R diff --git a/workflow/scripts/ppqt/run_spp_nodups.R b/bin/ppqt/run_spp_nodups.R similarity index 100% rename from workflow/scripts/ppqt/run_spp_nodups.R rename to bin/ppqt/run_spp_nodups.R diff --git a/workflow/scripts/ppqt/spp_1.10.1.tar.gz b/bin/ppqt/spp_1.10.1.tar.gz similarity index 100% rename from workflow/scripts/ppqt/spp_1.10.1.tar.gz rename to bin/ppqt/spp_1.10.1.tar.gz diff --git a/workflow/Snakefile b/workflow/Snakefile index 777e616..6a92c24 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -15,272 +15,156 @@ from scripts.common import ( references, str_bool ) +from scripts.grouping import group_samples_by_reps, group_output_files -# Timestamp in YYYYMMDD format -today = str(datetime.datetime.today()).split()[0].replace('-', '') - -# Global workflow variables configfile: "config.json" -samples = config['samples'] -workpath = config['project']['workpath'] -tmpdir = config['options']['tmp_dir'] -genome = config['options']['genome'] # Reference genome of a set of samples -assay = config['options']['assay'] -blocks = config['project']['blocks'] - -if None in list(blocks.values()): - blocking = False -else: - blocking = True - -# Check for SE or PE FastQ files: -convert = {1: False, 2: True} # 1 = SE, 2 = PE, -1 = Unknown -try: - paired_end = convert[config['project']['nends']] # True if PE else false -except KeyError: - # Catching case when value is -1 or unknown - sys.exit("Fatal: Raw data could not be classified as single-end or paired-end data!") - -# Analysis options -# Run differential binding pipeline -run_dba = True -if config['options']['contrasts'] == 'None': - run_dba = False +# Global workflow variables +today = str(datetime.datetime.today()).split()[0].replace('-', '') # YYYYMMDD +samples = config['samples'] +workpath = config['project']['workpath'] +tmpdir = config['options']['tmp_dir'] +genome = config['options']['genome'] +assay = config['options']['assay'] +blocks = config['project']['blocks'] +blocking = False if None in list(blocks.values()) else True +convert = {1: False, 2: True} # 1 = SE, 2 = PE, -1 = Unknown +paired_end = convert[config['project']['nends']] # True if PE else false +run_dba = False config['options']['contrasts'] is None else True +extensions = ["sorted.RPGC", "Q5DD.RPGC"] +chips = config['project']['peaks']['chips'] +contrast = config['project']['contrast'] +UropaCats = ["protTSS", "prot", "protSEC", "genes"] +zipGroup1, zipGroup2, zipToolC, contrasts \ + = zip_contrasts(contrast, PeakTools) +extensionsDict = {"sorted": "bam", "Q5DD":"bam"} if paired_end \ + else {"sorted": "bam", "Q5DD_tagAlign": "gz"} +file_exts = list(extensionsDict.keys()) +extensionsFull = ['sorted.bam', 'Q5DD.bam'] if paired_end \ + else ['sorted.bam', 'Q5DD_tagAlign.gz'] + +# Directory end points +trim_dir = "trim" +kraken_dir = "kraken" +bam_dir = join(workpath, "bam") +bw_dir = join(workpath, "bigwig") +qc_dir = join(workpath, "QC") +ppqt_dir = join(bam_dir, "ppqt") +macsN_dir = join(workpath, "macsNarrow") +macsB_dir = join(workpath, "macsBroad") +sicer_dir = join(workpath, "sicer") +peakqc_dir = join(workpath, "PeakQC") +uropa_dir = join(workpath, "UROPA_annotations") +diffbind_dir = join(uropa_dir, "DiffBind") +cfTool_dir = join(workpath, "cfChIPtool") +genrich_dir = join(workpath, "Genrich") +MEME_dir = join(workpath, "MEME") + +# Extended data structures +''' +:param chip2input : map (1:1) from sample id to input +{ + "WT_S1": "Input_S1", + "WT_S2": "Input_S2", + "WT_S3": "Input_S3", + "WT_S4": "Input_S4" + ... +} +''' +chip2input = config['project']['peaks']['inputs'] + +''' +:param groupdata : map (1:M) of group id to sample ids +{ + "G1": ["WT_S1", "WT_S2"], + "G2": ["WT_S3", "WT_S4"] + ... +} +''' +groupdata = config['project']['groups'] + +''' +:param groupdatawinput : +{ + "G1": ["WT_S1", "WT_S2"], + "G2": ["WT_S3", "WT_S4"] + ... +} + +:param groupswreps : + ["G1", "G2", ...] +''' +groupdatawinput, groupswreps = group_samples_by_reps(groupdata, samples, chip2input) +groups = list(groupdatawinput.keys()) +reps = False if len(groupswreps) > 0 else True +uniq_inputs = list(sorted(set([v for v in chip2input.values() if v]))) +sampleswinput = [ + chip_value for input_id, chip_value in chip2input.items() \ + if chip_value != 'NA' and chip_value != '' +] +inputnorm = [""] if len(sampleswinput) == 0 else ["", ".inputnorm"] +deepgroups, deepexts = group_output_files(extensions, groups,inputnorm) # Read in resource information, # containing information about # threads, mem, walltimes, etc. -# TODO: Add handler for when the -# mode is set to local. with open(join('config', 'cluster.json')) as fh: cluster = json.load(fh) -# Functions -def outputfiles2(extensions, groupslist, inputnorm): - """ - Produces correct output filenames based on group information. - Names will be: - Inputnorm.Q5DD.RPGC.metagene_heatmap.pdf - {groupName}.Q5DD.RPGC.metagene_heatmap.pdf - {groupName}.sorted.RPGC.metagene_heatmap.pdf - Note: Inputnorm will only be included when there are input samples. - """ - dtoolgroups, dtoolext = [], [] - - if len(inputnorm) == 2: - dtoolgroups.extend(["InputNorm"]) - dtoolext.extend([extensions[1]]) - - for group in groupslist: - dtoolgroups.extend([group] * 2) - dtoolext.extend([extensions[1], extensions[0]]) - - if len(inputnorm) == 2: - dtoolgroups.extend(["InputNorm.prot"]) - dtoolext.extend([extensions[1]]) - - for group in groupslist: - dtoolgroups.extend([group + ".prot"] * 2) - dtoolext.extend([extensions[1], extensions[0]]) - - return dtoolgroups, dtoolext - -def zip_contrasts(contrast, PeakTools): - """making output file names for differential binding analyses""" - zipGroup1, zipGroup2, zipTool, contrasts = [], [], [], [] - for g1, g2 in contrast: - for PeakTool in PeakTools: - zipGroup1.append(g1) - zipGroup2.append(g2) - zipTool.append(PeakTool) - contrasts.append( g1 + "_vs_" + g2 + "-" + PeakTool ) - return(zipGroup1, zipGroup2, zipTool, contrasts) - - -extensions = [ "sorted.RPGC", "Q5DD.RPGC" ] - - -# Getting sample relationships from config -# using ChIP/input nomenclature. NOTE: ATAC -# won't have input samples - -########### -chip2input = config['project']['peaks']['inputs'] #{"WT_S1": "Input_S1","WT_S2": "Input_S2","WT_S3": "Input_S3","WT_S4": "Input_S4"} -groupdata = config['project']['groups'] # {"G1": ["WT_S1","WT_S2"],"G2": ["WT_S3","WT_S4"]} - -groupdatawinput = {} -groupswreps = [] -for group, chipsamples in groupdata.items() : - tmp = [ ] - if len(chipsamples) > 1: - groupswreps.append(group) - for chip in chipsamples : - if chip in samples: - tmp.append(chip) - input = chip2input[chip] - if input != 'NA' and input != '': - tmp.append(input) - if len(tmp) != 0: - groupdatawinput[group]=set(tmp) - -groups = list(groupdatawinput.keys()) - -reps="no" -if len(groupswreps) > 0: - reps="yes" -############## - -uniq_inputs = list(sorted(set([v for v in chip2input.values() if v]))) - -sampleswinput = [] -for input in chip2input: - if chip2input[input] != 'NA' and chip2input[input] != '': - sampleswinput.append(input) - - -if len(sampleswinput) == 0: - inputnorm = [""] -else: - inputnorm = ["",".inputnorm"] - - -deepgroups, deepexts = outputfiles2(extensions, groups,inputnorm) - - - -# Directory names -trim_dir='trim' -kraken_dir='kraken' -bam_dir='bam' -bw_dir='bigwig' -deeptools_dir='deeptools' -extra_fingerprint_dir='deeptools/sorted_fingerprint' -qc_dir="QC" -ppqt_dir="ppqt" -macsN_dir="macsNarrow" -macsB_dir="macsBroad" -sicer_dir="sicer" - -uropa_dir = "UROPA_annotations" -diffbind_dir = "DiffBind" -diffbind_dir_block = "DiffBindBlock" - -if assay == "atac": - PeakTools = ["macsNarrow", "Genrich"] -elif assay == "chip": - PeakTools = ["macsNarrow", "macsBroad", "sicer"] -else: - PeakTools = ["macsNarrow"] - -chips = config['project']['peaks']['chips'] -contrast = config['project']['contrast'] -UropaCats = ["protTSS", "prot", "protSEC", "genes"] -extensions = ["sorted.RPGC", "Q5DD.RPGC"] - -# Setup to run with ChIP samples, -# which could include IgG samples -cfTool_dir="cfChIPtool" -cfTool_subdir2="cfChIPtool/BED/H3K4me3" - -zipGroup1, zipGroup2, zipToolC, contrasts = zip_contrasts(contrast, PeakTools) -# Final targets of the pipeline - -if paired_end: - extensionsDict = {"sorted": "bam", "Q5DD":"bam"} - extensionsFull = ['sorted.bam', 'Q5DD.bam'] -else: - extensionsDict= {"sorted": "bam", "Q5DD_tagAlign": "gz"} - extensionsFull = ['sorted.bam', 'Q5DD_tagAlign.gz'] - -if assay == "cfchip": - rule all: - input: - join(workpath,"multiqc_report.html"), - expand(join(workpath,qc_dir,"{name}.{ext}.insert_size_metrics.txt"),name=samples,ext=list(extensionsDict.keys())), - expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensionsFull), - expand(join(workpath,qc_dir,"{name}.preseq.dat"), name=samples), - expand(join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"),name=chips), - expand(join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples), - expand(join(workpath,cfTool_dir,"Output","H3K4me3","Signatures","{name}.Q5DD.csv"),name=chips), - join(workpath,"QC","H3K4me3_cfChIP_signature.txt"), - expand(join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"),name=samples, ext=["sorted", "Q5DD"]), - expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput), - expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'), - PeakTool=PeakTools,name=chips,type=["protTSS"]), - expand(join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools), - expand(join(workpath, uropa_dir, "QC", "AllSamples-macsNarrow_{PeakTool}_uropa_{type}_allhits.txt"), - PeakTool="DiffBindQC", type="protTSS"), - expand(join(workpath,uropa_dir,"promoterTable1",'{PeakTool}_promoter_overlap_summaryTable.txt'),PeakTool=PeakTools), - provided(expand(join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - zip,group1=zipGroup1,group2=zipGroup2,PeakTool=zipToolC), reps == "yes"), - provided(expand(join(workpath,uropa_dir,diffbind_dir,'{name}_{PeakTool}_uropa_{type}_allhits.txt'), - PeakTool=['DiffbindEdgeR','DiffbindDeseq2'],name=contrasts,type=["protTSS"]), reps == "yes"), - provided(expand(join(workpath,uropa_dir,"promoterTable2",'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'), - PeakTool=PeakTools),reps == "yes" and contrast), - -elif assay in ["atac", "chip"]: - rule all: - input: - join(workpath,"multiqc_report.html"), - provided(expand(join(workpath,qc_dir,"{name}.{ext}.insert_size_metrics.txt"),name=samples,ext=list(extensionsDict.keys())), paired_end==True), - expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensionsFull), - expand(join(workpath,qc_dir,"{name}.preseq.dat"), name=samples), - expand(join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"),name=chips), - provided(expand(join(workpath,"macsBroad","{name}","{name}_peaks.broadPeak"),name=chips), assay=="chip"), - provided(expand(join(workpath,"sicer","{name}","{name}_broadpeaks.bed"),name=chips), assay=="chip"), - expand(join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples), - expand(join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"),name=samples, ext=["sorted", "Q5DD"]), - expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput), - expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'), - PeakTool=PeakTools,name=chips,type=["protTSS", "prot", "protSEC", "genes"]), - - provided(expand(join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - zip,group1=zipGroup1,group2=zipGroup2,PeakTool=zipToolC), reps == "yes"), - provided(expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'), - PeakTool=PeakTools,name=chips,type=["protTSS", "prot", "protSEC", "genes"]), reps == "yes"), - provided(expand(join(workpath,uropa_dir,diffbind_dir,'{name}_{PeakTool}_uropa_{type}_allhits.txt'), - PeakTool=['DiffbindEdgeR','DiffbindDeseq2'],name=contrasts,type=["protTSS", "prot", "protSEC", "genes"]), reps == "yes"), - - provided(expand(join(workpath,"Genrich","{name}","{name}.narrowPeak"),name=chips), assay=="atac"), - - provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt"), name=samples, ext=["sorted", "Q5DD"]), paired_end == True and assay=="chip"), - provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.pdf"), name=samples, ext=["sorted", "Q5DD"]), paired_end == True and assay=="chip"), - provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt.txt"),name=samples, ext=["sorted", "Q5DD"]), paired_end == True and assay=="chip"), - provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt"), name=samples, ext=["sorted", "Q5DD_tagAlign"]), paired_end == False and assay=="chip"), - provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.pdf"), name=samples, ext=["sorted", "Q5DD_tagAlign"]), paired_end == False and assay=="chip"), - provided(expand(join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt.txt"),name=samples, ext=["sorted", "Q5DD_tagAlign"]), paired_end == False and assay=="chip"), - expand(join(workpath, "MEME", "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips), - expand(join(workpath, "MEME", "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips) - - -############################# -# Pipeline hooks for Onstart, -# onsucess, and onerror -include: join("rules", "hooks.smk") - -# QC/alignment rules: trim_pe, -# BWA_PE, picard_dedup, bam2bw, -# inputnorm -include: "rules/trim_align_dedup.smk" - -# QC rules common to all: preseq, NRF, -# rawfastqc, fastqc, fastq_screen, -# kraken_pe, multiqc, insert_size -include: "rules/qc.smk" - -# MACS2_narrow -# if assay=="atac" then run rules sortByRead and genrich -include: "rules/peakcall.smk" - -#FRiP, FRiP_plot, jaccard rules -include: "rules/peakcall_qc.smk" - -#UROPA, DiffBind, and manorm rules -include: "rules/dba.smk" - -# cfChIP-specific QC rules: -# cfChIPtool, cfChIPcompile, -# promoterTable1:, and promoterTable2 -if assay == "cfchip": - include: "rules/cfChIP.smk" +rule all: + input: + if assay == "cfchip": + peak_types = ["protTSS"] + join(qc_dir, "H3K4me3_cfChIP_signature.txt"), + expand(join(qc_dir, "{name}.{ext}.insert_size_metrics.txt"), name=samples, ext=file_exts) + expand(join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"), name=chips), + expand(join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool=PeakTools, name=chips, _type=peak_types), + expand(join(uropa_dir, "QC", "AllSamples-macsNarrow_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool="DiffBindQC", _type=peak_types), + expand(join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), PeakTool=PeakTools), + expand(join(diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], name=contrasts, _type=peak_types) + if reps: + expand(join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), + group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC) + expand(join(diffbind_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), + PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], name=contrasts, _type=peak_types) + + else assay in ["atac", "chip"]: + peak_types = ["protTSS", "prot", "protSEC", "genes"] + expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips), + expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips) + if paired_end: + expand(join(qc_dir, "{name}.{ext}.insert_size_metrics.txt"), name=samples, ext=file_exts) + if assay == "chip": + expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips) + expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips) + if paired_end: + short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"] + expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext), + expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext), + expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext) + expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext) + expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext) + expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext) + if assay == "atac": + expand(join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips) + if reps: + expand(join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{type}_allhits.txt"), + PeakTool=PeakTools, name=chips, _type=peak_types), + expand(join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), + group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC) + expand(join(diffbind_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), + PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], name=contrasts, _type=peak_types) + if contrast: + expand(join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), + PeakTool=PeakTools) + join(workpath,"multiqc_report.html"), + expand(join(qc_dir, "{name}.preseq.dat"), name=samples), + expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools), + expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extensionsFull), + expand(join(macsN_dir, "{name}","{name}_peaks.narrowPeak"), name=chips), + expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples), + expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"]), + expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput), \ No newline at end of file diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py new file mode 100644 index 0000000..6e39613 --- /dev/null +++ b/workflow/scripts/grouping.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# common functions related to sample grouping or group meta-information +def group_samples_by_reps(groupdata, samples, chip2input): + groupdatawinput = {} + groupswreps = [] + for group, chipsamples in groupdata.items() : + tmp = [ ] + if len(chipsamples) > 1: + groupswreps.append(group) + for chip in chipsamples : + if chip in samples: + tmp.append(chip) + input = chip2input[chip] + if input != 'NA' and input != '': + tmp.append(input) + if len(tmp) != 0: + groupdatawinput[group]=set(tmp) + return groupdatawinput, groupswreps + + +def group_output_files(extensions, groupslist, inputnorm): + """ + Produces correct output filenames based on group information. + Names will be: + Inputnorm.Q5DD.RPGC.metagene_heatmap.pdf + {groupName}.Q5DD.RPGC.metagene_heatmap.pdf + {groupName}.sorted.RPGC.metagene_heatmap.pdf + Note: Inputnorm will only be included when there are input samples. + """ + dtoolgroups, dtoolext = [], [] + + if len(inputnorm) == 2: + dtoolgroups.extend(["InputNorm"]) + dtoolext.extend([extensions[1]]) + + for group in groupslist: + dtoolgroups.extend([group] * 2) + dtoolext.extend([extensions[1], extensions[0]]) + + if len(inputnorm) == 2: + dtoolgroups.extend(["InputNorm.prot"]) + dtoolext.extend([extensions[1]]) + + for group in groupslist: + dtoolgroups.extend([group + ".prot"] * 2) + dtoolext.extend([extensions[1], extensions[0]]) + + return dtoolgroups, dtoolext + +def zip_contrasts(contrast, PeakTools): + """making output file names for differential binding analyses""" + zipGroup1, zipGroup2, zipTool, contrasts = [], [], [], [] + for g1, g2 in contrast: + for PeakTool in PeakTools: + zipGroup1.append(g1) + zipGroup2.append(g2) + zipTool.append(PeakTool) + contrasts.append( g1 + "_vs_" + g2 + "-" + PeakTool ) + return(zipGroup1, zipGroup2, zipTool, contrasts) \ No newline at end of file From e89e0bd7691cb6771ffc089b0f63a29faf66ddef Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Fri, 5 Jul 2024 17:50:35 -0400 Subject: [PATCH 02/28] chore: refactor script locations, workflow rules, remove uropa rules for time being --- src/run.py | 4 +- workflow/Snakefile | 261 ++++--- workflow/rules/cfChIP.smk | 19 +- workflow/rules/common.smk | 1 - workflow/rules/dba.smk | 180 ++--- workflow/rules/peakcall.smk | 105 ++- workflow/rules/peakcall_qc.smk | 46 -- workflow/rules/qc.smk | 436 ++++++----- workflow/rules/trim_align_dedup.smk | 700 +++++++++--------- workflow/scripts/DiffBind_v2_ChIPseq.Rmd | 260 ------- .../scripts/DiffBind_v2_ChIPseq_block.Rmd | 267 ------- workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd | 204 ----- workflow/scripts/FRiP_plot.R | 112 --- workflow/scripts/atac_nrf.py | 22 - workflow/scripts/bam_filter_by_mapq.py | 40 - workflow/scripts/blocking.py | 28 + workflow/scripts/cfChIP_signatures.R | 97 --- workflow/scripts/common.py | 61 +- workflow/scripts/frip.py | 164 ---- workflow/scripts/grouping.py | 54 +- workflow/scripts/jaccard_score.py | 202 ----- workflow/scripts/peakcall.py | 105 +++ workflow/scripts/ppqt_process.py | 27 - workflow/scripts/prep_diffbind.py | 54 -- workflow/scripts/prep_diffbindQC.py | 51 -- workflow/scripts/promoterAnnotation_by_Gene.R | 179 ----- workflow/scripts/significantPathways.R | 127 ---- 27 files changed, 1065 insertions(+), 2741 deletions(-) delete mode 100644 workflow/rules/common.smk delete mode 100644 workflow/rules/peakcall_qc.smk delete mode 100644 workflow/scripts/DiffBind_v2_ChIPseq.Rmd delete mode 100644 workflow/scripts/DiffBind_v2_ChIPseq_block.Rmd delete mode 100644 workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd delete mode 100644 workflow/scripts/FRiP_plot.R delete mode 100644 workflow/scripts/atac_nrf.py delete mode 100644 workflow/scripts/bam_filter_by_mapq.py create mode 100644 workflow/scripts/blocking.py delete mode 100755 workflow/scripts/cfChIP_signatures.R delete mode 100644 workflow/scripts/frip.py delete mode 100644 workflow/scripts/jaccard_score.py create mode 100644 workflow/scripts/peakcall.py delete mode 100644 workflow/scripts/ppqt_process.py delete mode 100644 workflow/scripts/prep_diffbind.py delete mode 100644 workflow/scripts/prep_diffbindQC.py delete mode 100755 workflow/scripts/promoterAnnotation_by_Gene.R delete mode 100755 workflow/scripts/significantPathways.R diff --git a/src/run.py b/src/run.py index b327cc8..34fc423 100644 --- a/src/run.py +++ b/src/run.py @@ -207,6 +207,7 @@ def setup(sub_args, ifiles, repo_path, output_path): # Add other runtime info for debugging config['project']['version'] = __version__ config['project']['workpath'] = os.path.abspath(sub_args.output) + config['project']['binpath'] = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'bin')) git_hash = git_commit_hash(repo_path) config['project']['git_commit_hash'] = git_hash # Add latest git commit hash config['project']['pipeline_path'] = repo_path # Add path to installation @@ -221,7 +222,8 @@ def setup(sub_args, ifiles, repo_path, output_path): v = str(v) config['options'][opt] = v - + # initiate a few workflow vars + config['options']['peak_type_base'] = ["protTSS"] return config diff --git a/workflow/Snakefile b/workflow/Snakefile index 6a92c24..026c56d 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,21 +1,13 @@ # Python standard library +import datetime +import json from os.path import join from os import listdir -import os, sys, re, datetime -import json - -# 3rd party imports from pypi -from snakemake.workflow import workflow as wf_api -from snakemake.utils import R # Local imports -from scripts.common import ( - allocated, - provided, - references, - str_bool -) -from scripts.grouping import group_samples_by_reps, group_output_files +from scripts.common import provided, get_file_components +from scripts.grouping import group_samples_by_reps, \ + group_output_files, zip_contrasts, get_peaktools configfile: "config.json" @@ -23,29 +15,30 @@ configfile: "config.json" today = str(datetime.datetime.today()).split()[0].replace('-', '') # YYYYMMDD samples = config['samples'] workpath = config['project']['workpath'] -tmpdir = config['options']['tmp_dir'] -genome = config['options']['genome'] assay = config['options']['assay'] -blocks = config['project']['blocks'] -blocking = False if None in list(blocks.values()) else True -convert = {1: False, 2: True} # 1 = SE, 2 = PE, -1 = Unknown -paired_end = convert[config['project']['nends']] # True if PE else false -run_dba = False config['options']['contrasts'] is None else True -extensions = ["sorted.RPGC", "Q5DD.RPGC"] +paired_end = False if config['project']['nends'] == 1 else True chips = config['project']['peaks']['chips'] contrast = config['project']['contrast'] -UropaCats = ["protTSS", "prot", "protSEC", "genes"] +chip2input = config['project']['peaks']['inputs'] +groupdata = config['project']['groups'] +peak_types = config['options']['peak_type_base'] +rule_all_ins = [] +groupdatawinput, groupswreps = group_samples_by_reps(groupdata, samples, chip2input) +PeakTools = get_peaktools(assay) zipGroup1, zipGroup2, zipToolC, contrasts \ = zip_contrasts(contrast, PeakTools) -extensionsDict = {"sorted": "bam", "Q5DD":"bam"} if paired_end \ - else {"sorted": "bam", "Q5DD_tagAlign": "gz"} -file_exts = list(extensionsDict.keys()) -extensionsFull = ['sorted.bam', 'Q5DD.bam'] if paired_end \ - else ['sorted.bam', 'Q5DD_tagAlign.gz'] +file_stems, extRPGC, extaln = get_file_components(paired_end) +groups = list(groupdatawinput.keys()) +reps = False if len(groupswreps) > 0 else True +uniq_inputs = list(sorted(set([v for v in chip2input.values() if v]))) +sampleswinput = [ + chip_value for input_id, chip_value in chip2input.items() \ + if chip_value != 'NA' and chip_value != '' +] +inputnorm = [""] if len(sampleswinput) == 0 else ["", ".inputnorm"] +deepgroups, deepexts = group_output_files(extRPGC, groups, inputnorm) # Directory end points -trim_dir = "trim" -kraken_dir = "kraken" bam_dir = join(workpath, "bam") bw_dir = join(workpath, "bigwig") qc_dir = join(workpath, "QC") @@ -55,116 +48,120 @@ macsB_dir = join(workpath, "macsBroad") sicer_dir = join(workpath, "sicer") peakqc_dir = join(workpath, "PeakQC") uropa_dir = join(workpath, "UROPA_annotations") -diffbind_dir = join(uropa_dir, "DiffBind") +diffbind_dir = join(workpath, "DiffBind") cfTool_dir = join(workpath, "cfChIPtool") genrich_dir = join(workpath, "Genrich") MEME_dir = join(workpath, "MEME") -# Extended data structures -''' -:param chip2input : map (1:1) from sample id to input -{ - "WT_S1": "Input_S1", - "WT_S2": "Input_S2", - "WT_S3": "Input_S3", - "WT_S4": "Input_S4" - ... -} -''' -chip2input = config['project']['peaks']['inputs'] - -''' -:param groupdata : map (1:M) of group id to sample ids -{ - "G1": ["WT_S1", "WT_S2"], - "G2": ["WT_S3", "WT_S4"] - ... -} -''' -groupdata = config['project']['groups'] - -''' -:param groupdatawinput : -{ - "G1": ["WT_S1", "WT_S2"], - "G2": ["WT_S3", "WT_S4"] - ... -} - -:param groupswreps : - ["G1", "G2", ...] -''' -groupdatawinput, groupswreps = group_samples_by_reps(groupdata, samples, chip2input) -groups = list(groupdatawinput.keys()) -reps = False if len(groupswreps) > 0 else True -uniq_inputs = list(sorted(set([v for v in chip2input.values() if v]))) -sampleswinput = [ - chip_value for input_id, chip_value in chip2input.items() \ - if chip_value != 'NA' and chip_value != '' -] -inputnorm = [""] if len(sampleswinput) == 0 else ["", ".inputnorm"] -deepgroups, deepexts = group_output_files(extensions, groups,inputnorm) - -# Read in resource information, -# containing information about -# threads, mem, walltimes, etc. +# Read in resource information with open(join('config', 'cluster.json')) as fh: cluster = json.load(fh) +if assay == "cfchip": + rule_all_ins.append(join( + qc_dir, "H3K4me3_cfChIP_signature.txt" + )) + rule_all_ins.extend(expand( + join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), + name=samples, + stem=file_stems + )) + rule_all_ins.extend(expand( + join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"), + name=chips + )) + rule_all_ins.extend(expand( + join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool=PeakTools, + name=chips, + _type=peak_types + )) + rule_all_ins.extend(expand( + join(uropa_dir, "QC", "AllSamples-macsNarrow_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool="DiffBindQC", + _type=peak_types + )) + rule_all_ins.extend(expand( + join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), + PeakTool=PeakTools + )) + # rule_all_ins.extend(expand( + # join(diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + # PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], + # name=contrasts, + # _type=peak_types + # )) + if reps: + rule_all_ins.extend(expand( + join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), + group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC + )) + rule_all_ins.extend(expand( + join(diffbind_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], + name=contrasts, _type=peak_types + )) +elif assay in ["atac", "chip"]: + peak_types.extend(["prot", "protSEC", "genes"]) + rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips)) + rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips)) + if paired_end: + rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems)) + if assay == "chip": + rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips)) + rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips)) + if paired_end: + short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"] + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext)) + if assay == "atac": + rule_all_ins.extend(expand( + join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips + )) + if reps: + rule_all_ins.extend(expand( + join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{type}_allhits.txt"), + PeakTool=PeakTools, name=chips, _type=peak_types + )) + rule_all_ins.extend(expand( + join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), + group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC + )) + # rule_all_ins.extend(expand( + # join(uropa_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), + # PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], + # name=contrasts, + # _type=peak_types + # )) + if contrast: + rule_all_ins.extend(expand( + join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), + PeakTool=PeakTools + )) +rule_all_ins.append(join(workpath,"multiqc_report.html")) +rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples)) +rule_all_ins.extend( + expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools) +) +rule_all_ins.extend(expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extaln)) +rule_all_ins.extend(expand(join(macsN_dir, "{name}","{name}_peaks.narrowPeak"), name=chips)) +rule_all_ins.extend( + expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples) +) +rule_all_ins.extend(expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"])) +rule_all_ins.extend(expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput)) + rule all: input: - if assay == "cfchip": - peak_types = ["protTSS"] - join(qc_dir, "H3K4me3_cfChIP_signature.txt"), - expand(join(qc_dir, "{name}.{ext}.insert_size_metrics.txt"), name=samples, ext=file_exts) - expand(join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"), name=chips), - expand(join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=PeakTools, name=chips, _type=peak_types), - expand(join(uropa_dir, "QC", "AllSamples-macsNarrow_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool="DiffBindQC", _type=peak_types), - expand(join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), PeakTool=PeakTools), - expand(join(diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], name=contrasts, _type=peak_types) - if reps: - expand(join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC) - expand(join(diffbind_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), - PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], name=contrasts, _type=peak_types) + rule_all_ins - else assay in ["atac", "chip"]: - peak_types = ["protTSS", "prot", "protSEC", "genes"] - expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips), - expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips) - if paired_end: - expand(join(qc_dir, "{name}.{ext}.insert_size_metrics.txt"), name=samples, ext=file_exts) - if assay == "chip": - expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips) - expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips) - if paired_end: - short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"] - expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext), - expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext), - expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext) - expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext) - expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext) - expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext) - if assay == "atac": - expand(join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips) - if reps: - expand(join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{type}_allhits.txt"), - PeakTool=PeakTools, name=chips, _type=peak_types), - expand(join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC) - expand(join(diffbind_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), - PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], name=contrasts, _type=peak_types) - if contrast: - expand(join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), - PeakTool=PeakTools) - join(workpath,"multiqc_report.html"), - expand(join(qc_dir, "{name}.preseq.dat"), name=samples), - expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools), - expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extensionsFull), - expand(join(macsN_dir, "{name}","{name}_peaks.narrowPeak"), name=chips), - expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples), - expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"]), - expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput), \ No newline at end of file +# Include child rules +include: join("rules", "hooks.smk") +include: join("rules", "trim_align_dedup.smk") +include: join("rules", "qc.smk") +include: join("rules", "peakcall.smk") +include: join("rules", "dba.smk") +include: join("rules", "cfChIP.smk") \ No newline at end of file diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk index fa521d6..672a05b 100644 --- a/workflow/rules/cfChIP.smk +++ b/workflow/rules/cfChIP.smk @@ -1,7 +1,16 @@ -# cell-free DNA ChIP-seq rules: -# - picard_dedup -# - cfChIPtool -# - cfChIPcompile +# cell-free ChIP-seq +# ~~~~ +# rules: picard_dedup, cfChIPtool, cfChIPcompile + + +# ~~ workflow configuration +workpath = config['project']['workpath'] +genome = config['options']['genome'] +blocks = config['project']['blocks'] +groupdata = config['project']['groups'] + + +# ~~ directories rule cfChIPtool: @@ -34,7 +43,7 @@ rule cfChIPtool: rule cfChIPcompile: input: - expand(join(workpath,cfTool_dir,"Output","H3K4me3","Signatures","{name}.Q5DD.csv"),name=chip) + expand(join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"), name=chips) output: txt=join(workpath,"QC","H3K4me3_cfChIP_signature.txt"), pdf=join(workpath,"QC","H3K4me3_cfChIP_signature.pdf") diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk deleted file mode 100644 index a41b6b5..0000000 --- a/workflow/rules/common.smk +++ /dev/null @@ -1 +0,0 @@ -from scripts.common import abstract_location diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index 036bab1..69aecc4 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -1,72 +1,46 @@ -# TODO: This Snakefile needs to be completely refactored. -# Python standard library +# Differential binding analysis rules +# ~~~~ from os.path import join import os +from scripts.common import allocated, mk_dir_if_not_exist +from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction +from scripts.blocking import test_for_block -# Local imports -from scripts.common import ( - allocated -) - -def outputIDR(groupswreps, groupdata, chip2input, tools): - """ - Produces the correct output files for IDR. All supposed replicates - should be directly compared when possible using IDR. IDR malfunctions - with bed files and GEM so it will not run with either of those. - Because there is no q-value calculated for SICER when there is no - input file, those samples are also ignored. - """ - IDRgroup, IDRsample1, IDRsample2, IDRpeaktool = [], [], [], [] - for group in groupswreps: - nsamples = len(groupdata[group]) - for i in range(nsamples): - ctrlTF = chip2input[groupdata[group][i]] != "" - for j in range(i+1,nsamples): - if ctrlTF == (chip2input[groupdata[group][j]] != ""): - if ctrlTF == False: - tooltmp = [ tool for tool in tools if tool != "sicer" ] - else: - tooltmp = tools - IDRgroup.extend([group] * len(tooltmp)) - IDRsample1.extend([groupdata[group][i]] * len(tooltmp)) - IDRsample2.extend([groupdata[group][j]] * len(tooltmp)) - IDRpeaktool.extend(tooltmp) - return( IDRgroup, IDRsample1, IDRsample2, IDRpeaktool ) +# ~~ workflow configuration +workpath = config['project']['workpath'] +genome = config['options']['genome'] +blocks = config['project']['blocks'] +groupdata = config['project']['groups'] -def zip_peak_files(chips, PeakTools, PeakExtensions): - """Making input file names for FRiP""" - zipSample, zipTool, zipExt = [], [], [] - for chip in chips: - for PeakTool in PeakTools: - zipSample.append(chip) - zipTool.append(PeakTool) - zipExt.append(PeakExtensions[PeakTool]) - return(zipSample, zipTool, zipExt) +# ~~ directories +bin_path = join(workpath, "workflow", "bin") +diffbind_dir_block = join(workpath, "DiffBindBlock") +diffbind_dir2 = join(workpath, "DiffBind_block") +diffbind_dir = join(workpath, "DiffBind") +bam_dir = join(workpath, "bam") +qc_dir = join(workpath, "PeakQC") +idr_dir = join(workpath, "IDR") +memechip_dir = join(workpath, "MEME") +homer_dir = join(workpath, "HOMER_motifs") +uropa_dir = join(workpath, "UROPA_annotations") +manorm_dir = join(workpath, "MANorm") +downstream_dir = join(workpath, "Downstream") +otherDirs = [qc_dir, homer_dir, uropa_dir] +cfTool_dir = join(workpath, "cfChIPtool") +cfTool_subdir2 = join(cfTool_dir, "BED", "H3K4me3") -def calc_effective_genome_fraction(effectivesize, genomefile): - """ - calculate the effective genome fraction by calculating the - actual genome size from a .genome-like file and then dividing - the effective genome size by that number - """ - lines=list(map(lambda x:x.strip().split("\t"),open(genomefile).readlines())) - genomelen=0 - for chrom,l in lines: - if not "_" in chrom and chrom!="chrX" and chrom!="chrM" and chrom!="chrY": - genomelen+=int(l) - return(str(float(effectivesize)/ genomelen)) +# ~~ workflow switches +blocking = False if None in list(blocks.values()) else True +if reps == "yes": otherDirs.append(diffbind_dir) +mk_dir_if_not_exist(PeakTools + otherDirs) -# PREPARING TO DEAL WITH A VARIED SET OF PEAKCALL TOOLS -gem_dir = "gem" -macsB_dir = "macsBroad" -sicer_dir = "sicer" +# ~~ peak calling configuration and outputs PeakToolsNG = [ tool for tool in PeakTools if tool != "gem" ] - PeakExtensions = { 'macsNarrow': '_peaks.narrowPeak', 'macsBroad': '_peaks.broadPeak', @@ -106,71 +80,29 @@ RankColIDR = { 'macsBroad': 'q.value', 'sicer': 'q.value' } - - IDRgroup, IDRsample1, IDRsample2, IDRpeaktool = outputIDR(groupswreps, groupdata, chip2input, PeakToolsNG) - zipSample, zipTool, zipExt = zip_peak_files(chips, PeakTools, PeakExtensions) - - -# CREATING DIRECTORIES -bam_dir='bam' -qc_dir='PeakQC' -idr_dir = 'IDR' -memechip_dir = "MEME" -homer_dir = "HOMER_motifs" -manorm_dir = "MANorm" -downstream_dir = "Downstream" - -otherDirs = [qc_dir, homer_dir, uropa_dir] -if reps == "yes": - # otherDirs.append(idr_dir) - otherDirs.append(diffbind_dir) - -for d in PeakTools + otherDirs: - if not os.path.exists(join(workpath,d)): - os.mkdir(join(workpath,d)) - - -# Blocking code -diffbind_dir2 = "DiffBind_block" -blocks=config['project']['blocks'] - -def test_for_block(contrast, blocks): - """ only want to run blocking on contrasts where all - individuals are on both sides of the contrast """ - contrastBlock = [ ] - for con in contrast: - group1 = con[0] - group2 = con[1] - block1 = [ blocks[sample] for sample in groupdata[group1] ] - block2 = [ blocks[sample] for sample in groupdata[group2] ] - if len(block1) == len(block2): - if len(set(block1).intersection(block2)) == len(block1): - contrastBlock.append(con) - return contrastBlock - - -contrastBlock = test_for_block(contrast,blocks) +contrastBlock = test_for_block(groupdata, contrast, blocks) zipGroup1B, zipGroup2B, zipToolCB, contrastsB = zip_contrasts(contrastBlock, PeakTools) +# ~~ rules rule diffbind: input: lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ] output: - html = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - Deseq2 = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"), - EdgeR = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"), - EdgeR_txt = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"), - Deseq2_txt = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"), - EdgeR_ftxt = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"), - Deseq2_ftxt = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"), - html_block = provided(join(workpath,diffbind_dir_block,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking) + html = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), + Deseq2 = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"), + EdgeR = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"), + EdgeR_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"), + Deseq2_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"), + EdgeR_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"), + Deseq2_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"), + html_block = provided(join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking) params: - rname="diffbind", - rscript = join(workpath,"workflow","scripts","DiffBind_v2_ChIPseq.Rmd"), - outdir = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}"), + rname = "diffbind", + rscript = join(workpath, "workflow", "scripts","DiffBind_v2_ChIPseq.Rmd"), + outdir = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}"), contrast = "{group1}_vs_{group2}", csvfile = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv"), pythonscript = join(workpath,"workflow","scripts","prep_diffbind.py"), @@ -212,15 +144,15 @@ if assay == "cfchip": input: lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ] output: - txt=join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'), - bed1=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')), - bed2=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')), + txt=join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'), + bed1=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')), + bed2=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')), params: rname="uropa", uropaver = config['tools']['UROPAVER'], - fldr = join(workpath, uropa_dir, '{PeakTool1}'), - json = join(workpath, uropa_dir, '{PeakTool1}','{name}.{PeakTool2}.{type}.json'), - outroot = join(workpath, uropa_dir, '{PeakTool1}','{name}_{PeakTool2}_uropa_{type}'), + fldr = join(uropa_dir, '{PeakTool1}'), + json = join(uropa_dir, '{PeakTool1}','{name}.{PeakTool2}.{type}.json'), + outroot = join(uropa_dir, '{PeakTool1}','{name}_{PeakTool2}_uropa_{type}'), gtf = config['references'][genome]['GTFFILE'], threads = 4, shell: """ @@ -244,15 +176,15 @@ else: input: lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ] output: - txt=join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'), - bed1=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')), - bed2=temp(join(workpath, uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')), + txt=join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'), + bed1=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')), + bed2=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')), params: rname="uropa", uropaver = config['tools']['UROPAVER'], - fldr = join(workpath, uropa_dir, '{PeakTool1}'), - json = join(workpath, uropa_dir, '{PeakTool1}','{name}.{PeakTool2}.{type}.json'), - outroot = join(workpath, uropa_dir, '{PeakTool1}','{name}_{PeakTool2}_uropa_{type}'), + fldr = join(uropa_dir, '{PeakTool1}'), + json = join(uropa_dir, '{PeakTool1}','{name}.{PeakTool2}.{type}.json'), + outroot = join(uropa_dir, '{PeakTool1}','{name}_{PeakTool2}_uropa_{type}'), gtf = config['references'][genome]['GTFFILE'], threads = 4, shell: """ diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk index 1decc59..466d639 100644 --- a/workflow/rules/peakcall.smk +++ b/workflow/rules/peakcall.smk @@ -1,35 +1,26 @@ -# Helper functions -def get_input_bam(wildcards): - """ - Returns a ChIP samples input BAM file, - see chip2input for ChIP, Input pairs. - """ - input_sample = chip2input[wildcards.name] - if input_sample: - # Runs in a ChIP, input mode - return join(workpath, bam_dir, "{0}.Q5DD.bam".format(input_sample)) - else: - # Runs in ChIP-only mode - return [] +# Quality control rules +# ~~~~ +# Common quality-control rules: preseq, NRF, rawfastqc, +# fastqc, fastq_screen, multiQC +from os.path import join +from scripts.peakcall import get_control_input, getMacTXT, getMacChip + + +# ~~ workflow configuration +workpath = config['project']['workpath'] +genome = config['options']['genome'] +paired_end = False if config['project']['nends'] == 1 else True +chip2input = config['project']['peaks']['inputs'] + +# Directory end points +bam_dir = join(workpath, "bam") +ppqt_dir = join(bam_dir, "ppqt") +genrich_dir = join(workpath, "Genrich") +macsN_dir = join(workpath, "macsNarrow") +macsB_dir = join(workpath, "macsBroad") +sicer_dir = join(workpath, "sicer") -def get_control_input(wildcards): - if paired_end and chip2input[wildcards.name] != "": - i = [ - join(workpath, bam_dir, "{0}.Q5DD.bam".format(chip2input[wildcards.name])) - ] - return i - elif paired_end and chip2input[wildcards.name] == "": - i = [] - return i - elif not paired_end and chip2input[wildcards.name] != "": - i = [ - join(workpath, bam_dir, "{0}.Q5DD_tagAlign.gz".format(chip2input[wildcards.name])) - ] - return i - else: - i = [] - return i rule sortByRead: """ @@ -41,9 +32,9 @@ rule sortByRead: Bam file sorted by read name (extension: sortByRead.bam) """ input: - join(workpath,bam_dir,"{name}.sorted.bam") + join(bam_dir, "{name}.sorted.bam") output: - temp(join(workpath,bam_dir,"{name}.sortedByRead.bam")) + temp(join(bam_dir, "{name}.sortedByRead.bam")) params: rname="sortByRead", samtools=config['tools']['SAMTOOLSVER'], @@ -69,9 +60,9 @@ rule genrich: summit -log(q-value), and summit position. """ input: - join(workpath,bam_dir,"{name}.sortedByRead.bam") + join(bam_dir, "{name}.sortedByRead.bam") output: - join(workpath,"Genrich","{name}","{name}.narrowPeak") + join(genrich_dir, "{name}", "{name}.narrowPeak") params: rname="genrich", genrich_ver=config['tools']['GENRICHVER'] @@ -92,13 +83,11 @@ rule genrich: # INDIVIDUAL RULES rule MACS2_narrow: input: - chip = lambda w: join(workpath,bam_dir, w.name+".Q5DD.bam") \ - if paired_end else join(workpath,bam_dir, w.name+".Q5DD_tagAlign.gz"), - txt = lambda w: join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD.ppqt.txt") \ - if paired_end else join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD_tagAlign.ppqt.txt"), - c_option = get_control_input + chip = lambda w: getMacChip(bam_dir, w.name, paired_end), + txt = lambda w: getMacTXT(ppqt_dir, w.name, paired_end), + c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), output: - join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"), + join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), params: rname='MACS2_narrow', gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'], @@ -112,7 +101,7 @@ rule MACS2_narrow: -t {input.chip} {params.flag} {input.c_option} \\ -g {params.gsize} \\ -n {wildcards.name} \\ - --outdir {workpath}/{macsN_dir}/{wildcards.name} \\ + --outdir {macsN_dir}/{wildcards.name} \\ -q 0.01 \\ --keep-dup="all" \\ -f "BAMPE" @@ -122,7 +111,7 @@ rule MACS2_narrow: -t {input.chip} {params.flag} {input.c_option} \\ -g {params.gsize} \\ -n {wildcards.name} \\ - --outdir {workpath}/{macsN_dir}/{wildcards.name} \\ + --outdir {macsN_dir}/{wildcards.name} \\ -q 0.01 \\ --keep-dup="all" \\ --nomodel \\ @@ -132,13 +121,11 @@ rule MACS2_narrow: rule MACS2_broad: input: - chip = lambda w: join(workpath,bam_dir, w.name+".Q5DD.bam") \ - if paired_end else join(workpath,bam_dir, w.name+".Q5DD_tagAlign.gz"), - txt = lambda w: join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD.ppqt.txt") \ - if paired_end else join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD_tagAlign.ppqt.txt"), - c_option = get_control_input + chip = lambda w: getMacChip(bam_dir, w.name, paired_end), + txt = lambda w: getMacTXT(ppqt_dir, w.name, paired_end), + c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), output: - join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"), + join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), params: rname='MACS2_broad', gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'], @@ -152,7 +139,7 @@ rule MACS2_broad: -t {input.chip} {params.flag} {input.c_option} \\ -g {params.gsize} \\ -n {wildcards.name} \\ - --outdir {workpath}/{macsB_dir}/{wildcards.name} \\ + --outdir {macsB_dir}/{wildcards.name} \\ --broad \\ --broad-cutoff 0.01 \\ --keep-dup="all" \\ @@ -163,7 +150,7 @@ rule MACS2_broad: -t {input.chip} {params.flag} {input.c_option} \\ -g {params.gsize} \\ -n {wildcards.name} \\ - --outdir {workpath}/{macsB_dir}/{wildcards.name} \\ + --outdir {macsB_dir}/{wildcards.name} \\ --broad \\ --broad-cutoff 0.01 \\ --keep-dup="all" \\ @@ -174,20 +161,18 @@ rule MACS2_broad: rule SICER: input: - chip = lambda w: join(workpath,bam_dir, w.name+".Q5DD.bam") \ - if paired_end else join(workpath,bam_dir, w.name+".Q5DD_tagAlign.gz"), - fragLen =lambda w: join(workpath,bam_dir, ppqt_dir, w.name+".Q5DD_tagAlign.ppqt.txt") if \ - not paired_end else join(workpath,"QC", w.name+".Q5DD.insert_size_metrics.txt"), - c_option = get_control_input + chip = lambda w: getSicerChips(bam_dir, w.name, paired_end), + fragLen = lambda w: getSicerFragLen(ppqt_dir, qc_dir, w.name, paired_end), + c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), output: - bed = join(workpath,sicer_dir,"{name}","{name}_broadpeaks.bed"), + bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), params: rname='SICER', sicerver=config['tools']['SICERVER'], bedtoolsver=config['tools']['BEDTOOLSVER'], genomever = config['options']['genome'], name="{name}", - sicer_dir=join(workpath,sicer_dir,"{name}"), + sicer_dir=join(sicer_dir,"{name}"), tmpdir=tmpdir, paired_end = paired_end, frac=config['references'][genome]['FRAC'], @@ -298,15 +283,15 @@ rule MEME: input: bed = lambda w: join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool]) output: - meme_out = join(workpath, "MEME", "{PeakTool}", "{name}_meme", "meme-chip.html"), - ame_out = join(workpath, "MEME", "{PeakTool}", "{name}_ame", "ame.html") + meme_out = join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), + ame_out = join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html") params: rname='MEME', ref_fa=config['references'][genome]['GENOME'], meme_vertebrates_db=config['references'][genome]['MEME_VERTEBRATES_DB'], meme_euk_db=config['references'][genome]['MEME_EUKARYOTE_DB'], meme_genome_db=config['references'][genome]['MEME_GENOME_DB'], - oc=join(workpath, "MEME", "{PeakTool}", "{name}"), + oc=join(MEME_dir, "{PeakTool}", "{name}"), tmpdir=tmpdir, outfa="{name}.fa", ntasks=int(28) diff --git a/workflow/rules/peakcall_qc.smk b/workflow/rules/peakcall_qc.smk deleted file mode 100644 index 5b41f4b..0000000 --- a/workflow/rules/peakcall_qc.smk +++ /dev/null @@ -1,46 +0,0 @@ -rule FRiP: - input: - bed = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ], - bam = join(workpath,bam_dir,"{name}.Q5DD.bam"), - output: - join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"), - params: - rname="frip", - outroot = lambda w: join(workpath,"PeakQC",w.PeakTool), - script=join(workpath,"workflow","scripts","frip.py"), - genome = config['references'][genome]['REFLEN'], - tmpdir = tmpdir, - container: config['images']['python'] - shell: """ - # Setups temporary directory for - # intermediate files with built-in - # mechanism for deletion on exit - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT - - python {params.script} \\ - -p {input.bed} \\ - -b {input.bam} \\ - -g {params.genome} \\ - -o {params.outroot} - """ - -rule jaccard: - input: - lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ], - output: - join(workpath,qc_dir,'{PeakTool}_jaccard.txt'), - params: - rname="jaccard", - outroot = lambda w: join(workpath,qc_dir,w.PeakTool), - script=join(workpath,"workflow","scripts","jaccard_score.py"), - genome = config['references'][genome]['REFLEN'] - envmodules: - config['tools']['BEDTOOLSVER'] - shell: """ - python {params.script} \\ - -i "{input}" \\ - -o "{params.outroot}" \\ - -g {params.genome} - """ diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 7ea370b..894c0f4 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -1,11 +1,25 @@ -# Common quality-control rules -# Includes the following: -# - preseq -# - NRF -# - rawfastqc -# - fastqc -# - fastq_screen -# - multiQC +# Quality control rules +# ~~~~ +# Common quality-control rules: preseq, NRF, rawfastqc, +# fastqc, fastq_screen, multiQC +from os.path import join +from scripts.common import get_bam_ext, get_fqscreen_outputs + + +# ~~ workflow configuration +workpath = config['project']['workpath'] +genome = config['options']['genome'] +paired_end = False if config['project']['nends'] == 1 else True +samples = config['samples'] +ends = [1] if not paired_end else [1, 2] + +# ~~ directories +qc_dir = join(workpath, "QC") +kraken_dir = join(workpath, 'kraken') +deeptools_dir = join(workpath, 'deeptools') +extra_fingerprint_dir = join(deeptools_dir, 'sorted_fingerprint') + + rule preseq: """ Quality step to estimate library complexity. Low library complexity may indicate @@ -17,19 +31,20 @@ rule preseq: Logfile containing library complexity information """ input: - bam = join(workpath,bam_dir,"{name}.sorted.bam"), + bam = join(bam_dir, "{name}.sorted.bam"), output: - ccurve = join(workpath,qc_dir,"{name}.ccurve"), + ccurve = join(qc_dir, "{name}.ccurve"), params: - rname = "preseq", - preseqver=config['tools']['PRESEQVER'], - shell: """ - module load {params.preseqver}; - preseq c_curve \\ - -B \\ - -o {output.ccurve} \\ - {input.bam} - """ + rname = "preseq", + preseqver = config['tools']['PRESEQVER'], + shell: + """ + module load {params.preseqver}; + preseq c_curve \\ + -B \\ + -o {output.ccurve} \\ + {input.bam} + """ rule NRF: @@ -46,34 +61,35 @@ rule NRF: PBC1 = one_pair/distinct_reads, and PBC2 = one_pair/two_pair. """ input: - bam=join(workpath,bam_dir,"{name}.sorted.bam"), + bam = join(bam_dir, "{name}.sorted.bam"), output: - preseq=join(workpath,qc_dir,"{name}.preseq.dat"), - preseqlog=join(workpath,qc_dir,"{name}.preseq.log"), - nrf=temp(join(workpath,qc_dir,"{name}.nrf")), + preseq = join(qc_dir, "{name}.preseq.dat"), + preseqlog = join(qc_dir, "{name}.preseq.log"), + nrf = temp(join(qc_dir, "{name}.nrf")), params: - rname='NRF', - samtoolsver=config['tools']['SAMTOOLSVER'], - rver=config['tools']['RVER'], - preseqver=config['tools']['PRESEQVER'], - nrfscript=join(workpath,"workflow","scripts","atac_nrf.py "), + rname = 'NRF', + samtoolsver = config['tools']['SAMTOOLSVER'], + rver = config['tools']['RVER'], + preseqver = config['tools']['PRESEQVER'], + nrfscript = join(workpath, "workflow", "scripts", "atac_nrf.py"), threads: 16 - shell: """ - module load {params.preseqver}; - preseq lc_extrap \\ - -P \\ - -B \\ - -D \\ - -o {output.preseq} \\ - {input.bam} \\ - -seed 12345 \\ - -v \\ - -l 100000000000 \\ - 2> {output.preseqlog} - python {params.nrfscript} \\ - {output.preseqlog} \\ - > {output.nrf} - """ + shell: + """ + module load {params.preseqver}; + preseq lc_extrap \\ + -P \\ + -B \\ + -D \\ + -o {output.preseq} \\ + {input.bam} \\ + -seed 12345 \\ + -v \\ + -l 100000000000 \\ + 2> {output.preseqlog} + python {params.nrfscript} \\ + {output.preseqlog} \\ + > {output.nrf} + """ rule rawfastqc: @@ -87,44 +103,34 @@ rule rawfastqc: FastQC report and zip file containing data quality information """ input: - expand(join(workpath,"{name}.R1.fastq.gz"), name=samples) if \ - not paired_end else \ - expand(join(workpath,"{name}.R{rn}.fastq.gz"), name=samples,rn=[1,2]) + expand(join(workpath, "{name}.R{rn}.fastq.gz"), name=samples, rn=list(map(str, ends))) output: - expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples), + expand(join(qc_dir, "rawfastQC", "{name}.R{rn}_fastqc.html"), name=samples, rn=ends), params: - rname='rawfastqc', - outdir=join(workpath,"rawfastQC"), - tmpdir=tmpdir, + rname = 'rawfastqc', + outdir = join(qc_dir, "rawfastQC"), + tmpdir = tmpdir, envmodules: config['tools']['FASTQCVER'] threads: int(allocated("threads", "rawfastqc", cluster)) - shell: """ - # Setups temporary directory for - # intermediate files with built-in - # mechanism for deletion on exit - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT + shell: + """ + # fastqc storage on lscratch b/c nfs bug + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT - # Running fastqc with local - # disk or a tmpdir, fastqc - # has been observed to lock - # up gpfs filesystems, adding - # this on request by HPC staff - fastqc \\ - {input} \\ - -t {threads} \\ - -o "${{tmp}}" - - # Copy output files from tmpdir - # to output directory - find "${{tmp}}" \\ - -type f \\ - \\( -name '*.html' -o -name '*.zip' \\) \\ - -exec cp {{}} {params.outdir} \\; - """ + fastqc \\ + {input} \\ + -t {threads} \\ + -o "${{tmp}}" + + find "${{tmp}}" \\ + -type f \\ + \\( -name '*.html' -o -name '*.zip' \\) \\ + -exec cp {{}} {params.outdir} \\; + """ rule fastqc: @@ -138,15 +144,13 @@ rule fastqc: Trimmed FastQC reports and zip file containing data quality information """ input: - expand(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),name=samples) if \ - not paired_end else \ - expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"), name=samples,rn=[1,2]) + expand(join(trim_dir, "{name}.R{rn}.trim.fastq.gz"), name=samples, rn=ends) output: - expand(join(workpath,'fastQC',"{name}.R1.trim_fastqc.html"),name=samples), + expand(join(qc_dir, 'fastQC', "{name}.R{rn}.trim_fastqc.html"), name=samples, rn=ends), params: - rname='fastqc', - outdir=join(workpath,"fastQC"), - tmpdir=tmpdir, + rname = 'fastqc', + outdir = join(qc_dir, "fastQC"), + tmpdir = tmpdir, envmodules: config['tools']['FASTQCVER'] threads: @@ -189,51 +193,42 @@ rule fastq_screen: FastQ Screen report and logfiles """ input: - join(workpath,trim_dir,"{name}.R1.trim.fastq.gz") if not paired_end else \ - expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"),name=samples,rn=[1,2]) + expand(join(trim_dir, "{name}.R{rn}.trim.fastq.gz"), name=samples, rn=ends) output: - join(workpath,"FQscreen","{name}.R1.trim_screen.txt") if not paired_end else \ - expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]), - join(workpath,"FQscreen","{name}.R1.trim_screen.png") if not paired_end else \ - expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]), - join(workpath,"FQscreen2","{name}.R1.trim_screen.txt") if not paired_end else \ - expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]), - join(workpath,"FQscreen2","{name}.R1.trim_screen.png") if not paired_end else \ - expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]), + get_fqscreen_outputs(paired_end, samples, qc_dir) params: - rname = 'fqscreen', - outdir = join(workpath,"FQscreen"), - outdir2 = join(workpath,"FQscreen2"), - # Exposed Parameters: modify resources/fastq_screen{_2}.conf - # to change defaults locations to bowtie2 indices - fastq_screen = config['bin']['FASTQ_SCREEN'], - fastq_screen_config1 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P1'], - fastq_screen_config2 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P2'], + rname = 'fqscreen', + outdir = join(qc_dir, "FQscreen"), + outdir2 = join(qc_dir, "FQscreen2"), + fastq_screen = config['bin']['FASTQ_SCREEN'], + fastq_screen_config1 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P1'], + fastq_screen_config2 = config['shared_resources']['FASTQ_SCREEN_CONFIG_P2'], envmodules: config['tools']['BOWTIE2VER'], config['tools']['PERLVER'], threads: int(allocated("threads", "fastq_screen", cluster)) - shell: """ - # First pass of contamination screening - {params.fastq_screen} \\ - --conf {params.fastq_screen_config1} \\ - --outdir {params.outdir} \\ - --threads {threads} \\ - --subset 1000000 \\ - --aligner bowtie2 \\ - --force \\ - {input} - # Second pass of contamination screening - {params.fastq_screen} \\ - --conf {params.fastq_screen_config2} \\ - --outdir {params.outdir2} \\ - --threads {threads} \\ - --subset 1000000 \\ - --aligner bowtie2 \\ - --force \\ - {input} - """ + shell: + """ + # First pass of contamination screening + {params.fastq_screen} \\ + --conf {params.fastq_screen_config1} \\ + --outdir {params.outdir} \\ + --threads {threads} \\ + --subset 1000000 \\ + --aligner bowtie2 \\ + --force \\ + {input} + # Second pass of contamination screening + {params.fastq_screen} \\ + --conf {params.fastq_screen_config2} \\ + --outdir {params.outdir2} \\ + --threads {threads} \\ + --subset 1000000 \\ + --aligner bowtie2 \\ + --force \\ + {input} + """ rule kraken: """ @@ -247,52 +242,54 @@ rule kraken: Kraken logfile and interative krona report """ input: - fq1=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), - fq2=provided(join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"), paired_end) + fq1 = join(trim_dir, "{name}.R1.trim.fastq.gz"), + fq2 = provided(join(trim_dir,"{name}.R2.trim.fastq.gz"), paired_end) output: - krakenout = join(workpath,kraken_dir,"{name}.trim.kraken_bacteria.out.txt"), - krakentaxa = join(workpath,kraken_dir,"{name}.trim.kraken_bacteria.taxa.txt"), - kronahtml = join(workpath,kraken_dir,"{name}.trim.kraken_bacteria.krona.html"), + krakenout = join(kraken_dir, "{name}.trim.kraken_bacteria.out.txt"), + krakentaxa = join(kraken_dir, "{name}.trim.kraken_bacteria.taxa.txt"), + kronahtml = join(kraken_dir, "{name}.trim.kraken_bacteria.krona.html"), params: - rname='kraken', - outdir=join(workpath,kraken_dir), - bacdb=config['shared_resources']['KRAKENBACDB'], - tmpdir=tmpdir, - paired_end = paired_end - threads: int(allocated("threads", "kraken_pe", cluster)), + rname = 'kraken', + outdir = kraken_dir, + bacdb = config['shared_resources']['KRAKENBACDB'], + tmpdir = tmpdir, + paired_end = paired_end + threads: + int(allocated("threads", "kraken_pe", cluster)), envmodules: config['tools']['KRAKENVER'], config['tools']['KRONATOOLSVER'], - shell: """ - # Setups temporary directory for - # intermediate files with built-in - # mechanism for deletion on exit - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT - - # Copy kraken2 db to /lscratch or temp - # location to reduce filesystem strain - cp -rv {params.bacdb} ${{tmp}}/; - kdb_base=$(basename {params.bacdb}) - if [ '{params.paired_end}' == True ]; then - kraken2 --db ${{tmp}}/${{kdb_base}} \\ - --threads {threads} --report {output.krakentaxa} \\ - --output {output.krakenout} \\ - --gzip-compressed \\ - --paired {input.fq1} {input.fq2} - else - kraken2 --db ${{tmp}}/${{kdb_base}} \\ - --threads {threads} --report {output.krakentaxa} \\ - --output {output.krakenout} \\ - --gzip-compressed \\ - {input.fq1} - fi - - # Generate Krona Report - cut -f2,3 {output.krakenout} | \\ - ktImportTaxonomy - -o {output.kronahtml} - """ + shell: + """ + # Setups temporary directory for + # intermediate files with built-in + # mechanism for deletion on exit + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + # Copy kraken2 db to /lscratch or temp + # location to reduce filesystem strain + cp -rv {params.bacdb} ${{tmp}}/; + kdb_base=$(basename {params.bacdb}) + if [ '{params.paired_end}' == True ]; then + kraken2 --db ${{tmp}}/${{kdb_base}} \\ + --threads {threads} --report {output.krakentaxa} \\ + --output {output.krakenout} \\ + --gzip-compressed \\ + --paired {input.fq1} {input.fq2} + else + kraken2 --db ${{tmp}}/${{kdb_base}} \\ + --threads {threads} --report {output.krakentaxa} \\ + --output {output.krakenout} \\ + --gzip-compressed \\ + {input.fq1} + fi + + # Generate Krona Report + cut -f2,3 {output.krakenout} | \\ + ktImportTaxonomy - -o {output.kronahtml} + """ rule multiqc: """ @@ -306,25 +303,22 @@ rule multiqc: Interactive MulitQC report and a QC metadata table """ input: - expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples), - expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples), - expand(join(workpath,kraken_dir,"{name}.trim.kraken_bacteria.krona.html"),name=samples), - expand(join(workpath,qc_dir,"{name}.ccurve"), name=samples), - expand(join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), name=samples), - expand(join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), name=samples), - # join(workpath,qc_dir,"QCTable.txt"), - expand(join(workpath,"rawfastQC","{name}.R1_fastqc.html"),name=samples), - expand(join(workpath,"fastQC","{name}.R1.trim_fastqc.html"),name=samples), - # expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab"),group=groups), - join(workpath,deeptools_dir,"spearman_heatmap.Q5DD_mqc.png") + expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.txt"), name=samples), + expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.txt"), name=samples), + expand(join(qc_dir, "{name}.ccurve"), name=samples), + expand(join(qc_dir, "rawfastQC", "{name}.R1_fastqc.html"), name=samples), + expand(join(qc_dir, "fastQC", "{name}.R1.trim_fastqc.html"), name=samples), + expand(join(kraken_dir, "{name}.trim.kraken_bacteria.krona.html"), name=samples), + expand(join(bam_dir, "{name}.Q5DD.bam.flagstat"), name=samples), + expand(join(bam_dir, "{name}.Q5.bam.flagstat"), name=samples), + join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png") output: - join(workpath,"multiqc_report.html") + join(workpath, "multiqc_report.html") params: - rname="multiqc", - multiqc=config['tools']['MULTIQCVER'], - qcconfig=join(workpath, config['shared_resources']['MULTIQC_CONFIG']), - excludedir=join(workpath,extra_fingerprint_dir), - dir=workpath + rname = "multiqc", + multiqc = config['tools']['MULTIQCVER'], + qcconfig = join(workpath, config['shared_resources']['MULTIQC_CONFIG']), + excludedir = join(workpath, extra_fingerprint_dir), shell: """ module load {params.multiqc} multiqc \\ @@ -333,9 +327,10 @@ rule multiqc: --interactive \\ -e cutadapt \\ --ignore {params.excludedir} \\ - -d {params.dir} + -d """ + workpath + """ """ + rule insert_size: """ Quality step calculates number of reads per insert size. @@ -347,31 +342,32 @@ rule insert_size: Number of reads per insert size and their histogram """ input: - bam = lambda w : join(workpath,bam_dir,w.name + "." + w.ext + "." + extensionsDict[w.ext]) + bam = lambda w : join(bam_dir, w.name + "." + w.ext + "." + get_bam_ext(w.ext, paired_end)) output: - txt= join(workpath,qc_dir,"{name}.{ext}.insert_size_metrics.txt"), - pdf= join(workpath,qc_dir,"{name}.{ext}.insert_size_histogram.pdf"), + txt = join(qc_dir, "{name}.{ext}.insert_size_metrics.txt"), + pdf = join(qc_dir, "{name}.{ext}.insert_size_histogram.pdf"), params: - rname="insert_size", - picardver=config['tools']['PICARDVER'], - rver=config['tools']['RVER'], - javaram='16g', - shell: """ - module load {params.picardver} {params.rver}; - java -Xmx{params.javaram} -jar ${{PICARDJARPATH}}/picard.jar CollectInsertSizeMetrics \\ - -I {input.bam} \\ - -O {output.txt} \\ - -H {output.pdf} - """ + rname = "insert_size", + picardver = config['tools']['PICARDVER'], + rver = config['tools']['RVER'], + javaram = '16g', + shell: + """ + module load {params.picardver} {params.rver}; + java -Xmx{params.javaram} -jar ${{PICARDJARPATH}}/picard.jar CollectInsertSizeMetrics \\ + -I {input.bam} \\ + -O {output.txt} \\ + -H {output.pdf} + """ rule deeptools_QC: input: [ join(workpath, bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] # this should be all bigwigs output: - heatmap=join(workpath,deeptools_dir,"spearman_heatmap.Q5DD.pdf"), - pca=join(workpath,deeptools_dir,"pca.Q5DD.pdf"), - npz=temp(join(workpath,deeptools_dir,"Q5DD.npz")), - png=join(workpath,deeptools_dir,"spearman_heatmap.Q5DD_mqc.png") + heatmap=join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"), + pca=join(deeptools_dir, "pca.Q5DD.pdf"), + npz=temp(join(deeptools_dir, "Q5DD.npz")), + png=join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png") params: rname="deeptools_QC", deeptoolsver=config['tools']['DEEPTOOLSVER'], @@ -384,3 +380,49 @@ rule deeptools_QC: plotPCA -in {output.npz} -o {output.pca} """ +rule FRiP: + input: + bed = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ], + bam = join(bam_dir, "{name}.Q5DD.bam"), + output: + join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"), + params: + rname="frip", + outroot = lambda w: join(workpath,"PeakQC",w.PeakTool), + script=join(workpath,"workflow","scripts","frip.py"), + genome = config['references'][genome]['REFLEN'], + tmpdir = tmpdir, + container: config['images']['python'] + shell: """ + # Setups temporary directory for + # intermediate files with built-in + # mechanism for deletion on exit + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + python {params.script} \\ + -p {input.bed} \\ + -b {input.bam} \\ + -g {params.genome} \\ + -o {params.outroot} + """ + +rule jaccard: + input: + lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ], + output: + join(qc_dir, '{PeakTool}_jaccard.txt'), + params: + rname="jaccard", + outroot = lambda w: join(qc_dir, w.PeakTool), + script=join(workpath,"workflow","scripts","jaccard_score.py"), + genome = config['references'][genome]['REFLEN'] + envmodules: + config['tools']['BEDTOOLSVER'] + shell: """ + python {params.script} \\ + -i "{input}" \\ + -o "{params.outroot}" \\ + -g {params.genome} + """ diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk index 078701b..0f1b458 100644 --- a/workflow/rules/trim_align_dedup.smk +++ b/workflow/rules/trim_align_dedup.smk @@ -1,63 +1,28 @@ -# Common paired-end rules: -# - trim_pe -# - kraken_pe -# - BWA_PE -# - insert_size - -def dedup_out7(input, assay, paired_end): - if assay == "cfchip": - i = [ - - input+".Q5DD_tagAlign" - ] - return i - elif paired_end == False and assay == "chip": - i = [ - input+".Q5DD_tagAlign.gz" - ] - return i - else: - i = [] - return i +# Trimming, alignment, and redundancy reduction rules +# ~~~~ +# Common paired-end rules: trim_pe, kraken_pe, BWA_PE, insert_size +import snakemake +from os.path import join +from scripts.common import allocated, get_bam_ext +from scripts.grouping import dedup_out7, get_bam_input, get_ppqt_input +from scripts.blocking import ctrl_test +# ~~ workflow configuration +workpath = config['project']['workpath'] +genome = config['options']['genome'] +paired_end = False if config['project']['nends'] == 1 else True +ends = [1] if not paired_end else [1, 2] +chip2input = config['project']['peaks']['inputs'] -def get_ppqt_input(wildcards): - if paired_end: - i = [ - join(workpath, bam_dir, ppqt_dir, "{0}.{1}.ppqt.txt".format(wildcards.name, wildcards.ext)) - ] - return i - else: - if wildcards.ext == "Q5DD": - i = [ - join(workpath, bam_dir, ppqt_dir, "{0}.Q5DD_tagAlign.ppqt.txt".format(wildcards.name)) - ] - return i - elif wildcards.ext == "sorted": - i = [ - join(workpath, bam_dir, ppqt_dir, "{0}.sorted.ppqt.txt".format(wildcards.name)) - ] - return i +# ~~ directories +trim_dir = join(workpath, 'trim') +tmpdir = config['options']['tmp_dir'] +bam_dir = join(workpath, "bam") +bw_dir = join(workpath, "bigwig") +qc_dir = join(workpath, "QC") +ppqt_dir = join(bam_dir, "ppqt") -def get_bam_input(wildcards): - if paired_end: - i = [ - join(workpath, bam_dir, "{0}.{1}.bam".format(wildcards.name, wildcards.ext)) - ] - return i - else: - if wildcards.ext == "Q5DD": - i = [ - join(workpath, bam_dir, "{0}.Q5DD.bam".format(wildcards.name)) - ] - return i - elif wildcards.ext == "sorted": - i = [ - join(workpath, bam_dir, "{0}.sorted.bam".format(wildcards.name)) - ] - return i - rule trim: """ Data-processing step to remove adapter sequences and perform quality trimming @@ -76,119 +41,119 @@ rule trim: Trimmed and blacklist-sequences-free FastQ files """ input: - file1=join(workpath,"{name}.R1.fastq.gz"), - file2=provided(join(workpath,"{name}.R2.fastq.gz"),paired_end) + file1 = join(workpath, "{name}.R1.fastq.gz"), + file2 = provided(join(workpath,"{name}.R2.fastq.gz"), paired_end) output: - outfq1=temp(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz")), - outfq2=provided(temp(join(workpath,trim_dir,"{name}.R2.trim.fastq.gz")),paired_end) + outfq1 = temp(join(trim_dir, "{name}.R1.trim.fastq.gz")), + outfq2 = provided(temp(join(trim_dir, "{name}.R2.trim.fastq.gz")), paired_end) params: - rname="trim", - cutadaptver=config['tools']['CUTADAPTVER'], - workpath=config['project']['workpath'], - fastawithadaptersetd=join(workpath, config['shared_resources']['ADAPTERS_FASTA']), - blacklistbwaindex=config['references'][genome]['BLACKLISTBWAINDEX'], - picardver=config['tools']['PICARDVER'], - bwaver=config['tools']['BWAVER'], - samtoolsver=config['tools']['SAMTOOLSVER'], - minlen=35, - leadingquality=10, - trailingquality=10, - javaram="64g", - sample="{name}", - tmpdir=tmpdir, - paired_end = paired_end - threads: 16 - shell: """ - module load {params.cutadaptver}; - module load {params.bwaver}; - module load {params.samtoolsver}; - module load {params.picardver}; - # Setups temporary directory for - # intermediate files with built-in - # mechanism for deletion on exit - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT + rname = "trim", + cutadaptver = config['tools']['CUTADAPTVER'], + workpath = config['project']['workpath'], + fastawithadaptersetd = join(workpath, config['shared_resources']['ADAPTERS_FASTA']), + blacklistbwaindex = config['references'][genome]['BLACKLISTBWAINDEX'], + picardver = config['tools']['PICARDVER'], + bwaver = config['tools']['BWAVER'], + samtoolsver = config['tools']['SAMTOOLSVER'], + minlen = 35, + leadingquality = 10, + trailingquality = 10, + javaram = "64g", + sample = "{name}", + tmpdir = tmpdir, + paired_end = paired_end + threads: + 16 + shell: + """ + module load {params.cutadaptver}; + module load {params.bwaver}; + module load {params.samtoolsver}; + module load {params.picardver}; + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT - if [ '{params.paired_end}' == True ];then - cutadapt \\ - --pair-filter=any \\ - --nextseq-trim=2 \\ - --trim-n \\ - -n 5 \\ - -O 5 \\ - -q {params.leadingquality},{params.trailingquality} \\ - -m {params.minlen}:{params.minlen} \\ - -b file:{params.fastawithadaptersetd} \\ - -B file:{params.fastawithadaptersetd} \\ - -j {threads} \\ - -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ - -p ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\ - {input.file1} {input.file2} - - if [ "{params.blacklistbwaindex}" != "" ]; - then bwa mem -t {threads} \\ - {params.blacklistbwaindex} \\ - ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ - ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\ - | samtools view -@{threads} \\ - -f4 \\ - -b \\ - -o ${{tmp}}/{params.sample}.bam; - rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz; - rm ${{tmp}}/{params.sample}.R2.trim.fastq.gz; - - java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\ - -VALIDATION_STRINGENCY SILENT \\ - -INPUT ${{tmp}}/{params.sample}.bam \\ - -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq \\ - -SECOND_END_FASTQ ${{tmp}}/{params.sample}.R2.trim.fastq \\ - -UNPAIRED_FASTQ ${{tmp}}/{params.sample}.unpaired.noBL.fastq + if [ '{params.paired_end}' == True ]; then + cutadapt \\ + --pair-filter=any \\ + --nextseq-trim=2 \\ + --trim-n \\ + -n 5 \\ + -O 5 \\ + -q {params.leadingquality},{params.trailingquality} \\ + -m {params.minlen}:{params.minlen} \\ + -b file:{params.fastawithadaptersetd} \\ + -B file:{params.fastawithadaptersetd} \\ + -j {threads} \\ + -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ + -p ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\ + {input.file1} {input.file2} - rm ${{tmp}}/{params.sample}.bam; - - pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq; - pigz -p {threads} ${{tmp}}/{params.sample}.R2.trim.fastq; - fi - mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1}; - mv ${{tmp}}/{params.sample}.R2.trim.fastq.gz {output.outfq2}; - else - cutadapt \\ - --nextseq-trim=2 \\ - --trim-n \\ - -n 5 \\ - -O 5 \\ - -q {params.leadingquality},{params.trailingquality} \\ - -m {params.minlen} \\ - -b file:{params.fastawithadaptersetd} \\ - -j {threads} \\ - -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ - {input.file1} - - if [ "{params.blacklistbwaindex}" != "" ]; - then bwa mem -t {threads} \\ - {params.blacklistbwaindex} \\ - ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ - | samtools view -@{threads} \\ - -f4 \\ - -b \\ - -o ${{tmp}}/{params.sample}.bam; - rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz; - - java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\ - -VALIDATION_STRINGENCY SILENT \\ - -INPUT ${{tmp}}/{params.sample}.bam \\ - -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq + if [ "{params.blacklistbwaindex}" != "" ]; + then bwa mem -t {threads} \\ + {params.blacklistbwaindex} \\ + ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ + ${{tmp}}/{params.sample}.R2.trim.fastq.gz \\ + | samtools view -@{threads} \\ + -f4 \\ + -b \\ + -o ${{tmp}}/{params.sample}.bam; + rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz; + rm ${{tmp}}/{params.sample}.R2.trim.fastq.gz; - rm ${{tmp}}/{params.sample}.bam; - - pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq; + java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\ + -VALIDATION_STRINGENCY SILENT \\ + -INPUT ${{tmp}}/{params.sample}.bam \\ + -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq \\ + -SECOND_END_FASTQ ${{tmp}}/{params.sample}.R2.trim.fastq \\ + -UNPAIRED_FASTQ ${{tmp}}/{params.sample}.unpaired.noBL.fastq + + rm ${{tmp}}/{params.sample}.bam; + + pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq; + pigz -p {threads} ${{tmp}}/{params.sample}.R2.trim.fastq; + fi + mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1}; + mv ${{tmp}}/{params.sample}.R2.trim.fastq.gz {output.outfq2}; + else + cutadapt \\ + --nextseq-trim=2 \\ + --trim-n \\ + -n 5 \\ + -O 5 \\ + -q {params.leadingquality},{params.trailingquality} \\ + -m {params.minlen} \\ + -b file:{params.fastawithadaptersetd} \\ + -j {threads} \\ + -o ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ + {input.file1} + + if [ "{params.blacklistbwaindex}" != "" ]; + then bwa mem -t {threads} \\ + {params.blacklistbwaindex} \\ + ${{tmp}}/{params.sample}.R1.trim.fastq.gz \\ + | samtools view -@{threads} \\ + -f4 \\ + -b \\ + -o ${{tmp}}/{params.sample}.bam; + rm ${{tmp}}/{params.sample}.R1.trim.fastq.gz; + + java -Xmx{params.javaram} -jar $PICARDJARPATH/picard.jar SamToFastq \\ + -VALIDATION_STRINGENCY SILENT \\ + -INPUT ${{tmp}}/{params.sample}.bam \\ + -FASTQ ${{tmp}}/{params.sample}.R1.trim.fastq + + rm ${{tmp}}/{params.sample}.bam; + + pigz -p {threads} ${{tmp}}/{params.sample}.R1.trim.fastq; + fi + mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1}; fi - mv ${{tmp}}/{params.sample}.R1.trim.fastq.gz {output.outfq1}; - fi """ + rule BWA: """ Data processing rule to align trimmed and blacklist-sequences-free reads @@ -205,56 +170,54 @@ rule BWA: Bam file that has reads aligned and filted by mapQ a value: Q5.bam """ input: - infq1 = join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), - infq2 = provided(join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"), paired_end) + infq1 = join(trim_dir, "{name}.R1.trim.fastq.gz"), + infq2 = join(trim_dir, "{name}.R2.trim.fastq.gz"), params: - d=join(workpath,bam_dir), - rname='bwa', - reference=config['references'][genome]['BWA'], - bwaver=config['tools']['BWAVER'], - samtoolsver=config['tools']['SAMTOOLSVER'], - script=join(workpath,"workflow","scripts","bam_filter_by_mapq.py"), - pythonver=config['tools']['PYTHONVER'], - paired_end = paired_end + d = join(bam_dir), + rname = 'bwa', + reference = config['references'][genome]['BWA'], + bwaver = config['tools']['BWAVER'], + samtoolsver = config['tools']['SAMTOOLSVER'], + script = join(workpath, "bin", "bam_filter_by_mapq.py"), + pythonver = config['tools']['PYTHONVER'], output: - outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), - outbam2=temp(join(join(workpath,bam_dir,"{name}.Q5.bam"))), - flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), - idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"), - flagstat2=join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), - idxstat2=join(workpath,bam_dir,"{name}.Q5.bam.idxstat"), + outbam1 = join(bam_dir, "{name}.sorted.bam"), + outbam2 = temp(join(bam_dir, "{name}.Q5.bam")), + flagstat1 = join(bam_dir, "{name}.sorted.bam.flagstat"), + idxstat1 = join(bam_dir, "{name}.sorted.bam.idxstat"), + flagstat2 = join(bam_dir, "{name}.Q5.bam.flagstat"), + idxstat2 = join(bam_dir, "{name}.Q5.bam.idxstat"), threads: 32 - shell: """ - module load {params.bwaver}; - module load {params.samtoolsver}; - module load {params.pythonver}; - if [ '{params.paired_end}' == True ];then - bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} \\ - | samtools sort -@{threads} -o {output.outbam1} - - samtools index {output.outbam1} - samtools flagstat {output.outbam1} > {output.flagstat1} - samtools idxstats {output.outbam1} > {output.idxstat1} - #samtools view -b -q 6 {output.outbam1} -o {output.outbam2} - - python {params.script} -i {output.outbam1} -o {output.outbam2} -q 6 - samtools index {output.outbam2} - samtools flagstat {output.outbam2} > {output.flagstat2} - samtools idxstats {output.outbam2} > {output.idxstat2} - else - bwa mem -t {threads} {params.reference} {input.infq1} \\ - | samtools sort -@{threads} -o {output.outbam1} - - samtools index {output.outbam1} - samtools flagstat {output.outbam1} > {output.flagstat1} - samtools idxstats {output.outbam1} > {output.idxstat1} - samtools view -b -q 6 {output.outbam1} -o {output.outbam2} - - samtools index {output.outbam2} - samtools flagstat {output.outbam2} > {output.flagstat2} - samtools idxstats {output.outbam2} > {output.idxstat2} - fi - """ + shell: + """ + module load {params.bwaver}; + module load {params.samtoolsver}; + module load {params.pythonver}; + if [ '""" + str(paired_end) + """' == True ];then + bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} \\ + | samtools sort -@{threads} -o {output.outbam1} + + samtools index {output.outbam1} + samtools flagstat {output.outbam1} > {output.flagstat1} + samtools idxstats {output.outbam1} > {output.idxstat1} + + python {params.script} -i {output.outbam1} -o {output.outbam2} -q 6 + samtools index {output.outbam2} + samtools flagstat {output.outbam2} > {output.flagstat2} + samtools idxstats {output.outbam2} > {output.idxstat2} + else + bwa mem -t {threads} {params.reference} {input.infq1} \\ + | samtools sort -@{threads} -o {output.outbam1} + samtools index {output.outbam1} + samtools flagstat {output.outbam1} > {output.flagstat1} + samtools idxstats {output.outbam1} > {output.idxstat1} + samtools view -b -q 6 {output.outbam1} -o {output.outbam2} + + samtools index {output.outbam2} + samtools flagstat {output.outbam2} > {output.flagstat2} + samtools idxstats {output.outbam2} > {output.idxstat2} + fi + """ rule dedup: """ @@ -273,125 +236,124 @@ rule dedup: Deduplicated Q5DD.bam for all assays, plus Q5DD.tagAlign if cfchip assay """ - input: - bam2=join(workpath,bam_dir,"{name}.Q5.bam") + input: + bam2 = join(bam_dir,"{name}.Q5.bam") output: - out5=join(workpath,bam_dir,"{name}.Q5DD.bam"), - out5f=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), - out5i=join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"), - out6=provided(join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), paired_end), - out7=dedup_out7(join(workpath,bam_dir,"{name}"), assay, paired_end) + out5 = join(workpath,bam_dir,"{name}.Q5DD.bam"), + out5f = join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), + out5i = join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"), + out6 = provided(join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), paired_end), + out7 = dedup_out7(join(workpath,bam_dir,"{name}"), assay, paired_end) params: - rname='dedup', - picardver=config['tools']['PICARDVER'], - samtoolsver=config['tools']['SAMTOOLSVER'], - bedtoolsver=config['tools']['BEDTOOLSVER'], - macsver=config['tools']['MACSVER'], - gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'], - folder=join(workpath,bam_dir), - genomefile=config['references'][genome]['REFLEN'], - rver=config['tools']['RVER'], - javaram='16g', - tmpdir=tmpdir, - tmpBam="{name}.Q5DD.withXY.bam", - rscript=join(config['references'][genome]['cfChIP_TOOLS_SRC'], "bam2fragment.R"), - paired_end = paired_end - shell: """ - module load {params.samtoolsver}; - module load {params.picardver}; - module load {params.bedtoolsver}; - module load {params.macsver}; - module load {params.rver}; - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT - - if [ "{assay}" == "cfchip" ];then - java -Xmx{params.javaram} \\ - -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ - -I {input.bam2} \\ - -O {params.tmpBam} \\ - -TMP_DIR ${{tmp}} \\ - -VALIDATION_STRINGENCY SILENT \\ - -REMOVE_DUPLICATES true \\ - -METRICS_FILE {output.out6}; - samtools index {params.tmpBam}; - samtools view -b {params.tmpBam} chr{{1..22}} > {output.out5}; - Rscript {params.rscript} {params.tmpBam} {output.out7}; - rm {params.tmpBam} {params.tmpBam}.bai; - samtools index {output.out5}; - samtools flagstat {output.out5} > {output.out5f}; - samtools idxstats {output.out5} > {output.out5i}; - elif [ '{params.paired_end}' == False ];then - macs2 filterdup -i {input} -g {params.gsize} --keep-dup="auto" -o ${{tmp}}/TmpTagAlign; - awk -F"\\t" -v OFS="\\t" '{{if ($2>0 && $3>0) {{print}}}}' ${{tmp}}/TmpTagAlign > ${{tmp}}/TmpTagAlign2; - awk -F"\\t" -v OFS="\\t" '{{print $1,1,$2}}' {params.genomefile} | sort -k1,1 -k2,2n > ${{tmp}}/GenomeFileBed; - bedtools intersect -wa -f 1.0 -a ${{tmp}}/TmpTagAlign2 -b ${{tmp}}/GenomeFileBed > ${{tmp}}/TmpTagAlign3; - bedtools bedtobam -i ${{tmp}}/TmpTagAlign3 -g {params.genomefile} | samtools sort -@4 -o {output.out5}; - gzip ${{tmp}}/TmpTagAlign3; - mv ${{tmp}}/TmpTagAlign3.gz {output.out7}; - samtools index {output.out5}; - samtools flagstat {output.out5} > {output.out5f} - samtools idxstats {output.out5} > {output.out5i} - else - java -Xmx{params.javaram} \\ - -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ - -I {input.bam2} \\ - -O {output.out5} \\ - -TMP_DIR ${{tmp}} \\ - -VALIDATION_STRINGENCY SILENT \\ - -REMOVE_DUPLICATES true \\ - -METRICS_FILE {output.out6}; - samtools index {output.out5}; - samtools flagstat {output.out5} > {output.out5f}; - samtools idxstats {output.out5} > {output.out5i}; - fi - """ + rname = 'dedup', + picardver = config['tools']['PICARDVER'], + samtoolsver = config['tools']['SAMTOOLSVER'], + bedtoolsver = config['tools']['BEDTOOLSVER'], + macsver = config['tools']['MACSVER'], + gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + folder = join(workpath,bam_dir), + genomefile = config['references'][genome]['REFLEN'], + rver = config['tools']['RVER'], + javaram = '16g', + tmpdir = tmpdir, + tmpBam = "{name}.Q5DD.withXY.bam", + rscript = join(config['references'][genome]['cfChIP_TOOLS_SRC'], "bam2fragment.R"), + shell: + """ + module load {params.samtoolsver}; + module load {params.picardver}; + module load {params.bedtoolsver}; + module load {params.macsver}; + module load {params.rver}; + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + if [ "{assay}" == "cfchip" ];then + java -Xmx{params.javaram} \\ + -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ + -I {input.bam2} \\ + -O {params.tmpBam} \\ + -TMP_DIR ${{tmp}} \\ + -VALIDATION_STRINGENCY SILENT \\ + -REMOVE_DUPLICATES true \\ + -METRICS_FILE {output.out6}; + samtools index {params.tmpBam}; + samtools view -b {params.tmpBam} chr{{1..22}} > {output.out5}; + Rscript {params.rscript} {params.tmpBam} {output.out7}; + rm {params.tmpBam} {params.tmpBam}.bai; + samtools index {output.out5}; + samtools flagstat {output.out5} > {output.out5f}; + samtools idxstats {output.out5} > {output.out5i}; + elif [ '""" + str(paired_end) + """' == False ];then + macs2 filterdup -i {input} -g {params.gsize} --keep-dup="auto" -o ${{tmp}}/TmpTagAlign; + awk -F"\\t" -v OFS="\\t" '{{if ($2>0 && $3>0) {{print}}}}' ${{tmp}}/TmpTagAlign > ${{tmp}}/TmpTagAlign2; + awk -F"\\t" -v OFS="\\t" '{{print $1,1,$2}}' {params.genomefile} | sort -k1,1 -k2,2n > ${{tmp}}/GenomeFileBed; + bedtools intersect -wa -f 1.0 -a ${{tmp}}/TmpTagAlign2 -b ${{tmp}}/GenomeFileBed > ${{tmp}}/TmpTagAlign3; + bedtools bedtobam -i ${{tmp}}/TmpTagAlign3 -g {params.genomefile} | samtools sort -@4 -o {output.out5}; + gzip ${{tmp}}/TmpTagAlign3; + mv ${{tmp}}/TmpTagAlign3.gz {output.out7}; + samtools index {output.out5}; + samtools flagstat {output.out5} > {output.out5f} + samtools idxstats {output.out5} > {output.out5i} + else + java -Xmx{params.javaram} \\ + -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ + -I {input.bam2} \\ + -O {output.out5} \\ + -TMP_DIR ${{tmp}} \\ + -VALIDATION_STRINGENCY SILENT \\ + -REMOVE_DUPLICATES true \\ + -METRICS_FILE {output.out6}; + samtools index {output.out5}; + samtools flagstat {output.out5} > {output.out5f}; + samtools idxstats {output.out5} > {output.out5i}; + fi + """ rule ppqt: input: - bam = lambda w : join(workpath,bam_dir,w.name + "." + w.ext + "." + extensionsDict[w.ext]) + bam = lambda w : join(bam_dir, w.name + "." + w.ext + "." + get_bam_ext(w.ext, paired_end)) output: - ppqt= join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt"), - pdf= join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.pdf"), - txt= join(workpath,bam_dir,ppqt_dir,"{name}.{ext}.ppqt.txt"), + ppqt = join(ppqt_dir, "{name}.{ext}.ppqt"), + pdf = join(ppqt_dir, "{name}.{ext}.pdf"), + txt = join(ppqt_dir, "{name}.{ext}.ppqt.txt"), params: - rname="ppqt", - samtoolsver=config['tools']['SAMTOOLSVER'], - rver=config['tools']['RVER'], - scriptPy=join(workpath,"workflow","scripts","ppqt_process.py"), - inputSample=(lambda w: w.name in uniq_inputs), - tmpdir=tmpdir, - paired_end = paired_end, - file_name = "{name}" + rname = "ppqt", + samtoolsver = config['tools']['SAMTOOLSVER'], + rver = config['tools']['RVER'], + scriptPy = join(workpath, "bin", "ppqt_process.py"), + tmpdir = tmpdir, + file_name = "{name}" container: config['images']['ppqt'] - shell: """ - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT + shell: + """ + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT - if [ '{params.paired_end}' == True ]; then - samtools view -b \\ - -f 66 \\ - -o ${{tmp}}/bam1.f66.bam {input.bam}; - samtools index ${{tmp}}/bam1.f66.bam; - run_spp.R -c=${{tmp}}/bam1.f66.bam \ - -savp={output.pdf} -out={output.ppqt} \ - -tmpdir=${{tmp}} -rf - else - if [[ '{input.bam}' == *.gz ]]; then - ln -s {input.bam} ${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz; - run_spp.R -c=${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz \ - -savp={output.pdf} -out={output.ppqt} \ - -tmpdir=/lscratch/$SLURM_JOBID -rf + if [ '""" + str(paired_end) + """' == True ]; then + samtools view -b \\ + -f 66 \\ + -o ${{tmp}}/bam1.f66.bam {input.bam}; + samtools index ${{tmp}}/bam1.f66.bam; + run_spp.R -c=${{tmp}}/bam1.f66.bam \ + -savp={output.pdf} -out={output.ppqt} \ + -tmpdir=${{tmp}} -rf else - run_spp.R -c={input.bam} \ - -savp={output.pdf} -out={output.ppqt} \ - -tmpdir=/lscratch/$SLURM_JOBID -rf + if [[ '{input.bam}' == *.gz ]]; then + ln -s {input.bam} ${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz; + run_spp.R -c=${{tmp}}/{params.file_name}.Q5DD.tagAlign.gz \ + -savp={output.pdf} -out={output.ppqt} \ + -tmpdir=/lscratch/$SLURM_JOBID -rf + else + run_spp.R -c={input.bam} \ + -savp={output.pdf} -out={output.ppqt} \ + -tmpdir=/lscratch/$SLURM_JOBID -rf + fi fi - fi - python {params.scriptPy} -i {output.ppqt} -o {output.txt} - """ + python {params.scriptPy} -i {output.ppqt} -o {output.txt} + """ rule bam2bw: """ @@ -405,42 +367,46 @@ rule bam2bw: an associated score, RPGC """ input: - bam = get_bam_input, - ppqt = get_ppqt_input + bam = lambda w: get_bam_input(bam_dir, w, paired_end), + ppqt = lambda w: get_ppqt_input(ppqt_dir, w, paired_end), output: - outbw=join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"), + outbw = join(bw_dir, "{name}.{ext}.RPGC.bw"), params: - rname="bam2bw", - effectivegenomesize=config['references'][genome]['EFFECTIVEGENOMESIZE'], - paired_end = paired_end, - tmpdir=tmpdir, - name = "{name}" - threads: int(allocated("threads", "bam2bw", cluster)), - envmodules: config['tools']['DEEPTOOLSVER'], - shell: """ - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT + rname = "bam2bw", + name = "{name}", + effectivegenomesize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + paired_end = paired_end, + tmpdir = tmpdir, + threads: + int(allocated("threads", "bam2bw", cluster)), + envmodules: + config['tools']['DEEPTOOLSVER'], + shell: + """ + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + bam_cov_option={input.ppqt} + if [ '{params.paired_end}' == False ]; then + ppqt_len=$(awk '{{print $1}}' {input.ppqt}) + bam_cov_option="-e ${{ppqt_len}}" + else + bam_cov_option=--centerReads + fi + echo "printing out value of bam-cov-option $bam_cov_option" + + bamCoverage \\ + --bam {input.bam} \\ + -o {output.outbw} \\ + --binSize 25 \\ + --smoothLength 75 \\ + --numberOfProcessors {threads} \\ + --normalizeUsing RPGC \\ + --effectiveGenomeSize {params.effectivegenomesize} \\ + ${{bam_cov_option}}; + """ - bam_cov_option={input.ppqt} - if [ '{params.paired_end}' == False ]; then - ppqt_len=$(awk '{{print $1}}' {input.ppqt}) - bam_cov_option="-e ${{ppqt_len}}" - else - bam_cov_option=--centerReads - fi - echo "printing out value of bam-cov-option $bam_cov_option" - - bamCoverage \\ - --bam {input.bam} \\ - -o {output.outbw} \\ - --binSize 25 \\ - --smoothLength 75 \\ - --numberOfProcessors {threads} \\ - --normalizeUsing RPGC \\ - --effectiveGenomeSize {params.effectivegenomesize} \\ - ${{bam_cov_option}}; - """ rule inputnorm: """ @@ -453,22 +419,26 @@ rule inputnorm: bigWig file of treatmment sample normalizes with its input control """ input: - chip = join(workpath,bw_dir,"{name}.Q5DD.RPGC.bw"), - ctrl = lambda w : join(workpath,bw_dir,chip2input[w.name] + ".Q5DD.RPGC.bw") + bws = lambda w: ctrl_test(chip2input, w.name, bw_dir) output: - join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw") + join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw") params: - rname="inputnorm", - threads: int(allocated("threads", "inputnorm", cluster)), - envmodules: config['tools']['DEEPTOOLSVER'], - shell: """ - bigwigCompare \\ - --binSize 25 \\ - --outFileName {output} \\ - --outFileFormat 'bigwig' \\ - --bigwig1 {input.chip} \\ - --bigwig2 {input.ctrl} \\ - --operation 'subtract' \\ - --skipNonCoveredRegions \\ - -p {threads} - """ + rname = "inputnorm", + bigwig_declare = lambda w, input: f"--bigwig1 {input.bws[0]}" if len(input.bws) == 1 \ + else f"--bigwig1 {input.bws[0]} --bigwig2 {input.bws[1]}" + threads: + int(allocated("threads", "inputnorm", cluster)), + envmodules: + config['tools']['DEEPTOOLSVER'], + shell: + """ + echo {input} + bigwigCompare \\ + --binSize 25 \\ + --outFileName {output} \\ + --outFileFormat 'bigwig' \\ + {params.bigwig_declare} \\ + --operation 'subtract' \\ + --skipNonCoveredRegions \\ + -p {threads} + """ diff --git a/workflow/scripts/DiffBind_v2_ChIPseq.Rmd b/workflow/scripts/DiffBind_v2_ChIPseq.Rmd deleted file mode 100644 index 031799d..0000000 --- a/workflow/scripts/DiffBind_v2_ChIPseq.Rmd +++ /dev/null @@ -1,260 +0,0 @@ ---- -title: "DiffBind: ChIP-seq pipeline" -output: - html_document: - toc: true - toc_float: - collapsed: false - number_sections: true - toc_depth: 3 - fig_width: 7 - fig_height: 6 -params: - csvfile: samplesheet.csv - contrasts: "group1_vs_group2" - peakcaller: "macs" ---- - - - -```{r, include=FALSE, warning=FALSE, message=FALSE} -## grab args -dateandtime<-format(Sys.time(), "%a %b %d %Y - %X") - -csvfile <- params$csvfile -contrasts <- params$contrasts -peakcaller <- params$peakcaller -``` - -**Groups being compared:** - *`r contrasts`* -**Peak sources:** - *`r peakcaller`* -**Report generated:** - *`r dateandtime`* - -```{r setup, echo=FALSE, warning=FALSE,message=FALSE} -knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) -suppressMessages(library(DT)) -suppressMessages(library(DiffBind)) -suppressMessages(library(parallel)) -``` - -# Peak Data -Read in sample sheet information and peak information -```{r samples} -samples <- dba(sampleSheet=csvfile) -consensus <- dba.peakset(samples,consensus=DBA_CONDITION) -print(samples) -``` - -## Correlation heatmap: Only peaks -Pearson correlation of peak positions: all samples versus all samples -```{r heatmap1} -try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE) -``` - -## PCA: Only peaks -Variance of peak positions -```{r PCA1, fig.height=5,fig.width=5} -try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE) -``` - -## Overlapping peak counts -Number of overlapping peaks. -If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where -the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different -from the consensus peak set used for differential analyses. -```{r Venn, fig_height=4} -if (nrow(samples$samples) < 5) { - dba.plotVenn(samples,1:nrow(samples$samples)) -} else { - dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups") - try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE) - try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE) -} -``` - -# Consensus peaks and counts -Consensus peaks are peaks found in at least two samples, independent of condition. -FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. -```{r peaksORsummits} -if ( grepl("narrow",samples$samples$Peaks[1]) ) { - summits <- TRUE - print ("Narrow peak calling tool.") - print ("Differential peaks are 250bp upstream and downstream of the summits.") -} else if ( grepl("broad",samples$samples$Peaks[1]) ) { - summits <- FALSE - print ("Broad peak calling tool.") - print ("Differential peaks are consensus peaks.") -} else { - summits <- FALSE - print ("Indeterminate peak calling tool.") - print ("Differential peaks are consensus peaks.") -} -``` - -```{r DBcount} -if (summits == TRUE) { - DBdataCounts <- dba.count(samples, summits=250) -} else { - DBdataCounts <- dba.count(samples) -} -print(DBdataCounts) -outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_consensusPeaks.bed") -consensus2 <- dba.peakset(DBdataCounts,bRetrieve=T) -consensus2$name <- paste0("Peak",1:length(consensus2)) -#rtracklayer::export(consensus2,outfile2) -``` - -## Correlation heatmap: Peaks and reads -Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples -```{r heatmap2} -try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE) -``` - -## Heatmap: Average signal across each peak -1000 most variable consensus peaks (library-size normalized counts) -```{r heatmap3} -try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE) -``` - -## PCA: Peaks and reads -Variation of library-size normalized counts of consensus peaks -```{r PCA2, fig.height=5,fig.width=5} -try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE) -``` - -# Set Up Contrast -Contrast is Group1 - Group2. -```{r contrast} -DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION) -print(DBdatacontrast) -``` - -# Differential Analysis -This report shows the differential analysis with two tools: Deseq2 and EdgeR. For most -projects, Deseq2 is the optimal tool. Both tools assume that the majority of peaks are -not changing between the two conditions. EdgeR also assumes that there are equal numbers -of peaks on each side of the contrast, so it normalizes the data more than Deseq2. EdgeR -is especially useful when this assumption is true or when there are large differences in -library size across samples. All concentrations are on log2 scale. - -```{r analyze} -DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) -DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) -``` - -```{r report} -DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2) -DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER) -``` - -## PCA {.tabset .tabset-fade} -Variance of differential peaks only - -### DeSeq2 {-} -```{r PCA3, fig.height=5,fig.width=5} -try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2),silent=TRUE) -``` - -### EdgeR {-} -```{r PCA4, fig.height=5,fig.width=5} -try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER),silent=TRUE) -``` - -## MA plot {.tabset .tabset-fade} -"Log concentration" means average concentration across all samples. -Each dot is a consensus peak. - -### DeSeq2 {-} -```{r MA_D} -try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE) -``` - -### EdgeR {-} -```{r MA_E} -try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE) -``` - -## Volcano plot {.tabset .tabset-fade} -Each dot is a consensus peak. - -### DeSeq2 {-} -```{r Volcano1} -try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE) -``` - -### EdgeR {-} -```{r Volcano2} -try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE) -``` - -## Heatmap: Differential {.tabset .tabset-fade} -1000 most significant differential peaks (Deseq2 or EdgeR normalized) - -### DeSeq2 {-} -```{r heatmap4D} -try(dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2,correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) -``` - -### EdgeR {-} -```{r heatmap4E} -try(dba.plotHeatmap(DBAnalysisEdgeR,contrast=1,method = DBA_EDGER,correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) -``` - -## Top 500 differentially bound peaks {.tabset .tabset-fade} -### DeSeq2 {-} -```{r Deseq2Report} -outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.txt") -outfile2 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.bed") -DBReportDeseq2$name <- paste0("Peak",1:length(DBReportDeseq2)) -try(rtracklayer::export(DBReportDeseq2, outfile2),silent=TRUE) -write.table(DBReportDeseq2, outfile, quote=F, sep="\t", row.names=F) -D2i <- length(DBReportDeseq2) -if (D2i == 0) { - i=1 -} else if (D2i > 500) { - i=500 -} else { - i=D2i -} -try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE) - -report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T) -outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_fullList.txt") -write.table(report2, outfile3, quote=F, sep="\t", row.names=F) -``` - -### EdgeR {-} -```{r EdgeRReport} -outfile <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR.txt") -outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR.bed") -DBReportEdgeR$name <- paste0("Peak",1:length(DBReportEdgeR)) -try(rtracklayer::export(DBReportEdgeR, outfile2),silent=TRUE) -write.table(DBReportEdgeR, outfile, quote=F, sep="\t", row.names=F) -Ei <- length(DBReportEdgeR) -if (Ei == 0) { - i=1 -} else if (Ei > 500) { - i=500 -} else { - i=Ei -} -try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE) - -report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T) -outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR_fullList.txt") -write.table(report2, outfile3, quote=F, sep="\t", row.names=F) -``` - -## R tool version information -```{r Info} -sessionInfo() -``` - -
diff --git a/workflow/scripts/DiffBind_v2_ChIPseq_block.Rmd b/workflow/scripts/DiffBind_v2_ChIPseq_block.Rmd deleted file mode 100644 index 2a508b5..0000000 --- a/workflow/scripts/DiffBind_v2_ChIPseq_block.Rmd +++ /dev/null @@ -1,267 +0,0 @@ ---- -title: "DiffBind: ChIP-seq pipeline, paired/blocked analysis" -output: - html_document: - toc: true - toc_float: - collapsed: false - number_sections: true - toc_depth: 3 - fig_width: 7 - fig_height: 6 -params: - csvfile: samplesheet.csv - contrasts: "group1_vs_group2" - peakcaller: "macs" - dir: "/path/to/DiffBindBlock/directory" ---- - - - -```{r, include=FALSE, warning=FALSE, message=FALSE} -## grab args -dateandtime<-format(Sys.time(), "%a %b %d %Y - %X") - -csvfile <- params$csvfile -contrasts <- params$contrasts -peakcaller <- params$peakcaller -``` - -**Groups being compared:** - *`r contrasts`* -**Peak sources:** - *`r peakcaller`* -**Report generated:** - *`r dateandtime`* - -```{r setup, echo=FALSE, warning=FALSE,message=FALSE} -knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) -knitr::opts_knit$set(root.dir=params$dir) -suppressMessages(library(DT)) -suppressMessages(library(DiffBind)) -suppressMessages(library(parallel)) -``` - -# Peak Data -Read in sample sheet information and peak information -```{r samples} -samples <- dba(sampleSheet=csvfile) -consensus <- dba.peakset(samples,consensus=DBA_CONDITION) -print(samples) -``` - -## Correlation heatmap: Only peaks -Pearson correlation of peak positions: all samples versus all samples -```{r heatmap1} -try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE) -``` - -## PCA: Only peaks -Variance of peak positions -```{r PCA1, fig.height=5,fig.width=5} -try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE) -``` - -## Overlapping peak counts -Number of overlapping peaks. -If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where -the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different -from the consensus peak set used for differential analyses. -```{r Venn, fig_height=4} -if (nrow(samples$samples) < 5) { - dba.plotVenn(samples,1:nrow(samples$samples)) -} else { - dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups") - try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE) - try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE) -} -``` - -# Consensus peaks and counts -Consensus peaks are peaks found in at least two samples, independent of condition. -FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. -```{r peaksORsummits} -if ( grepl("narrow",samples$samples$Peaks[1]) ) { - summits <- TRUE - print ("Narrow peak calling tool.") - print ("Differential peaks are 250bp upstream and downstream of the summits.") -} else if ( grepl("broad",samples$samples$Peaks[1]) ) { - summits <- FALSE - print ("Broad peak calling tool.") - print ("Differential peaks are consensus peaks.") -} else { - summits <- FALSE - print ("Indeterminate peak calling tool.") - print ("Differential peaks are consensus peaks.") -} -``` - -```{r DBcount} -if (summits == TRUE) { - DBdataCounts <- dba.count(samples, summits=250) -} else { - DBdataCounts <- dba.count(samples) -} -print(DBdataCounts) -outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_consensusPeaks.bed") -consensus2 <- dba.peakset(DBdataCounts,bRetrieve=T) -consensus2$name <- paste0("Peak",1:length(consensus2)) -#rtracklayer::export(consensus2,outfile2) -``` - -## Correlation heatmap: Peaks and reads -Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples -```{r heatmap2} -try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE) -``` - -## Heatmap: Average signal across each peak -1000 most variable consensus peaks (library-size normalized counts) -```{r heatmap3} -try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE) -``` - -## PCA: Peaks and reads -Variation of library-size normalized counts of consensus peaks -```{r PCA2, fig.height=5,fig.width=5} -try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE) -``` - -# Set Up Contrast -Contrast is Group1 - Group2. -```{r contrast} -DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION, - block=DBA_TREATMENT) -print(DBdatacontrast) -``` - -# Differential Analysis -This report shows the differential analysis with two tools: Deseq2 and EdgeR. For most -projects, Deseq2 is the optimal tool. Both tools assume that the majority of peaks are -not changing between the two conditions. EdgeR also assumes that there are equal numbers -of peaks on each side of the contrast, so it normalizes the data more than Deseq2. EdgeR -is especially useful when this assumption is true or when there are large differences in -library size across samples. All concentrations are on log2 scale. - -```{r analyze} -DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) -DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) -``` - -```{r report} -DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK) -DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK) -``` - -## PCA {.tabset .tabset-fade} -Variance of differential peaks only - -### DeSeq2 {-} -```{r PCA3, fig.height=5,fig.width=5} -try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2_BLOCK),silent=TRUE) -``` - -### EdgeR {-} -```{r PCA4, fig.height=5,fig.width=5} -try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER_BLOCK),silent=TRUE) -``` - -## MA plot {.tabset .tabset-fade} -"Log concentration" means average concentration across all samples. -Each dot is a consensus peak. - -### DeSeq2 {-} -```{r MA_D} -try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK),silent=TRUE) -``` - -### EdgeR {-} -```{r MA_E} -try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK),silent=TRUE) -``` - -## Volcano plot {.tabset .tabset-fade} -Each dot is a consensus peak. - -### DeSeq2 {-} -```{r Volcano1} -try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK),silent=TRUE) -``` - -### EdgeR {-} -```{r Volcano2} -try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK),silent=TRUE) -``` - -## Heatmap: Differential {.tabset .tabset-fade} -1000 most significant differential peaks (Deseq2 or EdgeR normalized) - -### DeSeq2 {-} -```{r heatmap4D} -try(dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2_BLOCK, - correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) -``` - -### EdgeR {-} -```{r heatmap4E} -try(dba.plotHeatmap(DBAnalysisEdgeR,contrast=1,method = DBA_EDGER_BLOCK, - correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) -``` - -## Top 500 differentially bound peaks {.tabset .tabset-fade} -### DeSeq2 {-} -```{r Deseq2Report} -outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block.txt") -outfile2 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block.bed") -DBReportDeseq2$name <- paste0("Peak",1:length(DBReportDeseq2)) -try(rtracklayer::export(DBReportDeseq2, outfile2),silent=TRUE) -write.table(DBReportDeseq2, outfile, quote=F, sep="\t", row.names=F) -D2i <- length(DBReportDeseq2) -if (D2i == 0) { - i=1 -} else if (D2i > 500) { - i=500 -} else { - i=D2i -} -try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE) - -report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2_BLOCK, - th=100,bNormalized=T,bFlip=FALSE,precision=0) -outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block_fullList.txt") -write.table(report2, outfile3, quote=F, sep="\t", row.names=F) -``` - -### EdgeR {-} -```{r EdgeRReport} -outfile <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR_block.txt") -outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR_block.bed") -DBReportEdgeR$name <- paste0("Peak",1:length(DBReportEdgeR)) -try(rtracklayer::export(DBReportEdgeR, outfile2),silent=TRUE) -write.table(DBReportEdgeR, outfile, quote=F, sep="\t", row.names=F) -Ei <- length(DBReportEdgeR) -if (Ei == 0) { - i=1 -} else if (Ei > 500) { - i=500 -} else { - i=Ei -} -try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE) - -report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER_BLOCK, - th=100,bNormalized=T,bFlip=FALSE,precision=0) -outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR_block_fullList.txt") -write.table(report2, outfile3, quote=F, sep="\t", row.names=F) -``` - -## R tool version information -```{r Info} -sessionInfo() -``` - -
\ No newline at end of file diff --git a/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd b/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd deleted file mode 100644 index d058cec..0000000 --- a/workflow/scripts/DiffBind_v2_cfChIP_QC.Rmd +++ /dev/null @@ -1,204 +0,0 @@ ---- -title: "DiffBind: cfChIP-seq QC" -output: - html_document: - toc: true - toc_float: - collapsed: false - number_sections: true - toc_depth: 3 - fig_width: 7 - fig_height: 6 -params: - csvfile: samplesheet.csv - contrasts: "group1_vs_group2" - peakcaller: "macs" ---- - - - -```{r, include=FALSE, warning=FALSE, message=FALSE} -## grab args -dateandtime<-format(Sys.time(), "%a %b %d %Y - %X") - -csvfile <- params$csvfile -contrasts <- params$contrasts -peakcaller <- params$peakcaller -``` - -**Peak sources:** - *`r peakcaller`* -**Report generated:** - *`r dateandtime`* - -```{r setup, echo=FALSE, warning=FALSE,message=FALSE} -knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) -suppressMessages(library(DiffBind)) -suppressMessages(library(parallel)) -suppressMessages(library(dplyr)) -suppressMessages(library(tidyr)) -suppressMessages(library(umap)) -suppressMessages(library(ggplot2)) -suppressMessages(library(ggrepel)) -``` - -# Peak Data -Read in sample sheet information and peak information -```{r samples} -samples <- dba(sampleSheet=csvfile) - -# if samples have Condition values -if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) { - consensus <- dba.peakset(samples,consensus=DBA_CONDITION, minOverlap = min(table(samples$samples$Condition))) -} -print(samples) -``` - -## Correlation heatmap: Only peaks -Pearson correlation of peak positions: all samples versus all samples -```{r heatmap1} -try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE) -``` - -## PCA: Only peaks -Variance of peak positions -```{r PCA1, fig.height=5,fig.width=5} -try(dba.plotPCA(samples),silent=TRUE) -``` - -## Overlapping peak counts -Number of overlapping peaks. -If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where -the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different -from the consensus peak set used for differential analyses. -```{r Venn, fig_height=4} -if (nrow(samples$samples) < 5) { - dba.plotVenn(samples,1:nrow(samples$samples)) -} else { - if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) { - dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups") - } else { - print("Consensus peaks were not called") - } -} -``` - -# Consensus peaks and counts -Consensus peaks are peaks found in at least two samples, independent of condition. -FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. -```{r peaksORsummits} -if ( grepl("narrow",samples$samples$Peaks[1]) ) { - summits <- TRUE - print ("Narrow peak calling tool.") - print ("Differential peaks are 250bp upstream and downstream of the summits.") -} else if ( grepl("broad",samples$samples$Peaks[1]) ) { - summits <- FALSE - print ("Broad peak calling tool.") - print ("Differential peaks are consensus peaks.") -} else { - summits <- FALSE - print ("Indeterminate peak calling tool.") - print ("Differential peaks are consensus peaks.") -} -``` - -```{r DBcount} - -if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) { - minOv <- min(table(samples$samples$Condition)) -} else { - minOv <- floor(ncol(samples$class)/3) -} - -print(paste0("The minimum number of overlaps is: ", minOv)) - -if (summits == TRUE) { - DBdataCounts <- dba.count(samples, summits=250, minOverlap = minOv) -} else { - DBdataCounts <- dba.count(samples, minOverlap = minOv) -} -print(DBdataCounts) - -``` - -## Correlation heatmap: Peaks and reads -Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples -```{r heatmap2} -try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE) -``` - -## Heatmap: Average signal across each peak -1000 most variable consensus peaks (library-size normalized counts) -```{r heatmap3} -try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE) -``` - -## PCA: Peaks and reads -Variation of library-size normalized counts of consensus peaks -```{r PCA2, fig.height=5,fig.width=5} -try(dba.plotPCA(DBdataCounts),silent=TRUE) -``` - -```{r TMM} -vec <- c("seqnames", "start", "end", "width", "strand", samples$samples$SampleID) -consensus2 <- dba.peakset(DBdataCounts, bRetrieve=TRUE) %>% ##extracts TMM-normalized counts - as.data.frame() %>% setNames(vec) %>% arrange(start, end) %>% mutate(Peaks = paste0("Peak",1:nrow(.))) %>% - dplyr::select(1:4, Peaks, samples$samples$SampleID) - -outfile1 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.csv") -write.csv(consensus2, outfile1, row.names = F) - -outfile2 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.bed") -write.table(consensus2[,c("seqnames","start","end","Peaks")], - outfile2, quote=F, sep="\t", row.names=F, col.names=F) - -counts_TMM_ALL <- consensus2 -rownames(counts_TMM_ALL) <- counts_TMM_ALL$Peaks -counts_TMM_ALL$Peaks <- NULL - -counts_TMM_ALL <- counts_TMM_ALL %>% dplyr::select(5:ncol(.)) %>% - t() %>% log10() %>% as.data.frame(.) -##UMAP coordinates -set.seed(123) -if (nrow(samples$samples) < 16) { - umap_coord <- umap(counts_TMM_ALL, n_neighbors= nrow(samples$samples)-1) -} else { - umap_coord <- umap(counts_TMM_ALL) -} -umap_coord <-as.data.frame(umap_coord$layout) %>% setNames(c("UMAP1", "UMAP2")) - -outfile <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_UMAP.csv") -write.csv(umap_coord, outfile, row.names = F) -``` - -## UMAP: peaks and reads -```{r UMAP_plot} -p <- ggplot(umap_coord,aes(x = UMAP1, y = UMAP2, label = samples$samples$SampleID))+ ##With labels - geom_point(aes(color=samples$samples$Condition), size = 3) + - theme_bw()+ ggtitle(paste0("log-transformed counts:", "n = ", nrow(umap_coord))) + - theme(plot.title = element_text(hjust = 0.5)) + - labs(color = "Phenotypes") + theme(text=element_text(size=15))+ - geom_text_repel(point.size = NA, size = 2.5) -q <- ggplot(umap_coord,aes(x = UMAP1, y = UMAP2)) + ##No labels - geom_point(aes(color=samples$samples$Condition), size = 3) + - theme_bw()+ ggtitle(paste0("log-transformed counts:", "n = ", nrow(umap_coord))) + - theme(plot.title = element_text(hjust = 0.5)) + - labs(color ="Phenotypes") + theme(text=element_text(size=15)) - ##geom_text_repel(point.size = NA, size = 2.5) -p - -if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) { -q -} -``` - -## R tool version information -```{r Info} -sessionInfo() -``` - -
diff --git a/workflow/scripts/FRiP_plot.R b/workflow/scripts/FRiP_plot.R deleted file mode 100644 index 8a81f49..0000000 --- a/workflow/scripts/FRiP_plot.R +++ /dev/null @@ -1,112 +0,0 @@ -## FRIP_plot.R -## Created by Tovah Markowitz -## June 19, 2020 -## Updated: Jan 19, 2022 -## Updated: Novemeber 4, 2022 - -args <- commandArgs(trailingOnly = TRUE) -folder <- args[1] - -library(ggplot2) -library(rjson) - -merge_files <- function(folder) { - files <- list.files(path=paste0(folder,"/PeakQC"), pattern="FRiP_table.txt", - full.names=T) - allList <- lapply(files,read.table,header=T) - allData <- do.call(rbind.data.frame, allList) - write.table(allData, paste0(folder, "/PeakQC/FRiP_All_table.txt"), quote=F, - row.names=F, sep="\t") - return(allData) -} - -plot_barplots <- function(inData, groupName, folder) { - p <- ggplot(inData,aes(x=bamsample, y=FRiP, fill=bedsample)) - p <- p + geom_bar(position="dodge",stat = "identity") + - facet_wrap(.~bedtool) + - theme_bw() + - theme(axis.text.x=element_text(angle = -15, hjust = 0)) + - labs(title=groupName, x="bam file", y ="Fraction of Reads in Peaks (FRiP)", - fill ="peak file") - pdf(paste0(folder, "/PeakQC/", groupName,".FRiP_barplot.pdf")) - print(p) - dev.off() -} - -plot_scatterplots <- function(inData, groupName, folder) { - p <- ggplot(inData,aes(x=n_basesM, y=FRiP, shape=bedsample, color=bedtool)) - p <- p + geom_point(size=2.5) + - facet_wrap(.~bamsample) + - theme_bw() + - scale_x_continuous(trans = "log10") + - labs(title=groupName, x="Number of Bases in Peaks (M)", - y="Fraction of Reads in Peaks (FRiP)", - shape="peak file", color="peak calling tool") - q <- p + annotation_logticks(sides="b") - pdf(paste0(folder, "/PeakQC/", groupName,".FRiP_scatterplot.pdf")) - tryCatch(print(q), error = function(e) {print(p)}) - dev.off() -} - -plot_barplots_self <- function(inData2, folder) { - p <- ggplot(inData2,aes(x=bamsample, y=FRiP, fill=groupInfo)) - p <- p + geom_bar(position="dodge",stat = "identity") + - facet_wrap(.~bedtool) + - theme_bw() + - theme(axis.text.x=element_text(angle = -15, hjust = 0)) + - labs(title="All Samples",x="bam file", y ="Fraction of Reads in Peaks (FRiP)", - fill ="Group") - pdf(paste0(folder, "/PeakQC/FRiP_barplot.pdf")) - print(p) - dev.off() -} - -plot_scatterplots_self <- function(inData2, folder) { - p <- ggplot(inData2,aes(x=n_basesM, y=FRiP, shape=bedtool, color=groupInfo)) - p <- p + geom_point(size=2.5) + - theme_bw() + - scale_x_continuous(trans = "log10") + - annotation_logticks(sides="b") + - labs(title="All samples", x="Number of Bases in Peaks (M)", - y="Fraction of Reads in Peaks (FRiP)", - shape="peak file", color="peak calling tool") - pdf(paste0(folder, "/PeakQC/FRiP_scatterplot.pdf")) - print(p) - dev.off() -} - -process_json <- function(injson) { -# to get the identities of the groups and the list of samples (ChIP and input) -# associated with it - json <- fromJSON(file = injson) - groupsInfo <- json$project$groups - inputs <- as.data.frame(json$project$peaks$inputs) - for (i in 1:length(groupsInfo)) { - tmp <- unique(unlist(inputs[names(inputs) %in% groupsInfo[[i]]])) - if (length(tmp) > 1) { - groupsInfo[[i]] <- c(groupsInfo[[i]],as.character(tmp)) - } else if (tmp != "" ) { - groupsInfo[[i]] <- c(groupsInfo[[i]],as.character(tmp)) - } - } - return(groupsInfo) -} - -allData <- merge_files(folder) -groupList <- process_json(paste0(folder,"/config.json")) - -for (i in 1:length(groupList)) { - group <- groupList[[i]] - groupName <- names(groupList)[i] - inData <- allData[which((allData$bedsample %in% group) & - (allData$bamsample %in% group)),] - plot_barplots(inData, groupName, folder) - plot_scatterplots(inData, groupName, folder) -} - -selfData <- allData[which(allData$bedsample == allData$bamsample),] -groupInfo <- reshape2::melt(groupList) -names(groupInfo) <- c("bamsample","groupInfo") -selfData2 <- merge(selfData,groupInfo) -plot_barplots_self(selfData2, folder) -plot_scatterplots_self(selfData2, folder) diff --git a/workflow/scripts/atac_nrf.py b/workflow/scripts/atac_nrf.py deleted file mode 100644 index edf21aa..0000000 --- a/workflow/scripts/atac_nrf.py +++ /dev/null @@ -1,22 +0,0 @@ -from __future__ import print_function -import sys - -preseq_log=sys.argv[1] - -with open(preseq_log, 'r') as fp: - for line in fp: - if line.startswith('TOTAL READS'): - tot_reads = float(line.strip().split("= ")[1]) - elif line.startswith('DISTINCT READS'): - distinct_reads = float(line.strip().split('= ')[1]) - elif line.startswith('1\t'): - one_pair = float(line.strip().split()[1]) - elif line.startswith('2\t'): - two_pair = float(line.strip().split()[1]) - -NRF = distinct_reads/tot_reads -PBC1 = one_pair/distinct_reads -PBC2 = one_pair/two_pair - -print("%.3f\t%.3f\t%.3f"%(NRF,PBC1,PBC2)) - diff --git a/workflow/scripts/bam_filter_by_mapq.py b/workflow/scripts/bam_filter_by_mapq.py deleted file mode 100644 index 12037cd..0000000 --- a/workflow/scripts/bam_filter_by_mapq.py +++ /dev/null @@ -1,40 +0,0 @@ -import pysam,sys -import argparse - -parser = argparse.ArgumentParser(description='filter PE bamfile by mapQ values') -parser.add_argument('-i',dest='inBam',required=True,help='Input Bam File') -parser.add_argument('-o',dest='outBam',required=True,help='Output Bam File') -parser.add_argument('-q',dest='mapQ',type=int,required=False,help='mapQ value ... default 6',default=6) -args = parser.parse_args() - -samfile = pysam.AlignmentFile(args.inBam, "rb") -mapq=dict() -for read in samfile.fetch(): - if read.is_unmapped: - continue - if read.is_supplementary: - continue - if read.is_secondary: - continue - if read.is_duplicate: - continue - if read.is_proper_pair: - if read.mapping_quality < args.mapQ and read.query_name in mapq: - del mapq[read.query_name] - if read.mapping_quality >= args.mapQ and not read.query_name in mapq: - mapq[read.query_name]=1 -samfile.close() - -samfile = pysam.AlignmentFile(args.inBam, "rb") -pairedreads = pysam.AlignmentFile(args.outBam, "wb", template=samfile) -for read in samfile.fetch(): - if read.query_name in mapq: - if read.is_supplementary: - continue - if read.is_secondary: - continue - if read.is_duplicate: - continue - pairedreads.write(read) -samfile.close() -pairedreads.close() diff --git a/workflow/scripts/blocking.py b/workflow/scripts/blocking.py new file mode 100644 index 0000000..9f3febb --- /dev/null +++ b/workflow/scripts/blocking.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +from os.path import join + + +# ~~~ Common helper functions for blocking or controls + + +def test_for_block(groupdata, contrast, blocks): + """ only want to run blocking on contrasts where all + individuals are on both sides of the contrast """ + contrastBlock = [ ] + for con in contrast: + group1 = con[0] + group2 = con[1] + block1 = [ blocks[sample] for sample in groupdata[group1] ] + block2 = [ blocks[sample] for sample in groupdata[group2] ] + if len(block1) == len(block2): + if len(set(block1).intersection(block2)) == len(block1): + contrastBlock.append(con) + return contrastBlock + + +def ctrl_test(ctrl_dict, input_name, in_dir): + sample = join(in_dir, f"{input_name}.Q5DD.RPGC.bw") + if input_name in ctrl_dict: + norm = join(in_dir, ctrl_dict[input_name] + ".Q5DD.RPGC.bw") + return [sample, norm] + return [sample] \ No newline at end of file diff --git a/workflow/scripts/cfChIP_signatures.R b/workflow/scripts/cfChIP_signatures.R deleted file mode 100755 index 778b75a..0000000 --- a/workflow/scripts/cfChIP_signatures.R +++ /dev/null @@ -1,97 +0,0 @@ -#################### -# -# Name: cfChIP_signatures.R -# Created by: Tovah Markowitz, PhD -# Bioinformatics (NCBR)/ Integrated Data Sciences Section (IDSS) -# Research Technologies Branch/DIR/NIAID -# -# Created: August 9, 2022 -# -#################### -# -# Purpose: To take the individual cfChIP signature tables, combine them, -# and create the preferred output plot -# -# Functions: mergeSignatures and plotSignatures -# -# Requires: ggplot2 and ggprism (for plotting) -# -# Details: mergeSignatures will take a folder of signatures and combine them -# into one long table. plotSignatures can directly take the the output -# of mergeSignatures, but you can also load the data into R and filter -# to only include a subset of samples before running the function. Also, -# add a column called "Condition" either to the input txt file or the R -# object to group columns in the plot using that additional information. -# -# Function1: mergeSignatures(folder, outFile) -# folder: [required] the path to the folder containing the individual signature -# files direct from the cfChIP tool -# outFile: [required] the name of the output txt file to save the data -# -# Function2: plotSignatures(inTXT, outPDF) -# inTXT: [required] either the name of the file from mergeSignatures or an -# an R object containing the data. Column names must match that of the -# output of mergeSignatures, but column order doesn't matter -# inPDF: [required] the name of the output pdf file to create -# -# Example usage: -# source("cfChIP_signatures.R") -# mergeSignatures("cfChIPtool/Output/H3K4me3/Signatures/", out.txt) -# plotSignatures(out.txt, out.pdf) -# plotSignatures(signatureDataFrame, out.pdf) -# -#################### - -mergeSignatures <- function(folder, outFile) { - files <- list.files(folder,full.names = T) - sigList <- lapply(files, read.csv) - samples <- gsub(".csv","",grep("csv",unlist(strsplit(files,"/")),value=T)) - for (i in 1:length(samples)) { - sigList[[i]] <- data.frame(sigList[[i]],Sample=samples[i]) - } - sigData <- do.call("rbind",sigList) - write.table(sigData, outFile, quote=F, sep="\t", row.names=F) -} - -plotSignatures <- function(inTXT, outPDF) { - library(ggplot2) - library(ggprism) - - if (mode(inTXT) == "character") { # if using a file name - sigData <- read.delim(inTXT) - } else { # if starting with an object in R - sigData <- as.data.frame(inTXT) - } - sigData$NormalizedCounts[which(sigData$NormalizedCounts > 3)] <- 3 - sigData$NormalizedCounts[which(sigData$NormalizedCounts < 0.15)] <- 0.15 - sigData$qValue[which(sigData$qValue > 300)] <- 300 - sigData$qValue[which(sigData$qValue < 5)] <- NA - names(sigData)[1] <- "cellType" - - cellTypes <- data.frame(cellType=c("Neutrophils","Monocytes","Megakaryocyte", - "Erythroblast","T-Cells","B-Cells","NK", - "Vasculary","Adipose","Skin","Sk. Muscle", - "Brain","Heart","Lung","Breast","Digestive", - "Pancreas"), - class=c(rep("Blood",7),rep("Global",4),rep("Other",6)) ) - - sigData2 <- merge(sigData,cellTypes) - sigData2$cellType <- factor(sigData2$cellType,levels=rev(cellTypes$cellType)) - - pdf(outPDF) - p <- ggplot(data=sigData2,aes(x=Sample,y=cellType,color=NormalizedCounts,size=qValue)) - p <- p + geom_point() + - scale_size(limits=c(5,300),breaks=c(5,50,100,150,200,250,300), - labels=paste0("e-",c(5,50,100,150,200,250,300))) + - scale_color_viridis_c(direction = -1, option="A") + - theme_prism() + - theme(axis.title.y = element_blank(), axis.title.x = element_blank(), - axis.text.x = element_text(angle = 45,vjust=0.9,hjust=1)) - if (sum(names(sigData) == "Condition") == 1) { - p <- p + facet_grid(rows=vars(class),cols=vars(Condition),scales="free",space="free") - } else { - p <- p + facet_grid(rows=vars(class),scales="free",space="free") - } - print(p) - dev.off() -} \ No newline at end of file diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index 0e51470..6feba1c 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -1,4 +1,11 @@ -# Common helper functions shared across the entire workflow + +#!/usr/bin/env python3 +# ~~~ Common helper functions shared across the entire workflow +import os +from os.path import join +from snakemake.io import expand + + def provided(samplelist, condition): """ Determines if optional rules should run. If an empty list is provided to rule all, @@ -201,4 +208,54 @@ def joint_option(prefix, valueslist): for v in valueslist: s += "{} {} ".format(prefix, v) s = s.rstrip() - return s \ No newline at end of file + return s + + +def mk_dir_if_not_exist(dirs): + if isinstance(dirs, str): + dirs = [dirs] + assert isinstance(dirs, list), 'Supplied directories should be in a list' + for _dir in dirs: + if not os.path.exists(_dir): + os.mkdir(_dir, mode=0o775) + return True + + +def get_file_components(pair_ended): + alnexts = [] + if pair_ended: + alnexts.extend(['sorted.bam', 'Q5DD.bam']) + else: + alnexts.extend(['sorted.bam', 'Q5DD_tagAlign.gz']) + stems = list(map(lambda x: x.split('.')[0], alnexts)) + rpgc_exts = list(map(lambda x: x.split('.')[0] + '.RPGC', alnexts)) + return stems, rpgc_exts, alnexts + + +def get_bam_ext(ext, pair_ended): + if pair_ended: + if ext.lower() == 'sorted': + return "bam" + elif ext == 'Q5DD': + return "bam" + else: + if ext.lower() == 'sorted': + return "bam" + elif ext == "Q5DD_tagAlign": + return "gz" + raise ValueError(f'Unknown file component. Pair ended: {str(pair_ended)}. Ext: {str(ext)}') + + +def get_fqscreen_outputs(paired_end, samples, qc_dir): + outs = [] + if paired_end: + outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2])), + outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2])), + outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.txt"), name=samples, rn=[1, 2])), + outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R{rn}.trim_screen.png"), name=samples, rn=[1, 2])), + else: + outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.txt"), name=samples)), + outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.png"), name=samples)), + outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.txt"), name=samples)), + outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.png"), name=samples)), + return outs \ No newline at end of file diff --git a/workflow/scripts/frip.py b/workflow/scripts/frip.py deleted file mode 100644 index 113eb62..0000000 --- a/workflow/scripts/frip.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/env python3 - -""" -Name: frip.py -Created by: Tovah Markowitz -Date: 06/18/20 - -Purpose: To calculate FRiP scores, one bam file and as many bedfiles as wanted as inputs -Currently only works with python/3.5 -""" - -########################################## -# Modules -import argparse -from argparse import RawTextHelpFormatter -from pybedtools import BedTool -import pysam -import pandas as pd - -########################################## -# Functions - -def split_infiles(infiles): - """ - breaks the infile string with space-delimited file names and - creates a list - """ - infileList = infiles.strip("\'").strip('\"').split(" ") - if len(infileList) == 1: - infileList = infileList[0].split(";") - return(infileList) - -def count_reads_in_bed(bam, bedfile, genomefile): - """ - some of this comes directly from the pybedtools site; read in - bed (or bed-like) file, sort it, and then count the number of - reads within the regions - """ - bedinfo = BedTool(bedfile) - bedinfo.sort(g=genomefile) - return ( - BedTool(bam).intersect( bedinfo, bed=True, stream=True, ) - ).count() - -def count_reads_in_bam(bam): - """ count the number of reads in a given bam file """ - return( pysam.AlignmentFile(bam).mapped ) - -def calculate_frip(nreads, noverlaps): - """ calculate FRiP score from nreads and noverlaps """ - return( float(noverlaps) / nreads ) - -def measure_bedfile_coverage(bedfile, genomefile): - """ calculate the number of bases covered by a given bed file """ - bedinfo = BedTool(bedfile) - return( bedinfo.sort(g=genomefile).total_coverage() ) - -def clip_bamfile_name(bamfile): - """ - clip bam file name for table/plotting purposes; assumes file - naming system matches that of Pipeliner - """ - sample = bamfile.split("/")[-1].split(".")[0] - condition = ".".join(bamfile.split("/")[-1].split(".")[1:-1]) - return( sample, condition ) - -def clip_bedfile_name(bedfile,filetype): - """ - clip bed file name for table/plotting purposes; assumes file - naming system matches that of Pipeliner - """ - if filetype == "": - toolused = bedfile.split("/")[-3] - sample = bedfile.split("/")[-2] - else: - toolused = filetype - sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks") - return( toolused, sample ) - -def process_files(bamfile, bedfiles, genome, filetypes): - """ - this is the main function to take in list of input files and - put out an array containing key file name information, read - counts, and FRiP scores - """ - bedfileL = bedfiles - filetypesL = filetypes - out = [[ "bedtool", "bedsample", "bamsample", "bamcondition", - "n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]] - nreads = count_reads_in_bam(bamfile) - (bamsample, condition) = clip_bamfile_name(bamfile) - for i in range(len(bedfileL)): - bed = bedfileL[i] - if len(filetypesL) > 1: - filetype = filetypesL[i] - else: - filetype = filetypesL[0] - (bedtool, bedsample) = clip_bedfile_name(bed,filetype) - noverlaps = count_reads_in_bed(bamfile, bed, genome) - frip = calculate_frip(nreads, noverlaps) - nbases = measure_bedfile_coverage(bed, genome) / 1000000 - out.append( [bedtool, bedsample, bamsample, condition, - nreads, noverlaps, frip, nbases] ) - out2 = pd.DataFrame(out[1:], columns=out[0]) - return(out2) - -def create_outfile_name(bamfile, outroot): - """ uses outroot to create the output file name """ - (bamsample, condition) = clip_bamfile_name(bamfile) - outtable = bamsample + "." + condition + "." + "FRiP_table.txt" - if outroot != "": - outtable = outroot + "." + outtable - return(outtable) - -def write_table(out2, outtable): - out2.to_csv(outtable,sep='\t',index=False) - - -############################################### -# Main - -def main(): - desc=""" -This function takes a space-delimited or semi-colon delimited list -of bed-like files (extensions must be recognizable by bedtools) -and a single bam file. It will then calculate the FRiP score for -all possible combinations of files and save the information in a -txt file. It will also calculate the number of bases covered by -each bed-like file. Note: this function assumes that the file -naming system of the input files matches that of Pipeliner. - """ - - parser = argparse.ArgumentParser(description=desc, formatter_class=RawTextHelpFormatter) - parser.add_argument('-p', nargs = '+', required=True, type=str, help='A space- or semicolon-delimited list of peakfiles \ -(or bed-like files).') - parser.add_argument('-b', required=True, type=str, help='The name of a bamfile to analyze.') - parser.add_argument('-g', required=True, type=str, help='The name of the .genome file so bedtools knows the \ -size of every chromosome.') - parser.add_argument('-o', required=True, type=str, help='The root name of the multiple output files. Default:""') - parser.add_argument('-t', required=False, default=[""], type=list, help='A space- \ -or semicolon-delimited list of input file sources/types. Only needed when \ -source of bed file is not built into the script. Default: ""') - - args = parser.parse_args() - bedfiles = args.p - bamfile = args.b - genomefile = args.g - outroot = args.o - filetypes = args.t - - out2 = process_files(bamfile, bedfiles, genomefile, filetypes) - outtable = create_outfile_name(bamfile, outroot) - write_table(out2, outtable) - -if __name__ == '__main__': - main() - -############################################### -# example cases - -#bedfiles = "macs_broad/mWT_HCF1_mm_i81/mWT_HCF1_mm_i81_peaks.broadPeak macs_broad/mWT_HCF1_mm_i89/mWT_HCF1_mm_i89_peaks.broadPeak" -#bamfiles = "bam/Input_mm_i95.sorted.Q5DD.bam bam/mWT_HCF1_mm_i81.sorted.Q5DD.bam bam/mWT_HCF1_mm_i89.sorted.Q5DD.bam" -#genomefile = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa.sizes" -#out2 = pd.read_csv("FRIP_test.txt",sep="\t") diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py index 6e39613..b8214cb 100644 --- a/workflow/scripts/grouping.py +++ b/workflow/scripts/grouping.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +# ~~~ Common helper functions for grouping of outputs +from os.path import join + # common functions related to sample grouping or group meta-information def group_samples_by_reps(groupdata, samples, chip2input): groupdatawinput = {} @@ -47,8 +50,11 @@ def group_output_files(extensions, groupslist, inputnorm): return dtoolgroups, dtoolext + def zip_contrasts(contrast, PeakTools): - """making output file names for differential binding analyses""" + """ + making output file names for differential binding analyses + """ zipGroup1, zipGroup2, zipTool, contrasts = [], [], [], [] for g1, g2 in contrast: for PeakTool in PeakTools: @@ -56,4 +62,48 @@ def zip_contrasts(contrast, PeakTools): zipGroup2.append(g2) zipTool.append(PeakTool) contrasts.append( g1 + "_vs_" + g2 + "-" + PeakTool ) - return(zipGroup1, zipGroup2, zipTool, contrasts) \ No newline at end of file + return(zipGroup1, zipGroup2, zipTool, contrasts) + + +def get_peaktools(assay_type): + tools = ["macsNarrow"] + if assay_type == "atac": + tools.append("Genrich") + elif assay_type == "chip": + tools.extend(["macsBroad", "sicer"]) + return tools + + +def dedup_out7(input, assay, paired_end): + dd = [] + if assay == "cfchip": + dd.append(input + ".Q5DD_tagAlign") + elif paired_end == False and assay == "chip": + dd.append(input + ".Q5DD_tagAlign.gz") + return dd + + +def get_ppqt_input(ppqt_dir, wildcards, paired_end): + ppqt = [] + if paired_end: + ppqt.append(join(ppqt_dir, "{0}.{1}.ppqt.txt".format(wildcards.name, wildcards.ext))) + else: + if wildcards.ext == "Q5DD": + ppqt.append(join(ppqt_dir, "{0}.Q5DD_tagAlign.ppqt.txt".format(wildcards.name))) + elif wildcards.ext == "sorted": + ppqt.append(join(ppqt_dir, "{0}.sorted.ppqt.txt".format(wildcards.name))) + else: + raise ValueError(f'Unknown alignment file extension, name: {wildcards.name}, ext: {wildcards.ext}.') + return ppqt + + +def get_bam_input(bam_dir, wildcards, paired_end): + bams = [] + if paired_end: + bams.append(join(bam_dir, "{0}.{1}.bam".format(wildcards.name, wildcards.ext))) + else: + if wildcards.ext == "Q5DD": + bams.append(join(bam_dir, "{0}.Q5DD.bam".format(wildcards.name))) + elif wildcards.ext == "sorted": + bams.append(join(bam_dir, "{0}.sorted.bam".format(wildcards.name))) + return bams \ No newline at end of file diff --git a/workflow/scripts/jaccard_score.py b/workflow/scripts/jaccard_score.py deleted file mode 100644 index 378ee69..0000000 --- a/workflow/scripts/jaccard_score.py +++ /dev/null @@ -1,202 +0,0 @@ -#!/usr/bin/env python3 - -""" -Name: jaccard_score.py -Created by: Tovah Markowitz -Date: 1/23/19 -Updated: 8/5/19 to compare multiple tools and create plots - -Purpose: To do all pairwise comparisons of bed/peak files given. Uses bedtools -to calculate a jaccard score for every comparison. All data is saved in a -single tab-delimited file. -""" - -########################################## -# Modules -import optparse -from pybedtools import BedTool -import pandas as pd -from sklearn.decomposition import PCA as sklearnPCA -import matplotlib as mpl -mpl.use('Agg') -import matplotlib.pyplot as plt -import seaborn as sns - -########################################## -# Functions - -def split_infiles(infiles): - """ breaks the infile string with space-delimited file names and creates a list. - also works for infile types - """ - infileList = infiles.strip("\'").strip('\"').split(" ") - if len(infileList) == 1: - infileList = infileList[0].split(";") - return(infileList) - -def loop_jaccard(infileList, genomefile, filetypeList): - """ Uses two loops to do all possible pairwise comparisons of files - in a list. Returns a writeable output and a pandas object - """ - nfiles = len(infileList) - (colnames, snames) = get_colnames(infileList, filetypeList) - out = [[1.000] * nfiles for i in range(nfiles)] - outTable = [] - for z in range(nfiles): - fileA = infileList[z] - print("fileA is: " + fileA) - for y in range(z+1,nfiles): - fileB = infileList[y] - (data, keylist) = run_jaccard(fileA, fileB, genomefile) - out[z][y] = data[3] - out[y][z] = data[3] - if filetypeList != [""]: - keylist.insert(1, "toolA") - keylist.insert(3, "toolB") - data.insert(1, filetypeList[z]) - data.insert(3, filetypeList[y]) - if len(outTable) == 0: - outTable.append( "\t".join(keylist) ) - outTable.append( "\t".join(data) ) - out2 = pd.DataFrame(out, columns=colnames, index=colnames,dtype="float") - return(outTable, out2, snames) - -def run_jaccard(fileA, fileB, genomefile): - """ Running bedtools. Reads in two bedtools approved file types, sorts the files, - and calculates a jaccard score. - """ - a = BedTool(fileA) - a = a.sort(g=genomefile) - b = BedTool(fileB) - b = b.sort(g=genomefile) - j = a.jaccard(b,g=genomefile) - j["fileA"] = fileA.split("/")[-1] - j["fileB"] = fileB.split("/")[-1] - keylist = list(j.keys()) - keylist.sort() - data = [ str(j[key]) for key in keylist ] - return(data, keylist) - -def get_colnames(infileList, filetypeList): - snames = [ i.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks") for i in infileList ] - if filetypeList == [""]: - colnames = snames - else: - colnames = [ snames[i] + "_" + filetypeList[i] for i in range(len(snames)) ] - return(colnames, snames) - -def create_outfile_names(outroot): - """ uses outroot to create the output file names """ - outTableFile = "jaccard.txt" - outPCAFile = "jaccard_PCA.pdf" - outHeatmapFile = "jaccard_heatmap.pdf" - if outroot != "": - if outroot[-1] == "/": - outTableFile= outroot + outTableFile - outPCAFile = outroot + outPCAFile - outHeatmapFile = outroot + outHeatmapFile - else: - outTableFile= outroot + "_" + outTableFile - outPCAFile = outroot + "." + outPCAFile - outHeatmapFile = outroot + "." + outHeatmapFile - return(outTableFile, outPCAFile, outHeatmapFile) - -def pca_plot(out, filetypeList, snames, outPCAFile): - """ creates a 2D PCA plot comparing the files based upon jaccard scores - """ - sklearn_pca = sklearnPCA(n_components=2) - Y_sklearn = sklearn_pca.fit_transform(out) - PCAdata = pd.DataFrame(Y_sklearn,columns=["PC1","PC2"]) - PCAdata.insert(0,"sample name",snames) - fig, ax =plt.subplots() - snames_pal = sns.hls_palette(len(set(snames)),s=.8) - sns.set_palette(snames_pal) - if filetypeList != [""]: - PCAdata.insert(1,"tool",filetypeList) - ax = sns.scatterplot(x="PC1",y="PC2",hue="sample name",style="tool",data=PCAdata,s=100) - else: - ax = sns.scatterplot(x="PC1",y="PC2",hue="sample name",data=PCAdata,s=100) - ax.axhline(y=0, color='grey', linewidth=1,linestyle="--") - ax.axvline(x=0, color='grey', linewidth=1,linestyle="--") - ax.set(xlabel= "PC1 (" + str(round(100*sklearn_pca.explained_variance_[0],2)) + "%)", - ylabel= "PC2 (" + str(round(100*sklearn_pca.explained_variance_[1],2)) + "%)") - plt.legend(bbox_to_anchor=(1.05, 1), loc=2) - #plt.show() - plt.savefig(outPCAFile, bbox_inches='tight') - plt.close("all") - -def plot_heatmap(out, outHeatmapFile, snames, filetypeList): - snames_pal = sns.hls_palette(len(set(snames)),s=.8) - snames_lut = dict(zip(set(snames), snames_pal)) - snames_cols = pd.Series(snames,index=out.index).map(snames_lut) - if filetypeList != [""]: - tool_pal = sns.cubehelix_palette(len(set(filetypeList))) - tool_lut = dict(zip(set(filetypeList), tool_pal)) - tool_cols = pd.Series(filetypeList,index=out.index).map(tool_lut) - g = sns.clustermap(out,cmap="YlGnBu",col_cluster=False, - row_colors=[snames_cols,tool_cols]) - for label in set(snames): - g.ax_col_dendrogram.bar(0, 0, color=snames_lut[label], - label=label, linewidth=0) - for label in set(filetypeList): - g.ax_col_dendrogram.bar(0, 0, color=tool_lut[label], - label=label, linewidth=0) - g.ax_col_dendrogram.legend(loc="center", ncol=3, - bbox_to_anchor=(0.4, 0.8)) - else: - g = sns.clustermap(out,cmap="YlGnBu",col_cluster=False, - row_colors=snames_cols) - for label in set(snames): - g.ax_col_dendrogram.bar(0, 0, color=snames_lut[label], - label=label, linewidth=0) - g.ax_col_dendrogram.legend(loc="center", ncol=3, - bbox_to_anchor=(0.5, 0.8)) - #plt.show() - plt.savefig(outHeatmapFile, bbox_inches='tight') - plt.close("all") - -def write_out(out, outFile): - f = open(outFile, 'w') - f.write( "\n".join(out) ) - f.close() - -########################################## -# Main - -def main(): - desc=""" - This function takes a space-delimited list of files (bed, bedgraph, gff, gtf, etc.) - and calculates all possible pairwise jaccard scores. From bedtools: 'Jaccard is the - length of the intersection over the union. Values range from 0 (no intersection) to - 1 (self intersection)'. The columns of the output file are: fileA, fileB, - intersection, jaccard, n_intersections, and union-intersection. - """ - - parser = optparse.OptionParser(description=desc) - - parser.add_option('-i', dest='infiles', default='', help='A space- or semicolon-delimited list of \ -input files for analysis.') - parser.add_option('-t', dest='filetypes', default='', help='A space- or semicolon-delimited list \ -of input file sources/types.') - parser.add_option('-o', dest='outroot', default='', help='The root name of the output files \ -where all the jaccard score information will be saved.') - parser.add_option('-g', dest='genomefile', default='', help='The name of the .genome file.') - - (options,args) = parser.parse_args() - infiles = options.infiles - filetypes = options.filetypes - outroot = options.outroot - genomefile = options.genomefile - - infileList = split_infiles(infiles) - filetypeList = split_infiles(filetypes) - (outTable, out, snames) = loop_jaccard(infileList, genomefile, filetypeList) - (outTableFile, outPCAFile, outHeatmapFile) = create_outfile_names(outroot) - write_out(outTable, outTableFile) - pca_plot(out, filetypeList, snames, outPCAFile) - plot_heatmap(out, outHeatmapFile, snames, filetypeList) - -if __name__ == '__main__': - main() - - diff --git a/workflow/scripts/peakcall.py b/workflow/scripts/peakcall.py new file mode 100644 index 0000000..e3b07e1 --- /dev/null +++ b/workflow/scripts/peakcall.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +from os.path import join + +def get_input_bam(input_sample, bam_dir): + """ + Returns a ChIP samples input BAM file, + see chip2input for ChIP, Input pairs. + """ + if input_sample: + # Runs in a ChIP, input mode + return join(bam_dir, "{0}.Q5DD.bam".format(input_sample)) + # Runs in ChIP-only mode + return [] + + +def get_control_input(ext, paired_end, bam_dir): + i = [] + if paired_end and ext != "": + i = [join(bam_dir, "{0}.Q5DD.bam".format(ext))] + elif not paired_end and ext != "": + i = [join(bam_dir, "{0}.Q5DD_tagAlign.gz".format(ext))] + return i + + +def outputIDR(groupswreps, groupdata, chip2input, tools): + """ + Produces the correct output files for IDR. All supposed replicates + should be directly compared when possible using IDR. IDR malfunctions + with bed files and GEM so it will not run with either of those. + Because there is no q-value calculated for SICER when there is no + input file, those samples are also ignored. + """ + IDRgroup, IDRsample1, IDRsample2, IDRpeaktool = [], [], [], [] + for group in groupswreps: + nsamples = len(groupdata[group]) + for i in range(nsamples): + ctrlTF = chip2input[groupdata[group][i]] != "" + for j in range(i+1,nsamples): + if ctrlTF == (chip2input[groupdata[group][j]] != ""): + if ctrlTF == False: + tooltmp = [ tool for tool in tools if tool != "sicer" ] + else: + tooltmp = tools + IDRgroup.extend([group] * len(tooltmp)) + IDRsample1.extend([groupdata[group][i]] * len(tooltmp)) + IDRsample2.extend([groupdata[group][j]] * len(tooltmp)) + IDRpeaktool.extend(tooltmp) + return( IDRgroup, IDRsample1, IDRsample2, IDRpeaktool ) + + +def zip_peak_files(chips, PeakTools, PeakExtensions): + """Making input file names for FRiP""" + zipSample, zipTool, zipExt = [], [], [] + for chip in chips: + for PeakTool in PeakTools: + zipSample.append(chip) + zipTool.append(PeakTool) + zipExt.append(PeakExtensions[PeakTool]) + return(zipSample, zipTool, zipExt) + + +def calc_effective_genome_fraction(effectivesize, genomefile): + """ + calculate the effective genome fraction by calculating the + actual genome size from a .genome-like file and then dividing + the effective genome size by that number + """ + lines=list(map(lambda x:x.strip().split("\t"),open(genomefile).readlines())) + genomelen=0 + for chrom,l in lines: + if not "_" in chrom and chrom!="chrX" and chrom!="chrM" and chrom!="chrY": + genomelen+=int(l) + return(str(float(effectivesize)/ genomelen)) + + +def getMacChip(bam_dir, name, paired_end): + if paired_end: + chip = join(bam_dir, name + ".Q5DD.bam") + else: + chip = join(bam_dir, name + ".Q5DD_tagAlign.gz") + return chip + + +def getMacTXT(ppqt_dir, name, paired_end): + if paired_end: + txt = join(ppqt_dir, name + ".Q5DD.ppqt.txt") + else: + txt = join(ppqt_dir, name + ".Q5DD_tagAlign.ppqt.txt") + return txt + + +def getSicerChips(bam_dir, name, paired_end): + if paired_end: + chip = join(bam_dir, name + ".Q5DD.bam") + else: + chip = join(bam_dir, name + ".Q5DD_tagAlign.gz") + return chip + + +def getSicerFragLen(ppqt_dir, qc_dir, name, paired_end): + if paired_end: + fragLen = join(qc_dir, name + ".Q5DD.insert_size_metrics.txt") + else: + fragLen = join(ppqt_dir, name + ".Q5DD_tagAlign.ppqt.txt") + return fragLen \ No newline at end of file diff --git a/workflow/scripts/ppqt_process.py b/workflow/scripts/ppqt_process.py deleted file mode 100644 index 9a2c9b1..0000000 --- a/workflow/scripts/ppqt_process.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python3 - -#Purpose: To grab the estimated fragment length from the ppqt output and a small txt with that information. For input files, adding an extra value of 200bp as an alternative. -import argparse -parser = argparse.ArgumentParser(description='Script to extract the the estimated fragment length from the ppqt output.') -parser.add_argument('-i', required=True,help='Name of the ppqt txt file') -parser.add_argument('-o', required=True,help='Name of the output file') -args = parser.parse_args() - -output = args.o -inppqt = args.i - -o=open(output,'w') - -file = list(map(lambda z:z.strip().split(),open(inppqt,'r').readlines())) - - -ppqt_values = file[0][2].split(",") -extenders = [] -for ppqt_value in ppqt_values: - if int(ppqt_value) > 150: - extenders.append(ppqt_value) -if len(extenders) > 0: - o.write(extenders[0]) -else: - o.write("200") -o.close() \ No newline at end of file diff --git a/workflow/scripts/prep_diffbind.py b/workflow/scripts/prep_diffbind.py deleted file mode 100644 index 4ba7d64..0000000 --- a/workflow/scripts/prep_diffbind.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 - -import json -import argparse - -parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv') -parser.add_argument('--g1',dest='group1',required=True,help='Name of the first group') -parser.add_argument('--g2',dest='group2',required=True,help='Name of the second group') -parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory') -parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located') -parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output') -parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv') -parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located') -parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file') - -args = parser.parse_args() - -with open("config.json","r") as read_file: - config=json.load(read_file) - -chip2input = config['project']['peaks']['inputs'] -groupdata = config['project']['groups'] -blocks = config['project']['blocks'] - -if None in list(blocks.values()): - samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", - "ControlID", "bamControl", "Peaks", "PeakCaller"])] -else: - samplesheet = [",".join(["SampleID","Condition","Treatment","Replicate", "bamReads", - "ControlID", "bamControl", "Peaks", "PeakCaller"])] - - -for condition in args.group1, args.group2: - for chip in groupdata[condition]: - replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) - bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam" - controlID = chip2input[chip] - if controlID != "": - bamControl = args.workpath + "/" + args.bamdir + "/" + controlID + ".Q5DD.bam" - else: - bamControl = "" - peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension - if None in list(blocks.values()): - samplesheet.append(",".join([chip, condition, replicate, bamReads, - controlID, bamControl, peaks, args.peakcaller])) - else: - block = blocks[chip] - samplesheet.append(",".join([chip, condition, block, replicate, bamReads, - controlID, bamControl, peaks, args.peakcaller])) - - -f = open(args.csvfile, 'w') -f.write ("\n".join(samplesheet)) -f.close() diff --git a/workflow/scripts/prep_diffbindQC.py b/workflow/scripts/prep_diffbindQC.py deleted file mode 100644 index 550b5f9..0000000 --- a/workflow/scripts/prep_diffbindQC.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 - -import json -import argparse - -parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv') -parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory') -parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located') -parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output') -parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv') -parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located') -parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file') - -args = parser.parse_args() - -with open("config.json","r") as read_file: - config=json.load(read_file) - -chip2input = config['project']['peaks']['inputs'] -groupdata = config['project']['groups'] - -tmpIDs = [x for xs in groupdata.values() for x in xs] -Ncounts = [tmpIDs.count(tmp) for tmp in set(tmpIDs)] - -samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", - "ControlID", "bamControl", "Peaks", "PeakCaller"])] - -count = 1 -for chip in chip2input.keys(): - if set(Ncounts) == {1}: # if all samples only in one group - for key in groupdata.keys(): - if chip in groupdata[key]: - condition = key - replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) - else: - condition = "" - replicate = str(count) - count = count +1 - bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam" - controlID = chip2input[chip] - if controlID != "": - bamControl = args.workpath + "/" + args.bamdir + "/" + controlID + ".Q5DD.bam" - else: - bamControl = "" - peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension - samplesheet.append(",".join([chip, condition, replicate, bamReads, - controlID, bamControl, peaks, args.peakcaller])) - -f = open(args.csvfile, 'w') -f.write ("\n".join(samplesheet)) -f.close() diff --git a/workflow/scripts/promoterAnnotation_by_Gene.R b/workflow/scripts/promoterAnnotation_by_Gene.R deleted file mode 100755 index 846cfc0..0000000 --- a/workflow/scripts/promoterAnnotation_by_Gene.R +++ /dev/null @@ -1,179 +0,0 @@ -#################### -# -# Name: promoterAnnotationByGene.R -# Created by: Tovah Markowitz, PhD -# Bioinformatics (NCBR)/ Integrated Data Sciences Section (IDSS) -# Research Technologies Branch/DIR/NIAID -# -# Created: August 8, 2022 -# Updated: October 26, 2022 to work with uropa 4.0.2 -# Updated: November 3, 2022 to fit with pipeline -# -#################### -# -# Purpose: To take UROPA allhits output files using "TSSprot" conditions and -# create a table of which genes have annotations overlapping their -# promoters and how many times. Output format: dataframe -# -# Details: Promoters will be defined as 3kb upstream to 1 kb downstream of the -# TSS. Allhits files were chosen to capture information from "peaks" -# overlappingmultiple promoters. Finalhits files can also be processed -# with this pipeline. This script can handle multiple allhits files as -# long as there are equal numbers of sampleNames to go with them. Also, -# giving a matching DiffBind txt file will allow the allhits file to be -# filtered to only include the significant differential peaks or to -# split the data by the direction of log fold-change. -# -# Requires: GenomicRanges, tidyr -# -# Function: promoterAnnotationByGene(allhitsFiles, sampleNames, diffbindFiles=NA, direction=NA) -# -# Variables: -# allhitsFiles: [required] a vector of allhits files to process -# sampleNames: [required] a vector of short names for each allhits file -# to use as column headers -# diffbindFiles: [optional] a vector of diffbind files to use to filter each -# allhits file -# direction: [optional] when filtering using diffbindFiles, define how to -# filter using log fold change. "Both" is default -# when not defined by user. -# Options: "both", "pos", "neg", "separate" -# -# Example usage: -# source("promoterAnnotation_by_Gene.R") -# out1 <- promoterAnnotationByGene(allhitsA.txt, "A") -# out2 <- promoterAnnotationByGene(allhitsA.txt, "A", diffbindA.txt, "both") -# out3 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsB.txt), -# sampleNames=c("A","B"), -# diffbindFiles=c(diffbindA.txt,diffbindB.txt), -# direction="pos") -# out4 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsA.txt), -# sampleNames=c("Deseq2","EdgeR"), -# diffbindFiles=c(Deseq2.txt,EdgeR.txt), -# direction="separate") -# -#################### - - -allhits2promoter <- function(allhitsFile) { - # cleaning up the allhits file to only keep information about peaks - # overlapping promoters - inData <- read.delim(allhitsFile) - tmp <- which(inData$name == "query_1") - if (length(tmp) == 0) { - print (paste0("Supplied file ", allhitsFile, " has no peaks overlapping promoters.")) - } else { - promoterData <- inData[tmp,] - promoterData <- promoterData[,c("peak_chr", "peak_start", "peak_end", "gene_id", "gene_name")] - return(promoterData) - } -} - -filterPromoter <- function(Diffbind, promoterData, sampleName) { - # used by DiffbindFilterPromoter - promoterData2 <- GenomicRanges::makeGRangesFromDataFrame(promoterData, seqnames.field="peak_chr", - start.field="peak_start", end.field="peak_end", - starts.in.df.are.0based=F) - Diffbind2 <- GenomicRanges::makeGRangesFromDataFrame(Diffbind) - ov <- GenomicRanges::countOverlaps(promoterData2,Diffbind2,type = "equal",maxgap=1) - promoterData3 <- promoterData[which(ov != 0),] - promoterData3$sample_id <- sampleName - return(promoterData3) -} - -DiffbindFilterPromoter <- function(DiffbindFile, promoterData, sampleName, direction) { - # filters the promoter data based upon whether it matches a different peak and what direction the fold-change is - # direction can be: "both", "pos", "neg", "separate". If direction is NA, use "both". - Diffbind <- read.delim(DiffbindFile) - Diffbind <- Diffbind[which(Diffbind$FDR < 0.05),] - if ((direction == "both") | is.na(direction)) { - promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) - } else if (direction == "pos") { - sampleName <- paste0(sampleName, "_pos") - Diffbind <- Diffbind[which(Diffbind$Fold > 0),] - promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) - } else if (direction == "neg") { - sampleName <- paste0(sampleName, "_neg") - Diffbind <- Diffbind[which(Diffbind$Fold < 0),] - promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) - } else { - sampleNameP <- paste0(sampleName, "_pos") - DiffbindP <- Diffbind[which(Diffbind$Fold > 0),] - promoterDataP <- filterPromoter(DiffbindP, promoterData, sampleNameP) - sampleNameN <- paste0(sampleName, "_neg") - DiffbindN <- Diffbind[which(Diffbind$Fold < 0),] - promoterDataN <- filterPromoter(DiffbindN, promoterData, sampleNameN) - promoterData2 <- rbind(promoterDataP, promoterDataN) - } -return(promoterData2) -} - -createPromoterTable <- function(promoterData) { - # making final output table - PromoterTable <- data.frame( table(promoterData[,c("gene_id", "sample_id")] ) ) - PromoterTable2 <- merge( unique(promoterData[,c("gene_id", "gene_name")] ), PromoterTable) - PromoterTable3 <- tidyr::pivot_wider(PromoterTable2, names_from="sample_id", values_from="Freq") - return(PromoterTable3) -} - -promoterAnnotationByGene <- function(allhitsFiles, sampleNames, diffbindFiles=NA, direction=NA) { - # the main function - if ( length(allhitsFiles) != length(sampleNames) ) { - print("Number of allhits files and sample names don't match.") - } else { - if ( (length(allhitsFiles) != length(diffbindFiles)) & (sum(is.na(diffbindFiles)) != 1) ) { - print("Number of allhits files and diffbind files don't match.") - } else { - if ( length(allhitsFiles) == 1 ) { - promoterData <- allhits2promoter(allhitsFiles) - if (is.na(diffbindFiles)) { - promoterData$sample_id <- sampleNames - } else { - promoterData <- DiffbindFilterPromoter(diffbindFiles, promoterData, sampleNames, direction) - } - } else { - for ( a in 1:length(allhitsFiles) ) { - print(a) - tmpA <- allhits2promoter(allhitsFiles[a]) - if (sum(is.na(diffbindFiles)) ==1) { - tmpA$sample_id <- sampleNames[a] - } else { - tmpA <- DiffbindFilterPromoter(diffbindFiles[a], tmpA, sampleNames[a], direction) - } - if (a == 1) { - promoterData <- tmpA - } else { - promoterData <- rbind(promoterData, tmpA) - } - } - } - } - promoterTable <- createPromoterTable(promoterData) - return(promoterTable) - } -} - -peakcallVersion <- function(inFolder,outFile) { -# currently only works for macs outputs -# inFolder here is the folder where the uropa output files are located - filesA <- list.files(path=inFolder,pattern="allhits.txt") - samples <- matrix(unlist(strsplit(filesA,"_macs")),ncol=2,byrow=T)[,1] - filesA <- list.files(path=inFolder,pattern="allhits.txt",full.names = T) - promoterInfo <- promoterAnnotationByGene(allhitsFiles=filesA, sampleNames=samples) - write.table(promoterInfo, outFile, quote=F,sep="\t",row.names=F) -} - -diffbindVersion <- function(inFolder,outFile) { -# currently designed for macs peaks, analyzed by deseq2 -# analyzing both positive and negative together for now -# inFolder here is the root working directory for the project - uropaFolder <- paste0(inFolder, "/UROPA_annotations/DiffBind") - diffbindFolder <- paste0(inFolder, "/DiffBind") - filesU <- list.files(path=uropaFolder, pattern="DiffbindDeseq2_uropa_protTSS_allhits.txt") - samples <- matrix(unlist(strsplit(filesU,"-macs")),ncol=2,byrow=T)[,1] - filesU <- list.files(path=uropaFolder, pattern="DiffbindDeseq2_uropa_protTSS_allhits.txt",full.names=T) - filesD <- list.files(path=diffbindFolder, pattern="Deseq2.txt",full.names=T,recursive=T) - promoterInfo <- promoterAnnotationByGene(allhitsFiles=filesU, - sampleNames=samples, diffbindFiles=filesD, direction="both") - write.table(promoterInfo, outFile, quote=F,sep="\t",row.names=F) -} diff --git a/workflow/scripts/significantPathways.R b/workflow/scripts/significantPathways.R deleted file mode 100755 index dd9af36..0000000 --- a/workflow/scripts/significantPathways.R +++ /dev/null @@ -1,127 +0,0 @@ -#################### -# -# Name: significantPathways.R -# Created by: Tovah Markowitz, PhD -# Bioinformatics (NCBR)/ Integrated Data Sciences Section (IDSS) -# Research Technologies Branch/DIR/NIAID -# -# Created: August 9, 2022 -# Updated: October 28, 2022 to make reactomePA optional -# Updated: November 4, 2022 to accept a txt file or a gtf for the background genes -# also to accept a promoter annotation table and analyze every column -# -#################### -# -# Purpose: To take a list of genes and find the significant KEGG or Reactome -# pathways using overenrichment analysis. See details for specialized functionality. -# -# Requires: clusterProfiler, ReactomePA, enrichplot, org.Hs.eg.db, rtracklayer, ggplot2, and ggprism -# -# Details: Takes input gene lists as Ensembl gene IDs or gene symbols, converts to -# Entrez gene IDs, and runs ORA against KEGG or Reactome database. Requires -# a background gene list as cfChIP currently ignores chrs X, Y, and M. -# Outputs a dataframe of significant pathways, a pdf of the top most -# significant pathways, or a pdf of just the pathways of interest (if significant). -# -# Function: significantPathways(Genes, bkgGeneTXT, database="KEGG", PDFfile=NA, pathwayVector=NA) -# -# Variables: -# Genes: [Required] a vector of the genes to be analyzed through ORA -# bkgGeneFILE: [Required] a txt file containing a column of Ensembl IDs listing -# the appropriate background gene set or the gtf file used for the uropa -# annotations -# For example: hg19.ensembl.prot_coding.with_annotations.txt -# database: [Optional] whether to compare to the KEGG or Reactome database -# default: KEGG -# PDFfile: [Optional] name of the PDF file to create, if empty no PDF will be made -# pathwayVector: [Optional] a vector of pathways (descriptions or IDs) to plot in the pdf. -# If PDFfile is empty, it is ignored. If this is empty and PDFfile is not, -# pdf plot will be of the top 30 most significant pathways instead. -# -# Example usage: -# source("significantPathways.R") -# out <- significantPathways(Genes= c("GeneA","GeneB"), -# bkgGeneFILE= "hg19.ensembl.prot_coding.with_annotations.txt", -# database="KEGG", PDFfile="a.pdf", -# pathwayVector=c("pathwayA", "pathwayB")) -# -#################### - -library(clusterProfiler) -library(enrichplot) -library(ggplot2) -library(ggprism) - -makeBarplotTop <- function(inData, titleName, PDFfile) { - inDataCount <- sum(inData@result$p.adjust < 0.1) - if (inDataCount > 30) { inDataCount = 30 } - if (inDataCount > 0) { - pdf(PDFfile) - print(barplot(inData, showCategory = inDataCount, - label_format=70, title=titleName, x="GeneRatio") + - theme_prism(base_size =8) + theme(legend.title = element_text()) ) - } - dev.off() -} - -makeBarPlotSelect <- function(inData, titleName, PDFfile, categories) { - pdf(PDFfile) - print(barplot(inData, showCategory = categories, - label_format=70, title=titleName, x="GeneRatio") + - theme_prism(base_size = 8) + theme(legend.title = element_text()) ) - dev.off() - } - -processGenes <- function(geneIDs) { - if (grepl("^ENSG", geneIDs[1])) { - ensIDs <- gsub("\\.[0-9]+", "", geneIDs, perl=T) - entrezIDs <- bitr(ensIDs, from= "ENSEMBL", toType="ENTREZID", OrgDb="org.Hs.eg.db") - } else { - entrezIDs <- bitr(ensIDs, from= "SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db") - } - entrezIDs <- entrezIDs$ENTREZID - return(entrezIDs) -} - -significantPathways <- function(Genes, bkgGeneFILE, database="KEGG", PDFfile=NA, pathwayVector=NA) { - sigGenes <- processGenes(Genes) - if (grepl("gtf",bkgGeneFILE)) { - bkgGenesData <- rtracklayer::import(bkgGeneFILE) - bkgGenes <- unique(bkgGenesData$gene_id) - } else { - bkgGenesData <- read.delim(bkgGeneFILE) - bkgGenes <- bkgGenesData[,grep("^ENSG", bkgGenesData[1,])] - } - backgroundGenes <- processGenes(bkgGenes) - if (database == "KEGG") { - pathwaySig <- enrichKEGG(sigGenes, organism= "hsa", keyType="kegg", universe=backgroundGenes, use_internal_data=TRUE) - pathwayData <- pathwaySig@result[which(pathwaySig@result$p.adjust < 0.1),] - } else { - library(ReactomePA) - pathwaySig <- enrichPathway(sigGenes, readable=T, universe=backgroundGenes) - pathwayData <- pathwaySig@result[which(pathwaySig@result$p.adjust < 0.1),] - } - if (!is.na(PDFfile)) { - if(length(pathwayVector) != 1) { - if (length(grep("HSA", pathwayVector, ignore.case=T)) != 0) { - pathwayVector <- pathwayData$Description[which(pathwayData$ID %in% pathwayVector)] - } - makeBarPlotSelect(inData=pathwaySig, titleName=database, PDFfile=PDFfile, categories=pathwayVector) - } else { - makeBarplotTop(inData=pathwaySig, titleName=database, PDFfile=PDFfile) - } - } - return(pathwayData) -} - -promoterAnnotationWrapper <- function(promoterFile, bkgGeneFILE, database="KEGG") { - promoterData <- read.delim(promoterFile) - outFolder <- dirname(promoterFile) - for (i in 3:ncol(promoterData)) { - colName <- names(promoterData)[i] - Genes <- promoterData$gene_id[which(promoterData[,i] > 0)] - outData <- significantPathways(Genes, bkgGeneFILE, database) - outFileName <- paste0(outFolder,"/",colName,"_",database,".txt") - write.table(outData, outFileName, quote=F, row.names=F, sep="\t") - } -} From 55ca7198c46ac641670d791bf8656fe362e8c151 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 8 Jul 2024 18:24:35 -0400 Subject: [PATCH 03/28] fix: correct diffbindedger output paths --- src/run.py | 6 +- workflow/Snakefile | 90 ++++++------- workflow/rules/dba.smk | 266 ++++++++++++++++++++----------------- workflow/scripts/common.py | 12 +- 4 files changed, 202 insertions(+), 172 deletions(-) diff --git a/src/run.py b/src/run.py index 34fc423..16be94e 100644 --- a/src/run.py +++ b/src/run.py @@ -19,7 +19,7 @@ from . import version as __version__ -def init(repo_path, output_path, links=[], required=['workflow', 'resources', 'config']): +def init(repo_path, output_path, links=[], required=['workflow', 'bin', 'resources', 'config']): """Initialize the output directory. If user provides a output directory path that already exists on the filesystem as a file (small chance of happening but possible), a OSError is raised. If the @@ -207,7 +207,7 @@ def setup(sub_args, ifiles, repo_path, output_path): # Add other runtime info for debugging config['project']['version'] = __version__ config['project']['workpath'] = os.path.abspath(sub_args.output) - config['project']['binpath'] = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'bin')) + config['project']['binpath'] = os.path.abspath(os.path.join(config['project']['workpath'], '..', 'bin')) git_hash = git_commit_hash(repo_path) config['project']['git_commit_hash'] = git_hash # Add latest git commit hash config['project']['pipeline_path'] = repo_path # Add path to installation @@ -610,6 +610,8 @@ def dryrun(outdir, config='config.json', snakefile=os.path.join('workflow', 'Sna dryrun_output = subprocess.check_output([ 'snakemake', '-npr', '-s', str(snakefile), + '--verbose', + '--debug-dag', '--use-singularity', '--rerun-incomplete', '--cores', str(256), diff --git a/workflow/Snakefile b/workflow/Snakefile index 026c56d..6d1bce7 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -85,62 +85,60 @@ if assay == "cfchip": join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), PeakTool=PeakTools )) - # rule_all_ins.extend(expand( - # join(diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - # PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], - # name=contrasts, - # _type=peak_types - # )) + + if reps: rule_all_ins.extend(expand( join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC )) rule_all_ins.extend(expand( - join(diffbind_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], - name=contrasts, _type=peak_types + join(uropa_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], + name=contrasts, _type=["protTSS"] )) -elif assay in ["atac", "chip"]: - peak_types.extend(["prot", "protSEC", "genes"]) - rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips)) - rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips)) - if paired_end: - rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems)) - if assay == "chip": - rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips)) - rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips)) + + elif assay in ["atac", "chip"]: + peak_types.extend(["prot", "protSEC", "genes"]) + rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips)) + rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips)) if paired_end: - short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"] - rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext)) - rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext)) - rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext)) - rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext)) - rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext)) - rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext)) - if assay == "atac": - rule_all_ins.extend(expand( - join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips - )) - if reps: - rule_all_ins.extend(expand( - join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{type}_allhits.txt"), - PeakTool=PeakTools, name=chips, _type=peak_types - )) - rule_all_ins.extend(expand( - join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC - )) - # rule_all_ins.extend(expand( - # join(uropa_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), - # PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], - # name=contrasts, - # _type=peak_types - # )) - if contrast: + rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems)) + if assay == "chip": + rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips)) + rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips)) + if paired_end: + short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"] + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext)) + elif assay == "atac": rule_all_ins.extend(expand( - join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), - PeakTool=PeakTools + join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips )) + if reps: + rule_all_ins.extend(expand( + join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{type}_allhits.txt"), + PeakTool=PeakTools, name=chips, _type=peak_types + )) + rule_all_ins.extend(expand( + join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), + group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC + )) + rule_all_ins.extend(expand( + join(uropa_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), + PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], + name=contrasts, + _type=peak_types + )) + if contrast: + rule_all_ins.extend(expand( + join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), + PeakTool=PeakTools + )) + rule_all_ins.append(join(workpath,"multiqc_report.html")) rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples)) rule_all_ins.extend( diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index 69aecc4..d767f57 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -1,18 +1,22 @@ # Differential binding analysis rules # ~~~~ -from os.path import join import os -from scripts.common import allocated, mk_dir_if_not_exist +import json +from os.path import join +from scripts.common import allocated, mk_dir_if_not_exist, test_combine from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction from scripts.blocking import test_for_block # ~~ workflow configuration workpath = config['project']['workpath'] +bin_path = config['project']['binpath'] genome = config['options']['genome'] blocks = config['project']['blocks'] groupdata = config['project']['groups'] - +contrast = config['project']['contrast'] +uropaver = config['tools']['UROPAVER'] +gtf = config['references'][genome]['GTFFILE'] # ~~ directories bin_path = join(workpath, "workflow", "bin") @@ -31,8 +35,6 @@ otherDirs = [qc_dir, homer_dir, uropa_dir] cfTool_dir = join(workpath, "cfChIPtool") cfTool_subdir2 = join(cfTool_dir, "BED", "H3K4me3") - - # ~~ workflow switches blocking = False if None in list(blocks.values()) else True if reps == "yes": otherDirs.append(diffbind_dir) @@ -86,136 +88,154 @@ contrastBlock = test_for_block(groupdata, contrast, blocks) zipGroup1B, zipGroup2B, zipToolCB, contrastsB = zip_contrasts(contrastBlock, PeakTools) # ~~ rules - rule diffbind: input: lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ] output: - html = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - Deseq2 = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"), - EdgeR = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"), - EdgeR_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"), - Deseq2_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"), - EdgeR_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"), - Deseq2_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"), - html_block = provided(join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking) + html = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), + Deseq2 = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"), + EdgeR = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"), + EdgeR_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"), + Deseq2_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"), + EdgeR_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"), + Deseq2_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"), + html_block = provided(join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking) params: - rname = "diffbind", - rscript = join(workpath, "workflow", "scripts","DiffBind_v2_ChIPseq.Rmd"), - outdir = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}"), - contrast = "{group1}_vs_{group2}", - csvfile = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv"), - pythonscript = join(workpath,"workflow","scripts","prep_diffbind.py"), - PeakExtension= lambda w: PeakExtensions[w.PeakTool], - peakcaller= lambda w: FileTypesDiffBind[w.PeakTool], - group1="{group1}", - group2="{group2}", - PeakTool="{PeakTool}", - blocking=blocking, - blocking_rscript = join(workpath,"workflow","scripts","DiffBind_v2_ChIPseq_block.Rmd"), - outdir_block= join(workpath,diffbind_dir_block,"{group1}_vs_{group2}-{PeakTool}"), - Deseq2_block = provided(join(workpath, diffbind_dir_block,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_block.bed"), blocking), - EdgeR_block = provided(join(workpath, diffbind_dir_block,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_block.bed"), blocking), + # variables and wildcards used in the shell directive + rname = "diffbind", + group1 = "{group1}", + group2 = "{group2}", + this_peaktool = "{PeakTool}", + this_contrast = "{group1}_vs_{group2}", + this_peakextension = lambda w: PeakExtensions[w.PeakTool], + peakcaller = lambda w: FileTypesDiffBind[w.PeakTool], + # scripts in the bin directory used in the shell directive + rscript = join(bin_path, "DiffBind_v2_ChIPseq.Rmd"), + pythonscript = join(bin_path, "prep_diffbind.py"), + blocking_rscript = join(bin_path, "DiffBind_v2_ChIPseq_block.Rmd"), + # output base directories or full file locations + outdir = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}"), + csvfile = join( + diffbind_dir, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv" + ), + Deseq2_block = provided(join( + diffbind_dir_block, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_block.bed" + ), blocking), + EdgeR_block = provided(join( + diffbind_dir_block, + "{group1}_vs_{group2}-{PeakTool}", + "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_block.bed" + ), blocking), + outdir_block = join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}"), container: config['images']['cfchip'] - shell: """ - python {params.pythonscript} --g1 {params.group1} --g2 {params.group2} --wp {workpath} \ - --pt {params.PeakTool} --pe {params.PeakExtension} --bd {bam_dir} \ - --pc {params.peakcaller} --csv {params.csvfile} - cp {params.rscript} {params.outdir} - cd {params.outdir} - Rscript -e 'rmarkdown::render("DiffBind_v2_ChIPseq.Rmd", output_file= "{output.html}", - params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}"))' - if [ ! -f {output.Deseq2} ]; then touch {output.Deseq2}; fi - if [ ! -f {output.EdgeR} ]; then touch {output.EdgeR}; fi - - if [ '{params.blocking}' == True ]; then - echo "DiffBind with Blocking" - Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file= "{output.html_block}", - params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}", dir= "{params.outdir_block}"))' - if [ ! -f {params.Deseq2_block} ]; then touch {params.Deseq2_block}; fi - if [ ! -f {params.EdgeR_block} ]; then touch {params.EdgeR_block}; fi - fi - """ - - -if assay == "cfchip": - rule UROPA: - input: - lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ] - output: - txt=join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'), - bed1=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')), - bed2=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')), - params: - rname="uropa", - uropaver = config['tools']['UROPAVER'], - fldr = join(uropa_dir, '{PeakTool1}'), - json = join(uropa_dir, '{PeakTool1}','{name}.{PeakTool2}.{type}.json'), - outroot = join(uropa_dir, '{PeakTool1}','{name}_{PeakTool2}_uropa_{type}'), - gtf = config['references'][genome]['GTFFILE'], - threads = 4, - shell: """ - module load {params.uropaver}; - # Dynamically creates UROPA config file - if [ ! -e {params.fldr} ]; then mkdir {params.fldr}; fi - echo '{{"queries":[ ' > {params.json} - if [ '{wildcards.type}' == 'protTSS' ]; then - echo ' {{ "feature":"gene","distance":3000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }},' >> {params.json} - echo ' {{ "feature":"gene","distance":10000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }},' >> {params.json} - echo ' {{ "feature":"gene","distance":100000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }}],' >> {params.json} - fi - echo '"show_attributes":["gene_id", "gene_name","gene_type"],' >> {params.json} - echo '"priority":"Yes",' >> {params.json} - echo '"gtf":"{params.gtf}",' >> {params.json} - echo '"bed": "{input}" }}' >> {params.json} - uropa -i {params.json} -p {params.outroot} -t {params.threads} -s + shell: """ -else: - rule UROPA: - input: - lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ] - output: - txt=join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'), - bed1=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')), - bed2=temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')), - params: - rname="uropa", - uropaver = config['tools']['UROPAVER'], - fldr = join(uropa_dir, '{PeakTool1}'), - json = join(uropa_dir, '{PeakTool1}','{name}.{PeakTool2}.{type}.json'), - outroot = join(uropa_dir, '{PeakTool1}','{name}_{PeakTool2}_uropa_{type}'), - gtf = config['references'][genome]['GTFFILE'], - threads = 4, - shell: """ - module load {params.uropaver}; - # Dynamically creates UROPA config file - if [ ! -e {params.fldr} ]; then mkdir {params.fldr}; fi - echo '{{"queries":[ ' > {params.json} - if [ '{wildcards.type}' == 'prot' ]; then - echo ' {{ "feature":"gene","distance":5000,"filter.attribute":"gene_type","attribute.value":"protein_coding" }},' >> {params.json} - echo ' {{ "feature":"gene","distance":100000,"filter.attribute":"gene_type","attribute.value":"protein_coding" }}],' >> {params.json} - elif [ '{wildcards.type}' == 'genes' ]; then - echo ' {{ "feature":"gene","distance":5000 }},' >> {params.json} - echo ' {{ "feature":"gene","distance":100000 }}],' >> {params.json} - elif [ '{wildcards.type}' == 'protSEC' ]; then - echo ' {{ "feature":"gene","distance":[3000,1000],"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }},' >> {params.json} - echo ' {{ "feature":"gene","distance":3000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"end" }},' >> {params.json} - echo ' {{ "feature":"gene","distance":100000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"center" }},' >> {params.json} - echo ' {{ "feature":"gene","distance":100000,"filter.attribute":"gene_type","attribute.value":"protein_coding" }}],' >> {params.json} - elif [ '{wildcards.type}' == 'protTSS' ]; then - echo ' {{ "feature":"gene","distance":[3000,1000],"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }},' >> {params.json} - echo ' {{ "feature":"gene","distance":10000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }},' >> {params.json} - echo ' {{ "feature":"gene","distance":100000,"filter.attribute":"gene_type","attribute.value":"protein_coding","feature.anchor":"start" }}],' >> {params.json} + python {params.pythonscript} --g1 {params.group1} --g2 {params.group2} --wp {workpath} \ + --pt {params.this_peaktool} --pe {params.this_peakextension} --bd {bam_dir} \ + --pc {params.peakcaller} --csv {params.csvfile} + cp {params.rscript} {params.outdir} + cd {params.outdir} + Rscript -e 'rmarkdown::render("DiffBind_v2_ChIPseq.Rmd", output_file= "{output.html}", + params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.this_peaktool}"))' + if [ ! -f {output.Deseq2} ]; then touch {output.Deseq2}; fi + if [ ! -f {output.EdgeR} ]; then touch {output.EdgeR}; fi + if [ '{params.blocking}' == True ]; then + echo "DiffBind with Blocking" + Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file= "{output.html_block}", + params=list(csvfile= "{params.csvfile}", contrasts= "{params.this_contrast}", peakcaller= "{params.this_peaktool}", dir= "{params.outdir_block}"))' + if [ ! -f {params.Deseq2_block} ]; then touch {params.Deseq2_block}; fi + if [ ! -f {params.EdgeR_block} ]; then touch {params.EdgeR_block}; fi fi - echo '"show_attributes":["gene_id", "gene_name","gene_type"],' >> {params.json} - echo '"priority":"Yes",' >> {params.json} - echo '"gtf":"{params.gtf}",' >> {params.json} - echo '"bed": "{input}" }}' >> {params.json} - uropa -i {params.json} -p {params.outroot} -t {params.threads} -s """ + +rule UROPA: + input: + lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ], + output: + txt = join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'), + bed1 = temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')), + bed2 = temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')), + params: + rname = "uropa", + fldr = join(uropa_dir, '{PeakTool1}'), + json = join(uropa_dir, '{PeakTool1}', '{name}.{PeakTool2}.{type}.json'), + outroot = join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}'), + threads: 4, + run: + # Dynamically creates UROPA config file + shell(f"module load {uropaver}") + if not os.path.exists("{params.fldr}"): + os.mkdir("{params.fldr}", mode=0o775) + + json_construct = dict() + json_construct['queries'] = [] + json_construct['show_attributes'] = ["gene_id", "gene_name", "gene_type"] + json_construct["priority"] = "Yes" + json_construct['gtf'] = gtf + json_construct['bed'] = "{input}" + + base_query = { + "feature": "gene", + "filter.attribute": "gene_type", + "attribute.value": "protein_coding", + "feature.anchor": "start" + } + + if assay == 'cfchip': + if '{type}' == 'protTSS': + for _d in (3000, 10000, 100000): + this_q = base_query.copy() + this_q['distance'] = _d + json_construct['queries'].append(this_q) + else: + if '{type}' == 'prot': + for _d in (5000, 100000): + this_q = base_query.copy() + del this_q["feature.anchor"] + this_q['distance'] = _d + json_construct['queries'].append(this_q) + elif '{type}' == 'genes': + this_query = {} + this_query['feature'] = 'gene' + for _d in (5000, 100000): + this_q = base_query.copy() + del this_q["feature.anchor"] + del this_q["filter.attribute"] + del this_q["attribute.value"] + this_q['distance'] = _d + json_construct['queries'].append(this_q) + elif '{type}' == 'protSEC': + # distance, feature.anchor + query_values = ( + ([3000, 1000], "start"), + (3000, "end"), + (100000, "center"), + (100000, None) + ) + for _distance, feature_anchor in query_values: + this_q = base_query.copy() + del this_q["feature.anchor"] + if feature_anchor: + this_q["feature.anchor"] = feature_anchor + this_q['distance'] = _d + json_construct['queries'].append(this_q) + elif '{type}' == 'protTSS': + for _d in ([3000, 1000], 10000, 100000): + this_q = base_query.copy() + this_q['distance'] = _d + json_construct['queries'].append(this_q) + with open('{params.json}', 'w') as jo: + json.dump(json_construct, jo, indent=4) + shell("uropa -i {params.json} -p {params.outroot} -t {threads} -s") + + rule manorm: input: bam1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"), diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index 6feba1c..26f44b6 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -258,4 +258,14 @@ def get_fqscreen_outputs(paired_end, samples, qc_dir): outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.png"), name=samples)), outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.txt"), name=samples)), outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.png"), name=samples)), - return outs \ No newline at end of file + return outs + + +def test_combine(one, two): + try: + three = one + two + except: + print(one) + print(two) + exit() + return three \ No newline at end of file From 21c4aea38e20ab96f11f5b741ad19fe4d002fc5d Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 8 Jul 2024 18:57:18 -0400 Subject: [PATCH 04/28] chore: add back bin scripts, correct pathing for bin scripts, formatting --- bin/DiffBind_v2_ChIPseq.Rmd | 260 +++++++++++++++++++++++++++++ bin/DiffBind_v2_ChIPseq_block.Rmd | 267 ++++++++++++++++++++++++++++++ bin/DiffBind_v2_cfChIP_QC.Rmd | 204 +++++++++++++++++++++++ bin/FRiP_plot.R | 112 +++++++++++++ bin/atac_nrf.py | 22 +++ bin/bam_filter_by_mapq.py | 40 +++++ bin/cfChIP_signatures.R | 97 +++++++++++ bin/frip.py | 164 ++++++++++++++++++ bin/jaccard_score.py | 202 ++++++++++++++++++++++ bin/ppqt_process.py | 27 +++ bin/prep_diffbind.py | 54 ++++++ bin/prep_diffbindQC.py | 51 ++++++ bin/promoterAnnotation_by_Gene.R | 179 ++++++++++++++++++++ bin/significantPathways.R | 127 ++++++++++++++ workflow/Snakefile | 1 + workflow/rules/cfChIP.smk | 153 +++++++++-------- workflow/rules/qc.smk | 64 +++---- workflow/scripts/common.py | 12 +- 18 files changed, 1925 insertions(+), 111 deletions(-) create mode 100644 bin/DiffBind_v2_ChIPseq.Rmd create mode 100644 bin/DiffBind_v2_ChIPseq_block.Rmd create mode 100644 bin/DiffBind_v2_cfChIP_QC.Rmd create mode 100644 bin/FRiP_plot.R create mode 100644 bin/atac_nrf.py create mode 100644 bin/bam_filter_by_mapq.py create mode 100755 bin/cfChIP_signatures.R create mode 100644 bin/frip.py create mode 100644 bin/jaccard_score.py create mode 100644 bin/ppqt_process.py create mode 100644 bin/prep_diffbind.py create mode 100644 bin/prep_diffbindQC.py create mode 100755 bin/promoterAnnotation_by_Gene.R create mode 100755 bin/significantPathways.R diff --git a/bin/DiffBind_v2_ChIPseq.Rmd b/bin/DiffBind_v2_ChIPseq.Rmd new file mode 100644 index 0000000..031799d --- /dev/null +++ b/bin/DiffBind_v2_ChIPseq.Rmd @@ -0,0 +1,260 @@ +--- +title: "DiffBind: ChIP-seq pipeline" +output: + html_document: + toc: true + toc_float: + collapsed: false + number_sections: true + toc_depth: 3 + fig_width: 7 + fig_height: 6 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macs" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +## grab args +dateandtime<-format(Sys.time(), "%a %b %d %Y - %X") + +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +``` + +**Groups being compared:** + *`r contrasts`* +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + +```{r setup, echo=FALSE, warning=FALSE,message=FALSE} +knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) +suppressMessages(library(DT)) +suppressMessages(library(DiffBind)) +suppressMessages(library(parallel)) +``` + +# Peak Data +Read in sample sheet information and peak information +```{r samples} +samples <- dba(sampleSheet=csvfile) +consensus <- dba.peakset(samples,consensus=DBA_CONDITION) +print(samples) +``` + +## Correlation heatmap: Only peaks +Pearson correlation of peak positions: all samples versus all samples +```{r heatmap1} +try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE) +``` + +## PCA: Only peaks +Variance of peak positions +```{r PCA1, fig.height=5,fig.width=5} +try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE) +``` + +## Overlapping peak counts +Number of overlapping peaks. +If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where +the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different +from the consensus peak set used for differential analyses. +```{r Venn, fig_height=4} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples,1:nrow(samples$samples)) +} else { + dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups") + try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE) + try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE) +} +``` + +# Consensus peaks and counts +Consensus peaks are peaks found in at least two samples, independent of condition. +FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. +```{r peaksORsummits} +if ( grepl("narrow",samples$samples$Peaks[1]) ) { + summits <- TRUE + print ("Narrow peak calling tool.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else if ( grepl("broad",samples$samples$Peaks[1]) ) { + summits <- FALSE + print ("Broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} else { + summits <- FALSE + print ("Indeterminate peak calling tool.") + print ("Differential peaks are consensus peaks.") +} +``` + +```{r DBcount} +if (summits == TRUE) { + DBdataCounts <- dba.count(samples, summits=250) +} else { + DBdataCounts <- dba.count(samples) +} +print(DBdataCounts) +outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_consensusPeaks.bed") +consensus2 <- dba.peakset(DBdataCounts,bRetrieve=T) +consensus2$name <- paste0("Peak",1:length(consensus2)) +#rtracklayer::export(consensus2,outfile2) +``` + +## Correlation heatmap: Peaks and reads +Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples +```{r heatmap2} +try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE) +``` + +## Heatmap: Average signal across each peak +1000 most variable consensus peaks (library-size normalized counts) +```{r heatmap3} +try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE) +``` + +## PCA: Peaks and reads +Variation of library-size normalized counts of consensus peaks +```{r PCA2, fig.height=5,fig.width=5} +try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE) +``` + +# Set Up Contrast +Contrast is Group1 - Group2. +```{r contrast} +DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION) +print(DBdatacontrast) +``` + +# Differential Analysis +This report shows the differential analysis with two tools: Deseq2 and EdgeR. For most +projects, Deseq2 is the optimal tool. Both tools assume that the majority of peaks are +not changing between the two conditions. EdgeR also assumes that there are equal numbers +of peaks on each side of the contrast, so it normalizes the data more than Deseq2. EdgeR +is especially useful when this assumption is true or when there are large differences in +library size across samples. All concentrations are on log2 scale. + +```{r analyze} +DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) +DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) +``` + +```{r report} +DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2) +DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER) +``` + +## PCA {.tabset .tabset-fade} +Variance of differential peaks only + +### DeSeq2 {-} +```{r PCA3, fig.height=5,fig.width=5} +try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2),silent=TRUE) +``` + +### EdgeR {-} +```{r PCA4, fig.height=5,fig.width=5} +try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER),silent=TRUE) +``` + +## MA plot {.tabset .tabset-fade} +"Log concentration" means average concentration across all samples. +Each dot is a consensus peak. + +### DeSeq2 {-} +```{r MA_D} +try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE) +``` + +### EdgeR {-} +```{r MA_E} +try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE) +``` + +## Volcano plot {.tabset .tabset-fade} +Each dot is a consensus peak. + +### DeSeq2 {-} +```{r Volcano1} +try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE) +``` + +### EdgeR {-} +```{r Volcano2} +try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE) +``` + +## Heatmap: Differential {.tabset .tabset-fade} +1000 most significant differential peaks (Deseq2 or EdgeR normalized) + +### DeSeq2 {-} +```{r heatmap4D} +try(dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2,correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) +``` + +### EdgeR {-} +```{r heatmap4E} +try(dba.plotHeatmap(DBAnalysisEdgeR,contrast=1,method = DBA_EDGER,correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) +``` + +## Top 500 differentially bound peaks {.tabset .tabset-fade} +### DeSeq2 {-} +```{r Deseq2Report} +outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.txt") +outfile2 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2.bed") +DBReportDeseq2$name <- paste0("Peak",1:length(DBReportDeseq2)) +try(rtracklayer::export(DBReportDeseq2, outfile2),silent=TRUE) +write.table(DBReportDeseq2, outfile, quote=F, sep="\t", row.names=F) +D2i <- length(DBReportDeseq2) +if (D2i == 0) { + i=1 +} else if (D2i > 500) { + i=500 +} else { + i=D2i +} +try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE) + +report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T) +outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_fullList.txt") +write.table(report2, outfile3, quote=F, sep="\t", row.names=F) +``` + +### EdgeR {-} +```{r EdgeRReport} +outfile <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR.txt") +outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR.bed") +DBReportEdgeR$name <- paste0("Peak",1:length(DBReportEdgeR)) +try(rtracklayer::export(DBReportEdgeR, outfile2),silent=TRUE) +write.table(DBReportEdgeR, outfile, quote=F, sep="\t", row.names=F) +Ei <- length(DBReportEdgeR) +if (Ei == 0) { + i=1 +} else if (Ei > 500) { + i=500 +} else { + i=Ei +} +try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE) + +report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER,th=100,bNormalized=T,bFlip=FALSE,precision=0,bCalled=T) +outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR_fullList.txt") +write.table(report2, outfile3, quote=F, sep="\t", row.names=F) +``` + +## R tool version information +```{r Info} +sessionInfo() +``` + +
diff --git a/bin/DiffBind_v2_ChIPseq_block.Rmd b/bin/DiffBind_v2_ChIPseq_block.Rmd new file mode 100644 index 0000000..2a508b5 --- /dev/null +++ b/bin/DiffBind_v2_ChIPseq_block.Rmd @@ -0,0 +1,267 @@ +--- +title: "DiffBind: ChIP-seq pipeline, paired/blocked analysis" +output: + html_document: + toc: true + toc_float: + collapsed: false + number_sections: true + toc_depth: 3 + fig_width: 7 + fig_height: 6 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macs" + dir: "/path/to/DiffBindBlock/directory" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +## grab args +dateandtime<-format(Sys.time(), "%a %b %d %Y - %X") + +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +``` + +**Groups being compared:** + *`r contrasts`* +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + +```{r setup, echo=FALSE, warning=FALSE,message=FALSE} +knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) +knitr::opts_knit$set(root.dir=params$dir) +suppressMessages(library(DT)) +suppressMessages(library(DiffBind)) +suppressMessages(library(parallel)) +``` + +# Peak Data +Read in sample sheet information and peak information +```{r samples} +samples <- dba(sampleSheet=csvfile) +consensus <- dba.peakset(samples,consensus=DBA_CONDITION) +print(samples) +``` + +## Correlation heatmap: Only peaks +Pearson correlation of peak positions: all samples versus all samples +```{r heatmap1} +try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE) +``` + +## PCA: Only peaks +Variance of peak positions +```{r PCA1, fig.height=5,fig.width=5} +try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE) +``` + +## Overlapping peak counts +Number of overlapping peaks. +If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where +the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different +from the consensus peak set used for differential analyses. +```{r Venn, fig_height=4} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples,1:nrow(samples$samples)) +} else { + dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups") + try(dba.plotVenn(samples,samples$masks[[3]],main="Binding Site Overlaps: samples in Group1"),silent=TRUE) + try(dba.plotVenn(samples,samples$masks[[4]],main="Binding Site Overlaps: samples in Group2"),silent=TRUE) +} +``` + +# Consensus peaks and counts +Consensus peaks are peaks found in at least two samples, independent of condition. +FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. +```{r peaksORsummits} +if ( grepl("narrow",samples$samples$Peaks[1]) ) { + summits <- TRUE + print ("Narrow peak calling tool.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else if ( grepl("broad",samples$samples$Peaks[1]) ) { + summits <- FALSE + print ("Broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} else { + summits <- FALSE + print ("Indeterminate peak calling tool.") + print ("Differential peaks are consensus peaks.") +} +``` + +```{r DBcount} +if (summits == TRUE) { + DBdataCounts <- dba.count(samples, summits=250) +} else { + DBdataCounts <- dba.count(samples) +} +print(DBdataCounts) +outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_consensusPeaks.bed") +consensus2 <- dba.peakset(DBdataCounts,bRetrieve=T) +consensus2$name <- paste0("Peak",1:length(consensus2)) +#rtracklayer::export(consensus2,outfile2) +``` + +## Correlation heatmap: Peaks and reads +Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples +```{r heatmap2} +try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE) +``` + +## Heatmap: Average signal across each peak +1000 most variable consensus peaks (library-size normalized counts) +```{r heatmap3} +try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE) +``` + +## PCA: Peaks and reads +Variation of library-size normalized counts of consensus peaks +```{r PCA2, fig.height=5,fig.width=5} +try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE) +``` + +# Set Up Contrast +Contrast is Group1 - Group2. +```{r contrast} +DBdatacontrast <- dba.contrast(DBdataCounts, minMembers=2, categories = DBA_CONDITION, + block=DBA_TREATMENT) +print(DBdatacontrast) +``` + +# Differential Analysis +This report shows the differential analysis with two tools: Deseq2 and EdgeR. For most +projects, Deseq2 is the optimal tool. Both tools assume that the majority of peaks are +not changing between the two conditions. EdgeR also assumes that there are equal numbers +of peaks on each side of the contrast, so it normalizes the data more than Deseq2. EdgeR +is especially useful when this assumption is true or when there are large differences in +library size across samples. All concentrations are on log2 scale. + +```{r analyze} +DBAnalysisDeseq2 <- dba.analyze(DBdatacontrast, method = DBA_DESEQ2) +DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) +``` + +```{r report} +DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK) +DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK) +``` + +## PCA {.tabset .tabset-fade} +Variance of differential peaks only + +### DeSeq2 {-} +```{r PCA3, fig.height=5,fig.width=5} +try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2_BLOCK),silent=TRUE) +``` + +### EdgeR {-} +```{r PCA4, fig.height=5,fig.width=5} +try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER_BLOCK),silent=TRUE) +``` + +## MA plot {.tabset .tabset-fade} +"Log concentration" means average concentration across all samples. +Each dot is a consensus peak. + +### DeSeq2 {-} +```{r MA_D} +try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK),silent=TRUE) +``` + +### EdgeR {-} +```{r MA_E} +try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK),silent=TRUE) +``` + +## Volcano plot {.tabset .tabset-fade} +Each dot is a consensus peak. + +### DeSeq2 {-} +```{r Volcano1} +try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2_BLOCK),silent=TRUE) +``` + +### EdgeR {-} +```{r Volcano2} +try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER_BLOCK),silent=TRUE) +``` + +## Heatmap: Differential {.tabset .tabset-fade} +1000 most significant differential peaks (Deseq2 or EdgeR normalized) + +### DeSeq2 {-} +```{r heatmap4D} +try(dba.plotHeatmap(DBAnalysisDeseq2,contrast=1,method = DBA_DESEQ2_BLOCK, + correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) +``` + +### EdgeR {-} +```{r heatmap4E} +try(dba.plotHeatmap(DBAnalysisEdgeR,contrast=1,method = DBA_EDGER_BLOCK, + correlations=FALSE,margin=20,cexRow=1,cexCol=1),silent=TRUE) +``` + +## Top 500 differentially bound peaks {.tabset .tabset-fade} +### DeSeq2 {-} +```{r Deseq2Report} +outfile <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block.txt") +outfile2 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block.bed") +DBReportDeseq2$name <- paste0("Peak",1:length(DBReportDeseq2)) +try(rtracklayer::export(DBReportDeseq2, outfile2),silent=TRUE) +write.table(DBReportDeseq2, outfile, quote=F, sep="\t", row.names=F) +D2i <- length(DBReportDeseq2) +if (D2i == 0) { + i=1 +} else if (D2i > 500) { + i=500 +} else { + i=D2i +} +try(DT::datatable(data.frame(DBReportDeseq2)[1:i,], rownames=F),silent=TRUE) + +report2 <- dba.report(DBAnalysisDeseq2,method = DBA_DESEQ2_BLOCK, + th=100,bNormalized=T,bFlip=FALSE,precision=0) +outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_Deseq2_block_fullList.txt") +write.table(report2, outfile3, quote=F, sep="\t", row.names=F) +``` + +### EdgeR {-} +```{r EdgeRReport} +outfile <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR_block.txt") +outfile2 <- paste0(contrasts, "-", peakcaller,"_Diffbind_EdgeR_block.bed") +DBReportEdgeR$name <- paste0("Peak",1:length(DBReportEdgeR)) +try(rtracklayer::export(DBReportEdgeR, outfile2),silent=TRUE) +write.table(DBReportEdgeR, outfile, quote=F, sep="\t", row.names=F) +Ei <- length(DBReportEdgeR) +if (Ei == 0) { + i=1 +} else if (Ei > 500) { + i=500 +} else { + i=Ei +} +try(DT::datatable(data.frame(DBReportEdgeR)[1:i,], rownames=F),silent=TRUE) + +report2 <- dba.report(DBAnalysisEdgeR,method = DBA_EDGER_BLOCK, + th=100,bNormalized=T,bFlip=FALSE,precision=0) +outfile3 <- paste0(contrasts, "-", peakcaller, "_Diffbind_EdgeR_block_fullList.txt") +write.table(report2, outfile3, quote=F, sep="\t", row.names=F) +``` + +## R tool version information +```{r Info} +sessionInfo() +``` + +
\ No newline at end of file diff --git a/bin/DiffBind_v2_cfChIP_QC.Rmd b/bin/DiffBind_v2_cfChIP_QC.Rmd new file mode 100644 index 0000000..d058cec --- /dev/null +++ b/bin/DiffBind_v2_cfChIP_QC.Rmd @@ -0,0 +1,204 @@ +--- +title: "DiffBind: cfChIP-seq QC" +output: + html_document: + toc: true + toc_float: + collapsed: false + number_sections: true + toc_depth: 3 + fig_width: 7 + fig_height: 6 +params: + csvfile: samplesheet.csv + contrasts: "group1_vs_group2" + peakcaller: "macs" +--- + + + +```{r, include=FALSE, warning=FALSE, message=FALSE} +## grab args +dateandtime<-format(Sys.time(), "%a %b %d %Y - %X") + +csvfile <- params$csvfile +contrasts <- params$contrasts +peakcaller <- params$peakcaller +``` + +**Peak sources:** + *`r peakcaller`* +**Report generated:** + *`r dateandtime`* + +```{r setup, echo=FALSE, warning=FALSE,message=FALSE} +knitr::opts_chunk$set(echo = FALSE, include=TRUE, message=FALSE, warning=FALSE, error=FALSE) +suppressMessages(library(DiffBind)) +suppressMessages(library(parallel)) +suppressMessages(library(dplyr)) +suppressMessages(library(tidyr)) +suppressMessages(library(umap)) +suppressMessages(library(ggplot2)) +suppressMessages(library(ggrepel)) +``` + +# Peak Data +Read in sample sheet information and peak information +```{r samples} +samples <- dba(sampleSheet=csvfile) + +# if samples have Condition values +if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) { + consensus <- dba.peakset(samples,consensus=DBA_CONDITION, minOverlap = min(table(samples$samples$Condition))) +} +print(samples) +``` + +## Correlation heatmap: Only peaks +Pearson correlation of peak positions: all samples versus all samples +```{r heatmap1} +try(dba.plotHeatmap(samples,main="",cexRow=1,cexCol=1),silent=TRUE) +``` + +## PCA: Only peaks +Variance of peak positions +```{r PCA1, fig.height=5,fig.width=5} +try(dba.plotPCA(samples),silent=TRUE) +``` + +## Overlapping peak counts +Number of overlapping peaks. +If the number of samples is greater than 4, a "consensus" peak Venn diagram is created, where +the consensus peak set are the peaks identified in at least 2 samples for that condition. This is different +from the consensus peak set used for differential analyses. +```{r Venn, fig_height=4} +if (nrow(samples$samples) < 5) { + dba.plotVenn(samples,1:nrow(samples$samples)) +} else { + if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) { + dba.plotVenn(consensus,consensus$masks$Consensus,main="Binding Site Overlaps: 'consensus', comparing between groups") + } else { + print("Consensus peaks were not called") + } +} +``` + +# Consensus peaks and counts +Consensus peaks are peaks found in at least two samples, independent of condition. +FRiP is of consensus peaks and will not match FRiP values calculated outside of this tool. +```{r peaksORsummits} +if ( grepl("narrow",samples$samples$Peaks[1]) ) { + summits <- TRUE + print ("Narrow peak calling tool.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else if ( grepl("broad",samples$samples$Peaks[1]) ) { + summits <- FALSE + print ("Broad peak calling tool.") + print ("Differential peaks are consensus peaks.") +} else { + summits <- FALSE + print ("Indeterminate peak calling tool.") + print ("Differential peaks are consensus peaks.") +} +``` + +```{r DBcount} + +if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) { + minOv <- min(table(samples$samples$Condition)) +} else { + minOv <- floor(ncol(samples$class)/3) +} + +print(paste0("The minimum number of overlaps is: ", minOv)) + +if (summits == TRUE) { + DBdataCounts <- dba.count(samples, summits=250, minOverlap = minOv) +} else { + DBdataCounts <- dba.count(samples, minOverlap = minOv) +} +print(DBdataCounts) + +``` + +## Correlation heatmap: Peaks and reads +Pearson correlation of library-size normalized counts of consensus peaks: all samples versus all samples +```{r heatmap2} +try(dba.plotHeatmap(DBdataCounts,main="",cexRow=1,cexCol=1),silent=TRUE) +``` + +## Heatmap: Average signal across each peak +1000 most variable consensus peaks (library-size normalized counts) +```{r heatmap3} +try(dba.plotHeatmap(DBdataCounts,correlations=FALSE,cexRow=1,cexCol=1),silent=TRUE) +``` + +## PCA: Peaks and reads +Variation of library-size normalized counts of consensus peaks +```{r PCA2, fig.height=5,fig.width=5} +try(dba.plotPCA(DBdataCounts),silent=TRUE) +``` + +```{r TMM} +vec <- c("seqnames", "start", "end", "width", "strand", samples$samples$SampleID) +consensus2 <- dba.peakset(DBdataCounts, bRetrieve=TRUE) %>% ##extracts TMM-normalized counts + as.data.frame() %>% setNames(vec) %>% arrange(start, end) %>% mutate(Peaks = paste0("Peak",1:nrow(.))) %>% + dplyr::select(1:4, Peaks, samples$samples$SampleID) + +outfile1 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.csv") +write.csv(consensus2, outfile1, row.names = F) + +outfile2 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.bed") +write.table(consensus2[,c("seqnames","start","end","Peaks")], + outfile2, quote=F, sep="\t", row.names=F, col.names=F) + +counts_TMM_ALL <- consensus2 +rownames(counts_TMM_ALL) <- counts_TMM_ALL$Peaks +counts_TMM_ALL$Peaks <- NULL + +counts_TMM_ALL <- counts_TMM_ALL %>% dplyr::select(5:ncol(.)) %>% + t() %>% log10() %>% as.data.frame(.) +##UMAP coordinates +set.seed(123) +if (nrow(samples$samples) < 16) { + umap_coord <- umap(counts_TMM_ALL, n_neighbors= nrow(samples$samples)-1) +} else { + umap_coord <- umap(counts_TMM_ALL) +} +umap_coord <-as.data.frame(umap_coord$layout) %>% setNames(c("UMAP1", "UMAP2")) + +outfile <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_UMAP.csv") +write.csv(umap_coord, outfile, row.names = F) +``` + +## UMAP: peaks and reads +```{r UMAP_plot} +p <- ggplot(umap_coord,aes(x = UMAP1, y = UMAP2, label = samples$samples$SampleID))+ ##With labels + geom_point(aes(color=samples$samples$Condition), size = 3) + + theme_bw()+ ggtitle(paste0("log-transformed counts:", "n = ", nrow(umap_coord))) + + theme(plot.title = element_text(hjust = 0.5)) + + labs(color = "Phenotypes") + theme(text=element_text(size=15))+ + geom_text_repel(point.size = NA, size = 2.5) +q <- ggplot(umap_coord,aes(x = UMAP1, y = UMAP2)) + ##No labels + geom_point(aes(color=samples$samples$Condition), size = 3) + + theme_bw()+ ggtitle(paste0("log-transformed counts:", "n = ", nrow(umap_coord))) + + theme(plot.title = element_text(hjust = 0.5)) + + labs(color ="Phenotypes") + theme(text=element_text(size=15)) + ##geom_text_repel(point.size = NA, size = 2.5) +p + +if ( sum(samples$class["Condition",] != "") == ncol(samples$class) ) { +q +} +``` + +## R tool version information +```{r Info} +sessionInfo() +``` + +
diff --git a/bin/FRiP_plot.R b/bin/FRiP_plot.R new file mode 100644 index 0000000..8a81f49 --- /dev/null +++ b/bin/FRiP_plot.R @@ -0,0 +1,112 @@ +## FRIP_plot.R +## Created by Tovah Markowitz +## June 19, 2020 +## Updated: Jan 19, 2022 +## Updated: Novemeber 4, 2022 + +args <- commandArgs(trailingOnly = TRUE) +folder <- args[1] + +library(ggplot2) +library(rjson) + +merge_files <- function(folder) { + files <- list.files(path=paste0(folder,"/PeakQC"), pattern="FRiP_table.txt", + full.names=T) + allList <- lapply(files,read.table,header=T) + allData <- do.call(rbind.data.frame, allList) + write.table(allData, paste0(folder, "/PeakQC/FRiP_All_table.txt"), quote=F, + row.names=F, sep="\t") + return(allData) +} + +plot_barplots <- function(inData, groupName, folder) { + p <- ggplot(inData,aes(x=bamsample, y=FRiP, fill=bedsample)) + p <- p + geom_bar(position="dodge",stat = "identity") + + facet_wrap(.~bedtool) + + theme_bw() + + theme(axis.text.x=element_text(angle = -15, hjust = 0)) + + labs(title=groupName, x="bam file", y ="Fraction of Reads in Peaks (FRiP)", + fill ="peak file") + pdf(paste0(folder, "/PeakQC/", groupName,".FRiP_barplot.pdf")) + print(p) + dev.off() +} + +plot_scatterplots <- function(inData, groupName, folder) { + p <- ggplot(inData,aes(x=n_basesM, y=FRiP, shape=bedsample, color=bedtool)) + p <- p + geom_point(size=2.5) + + facet_wrap(.~bamsample) + + theme_bw() + + scale_x_continuous(trans = "log10") + + labs(title=groupName, x="Number of Bases in Peaks (M)", + y="Fraction of Reads in Peaks (FRiP)", + shape="peak file", color="peak calling tool") + q <- p + annotation_logticks(sides="b") + pdf(paste0(folder, "/PeakQC/", groupName,".FRiP_scatterplot.pdf")) + tryCatch(print(q), error = function(e) {print(p)}) + dev.off() +} + +plot_barplots_self <- function(inData2, folder) { + p <- ggplot(inData2,aes(x=bamsample, y=FRiP, fill=groupInfo)) + p <- p + geom_bar(position="dodge",stat = "identity") + + facet_wrap(.~bedtool) + + theme_bw() + + theme(axis.text.x=element_text(angle = -15, hjust = 0)) + + labs(title="All Samples",x="bam file", y ="Fraction of Reads in Peaks (FRiP)", + fill ="Group") + pdf(paste0(folder, "/PeakQC/FRiP_barplot.pdf")) + print(p) + dev.off() +} + +plot_scatterplots_self <- function(inData2, folder) { + p <- ggplot(inData2,aes(x=n_basesM, y=FRiP, shape=bedtool, color=groupInfo)) + p <- p + geom_point(size=2.5) + + theme_bw() + + scale_x_continuous(trans = "log10") + + annotation_logticks(sides="b") + + labs(title="All samples", x="Number of Bases in Peaks (M)", + y="Fraction of Reads in Peaks (FRiP)", + shape="peak file", color="peak calling tool") + pdf(paste0(folder, "/PeakQC/FRiP_scatterplot.pdf")) + print(p) + dev.off() +} + +process_json <- function(injson) { +# to get the identities of the groups and the list of samples (ChIP and input) +# associated with it + json <- fromJSON(file = injson) + groupsInfo <- json$project$groups + inputs <- as.data.frame(json$project$peaks$inputs) + for (i in 1:length(groupsInfo)) { + tmp <- unique(unlist(inputs[names(inputs) %in% groupsInfo[[i]]])) + if (length(tmp) > 1) { + groupsInfo[[i]] <- c(groupsInfo[[i]],as.character(tmp)) + } else if (tmp != "" ) { + groupsInfo[[i]] <- c(groupsInfo[[i]],as.character(tmp)) + } + } + return(groupsInfo) +} + +allData <- merge_files(folder) +groupList <- process_json(paste0(folder,"/config.json")) + +for (i in 1:length(groupList)) { + group <- groupList[[i]] + groupName <- names(groupList)[i] + inData <- allData[which((allData$bedsample %in% group) & + (allData$bamsample %in% group)),] + plot_barplots(inData, groupName, folder) + plot_scatterplots(inData, groupName, folder) +} + +selfData <- allData[which(allData$bedsample == allData$bamsample),] +groupInfo <- reshape2::melt(groupList) +names(groupInfo) <- c("bamsample","groupInfo") +selfData2 <- merge(selfData,groupInfo) +plot_barplots_self(selfData2, folder) +plot_scatterplots_self(selfData2, folder) diff --git a/bin/atac_nrf.py b/bin/atac_nrf.py new file mode 100644 index 0000000..edf21aa --- /dev/null +++ b/bin/atac_nrf.py @@ -0,0 +1,22 @@ +from __future__ import print_function +import sys + +preseq_log=sys.argv[1] + +with open(preseq_log, 'r') as fp: + for line in fp: + if line.startswith('TOTAL READS'): + tot_reads = float(line.strip().split("= ")[1]) + elif line.startswith('DISTINCT READS'): + distinct_reads = float(line.strip().split('= ')[1]) + elif line.startswith('1\t'): + one_pair = float(line.strip().split()[1]) + elif line.startswith('2\t'): + two_pair = float(line.strip().split()[1]) + +NRF = distinct_reads/tot_reads +PBC1 = one_pair/distinct_reads +PBC2 = one_pair/two_pair + +print("%.3f\t%.3f\t%.3f"%(NRF,PBC1,PBC2)) + diff --git a/bin/bam_filter_by_mapq.py b/bin/bam_filter_by_mapq.py new file mode 100644 index 0000000..12037cd --- /dev/null +++ b/bin/bam_filter_by_mapq.py @@ -0,0 +1,40 @@ +import pysam,sys +import argparse + +parser = argparse.ArgumentParser(description='filter PE bamfile by mapQ values') +parser.add_argument('-i',dest='inBam',required=True,help='Input Bam File') +parser.add_argument('-o',dest='outBam',required=True,help='Output Bam File') +parser.add_argument('-q',dest='mapQ',type=int,required=False,help='mapQ value ... default 6',default=6) +args = parser.parse_args() + +samfile = pysam.AlignmentFile(args.inBam, "rb") +mapq=dict() +for read in samfile.fetch(): + if read.is_unmapped: + continue + if read.is_supplementary: + continue + if read.is_secondary: + continue + if read.is_duplicate: + continue + if read.is_proper_pair: + if read.mapping_quality < args.mapQ and read.query_name in mapq: + del mapq[read.query_name] + if read.mapping_quality >= args.mapQ and not read.query_name in mapq: + mapq[read.query_name]=1 +samfile.close() + +samfile = pysam.AlignmentFile(args.inBam, "rb") +pairedreads = pysam.AlignmentFile(args.outBam, "wb", template=samfile) +for read in samfile.fetch(): + if read.query_name in mapq: + if read.is_supplementary: + continue + if read.is_secondary: + continue + if read.is_duplicate: + continue + pairedreads.write(read) +samfile.close() +pairedreads.close() diff --git a/bin/cfChIP_signatures.R b/bin/cfChIP_signatures.R new file mode 100755 index 0000000..778b75a --- /dev/null +++ b/bin/cfChIP_signatures.R @@ -0,0 +1,97 @@ +#################### +# +# Name: cfChIP_signatures.R +# Created by: Tovah Markowitz, PhD +# Bioinformatics (NCBR)/ Integrated Data Sciences Section (IDSS) +# Research Technologies Branch/DIR/NIAID +# +# Created: August 9, 2022 +# +#################### +# +# Purpose: To take the individual cfChIP signature tables, combine them, +# and create the preferred output plot +# +# Functions: mergeSignatures and plotSignatures +# +# Requires: ggplot2 and ggprism (for plotting) +# +# Details: mergeSignatures will take a folder of signatures and combine them +# into one long table. plotSignatures can directly take the the output +# of mergeSignatures, but you can also load the data into R and filter +# to only include a subset of samples before running the function. Also, +# add a column called "Condition" either to the input txt file or the R +# object to group columns in the plot using that additional information. +# +# Function1: mergeSignatures(folder, outFile) +# folder: [required] the path to the folder containing the individual signature +# files direct from the cfChIP tool +# outFile: [required] the name of the output txt file to save the data +# +# Function2: plotSignatures(inTXT, outPDF) +# inTXT: [required] either the name of the file from mergeSignatures or an +# an R object containing the data. Column names must match that of the +# output of mergeSignatures, but column order doesn't matter +# inPDF: [required] the name of the output pdf file to create +# +# Example usage: +# source("cfChIP_signatures.R") +# mergeSignatures("cfChIPtool/Output/H3K4me3/Signatures/", out.txt) +# plotSignatures(out.txt, out.pdf) +# plotSignatures(signatureDataFrame, out.pdf) +# +#################### + +mergeSignatures <- function(folder, outFile) { + files <- list.files(folder,full.names = T) + sigList <- lapply(files, read.csv) + samples <- gsub(".csv","",grep("csv",unlist(strsplit(files,"/")),value=T)) + for (i in 1:length(samples)) { + sigList[[i]] <- data.frame(sigList[[i]],Sample=samples[i]) + } + sigData <- do.call("rbind",sigList) + write.table(sigData, outFile, quote=F, sep="\t", row.names=F) +} + +plotSignatures <- function(inTXT, outPDF) { + library(ggplot2) + library(ggprism) + + if (mode(inTXT) == "character") { # if using a file name + sigData <- read.delim(inTXT) + } else { # if starting with an object in R + sigData <- as.data.frame(inTXT) + } + sigData$NormalizedCounts[which(sigData$NormalizedCounts > 3)] <- 3 + sigData$NormalizedCounts[which(sigData$NormalizedCounts < 0.15)] <- 0.15 + sigData$qValue[which(sigData$qValue > 300)] <- 300 + sigData$qValue[which(sigData$qValue < 5)] <- NA + names(sigData)[1] <- "cellType" + + cellTypes <- data.frame(cellType=c("Neutrophils","Monocytes","Megakaryocyte", + "Erythroblast","T-Cells","B-Cells","NK", + "Vasculary","Adipose","Skin","Sk. Muscle", + "Brain","Heart","Lung","Breast","Digestive", + "Pancreas"), + class=c(rep("Blood",7),rep("Global",4),rep("Other",6)) ) + + sigData2 <- merge(sigData,cellTypes) + sigData2$cellType <- factor(sigData2$cellType,levels=rev(cellTypes$cellType)) + + pdf(outPDF) + p <- ggplot(data=sigData2,aes(x=Sample,y=cellType,color=NormalizedCounts,size=qValue)) + p <- p + geom_point() + + scale_size(limits=c(5,300),breaks=c(5,50,100,150,200,250,300), + labels=paste0("e-",c(5,50,100,150,200,250,300))) + + scale_color_viridis_c(direction = -1, option="A") + + theme_prism() + + theme(axis.title.y = element_blank(), axis.title.x = element_blank(), + axis.text.x = element_text(angle = 45,vjust=0.9,hjust=1)) + if (sum(names(sigData) == "Condition") == 1) { + p <- p + facet_grid(rows=vars(class),cols=vars(Condition),scales="free",space="free") + } else { + p <- p + facet_grid(rows=vars(class),scales="free",space="free") + } + print(p) + dev.off() +} \ No newline at end of file diff --git a/bin/frip.py b/bin/frip.py new file mode 100644 index 0000000..113eb62 --- /dev/null +++ b/bin/frip.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 + +""" +Name: frip.py +Created by: Tovah Markowitz +Date: 06/18/20 + +Purpose: To calculate FRiP scores, one bam file and as many bedfiles as wanted as inputs +Currently only works with python/3.5 +""" + +########################################## +# Modules +import argparse +from argparse import RawTextHelpFormatter +from pybedtools import BedTool +import pysam +import pandas as pd + +########################################## +# Functions + +def split_infiles(infiles): + """ + breaks the infile string with space-delimited file names and + creates a list + """ + infileList = infiles.strip("\'").strip('\"').split(" ") + if len(infileList) == 1: + infileList = infileList[0].split(";") + return(infileList) + +def count_reads_in_bed(bam, bedfile, genomefile): + """ + some of this comes directly from the pybedtools site; read in + bed (or bed-like) file, sort it, and then count the number of + reads within the regions + """ + bedinfo = BedTool(bedfile) + bedinfo.sort(g=genomefile) + return ( + BedTool(bam).intersect( bedinfo, bed=True, stream=True, ) + ).count() + +def count_reads_in_bam(bam): + """ count the number of reads in a given bam file """ + return( pysam.AlignmentFile(bam).mapped ) + +def calculate_frip(nreads, noverlaps): + """ calculate FRiP score from nreads and noverlaps """ + return( float(noverlaps) / nreads ) + +def measure_bedfile_coverage(bedfile, genomefile): + """ calculate the number of bases covered by a given bed file """ + bedinfo = BedTool(bedfile) + return( bedinfo.sort(g=genomefile).total_coverage() ) + +def clip_bamfile_name(bamfile): + """ + clip bam file name for table/plotting purposes; assumes file + naming system matches that of Pipeliner + """ + sample = bamfile.split("/")[-1].split(".")[0] + condition = ".".join(bamfile.split("/")[-1].split(".")[1:-1]) + return( sample, condition ) + +def clip_bedfile_name(bedfile,filetype): + """ + clip bed file name for table/plotting purposes; assumes file + naming system matches that of Pipeliner + """ + if filetype == "": + toolused = bedfile.split("/")[-3] + sample = bedfile.split("/")[-2] + else: + toolused = filetype + sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks") + return( toolused, sample ) + +def process_files(bamfile, bedfiles, genome, filetypes): + """ + this is the main function to take in list of input files and + put out an array containing key file name information, read + counts, and FRiP scores + """ + bedfileL = bedfiles + filetypesL = filetypes + out = [[ "bedtool", "bedsample", "bamsample", "bamcondition", + "n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]] + nreads = count_reads_in_bam(bamfile) + (bamsample, condition) = clip_bamfile_name(bamfile) + for i in range(len(bedfileL)): + bed = bedfileL[i] + if len(filetypesL) > 1: + filetype = filetypesL[i] + else: + filetype = filetypesL[0] + (bedtool, bedsample) = clip_bedfile_name(bed,filetype) + noverlaps = count_reads_in_bed(bamfile, bed, genome) + frip = calculate_frip(nreads, noverlaps) + nbases = measure_bedfile_coverage(bed, genome) / 1000000 + out.append( [bedtool, bedsample, bamsample, condition, + nreads, noverlaps, frip, nbases] ) + out2 = pd.DataFrame(out[1:], columns=out[0]) + return(out2) + +def create_outfile_name(bamfile, outroot): + """ uses outroot to create the output file name """ + (bamsample, condition) = clip_bamfile_name(bamfile) + outtable = bamsample + "." + condition + "." + "FRiP_table.txt" + if outroot != "": + outtable = outroot + "." + outtable + return(outtable) + +def write_table(out2, outtable): + out2.to_csv(outtable,sep='\t',index=False) + + +############################################### +# Main + +def main(): + desc=""" +This function takes a space-delimited or semi-colon delimited list +of bed-like files (extensions must be recognizable by bedtools) +and a single bam file. It will then calculate the FRiP score for +all possible combinations of files and save the information in a +txt file. It will also calculate the number of bases covered by +each bed-like file. Note: this function assumes that the file +naming system of the input files matches that of Pipeliner. + """ + + parser = argparse.ArgumentParser(description=desc, formatter_class=RawTextHelpFormatter) + parser.add_argument('-p', nargs = '+', required=True, type=str, help='A space- or semicolon-delimited list of peakfiles \ +(or bed-like files).') + parser.add_argument('-b', required=True, type=str, help='The name of a bamfile to analyze.') + parser.add_argument('-g', required=True, type=str, help='The name of the .genome file so bedtools knows the \ +size of every chromosome.') + parser.add_argument('-o', required=True, type=str, help='The root name of the multiple output files. Default:""') + parser.add_argument('-t', required=False, default=[""], type=list, help='A space- \ +or semicolon-delimited list of input file sources/types. Only needed when \ +source of bed file is not built into the script. Default: ""') + + args = parser.parse_args() + bedfiles = args.p + bamfile = args.b + genomefile = args.g + outroot = args.o + filetypes = args.t + + out2 = process_files(bamfile, bedfiles, genomefile, filetypes) + outtable = create_outfile_name(bamfile, outroot) + write_table(out2, outtable) + +if __name__ == '__main__': + main() + +############################################### +# example cases + +#bedfiles = "macs_broad/mWT_HCF1_mm_i81/mWT_HCF1_mm_i81_peaks.broadPeak macs_broad/mWT_HCF1_mm_i89/mWT_HCF1_mm_i89_peaks.broadPeak" +#bamfiles = "bam/Input_mm_i95.sorted.Q5DD.bam bam/mWT_HCF1_mm_i81.sorted.Q5DD.bam bam/mWT_HCF1_mm_i89.sorted.Q5DD.bam" +#genomefile = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa.sizes" +#out2 = pd.read_csv("FRIP_test.txt",sep="\t") diff --git a/bin/jaccard_score.py b/bin/jaccard_score.py new file mode 100644 index 0000000..378ee69 --- /dev/null +++ b/bin/jaccard_score.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 + +""" +Name: jaccard_score.py +Created by: Tovah Markowitz +Date: 1/23/19 +Updated: 8/5/19 to compare multiple tools and create plots + +Purpose: To do all pairwise comparisons of bed/peak files given. Uses bedtools +to calculate a jaccard score for every comparison. All data is saved in a +single tab-delimited file. +""" + +########################################## +# Modules +import optparse +from pybedtools import BedTool +import pandas as pd +from sklearn.decomposition import PCA as sklearnPCA +import matplotlib as mpl +mpl.use('Agg') +import matplotlib.pyplot as plt +import seaborn as sns + +########################################## +# Functions + +def split_infiles(infiles): + """ breaks the infile string with space-delimited file names and creates a list. + also works for infile types + """ + infileList = infiles.strip("\'").strip('\"').split(" ") + if len(infileList) == 1: + infileList = infileList[0].split(";") + return(infileList) + +def loop_jaccard(infileList, genomefile, filetypeList): + """ Uses two loops to do all possible pairwise comparisons of files + in a list. Returns a writeable output and a pandas object + """ + nfiles = len(infileList) + (colnames, snames) = get_colnames(infileList, filetypeList) + out = [[1.000] * nfiles for i in range(nfiles)] + outTable = [] + for z in range(nfiles): + fileA = infileList[z] + print("fileA is: " + fileA) + for y in range(z+1,nfiles): + fileB = infileList[y] + (data, keylist) = run_jaccard(fileA, fileB, genomefile) + out[z][y] = data[3] + out[y][z] = data[3] + if filetypeList != [""]: + keylist.insert(1, "toolA") + keylist.insert(3, "toolB") + data.insert(1, filetypeList[z]) + data.insert(3, filetypeList[y]) + if len(outTable) == 0: + outTable.append( "\t".join(keylist) ) + outTable.append( "\t".join(data) ) + out2 = pd.DataFrame(out, columns=colnames, index=colnames,dtype="float") + return(outTable, out2, snames) + +def run_jaccard(fileA, fileB, genomefile): + """ Running bedtools. Reads in two bedtools approved file types, sorts the files, + and calculates a jaccard score. + """ + a = BedTool(fileA) + a = a.sort(g=genomefile) + b = BedTool(fileB) + b = b.sort(g=genomefile) + j = a.jaccard(b,g=genomefile) + j["fileA"] = fileA.split("/")[-1] + j["fileB"] = fileB.split("/")[-1] + keylist = list(j.keys()) + keylist.sort() + data = [ str(j[key]) for key in keylist ] + return(data, keylist) + +def get_colnames(infileList, filetypeList): + snames = [ i.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks") for i in infileList ] + if filetypeList == [""]: + colnames = snames + else: + colnames = [ snames[i] + "_" + filetypeList[i] for i in range(len(snames)) ] + return(colnames, snames) + +def create_outfile_names(outroot): + """ uses outroot to create the output file names """ + outTableFile = "jaccard.txt" + outPCAFile = "jaccard_PCA.pdf" + outHeatmapFile = "jaccard_heatmap.pdf" + if outroot != "": + if outroot[-1] == "/": + outTableFile= outroot + outTableFile + outPCAFile = outroot + outPCAFile + outHeatmapFile = outroot + outHeatmapFile + else: + outTableFile= outroot + "_" + outTableFile + outPCAFile = outroot + "." + outPCAFile + outHeatmapFile = outroot + "." + outHeatmapFile + return(outTableFile, outPCAFile, outHeatmapFile) + +def pca_plot(out, filetypeList, snames, outPCAFile): + """ creates a 2D PCA plot comparing the files based upon jaccard scores + """ + sklearn_pca = sklearnPCA(n_components=2) + Y_sklearn = sklearn_pca.fit_transform(out) + PCAdata = pd.DataFrame(Y_sklearn,columns=["PC1","PC2"]) + PCAdata.insert(0,"sample name",snames) + fig, ax =plt.subplots() + snames_pal = sns.hls_palette(len(set(snames)),s=.8) + sns.set_palette(snames_pal) + if filetypeList != [""]: + PCAdata.insert(1,"tool",filetypeList) + ax = sns.scatterplot(x="PC1",y="PC2",hue="sample name",style="tool",data=PCAdata,s=100) + else: + ax = sns.scatterplot(x="PC1",y="PC2",hue="sample name",data=PCAdata,s=100) + ax.axhline(y=0, color='grey', linewidth=1,linestyle="--") + ax.axvline(x=0, color='grey', linewidth=1,linestyle="--") + ax.set(xlabel= "PC1 (" + str(round(100*sklearn_pca.explained_variance_[0],2)) + "%)", + ylabel= "PC2 (" + str(round(100*sklearn_pca.explained_variance_[1],2)) + "%)") + plt.legend(bbox_to_anchor=(1.05, 1), loc=2) + #plt.show() + plt.savefig(outPCAFile, bbox_inches='tight') + plt.close("all") + +def plot_heatmap(out, outHeatmapFile, snames, filetypeList): + snames_pal = sns.hls_palette(len(set(snames)),s=.8) + snames_lut = dict(zip(set(snames), snames_pal)) + snames_cols = pd.Series(snames,index=out.index).map(snames_lut) + if filetypeList != [""]: + tool_pal = sns.cubehelix_palette(len(set(filetypeList))) + tool_lut = dict(zip(set(filetypeList), tool_pal)) + tool_cols = pd.Series(filetypeList,index=out.index).map(tool_lut) + g = sns.clustermap(out,cmap="YlGnBu",col_cluster=False, + row_colors=[snames_cols,tool_cols]) + for label in set(snames): + g.ax_col_dendrogram.bar(0, 0, color=snames_lut[label], + label=label, linewidth=0) + for label in set(filetypeList): + g.ax_col_dendrogram.bar(0, 0, color=tool_lut[label], + label=label, linewidth=0) + g.ax_col_dendrogram.legend(loc="center", ncol=3, + bbox_to_anchor=(0.4, 0.8)) + else: + g = sns.clustermap(out,cmap="YlGnBu",col_cluster=False, + row_colors=snames_cols) + for label in set(snames): + g.ax_col_dendrogram.bar(0, 0, color=snames_lut[label], + label=label, linewidth=0) + g.ax_col_dendrogram.legend(loc="center", ncol=3, + bbox_to_anchor=(0.5, 0.8)) + #plt.show() + plt.savefig(outHeatmapFile, bbox_inches='tight') + plt.close("all") + +def write_out(out, outFile): + f = open(outFile, 'w') + f.write( "\n".join(out) ) + f.close() + +########################################## +# Main + +def main(): + desc=""" + This function takes a space-delimited list of files (bed, bedgraph, gff, gtf, etc.) + and calculates all possible pairwise jaccard scores. From bedtools: 'Jaccard is the + length of the intersection over the union. Values range from 0 (no intersection) to + 1 (self intersection)'. The columns of the output file are: fileA, fileB, + intersection, jaccard, n_intersections, and union-intersection. + """ + + parser = optparse.OptionParser(description=desc) + + parser.add_option('-i', dest='infiles', default='', help='A space- or semicolon-delimited list of \ +input files for analysis.') + parser.add_option('-t', dest='filetypes', default='', help='A space- or semicolon-delimited list \ +of input file sources/types.') + parser.add_option('-o', dest='outroot', default='', help='The root name of the output files \ +where all the jaccard score information will be saved.') + parser.add_option('-g', dest='genomefile', default='', help='The name of the .genome file.') + + (options,args) = parser.parse_args() + infiles = options.infiles + filetypes = options.filetypes + outroot = options.outroot + genomefile = options.genomefile + + infileList = split_infiles(infiles) + filetypeList = split_infiles(filetypes) + (outTable, out, snames) = loop_jaccard(infileList, genomefile, filetypeList) + (outTableFile, outPCAFile, outHeatmapFile) = create_outfile_names(outroot) + write_out(outTable, outTableFile) + pca_plot(out, filetypeList, snames, outPCAFile) + plot_heatmap(out, outHeatmapFile, snames, filetypeList) + +if __name__ == '__main__': + main() + + diff --git a/bin/ppqt_process.py b/bin/ppqt_process.py new file mode 100644 index 0000000..9a2c9b1 --- /dev/null +++ b/bin/ppqt_process.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +#Purpose: To grab the estimated fragment length from the ppqt output and a small txt with that information. For input files, adding an extra value of 200bp as an alternative. +import argparse +parser = argparse.ArgumentParser(description='Script to extract the the estimated fragment length from the ppqt output.') +parser.add_argument('-i', required=True,help='Name of the ppqt txt file') +parser.add_argument('-o', required=True,help='Name of the output file') +args = parser.parse_args() + +output = args.o +inppqt = args.i + +o=open(output,'w') + +file = list(map(lambda z:z.strip().split(),open(inppqt,'r').readlines())) + + +ppqt_values = file[0][2].split(",") +extenders = [] +for ppqt_value in ppqt_values: + if int(ppqt_value) > 150: + extenders.append(ppqt_value) +if len(extenders) > 0: + o.write(extenders[0]) +else: + o.write("200") +o.close() \ No newline at end of file diff --git a/bin/prep_diffbind.py b/bin/prep_diffbind.py new file mode 100644 index 0000000..4ba7d64 --- /dev/null +++ b/bin/prep_diffbind.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import json +import argparse + +parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv') +parser.add_argument('--g1',dest='group1',required=True,help='Name of the first group') +parser.add_argument('--g2',dest='group2',required=True,help='Name of the second group') +parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory') +parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located') +parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output') +parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv') +parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located') +parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file') + +args = parser.parse_args() + +with open("config.json","r") as read_file: + config=json.load(read_file) + +chip2input = config['project']['peaks']['inputs'] +groupdata = config['project']['groups'] +blocks = config['project']['blocks'] + +if None in list(blocks.values()): + samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", + "ControlID", "bamControl", "Peaks", "PeakCaller"])] +else: + samplesheet = [",".join(["SampleID","Condition","Treatment","Replicate", "bamReads", + "ControlID", "bamControl", "Peaks", "PeakCaller"])] + + +for condition in args.group1, args.group2: + for chip in groupdata[condition]: + replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) + bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam" + controlID = chip2input[chip] + if controlID != "": + bamControl = args.workpath + "/" + args.bamdir + "/" + controlID + ".Q5DD.bam" + else: + bamControl = "" + peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension + if None in list(blocks.values()): + samplesheet.append(",".join([chip, condition, replicate, bamReads, + controlID, bamControl, peaks, args.peakcaller])) + else: + block = blocks[chip] + samplesheet.append(",".join([chip, condition, block, replicate, bamReads, + controlID, bamControl, peaks, args.peakcaller])) + + +f = open(args.csvfile, 'w') +f.write ("\n".join(samplesheet)) +f.close() diff --git a/bin/prep_diffbindQC.py b/bin/prep_diffbindQC.py new file mode 100644 index 0000000..550b5f9 --- /dev/null +++ b/bin/prep_diffbindQC.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +import json +import argparse + +parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv') +parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory') +parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located') +parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output') +parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv') +parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located') +parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file') + +args = parser.parse_args() + +with open("config.json","r") as read_file: + config=json.load(read_file) + +chip2input = config['project']['peaks']['inputs'] +groupdata = config['project']['groups'] + +tmpIDs = [x for xs in groupdata.values() for x in xs] +Ncounts = [tmpIDs.count(tmp) for tmp in set(tmpIDs)] + +samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", + "ControlID", "bamControl", "Peaks", "PeakCaller"])] + +count = 1 +for chip in chip2input.keys(): + if set(Ncounts) == {1}: # if all samples only in one group + for key in groupdata.keys(): + if chip in groupdata[key]: + condition = key + replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) + else: + condition = "" + replicate = str(count) + count = count +1 + bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam" + controlID = chip2input[chip] + if controlID != "": + bamControl = args.workpath + "/" + args.bamdir + "/" + controlID + ".Q5DD.bam" + else: + bamControl = "" + peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension + samplesheet.append(",".join([chip, condition, replicate, bamReads, + controlID, bamControl, peaks, args.peakcaller])) + +f = open(args.csvfile, 'w') +f.write ("\n".join(samplesheet)) +f.close() diff --git a/bin/promoterAnnotation_by_Gene.R b/bin/promoterAnnotation_by_Gene.R new file mode 100755 index 0000000..846cfc0 --- /dev/null +++ b/bin/promoterAnnotation_by_Gene.R @@ -0,0 +1,179 @@ +#################### +# +# Name: promoterAnnotationByGene.R +# Created by: Tovah Markowitz, PhD +# Bioinformatics (NCBR)/ Integrated Data Sciences Section (IDSS) +# Research Technologies Branch/DIR/NIAID +# +# Created: August 8, 2022 +# Updated: October 26, 2022 to work with uropa 4.0.2 +# Updated: November 3, 2022 to fit with pipeline +# +#################### +# +# Purpose: To take UROPA allhits output files using "TSSprot" conditions and +# create a table of which genes have annotations overlapping their +# promoters and how many times. Output format: dataframe +# +# Details: Promoters will be defined as 3kb upstream to 1 kb downstream of the +# TSS. Allhits files were chosen to capture information from "peaks" +# overlappingmultiple promoters. Finalhits files can also be processed +# with this pipeline. This script can handle multiple allhits files as +# long as there are equal numbers of sampleNames to go with them. Also, +# giving a matching DiffBind txt file will allow the allhits file to be +# filtered to only include the significant differential peaks or to +# split the data by the direction of log fold-change. +# +# Requires: GenomicRanges, tidyr +# +# Function: promoterAnnotationByGene(allhitsFiles, sampleNames, diffbindFiles=NA, direction=NA) +# +# Variables: +# allhitsFiles: [required] a vector of allhits files to process +# sampleNames: [required] a vector of short names for each allhits file +# to use as column headers +# diffbindFiles: [optional] a vector of diffbind files to use to filter each +# allhits file +# direction: [optional] when filtering using diffbindFiles, define how to +# filter using log fold change. "Both" is default +# when not defined by user. +# Options: "both", "pos", "neg", "separate" +# +# Example usage: +# source("promoterAnnotation_by_Gene.R") +# out1 <- promoterAnnotationByGene(allhitsA.txt, "A") +# out2 <- promoterAnnotationByGene(allhitsA.txt, "A", diffbindA.txt, "both") +# out3 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsB.txt), +# sampleNames=c("A","B"), +# diffbindFiles=c(diffbindA.txt,diffbindB.txt), +# direction="pos") +# out4 <- promoterAnnotationByGene(allhitsFiles= c(allhitsA.txt, allhitsA.txt), +# sampleNames=c("Deseq2","EdgeR"), +# diffbindFiles=c(Deseq2.txt,EdgeR.txt), +# direction="separate") +# +#################### + + +allhits2promoter <- function(allhitsFile) { + # cleaning up the allhits file to only keep information about peaks + # overlapping promoters + inData <- read.delim(allhitsFile) + tmp <- which(inData$name == "query_1") + if (length(tmp) == 0) { + print (paste0("Supplied file ", allhitsFile, " has no peaks overlapping promoters.")) + } else { + promoterData <- inData[tmp,] + promoterData <- promoterData[,c("peak_chr", "peak_start", "peak_end", "gene_id", "gene_name")] + return(promoterData) + } +} + +filterPromoter <- function(Diffbind, promoterData, sampleName) { + # used by DiffbindFilterPromoter + promoterData2 <- GenomicRanges::makeGRangesFromDataFrame(promoterData, seqnames.field="peak_chr", + start.field="peak_start", end.field="peak_end", + starts.in.df.are.0based=F) + Diffbind2 <- GenomicRanges::makeGRangesFromDataFrame(Diffbind) + ov <- GenomicRanges::countOverlaps(promoterData2,Diffbind2,type = "equal",maxgap=1) + promoterData3 <- promoterData[which(ov != 0),] + promoterData3$sample_id <- sampleName + return(promoterData3) +} + +DiffbindFilterPromoter <- function(DiffbindFile, promoterData, sampleName, direction) { + # filters the promoter data based upon whether it matches a different peak and what direction the fold-change is + # direction can be: "both", "pos", "neg", "separate". If direction is NA, use "both". + Diffbind <- read.delim(DiffbindFile) + Diffbind <- Diffbind[which(Diffbind$FDR < 0.05),] + if ((direction == "both") | is.na(direction)) { + promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) + } else if (direction == "pos") { + sampleName <- paste0(sampleName, "_pos") + Diffbind <- Diffbind[which(Diffbind$Fold > 0),] + promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) + } else if (direction == "neg") { + sampleName <- paste0(sampleName, "_neg") + Diffbind <- Diffbind[which(Diffbind$Fold < 0),] + promoterData2 <- filterPromoter(Diffbind, promoterData, sampleName) + } else { + sampleNameP <- paste0(sampleName, "_pos") + DiffbindP <- Diffbind[which(Diffbind$Fold > 0),] + promoterDataP <- filterPromoter(DiffbindP, promoterData, sampleNameP) + sampleNameN <- paste0(sampleName, "_neg") + DiffbindN <- Diffbind[which(Diffbind$Fold < 0),] + promoterDataN <- filterPromoter(DiffbindN, promoterData, sampleNameN) + promoterData2 <- rbind(promoterDataP, promoterDataN) + } +return(promoterData2) +} + +createPromoterTable <- function(promoterData) { + # making final output table + PromoterTable <- data.frame( table(promoterData[,c("gene_id", "sample_id")] ) ) + PromoterTable2 <- merge( unique(promoterData[,c("gene_id", "gene_name")] ), PromoterTable) + PromoterTable3 <- tidyr::pivot_wider(PromoterTable2, names_from="sample_id", values_from="Freq") + return(PromoterTable3) +} + +promoterAnnotationByGene <- function(allhitsFiles, sampleNames, diffbindFiles=NA, direction=NA) { + # the main function + if ( length(allhitsFiles) != length(sampleNames) ) { + print("Number of allhits files and sample names don't match.") + } else { + if ( (length(allhitsFiles) != length(diffbindFiles)) & (sum(is.na(diffbindFiles)) != 1) ) { + print("Number of allhits files and diffbind files don't match.") + } else { + if ( length(allhitsFiles) == 1 ) { + promoterData <- allhits2promoter(allhitsFiles) + if (is.na(diffbindFiles)) { + promoterData$sample_id <- sampleNames + } else { + promoterData <- DiffbindFilterPromoter(diffbindFiles, promoterData, sampleNames, direction) + } + } else { + for ( a in 1:length(allhitsFiles) ) { + print(a) + tmpA <- allhits2promoter(allhitsFiles[a]) + if (sum(is.na(diffbindFiles)) ==1) { + tmpA$sample_id <- sampleNames[a] + } else { + tmpA <- DiffbindFilterPromoter(diffbindFiles[a], tmpA, sampleNames[a], direction) + } + if (a == 1) { + promoterData <- tmpA + } else { + promoterData <- rbind(promoterData, tmpA) + } + } + } + } + promoterTable <- createPromoterTable(promoterData) + return(promoterTable) + } +} + +peakcallVersion <- function(inFolder,outFile) { +# currently only works for macs outputs +# inFolder here is the folder where the uropa output files are located + filesA <- list.files(path=inFolder,pattern="allhits.txt") + samples <- matrix(unlist(strsplit(filesA,"_macs")),ncol=2,byrow=T)[,1] + filesA <- list.files(path=inFolder,pattern="allhits.txt",full.names = T) + promoterInfo <- promoterAnnotationByGene(allhitsFiles=filesA, sampleNames=samples) + write.table(promoterInfo, outFile, quote=F,sep="\t",row.names=F) +} + +diffbindVersion <- function(inFolder,outFile) { +# currently designed for macs peaks, analyzed by deseq2 +# analyzing both positive and negative together for now +# inFolder here is the root working directory for the project + uropaFolder <- paste0(inFolder, "/UROPA_annotations/DiffBind") + diffbindFolder <- paste0(inFolder, "/DiffBind") + filesU <- list.files(path=uropaFolder, pattern="DiffbindDeseq2_uropa_protTSS_allhits.txt") + samples <- matrix(unlist(strsplit(filesU,"-macs")),ncol=2,byrow=T)[,1] + filesU <- list.files(path=uropaFolder, pattern="DiffbindDeseq2_uropa_protTSS_allhits.txt",full.names=T) + filesD <- list.files(path=diffbindFolder, pattern="Deseq2.txt",full.names=T,recursive=T) + promoterInfo <- promoterAnnotationByGene(allhitsFiles=filesU, + sampleNames=samples, diffbindFiles=filesD, direction="both") + write.table(promoterInfo, outFile, quote=F,sep="\t",row.names=F) +} diff --git a/bin/significantPathways.R b/bin/significantPathways.R new file mode 100755 index 0000000..dd9af36 --- /dev/null +++ b/bin/significantPathways.R @@ -0,0 +1,127 @@ +#################### +# +# Name: significantPathways.R +# Created by: Tovah Markowitz, PhD +# Bioinformatics (NCBR)/ Integrated Data Sciences Section (IDSS) +# Research Technologies Branch/DIR/NIAID +# +# Created: August 9, 2022 +# Updated: October 28, 2022 to make reactomePA optional +# Updated: November 4, 2022 to accept a txt file or a gtf for the background genes +# also to accept a promoter annotation table and analyze every column +# +#################### +# +# Purpose: To take a list of genes and find the significant KEGG or Reactome +# pathways using overenrichment analysis. See details for specialized functionality. +# +# Requires: clusterProfiler, ReactomePA, enrichplot, org.Hs.eg.db, rtracklayer, ggplot2, and ggprism +# +# Details: Takes input gene lists as Ensembl gene IDs or gene symbols, converts to +# Entrez gene IDs, and runs ORA against KEGG or Reactome database. Requires +# a background gene list as cfChIP currently ignores chrs X, Y, and M. +# Outputs a dataframe of significant pathways, a pdf of the top most +# significant pathways, or a pdf of just the pathways of interest (if significant). +# +# Function: significantPathways(Genes, bkgGeneTXT, database="KEGG", PDFfile=NA, pathwayVector=NA) +# +# Variables: +# Genes: [Required] a vector of the genes to be analyzed through ORA +# bkgGeneFILE: [Required] a txt file containing a column of Ensembl IDs listing +# the appropriate background gene set or the gtf file used for the uropa +# annotations +# For example: hg19.ensembl.prot_coding.with_annotations.txt +# database: [Optional] whether to compare to the KEGG or Reactome database +# default: KEGG +# PDFfile: [Optional] name of the PDF file to create, if empty no PDF will be made +# pathwayVector: [Optional] a vector of pathways (descriptions or IDs) to plot in the pdf. +# If PDFfile is empty, it is ignored. If this is empty and PDFfile is not, +# pdf plot will be of the top 30 most significant pathways instead. +# +# Example usage: +# source("significantPathways.R") +# out <- significantPathways(Genes= c("GeneA","GeneB"), +# bkgGeneFILE= "hg19.ensembl.prot_coding.with_annotations.txt", +# database="KEGG", PDFfile="a.pdf", +# pathwayVector=c("pathwayA", "pathwayB")) +# +#################### + +library(clusterProfiler) +library(enrichplot) +library(ggplot2) +library(ggprism) + +makeBarplotTop <- function(inData, titleName, PDFfile) { + inDataCount <- sum(inData@result$p.adjust < 0.1) + if (inDataCount > 30) { inDataCount = 30 } + if (inDataCount > 0) { + pdf(PDFfile) + print(barplot(inData, showCategory = inDataCount, + label_format=70, title=titleName, x="GeneRatio") + + theme_prism(base_size =8) + theme(legend.title = element_text()) ) + } + dev.off() +} + +makeBarPlotSelect <- function(inData, titleName, PDFfile, categories) { + pdf(PDFfile) + print(barplot(inData, showCategory = categories, + label_format=70, title=titleName, x="GeneRatio") + + theme_prism(base_size = 8) + theme(legend.title = element_text()) ) + dev.off() + } + +processGenes <- function(geneIDs) { + if (grepl("^ENSG", geneIDs[1])) { + ensIDs <- gsub("\\.[0-9]+", "", geneIDs, perl=T) + entrezIDs <- bitr(ensIDs, from= "ENSEMBL", toType="ENTREZID", OrgDb="org.Hs.eg.db") + } else { + entrezIDs <- bitr(ensIDs, from= "SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db") + } + entrezIDs <- entrezIDs$ENTREZID + return(entrezIDs) +} + +significantPathways <- function(Genes, bkgGeneFILE, database="KEGG", PDFfile=NA, pathwayVector=NA) { + sigGenes <- processGenes(Genes) + if (grepl("gtf",bkgGeneFILE)) { + bkgGenesData <- rtracklayer::import(bkgGeneFILE) + bkgGenes <- unique(bkgGenesData$gene_id) + } else { + bkgGenesData <- read.delim(bkgGeneFILE) + bkgGenes <- bkgGenesData[,grep("^ENSG", bkgGenesData[1,])] + } + backgroundGenes <- processGenes(bkgGenes) + if (database == "KEGG") { + pathwaySig <- enrichKEGG(sigGenes, organism= "hsa", keyType="kegg", universe=backgroundGenes, use_internal_data=TRUE) + pathwayData <- pathwaySig@result[which(pathwaySig@result$p.adjust < 0.1),] + } else { + library(ReactomePA) + pathwaySig <- enrichPathway(sigGenes, readable=T, universe=backgroundGenes) + pathwayData <- pathwaySig@result[which(pathwaySig@result$p.adjust < 0.1),] + } + if (!is.na(PDFfile)) { + if(length(pathwayVector) != 1) { + if (length(grep("HSA", pathwayVector, ignore.case=T)) != 0) { + pathwayVector <- pathwayData$Description[which(pathwayData$ID %in% pathwayVector)] + } + makeBarPlotSelect(inData=pathwaySig, titleName=database, PDFfile=PDFfile, categories=pathwayVector) + } else { + makeBarplotTop(inData=pathwaySig, titleName=database, PDFfile=PDFfile) + } + } + return(pathwayData) +} + +promoterAnnotationWrapper <- function(promoterFile, bkgGeneFILE, database="KEGG") { + promoterData <- read.delim(promoterFile) + outFolder <- dirname(promoterFile) + for (i in 3:ncol(promoterData)) { + colName <- names(promoterData)[i] + Genes <- promoterData$gene_id[which(promoterData[,i] > 0)] + outData <- significantPathways(Genes, bkgGeneFILE, database) + outFileName <- paste0(outFolder,"/",colName,"_",database,".txt") + write.table(outData, outFileName, quote=F, row.names=F, sep="\t") + } +} diff --git a/workflow/Snakefile b/workflow/Snakefile index 6d1bce7..240f016 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -14,6 +14,7 @@ configfile: "config.json" # Global workflow variables today = str(datetime.datetime.today()).split()[0].replace('-', '') # YYYYMMDD samples = config['samples'] +bin_path = config['project']['binpath'] workpath = config['project']['workpath'] assay = config['options']['assay'] paired_end = False if config['project']['nends'] == 1 else True diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk index 672a05b..4730054 100644 --- a/workflow/rules/cfChIP.smk +++ b/workflow/rules/cfChIP.smk @@ -5,117 +5,128 @@ # ~~ workflow configuration workpath = config['project']['workpath'] +bin_path = config['project']['binpath'] genome = config['options']['genome'] blocks = config['project']['blocks'] groupdata = config['project']['groups'] - -# ~~ directories +# Directory end points +bam_dir = join(workpath, "bam") +cfTool_dir = join(workpath, "cfChIPtool") +cfTool_subdir2 = join(cfTool_dir, "BED", "H3K4me3") +qc_dir = join(workpath, "QC") rule cfChIPtool: input: - out5=join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"), + out5 = join(bam_dir, "{name}.Q5DD.bam.idxstat"), output: - out1=join(workpath,cfTool_subdir2,"{name}.Q5DD.tagAlign.gz"), - out2=join(workpath,cfTool_dir,"Output","H3K4me3","Signatures","{name}.Q5DD.csv"), - out3=join(workpath,cfTool_dir,"Samples","H3K4me3","{name}.Q5DD.rdata"), + out1 = join(cfTool_subdir2, "{name}.Q5DD.tagAlign.gz"), + out2 = join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"), + out3 = join(cfTool_dir, "Samples", "H3K4me3", "{name}.Q5DD.rdata"), params: - rname='cfChiP', - rver="R/4.1.0", - toolkit = config['references'][genome]['cfChIP_TOOLS_SRC'], - tmpfile = lambda w: join(workpath,cfTool_subdir2, w.name + ".Q5DD.tagAlign"), - tag=lambda w: temp(join(workpath,bam_dir, w.name+".Q5DD_tagAlign")) + rname = 'cfChiP', + rver = "R/4.1.0", + toolkit = config['references'][genome]['cfChIP_TOOLS_SRC'], + tmpfile = lambda w: join(cfTool_subdir2, w.name + ".Q5DD.tagAlign"), + tag = lambda w: temp(join(bam_dir, w.name + ".Q5DD_tagAlign")) container: config['images']['cfchip'] - shell: """ - cp {params.tag} {params.tmpfile} - gzip {params.tmpfile} + shell: + """ + cp {params.tag} {params.tmpfile} + gzip {params.tmpfile} - Rscript {params.toolkit}/ProcessBEDFiles.R \\ - -a {params.toolkit}/SetupFiles/H3K4me3 \\ - -r {cfTool_dir} \\ - -p {cfTool_dir} \\ - -m H3K4me3 \\ - -S {output.out1} - """ + Rscript {params.toolkit}/ProcessBEDFiles.R \\ + -a {params.toolkit}/SetupFiles/H3K4me3 \\ + -r {cfTool_dir} \\ + -p {cfTool_dir} \\ + -m H3K4me3 \\ + -S {output.out1} + """ rule cfChIPcompile: input: expand(join(cfTool_dir, "Output", "H3K4me3", "Signatures", "{name}.Q5DD.csv"), name=chips) output: - txt=join(workpath,"QC","H3K4me3_cfChIP_signature.txt"), - pdf=join(workpath,"QC","H3K4me3_cfChIP_signature.pdf") + txt = join(qc_dir, "H3K4me3_cfChIP_signature.txt"), + pdf = join(qc_dir, "H3K4me3_cfChIP_signature.pdf") params: - rname="cfChIP2", - script=join(workpath,"workflow","scripts","cfChIP_signatures.R"), - infolder=join(workpath,cfTool_dir,"Output","H3K4me3","Signatures"), + rname = "cfChIP2", + script = join(bin_path, "cfChIP_signatures.R"), + infolder = join(cfTool_dir, "Output", "H3K4me3", "Signatures"), container: config['images']['cfchip'] - shell: """ - Rscript -e "source('{params.script}'); mergeSignatures( '{params.infolder}', '{output.txt}' )"; - Rscript -e "source('{params.script}'); plotSignatures( '{output.txt}', '{output.pdf}' )"; - """ + shell: + """ + Rscript -e "source('{params.script}'); mergeSignatures( '{params.infolder}', '{output.txt}' )"; + Rscript -e "source('{params.script}'); plotSignatures( '{output.txt}', '{output.pdf}' )"; + """ + rule promoterTable1: input: - expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_protTSS_allhits.txt'),PeakTool=PeakTools,name=chips), + expand(join(workpath,uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_protTSS_allhits.txt'), PeakTool=PeakTools, name=chips), output: - txt=join(workpath,uropa_dir,"promoterTable1",'{PeakTool}_promoter_overlap_summaryTable.txt') + txt = join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt") params: - rname="promoter1", - script=join(workpath,"workflow","scripts","promoterAnnotation_by_Gene.R"), - infolder= join(workpath,uropa_dir, '{PeakTool}') + rname = "promoter1", + script = join(bin_path, "promoterAnnotation_by_Gene.R"), + infolder = join(uropa_dir, '{PeakTool}') container: config['images']['cfchip'] - shell: """ - Rscript -e "source('{params.script}'); peakcallVersion('{params.infolder}','{output.txt}')"; - """ + shell: + """ + Rscript -e "source('{params.script}'); peakcallVersion('{params.infolder}','{output.txt}')"; + """ + rule promoterTable2: input: - expand(join(workpath,uropa_dir,diffbind_dir,'{name}_DiffbindDeseq2_uropa_protTSS_allhits.txt'), name=contrasts), + expand(join(diffbind_dir, '{name}_DiffbindDeseq2_uropa_protTSS_allhits.txt'), name=contrasts), output: - txt=join(workpath,uropa_dir,"promoterTable2",'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'), + txt = join(workpath,uropa_dir,"promoterTable2",'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'), params: - rname="promoter2", - script1=join(workpath,"workflow","scripts","promoterAnnotation_by_Gene.R"), - script2=join(workpath,"workflow","scripts","significantPathways.R"), - infolder= workpath, - gtf = config['references'][genome]['GTFFILE'], + rname = "promoter2", + script1 = join(bin_path, "promoterAnnotation_by_Gene.R"), + script2 = join(bin_path, "significantPathways.R"), + infolder = workpath, + gtf = config['references'][genome]['GTFFILE'], container: config['images']['cfchip'] - shell: """ - Rscript -e "source('{params.script1}'); diffbindVersion('{params.infolder}','{output.txt}')"; - Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','KEGG')"; - Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','Reactome')"; - """ + shell: + """ + Rscript -e "source('{params.script1}'); diffbindVersion('{params.infolder}','{output.txt}')"; + Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','KEGG')"; + Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','Reactome')"; + """ rule diffbindQC: input: - lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ] + lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ] output: - html = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC.html"), - bed = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), + html = join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC.html"), + bed = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), params: - rname="diffbindQC", - rscript=join(workpath,"workflow","scripts","DiffBind_v2_cfChIP_QC.Rmd"), - outdir = join(workpath, "QC", "AllSamples-{PeakTool}"), - contrast = "AllSamples", - csvfile = join(workpath, "QC", "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBind_prep.csv"), - pythonscript = join(workpath,"workflow","scripts","prep_diffbindQC.py"), - PeakExtension= lambda w: PeakExtensions[w.PeakTool], - PeakTool="{PeakTool}", - peakcaller= lambda w: FileTypesDiffBind[w.PeakTool], + rname = "diffbindQC", + contrast = "AllSamples", + PeakTool = "{PeakTool}", + rscript = join(bin_path, "DiffBind_v2_cfChIP_QC.Rmd"), + outdir = join(qc_dir, "AllSamples-{PeakTool}"), + csvfile = join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBind_prep.csv"), + pythonscript = join(bin_path, "prep_diffbindQC.py"), + PeakExtension = lambda w: PeakExtensions[w.PeakTool], + peakcaller = lambda w: FileTypesDiffBind[w.PeakTool], container: config['images']['cfchip'] - shell: """ - python {params.pythonscript} --wp {workpath} \ - --pt {params.PeakTool} --pe {params.PeakExtension} --bd {bam_dir} \ - --pc {params.peakcaller} --csv {params.csvfile} - cp {params.rscript} {params.outdir} - cd {params.outdir} - Rscript -e 'rmarkdown::render("DiffBind_v2_cfChIP_QC.Rmd", output_file= "{output.html}", - params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}"))' - """ \ No newline at end of file + shell: + """ + python {params.pythonscript} --wp {workpath} \ + --pt {params.PeakTool} --pe {params.PeakExtension} --bd {bam_dir} \ + --pc {params.peakcaller} --csv {params.csvfile} + cp {params.rscript} {params.outdir} + cd {params.outdir} + Rscript -e 'rmarkdown::render("DiffBind_v2_cfChIP_QC.Rmd", output_file= "{output.html}", + params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}"))' + """ \ No newline at end of file diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 894c0f4..b39042f 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -8,6 +8,7 @@ from scripts.common import get_bam_ext, get_fqscreen_outputs # ~~ workflow configuration workpath = config['project']['workpath'] +bin_path = config['project']['binpath'] genome = config['options']['genome'] paired_end = False if config['project']['nends'] == 1 else True samples = config['samples'] @@ -17,6 +18,7 @@ ends = [1] if not paired_end else [1, 2] qc_dir = join(workpath, "QC") kraken_dir = join(workpath, 'kraken') deeptools_dir = join(workpath, 'deeptools') +peakqc_dir = join(workpath, "PeakQC") extra_fingerprint_dir = join(deeptools_dir, 'sorted_fingerprint') @@ -387,26 +389,28 @@ rule FRiP: output: join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"), params: - rname="frip", - outroot = lambda w: join(workpath,"PeakQC",w.PeakTool), - script=join(workpath,"workflow","scripts","frip.py"), - genome = config['references'][genome]['REFLEN'], - tmpdir = tmpdir, - container: config['images']['python'] - shell: """ - # Setups temporary directory for - # intermediate files with built-in - # mechanism for deletion on exit - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT + rname = "frip", + outroot = lambda w: join(peakqc_dir, w.PeakTool), + script = join(bin_path, "frip.py"), + genome = config['references'][genome]['REFLEN'], + tmpdir = tmpdir, + container: + config['images']['python'] + shell: + """ + # Setups temporary directory for + # intermediate files with built-in + # mechanism for deletion on exit + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT - python {params.script} \\ - -p {input.bed} \\ - -b {input.bam} \\ - -g {params.genome} \\ - -o {params.outroot} - """ + python {params.script} \\ + -p {input.bed} \\ + -b {input.bam} \\ + -g {params.genome} \\ + -o {params.outroot} + """ rule jaccard: input: @@ -414,15 +418,17 @@ rule jaccard: output: join(qc_dir, '{PeakTool}_jaccard.txt'), params: - rname="jaccard", - outroot = lambda w: join(qc_dir, w.PeakTool), - script=join(workpath,"workflow","scripts","jaccard_score.py"), - genome = config['references'][genome]['REFLEN'] + rname = "frip", + rname = "jaccard", + outroot = lambda w: join(qc_dir, w.PeakTool), + script = join(bin_path, "jaccard_score.py"), + genome = config['references'][genome]['REFLEN'] envmodules: config['tools']['BEDTOOLSVER'] - shell: """ - python {params.script} \\ - -i "{input}" \\ - -o "{params.outroot}" \\ - -g {params.genome} - """ + shell: + """ + python {params.script} \\ + -i "{input}" \\ + -o "{params.outroot}" \\ + -g {params.genome} + """ diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index 26f44b6..6feba1c 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -258,14 +258,4 @@ def get_fqscreen_outputs(paired_end, samples, qc_dir): outs.extend(expand(join(qc_dir, "FQscreen", "{name}.R1.trim_screen.png"), name=samples)), outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.txt"), name=samples)), outs.extend(expand(join(qc_dir, "FQscreen2", "{name}.R1.trim_screen.png"), name=samples)), - return outs - - -def test_combine(one, two): - try: - three = one + two - except: - print(one) - print(two) - exit() - return three \ No newline at end of file + return outs \ No newline at end of file From 07eb85fde045073c3b6563a08d60b306563d71cd Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 10 Jul 2024 11:39:00 -0400 Subject: [PATCH 05/28] fix: remove old imports, duplicate parameters --- workflow/rules/dba.smk | 2 +- workflow/rules/qc.smk | 29 +++++++++++++++-------------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index d767f57..e20c48a 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -3,7 +3,7 @@ import os import json from os.path import join -from scripts.common import allocated, mk_dir_if_not_exist, test_combine +from scripts.common import allocated, mk_dir_if_not_exist from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction from scripts.blocking import test_for_block diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index b39042f..26861ef 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -366,21 +366,23 @@ rule deeptools_QC: input: [ join(workpath, bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] # this should be all bigwigs output: - heatmap=join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"), - pca=join(deeptools_dir, "pca.Q5DD.pdf"), - npz=temp(join(deeptools_dir, "Q5DD.npz")), - png=join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png") + javaram = '16g', + heatmap = join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"), + pca = join(deeptools_dir, "pca.Q5DD.pdf"), + npz = temp(join(deeptools_dir, "Q5DD.npz")), + png = join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png") params: - rname="deeptools_QC", - deeptoolsver=config['tools']['DEEPTOOLSVER'], + rname = "deeptools_QC", + deeptoolsver = config['tools']['DEEPTOOLSVER'], labels=samples # this should be the sample names to match the bigwigs in the same order - shell: """ - module load {params.deeptoolsver} - multiBigwigSummary bins -b {input} -l {params.labels} -out {output.npz} - plotCorrelation -in {output.npz} -o {output.heatmap} -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers - plotCorrelation -in {output.npz} -o {output.png} -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers - plotPCA -in {output.npz} -o {output.pca} - """ + shell: + """ + module load {params.deeptoolsver} + multiBigwigSummary bins -b {input} -l {params.labels} -out {output.npz} + plotCorrelation -in {output.npz} -o {output.heatmap} -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers + plotCorrelation -in {output.npz} -o {output.png} -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers + plotPCA -in {output.npz} -o {output.pca} + """ rule FRiP: input: @@ -418,7 +420,6 @@ rule jaccard: output: join(qc_dir, '{PeakTool}_jaccard.txt'), params: - rname = "frip", rname = "jaccard", outroot = lambda w: join(qc_dir, w.PeakTool), script = join(bin_path, "jaccard_score.py"), From 83b0e5c68127bd7b42d530d0cf3bd1c2448f2641 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 10 Jul 2024 12:55:41 -0400 Subject: [PATCH 06/28] fix: correct typo type to _type --- workflow/Snakefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 240f016..dc011b8 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -94,7 +94,7 @@ if assay == "cfchip": group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC )) rule_all_ins.extend(expand( - join(uropa_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], + join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], name=contrasts, _type=["protTSS"] )) @@ -121,7 +121,7 @@ if assay == "cfchip": )) if reps: rule_all_ins.extend(expand( - join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{type}_allhits.txt"), + join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), PeakTool=PeakTools, name=chips, _type=peak_types )) rule_all_ins.extend(expand( @@ -129,7 +129,7 @@ if assay == "cfchip": group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC )) rule_all_ins.extend(expand( - join(uropa_dir, "{name}_{PeakTool}_uropa_{type}_allhits.txt"), + join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], name=contrasts, _type=peak_types From 543121be62664525af9a0d58ea8f1ec20add6717 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 10 Jul 2024 17:35:01 -0400 Subject: [PATCH 07/28] fix: correct pathing on diffbind outputs --- workflow/Snakefile | 8 +++++--- workflow/rules/dba.smk | 16 +++++++++------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index dc011b8..e025c45 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -49,6 +49,7 @@ macsB_dir = join(workpath, "macsBroad") sicer_dir = join(workpath, "sicer") peakqc_dir = join(workpath, "PeakQC") uropa_dir = join(workpath, "UROPA_annotations") +uropa_diffbind_dir = join(uropa_dir, "DiffBind") diffbind_dir = join(workpath, "DiffBind") cfTool_dir = join(workpath, "cfChIPtool") genrich_dir = join(workpath, "Genrich") @@ -94,10 +95,11 @@ if assay == "cfchip": group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC )) rule_all_ins.extend(expand( - join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], - name=contrasts, _type=["protTSS"] + join(uropa_diffbind_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], + name=contrasts, + _type=["protTSS"] )) - elif assay in ["atac", "chip"]: peak_types.extend(["prot", "protSEC", "genes"]) rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips)) diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index e20c48a..eb8d741 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -23,6 +23,8 @@ bin_path = join(workpath, "workflow", "bin") diffbind_dir_block = join(workpath, "DiffBindBlock") diffbind_dir2 = join(workpath, "DiffBind_block") diffbind_dir = join(workpath, "DiffBind") +uropa_dir = join(workpath, "UROPA_annotations") +uropa_diffbind_dir = join(uropa_dir, "DiffBind") bam_dir = join(workpath, "bam") qc_dir = join(workpath, "PeakQC") idr_dir = join(workpath, "IDR") @@ -93,11 +95,11 @@ rule diffbind: lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ] output: html = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - Deseq2 = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"), - EdgeR = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"), - EdgeR_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"), - Deseq2_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"), - EdgeR_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"), + Deseq2 = join(diffbind_dir, "DiffbindDeseq2", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"), + EdgeR = join(diffbind_dir, "DiffbindEdgeR", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"), + EdgeR_txt = join(diffbind_dir, "DiffbindEdgeR", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"), + Deseq2_txt = join(diffbind_dir, "DiffbindDeseq2", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"), + EdgeR_ftxt = join(diffbind_dir, "DiffbindEdgeR", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"), Deseq2_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"), html_block = provided(join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking) params: @@ -141,11 +143,11 @@ rule diffbind: cp {params.rscript} {params.outdir} cd {params.outdir} Rscript -e 'rmarkdown::render("DiffBind_v2_ChIPseq.Rmd", output_file= "{output.html}", - params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.this_peaktool}"))' + params=list(csvfile= "{params.csvfile}", contrasts= "{params.this_contrast}", peakcaller= "{params.this_peaktool}"))' if [ ! -f {output.Deseq2} ]; then touch {output.Deseq2}; fi if [ ! -f {output.EdgeR} ]; then touch {output.EdgeR}; fi - if [ '{params.blocking}' == True ]; then + if [ '"""+str(blocking)+"""' == True ]; then echo "DiffBind with Blocking" Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file= "{output.html_block}", params=list(csvfile= "{params.csvfile}", contrasts= "{params.this_contrast}", peakcaller= "{params.this_peaktool}", dir= "{params.outdir_block}"))' From e5027d8b4de61df2fb05f5b4cceefeecb521c35b Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Thu, 11 Jul 2024 10:53:28 -0400 Subject: [PATCH 08/28] fix: missing imports in peakcall rules --- workflow/rules/peakcall.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk index 466d639..1fce9b2 100644 --- a/workflow/rules/peakcall.smk +++ b/workflow/rules/peakcall.smk @@ -4,7 +4,8 @@ # Common quality-control rules: preseq, NRF, rawfastqc, # fastqc, fastq_screen, multiQC from os.path import join -from scripts.peakcall import get_control_input, getMacTXT, getMacChip +from scripts.peakcall import get_control_input, getMacTXT, getMacChip, + getSicerChips, getSicerFragLen, get_control_input # ~~ workflow configuration From 0e7ba2ec1a12e166fb4a738d42ba1598c2305bfe Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Thu, 11 Jul 2024 10:58:26 -0400 Subject: [PATCH 09/28] fix: indent --- workflow/rules/peakcall.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk index 1fce9b2..cdc1762 100644 --- a/workflow/rules/peakcall.smk +++ b/workflow/rules/peakcall.smk @@ -4,7 +4,7 @@ # Common quality-control rules: preseq, NRF, rawfastqc, # fastqc, fastq_screen, multiQC from os.path import join -from scripts.peakcall import get_control_input, getMacTXT, getMacChip, +from scripts.peakcall import get_control_input, getMacTXT, getMacChip, \ getSicerChips, getSicerFragLen, get_control_input From 8f6cedd3c7800a064715dc1adfe70683667eddc5 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Thu, 11 Jul 2024 11:11:49 -0400 Subject: [PATCH 10/28] fix: fix single end functionality in bwa rule --- workflow/rules/trim_align_dedup.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk index 0f1b458..8eae5cd 100644 --- a/workflow/rules/trim_align_dedup.smk +++ b/workflow/rules/trim_align_dedup.smk @@ -42,7 +42,7 @@ rule trim: """ input: file1 = join(workpath, "{name}.R1.fastq.gz"), - file2 = provided(join(workpath,"{name}.R2.fastq.gz"), paired_end) + file2 = provided(join(workpath, "{name}.R2.fastq.gz"), paired_end) output: outfq1 = temp(join(trim_dir, "{name}.R1.trim.fastq.gz")), outfq2 = provided(temp(join(trim_dir, "{name}.R2.trim.fastq.gz")), paired_end) @@ -171,7 +171,7 @@ rule BWA: """ input: infq1 = join(trim_dir, "{name}.R1.trim.fastq.gz"), - infq2 = join(trim_dir, "{name}.R2.trim.fastq.gz"), + infq2 = join(trim_dir, "{name}.R2.trim.fastq.gz") if paired_end else [], params: d = join(bam_dir), rname = 'bwa', From c4e32cdd5e2e1cd57e59f9e659f468e34b287d99 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Thu, 11 Jul 2024 13:59:52 -0400 Subject: [PATCH 11/28] chore: spacing, refactor manorm rule --- workflow/rules/cfChIP.smk | 2 + workflow/rules/dba.smk | 79 ++--- workflow/rules/peakcall.smk | 470 ++++++++++++++-------------- workflow/rules/qc.smk | 88 +++--- workflow/rules/trim_align_dedup.smk | 13 +- workflow/scripts/peakcall.py | 9 +- 6 files changed, 350 insertions(+), 311 deletions(-) diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk index 4730054..665ef2f 100644 --- a/workflow/rules/cfChIP.smk +++ b/workflow/rules/cfChIP.smk @@ -10,6 +10,7 @@ genome = config['options']['genome'] blocks = config['project']['blocks'] groupdata = config['project']['groups'] + # Directory end points bam_dir = join(workpath, "bam") cfTool_dir = join(workpath, "cfChIPtool") @@ -102,6 +103,7 @@ rule promoterTable2: Rscript -e "source('{params.script2}'); promoterAnnotationWrapper('{output.txt}','{params.gtf}','Reactome')"; """ + rule diffbindQC: input: lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ] diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index eb8d741..80265a4 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -4,7 +4,7 @@ import os import json from os.path import join from scripts.common import allocated, mk_dir_if_not_exist -from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction +from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction, get_manorm_sizes from scripts.blocking import test_for_block @@ -214,7 +214,7 @@ rule UROPA: this_q['distance'] = _d json_construct['queries'].append(this_q) elif '{type}' == 'protSEC': - # distance, feature.anchor + # distance, feature.anchor query_values = ( ([3000, 1000], "start"), (3000, "end"), @@ -239,39 +239,46 @@ rule UROPA: rule manorm: - input: - bam1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"), - bam2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".Q5DD.bam"), - ppqt = join(workpath,bam_dir, "Q5DD.ppqt.txt"), - peak1 = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]), - peak2 = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]), + input: + bam1 = lambda w: join(bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"), + bam2 = lambda w: join(bam_dir, groupdata[w.group2][0] + ".Q5DD.bam"), + ppqt = join(bam_dir, "Q5DD.ppqt.txt"), + peak1 = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]), + peak2 = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]), output: - xls = join(workpath,manorm_dir,"{group1}_vs_{group2}-{tool}","{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), - bed = temp(join(workpath,manorm_dir,"{group1}_vs_{group2}-{tool}","{group1}_vs_{group2}-{tool}_all_MA.bed")), - wigA = join(workpath,manorm_dir,"{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_A_values.wig.gz"), - wigM = join(workpath,manorm_dir,"{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_M_values.wig.gz"), - wigP = join(workpath,manorm_dir,"{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_P_values.wig.gz"), + xls = join(manorm_dir, "{group1}_vs_{group2}-{tool}","{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), + bed = temp(join(manorm_dir, "{group1}_vs_{group2}-{tool}","{group1}_vs_{group2}-{tool}_all_MA.bed")), + wigA = join(manorm_dir, "{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_A_values.wig.gz"), + wigM = join(manorm_dir, "{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_M_values.wig.gz"), + wigP = join(manorm_dir, "{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_P_values.wig.gz"), params: - rname='manorm', - fldr = join(workpath,manorm_dir,"{group1}_vs_{group2}-{tool}"), - bedtoolsver=config['tools']['BEDTOOLSVER'], - sample1= lambda w: groupdata[w.group1][0], - sample2= lambda w: groupdata[w.group2][0], - manormver="manorm/1.1.4" - run: - commoncmd1 = "if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi " - commoncmd2 = "cd /lscratch/$SLURM_JOBID; " - commoncmd3 = "module load {params.manormver}; module load {params.bedtoolsver}; " - cmd1 = "bamToBed -i {input.bam1} > bam1.bed; " - cmd2 = "bamToBed -i {input.bam2} > bam2.bed; " - cmd3 = "cut -f 1,2,3 {input.peak1} > peak1.bed; " - cmd4 = "cut -f 1,2,3 {input.peak2} > peak2.bed; " - file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) - extsize1 = [ ppqt[1] for ppqt in file if ppqt[0] == params.sample1 ][0] - extsize2 = [ ppqt[1] for ppqt in file if ppqt[0] == params.sample2 ][0] - cmd5 = "manorm --p1 peak1.bed --p2 peak2.bed --r1 bam1.bed --r2 bam2.bed --s1 " + extsize1 + " --s2 " + extsize2 + " -o {params.fldr} --name1 '" + wildcards.group1 + "' --name2 '" + wildcards.group2 + "'; " - cmd6 = "gzip {params.fldr}/output_tracks/*wig; " - cmd7 = "mv {params.fldr}/" + wildcards.group1 + "_vs_" + wildcards.group2 + "_all_MAvalues.xls {output.xls}; " - cmd8 = "tail -n +2 {output.xls} | nl -w2 | awk -v OFS='\t' '{{print $2,$3,$4,$9$1,$6}}' > {output.bed}" - shell(commoncmd1) - shell( commoncmd2 + commoncmd3 + cmd1 + cmd2 + cmd3 + cmd4 + cmd5 + cmd6 + cmd7 + cmd8 ) \ No newline at end of file + rname = 'manorm', + fldr = join(manorm_dir, "{group1}_vs_{group2}-{tool}"), + bedtoolsver = config['tools']['BEDTOOLSVER'], + manormver = "manorm/1.1.4" + extsizes = lambda w, _in: get_manorm_sizes(w.group1, w.group2, groupdata, _in.ppqt) + shell: + """ + if [ ! -e /lscratch/$SLURM_JOBID ]; then + mkdir /lscratch/$SLURM_JOBID + fi + cd /lscratch/$SLURM_JOBID + module load {params.manormver} + module load {params.bedtoolsver} + bamToBed -i {input.bam1} > bam1.bed + bamToBed -i {input.bam2} > bam2.bed + cut -f 1,2,3 {input.peak1} > peak1.bed + cut -f 1,2,3 {input.peak2} > peak2.bed + manorm \ + --p1 peak1.bed \ + --p2 peak2.bed \ + --r1 bam1.bed \ + --r2 bam2.bed \ + {params.extsizes} \ + -o {params.fldr} \ + --name1 {wildcards.group1} \ + --name2 {wildcards.group2} + gzip {params.fldr}/output_tracks/*wig + mv {params.fldr}/{wildcards.group1}_vs_{wildcards.group2}_all_MAvalues.xls {output.xls} + tail -n +2 {output.xls} | nl -w2 | awk -v OFS='\t' '{{print $2,$3,$4,$9$1,$6}}' > {output.bed} + """ \ No newline at end of file diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk index cdc1762..f7953ee 100644 --- a/workflow/rules/peakcall.smk +++ b/workflow/rules/peakcall.smk @@ -13,6 +13,7 @@ workpath = config['project']['workpath'] genome = config['options']['genome'] paired_end = False if config['project']['nends'] == 1 else True chip2input = config['project']['peaks']['inputs'] +tmpdir = config['options']['tmp_dir'] # Directory end points bam_dir = join(workpath, "bam") @@ -37,16 +38,19 @@ rule sortByRead: output: temp(join(bam_dir, "{name}.sortedByRead.bam")) params: - rname="sortByRead", - samtools=config['tools']['SAMTOOLSVER'], - mem=allocated("mem", "sortByRead", cluster) - threads: int(allocated("threads", "sortByRead", cluster)) - shell: """ - module load {params.samtools} - samtools sort {input} -n \\ - -@ {threads} \\ - -o {output} - """ + rname = "sortByRead", + samtools = config['tools']['SAMTOOLSVER'], + mem = allocated("mem", "sortByRead", cluster) + threads: + int(allocated("threads", "sortByRead", cluster)) + shell: + """ + module load {params.samtools} + samtools sort {input} -n \\ + -@ {threads} \\ + -o {output} + """ + rule genrich: """ @@ -65,253 +69,257 @@ rule genrich: output: join(genrich_dir, "{name}", "{name}.narrowPeak") params: - rname="genrich", - genrich_ver=config['tools']['GENRICHVER'] - shell: """ - module load {params.genrich_ver} - Genrich \\ - -t {input} \\ - -o {output} \\ - -j \\ - -y \\ - -r \\ - -v \\ - -d 150 \\ - -m 5 \\ - -e chrM,chrY - """ + rname = "genrich", + genrich_ver = config['tools']['GENRICHVER'] + shell: + """ + module load {params.genrich_ver} + Genrich \\ + -t {input} \\ + -o {output} \\ + -j \\ + -y \\ + -r \\ + -v \\ + -d 150 \\ + -m 5 \\ + -e chrM,chrY + """ + -# INDIVIDUAL RULES rule MACS2_narrow: input: - chip = lambda w: getMacChip(bam_dir, w.name, paired_end), - txt = lambda w: getMacTXT(ppqt_dir, w.name, paired_end), - c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), + chip = lambda w: getMacChip(bam_dir, w.name, paired_end), + txt = lambda w: getMacTXT(ppqt_dir, w.name, paired_end), + c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), output: join(macsN_dir, "{name}", "{name}_peaks.narrowPeak"), params: - rname='MACS2_narrow', - gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'], - macsver=config['tools']['MACSVER'], - paired_end = paired_end, - flag= lambda w: "-c" if chip2input[w.name] else "" - shell: """ - module load {params.macsver}; - if [ '{params.paired_end}' == True ]; then - macs2 callpeak \\ - -t {input.chip} {params.flag} {input.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {macsN_dir}/{wildcards.name} \\ - -q 0.01 \\ - --keep-dup="all" \\ - -f "BAMPE" - else - ppqt_len=$(awk '{{print $1}}' {input.txt}) - macs2 callpeak \\ - -t {input.chip} {params.flag} {input.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {macsN_dir}/{wildcards.name} \\ - -q 0.01 \\ - --keep-dup="all" \\ - --nomodel \\ - --extsize $ppqt_len - fi - """ + rname = 'MACS2_narrow', + gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + macsver = config['tools']['MACSVER'], + flag = lambda w: "-c" if chip2input[w.name] else "" + shell: + """ + module load {params.macsver}; + if [ '{params.paired_end}' == True ]; then + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {macsN_dir}/{wildcards.name} \\ + -q 0.01 \\ + --keep-dup="all" \\ + -f "BAMPE" + else + ppqt_len=$(awk '{{print $1}}' {input.txt}) + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {macsN_dir}/{wildcards.name} \\ + -q 0.01 \\ + --keep-dup="all" \\ + --nomodel \\ + --extsize $ppqt_len + fi + """ + rule MACS2_broad: input: - chip = lambda w: getMacChip(bam_dir, w.name, paired_end), - txt = lambda w: getMacTXT(ppqt_dir, w.name, paired_end), - c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), + chip = lambda w: getMacChip(bam_dir, w.name, paired_end), + txt = lambda w: getMacTXT(ppqt_dir, w.name, paired_end), + c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), output: join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), params: - rname='MACS2_broad', - gsize=config['references'][genome]['EFFECTIVEGENOMESIZE'], - macsver=config['tools']['MACSVER'], - paired_end = paired_end, - flag= lambda w: "-c" if chip2input[w.name] else "" - shell: """ - module load {params.macsver}; - if [ '{params.paired_end}' == True ]; then - macs2 callpeak \\ - -t {input.chip} {params.flag} {input.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {macsB_dir}/{wildcards.name} \\ - --broad \\ - --broad-cutoff 0.01 \\ - --keep-dup="all" \\ - -f "BAMPE" - else - ppqt_len=$(awk '{{print $1}}' {input.txt}) - macs2 callpeak \\ - -t {input.chip} {params.flag} {input.c_option} \\ - -g {params.gsize} \\ - -n {wildcards.name} \\ - --outdir {macsB_dir}/{wildcards.name} \\ - --broad \\ - --broad-cutoff 0.01 \\ - --keep-dup="all" \\ - --nomodel \\ - --extsize $ppqt_len - fi - """ + rname = 'MACS2_broad', + gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], + macsver = config['tools']['MACSVER'], + flag = lambda w: "-c" if chip2input[w.name] else "" + shell: + """ + module load {params.macsver}; + if [ '{params.paired_end}' == True ]; then + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {macsB_dir}/{wildcards.name} \\ + --broad \\ + --broad-cutoff 0.01 \\ + --keep-dup="all" \\ + -f "BAMPE" + else + ppqt_len=$(awk '{{print $1}}' {input.txt}) + macs2 callpeak \\ + -t {input.chip} {params.flag} {input.c_option} \\ + -g {params.gsize} \\ + -n {wildcards.name} \\ + --outdir {macsB_dir}/{wildcards.name} \\ + --broad \\ + --broad-cutoff 0.01 \\ + --keep-dup="all" \\ + --nomodel \\ + --extsize $ppqt_len + fi + """ + rule SICER: - input: - chip = lambda w: getSicerChips(bam_dir, w.name, paired_end), - fragLen = lambda w: getSicerFragLen(ppqt_dir, qc_dir, w.name, paired_end), - c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), + input: + chip = lambda w: getSicerChips(bam_dir, w.name, paired_end), + fragLen = lambda w: getSicerFragLen(ppqt_dir, qc_dir, w.name, paired_end), + c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), output: - bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), + bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), params: - rname='SICER', - sicerver=config['tools']['SICERVER'], - bedtoolsver=config['tools']['BEDTOOLSVER'], - genomever = config['options']['genome'], - name="{name}", - sicer_dir=join(sicer_dir,"{name}"), - tmpdir=tmpdir, - paired_end = paired_end, - frac=config['references'][genome]['FRAC'], - flag= lambda w: "-c" if chip2input[w.name] else "", - shell: """ - module load {params.sicerver} - module load {params.bedtoolsver} - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT + rname = 'SICER', + name = "{name}", + sicerver = config['tools']['SICERVER'], + bedtoolsver = config['tools']['BEDTOOLSVER'], + genomever = config['options']['genome'], + this_sicer_dir = join(sicer_dir,"{name}"), + frac = config['references'][genome]['FRAC'], + flag = lambda w: "-c" if chip2input[w.name] else "", + shell: + """ + module load {params.sicerver} + module load {params.bedtoolsver} + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT - if [ '{params.paired_end}' == True ]; then - MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6) - mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE) - else - mean_insert_size=$(awk '{{print $1}}' {input.fragLen}) - fi - echo "printing out value of mean-insert-size ${{mean_insert_size}}" - a={input.c_option} - echo "Printing input.c_option ${{a}}" - if [ '{params.paired_end}' == True ]; then - if [ -f "{input.c_option}" ]; then - # Copying input to tmpdir due to SICER2 - # bam2bed file conversion, if more than - # one sample shares the same IP sample - # than a race condition can occur where - # two jobs can concurrent try to write - # to the same BED file (during bedtools - # bam2bed that sicer calls). - input_bam="$(basename "{input.c_option}")" - cp {input.c_option} ${{tmp}} - echo "paired-end with input... ${{tmp}}/${{input_bam}}" - sicer \\ - -t {input.chip} \\ - -c "${{tmp}}/${{input_bam}}" \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf {params.frac} \\ - -g 600 \\ - -fdr 1E-2 \\ - -cpu 30 \\ - -o ${{tmp}} - - mv ${{tmp}}/{params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed}; - mv ${{tmp}}/{params.name}.Q5DD-W300-G600-islands-summary {params.sicer_dir} + if [ '{params.paired_end}' == True ]; then + MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6) + mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE) else - echo "paired-end without input" - sicer \\ - -t {input.chip} \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf {params.frac} \\ - -g 600 \\ - -e 100 \\ - -cpu 30 \\ - -o ${{tmp}} - - mv ${{tmp}}/{params.name}.Q5DD-W300-G600.scoreisland {params.sicer_dir} + mean_insert_size=$(awk '{{print $1}}' {input.fragLen}) fi - else - if [ -f "{input.c_option}" ]; then - echo "single-end with input" - cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; - awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; + echo "printing out value of mean-insert-size ${{mean_insert_size}}" + a={input.c_option} + echo "Printing input.c_option ${{a}}" + if [ '{params.paired_end}' == True ]; then + if [ -f "{input.c_option}" ]; then + # Copying input to tmpdir due to SICER2 + # bam2bed file conversion, if more than + # one sample shares the same IP sample + # than a race condition can occur where + # two jobs can concurrent try to write + # to the same BED file (during bedtools + # bam2bed that sicer calls). + input_bam="$(basename "{input.c_option}")" + cp {input.c_option} ${{tmp}} + echo "paired-end with input... ${{tmp}}/${{input_bam}}" + sicer \\ + -t {input.chip} \\ + -c "${{tmp}}/${{input_bam}}" \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -fdr 1E-2 \\ + -cpu 30 \\ + -o ${{tmp}} + + mv ${{tmp}}/{params.name}.Q5DD-W300-G600-FDR0.01-island.bed {output.bed}; + mv ${{tmp}}/{params.name}.Q5DD-W300-G600-islands-summary {params.this_sicer_dir} + else + echo "paired-end without input" + sicer \\ + -t {input.chip} \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -e 100 \\ + -cpu 30 \\ + -o ${{tmp}} - cp {input.c_option} ${{tmp}}/input.bed.gz; gzip -d ${{tmp}}/input.bed.gz; - awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/input.bed > ${{tmp}}/inputV2.bed; - - sicer \\ - -t ${{tmp}}/{params.name}.bed \\ - -c ${{tmp}}/inputV2.bed \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf {params.frac} \\ - -g 600 \\ - -fdr 1E-2 \\ - -cpu 30 \\ - -o ${{tmp}} - mv ${{tmp}}/{params.name}-W300-G600-FDR0.01-island.bed {output.bed}; - mv ${{tmp}}/{params.name}-W300-G600-islands-summary {params.sicer_dir} + mv ${{tmp}}/{params.name}.Q5DD-W300-G600.scoreisland {params.this_sicer_dir} + fi else - echo "single-end without input" - cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; - awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; - sicer \\ - -t ${{tmp}}/{params.name}.bed \\ - -s {params.genomever} \\ - -rt 100 \\ - -w 300 \\ - -f ${{mean_insert_size}} \\ - -egf {params.frac} \\ - -g 600 \\ - -e 100 \\ - -cpu 30 \\ - -o ${{tmp}} - mv ${{tmp}}/{params.name}-W300-G600.scoreisland {output.bed} + if [ -f "{input.c_option}" ]; then + echo "single-end with input" + cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; + awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; + + cp {input.c_option} ${{tmp}}/input.bed.gz; gzip -d ${{tmp}}/input.bed.gz; + awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/input.bed > ${{tmp}}/inputV2.bed; + + sicer \\ + -t ${{tmp}}/{params.name}.bed \\ + -c ${{tmp}}/inputV2.bed \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -fdr 1E-2 \\ + -cpu 30 \\ + -o ${{tmp}} + mv ${{tmp}}/{params.name}-W300-G600-FDR0.01-island.bed {output.bed}; + mv ${{tmp}}/{params.name}-W300-G600-islands-summary {params.this_sicer_dir} + else + echo "single-end without input" + cp {input.chip} ${{tmp}}/chip.bed.gz; gzip -d ${{tmp}}/chip.bed.gz; + awk 'BEGIN{{FS=OFS="\\t"}} {{gsub(/\./, 0, $5)}} 1' ${{tmp}}/chip.bed > ${{tmp}}/{params.name}.bed; + sicer \\ + -t ${{tmp}}/{params.name}.bed \\ + -s {params.genomever} \\ + -rt 100 \\ + -w 300 \\ + -f ${{mean_insert_size}} \\ + -egf {params.frac} \\ + -g 600 \\ + -e 100 \\ + -cpu 30 \\ + -o ${{tmp}} + mv ${{tmp}}/{params.name}-W300-G600.scoreisland {output.bed} + fi fi - fi - """ + """ + rule MEME: input: - bed = lambda w: join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool]) + bed = lambda w: join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool]) output: - meme_out = join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), - ame_out = join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html") + meme_out = join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), + ame_out = join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html") params: - rname='MEME', - ref_fa=config['references'][genome]['GENOME'], - meme_vertebrates_db=config['references'][genome]['MEME_VERTEBRATES_DB'], - meme_euk_db=config['references'][genome]['MEME_EUKARYOTE_DB'], - meme_genome_db=config['references'][genome]['MEME_GENOME_DB'], - oc=join(MEME_dir, "{PeakTool}", "{name}"), - tmpdir=tmpdir, - outfa="{name}.fa", - ntasks=int(28) - shell: """ - module load meme - module load bedtools - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT + rname = 'SICER', + rname = 'MEME', + ref_fa = config['references'][genome]['GENOME'], + meme_vertebrates_db = config['references'][genome]['MEME_VERTEBRATES_DB'], + meme_euk_db = config['references'][genome]['MEME_EUKARYOTE_DB'], + meme_genome_db = config['references'][genome]['MEME_GENOME_DB'], + oc = join(MEME_dir, "{PeakTool}", "{name}"), + outfa = "{name}.fa", + ntasks = int(28) + shell: + """ + module load meme + module load bedtools + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT - bedtools getfasta -fi {params.ref_fa} -bed {input.bed} -fo ${{tmp}}/{params.outfa} - meme-chip \\ - --oc {params.oc}_meme \\ - -db {params.meme_vertebrates_db} \\ - -meme-searchsize 34000000 \\ - -meme-p {params.ntasks} \\ - ${{tmp}}/{params.outfa} + bedtools getfasta -fi {params.ref_fa} -bed {input.bed} -fo ${{tmp}}/{params.outfa} + meme-chip \\ + --oc {params.oc}_meme \\ + -db {params.meme_vertebrates_db} \\ + -meme-searchsize 34000000 \\ + -meme-p {params.ntasks} \\ + ${{tmp}}/{params.outfa} - ame \\ - --oc {params.oc}_ame ${{tmp}}/{params.outfa} \\ - {params.meme_euk_db} {params.meme_vertebrates_db} {params.meme_genome_db} - """ \ No newline at end of file + ame \\ + --oc {params.oc}_ame ${{tmp}}/{params.outfa} \\ + {params.meme_euk_db} {params.meme_vertebrates_db} {params.meme_genome_db} + """ \ No newline at end of file diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 26861ef..48343f9 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -14,6 +14,7 @@ paired_end = False if config['project']['nends'] == 1 else samples = config['samples'] ends = [1] if not paired_end else [1, 2] + # ~~ directories qc_dir = join(workpath, "QC") kraken_dir = join(workpath, 'kraken') @@ -157,31 +158,32 @@ rule fastqc: config['tools']['FASTQCVER'] threads: int(allocated("threads", "fastqc", cluster)) - shell: """ - # Setups temporary directory for - # intermediate files with built-in - # mechanism for deletion on exit - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") - trap 'rm -rf "${{tmp}}"' EXIT + shell: + """ + # Setups temporary directory for + # intermediate files with built-in + # mechanism for deletion on exit + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT - # Running fastqc with local - # disk or a tmpdir, fastqc - # has been observed to lock - # up gpfs filesystems, adding - # this on request by HPC staff - fastqc \\ - {input} \\ - -t {threads} \\ - -o "${{tmp}}" - - # Copy output files from tmpdir - # to output directory - find "${{tmp}}" \\ - -type f \\ - \\( -name '*.html' -o -name '*.zip' \\) \\ - -exec cp {{}} {params.outdir} \\; - """ + # Running fastqc with local + # disk or a tmpdir, fastqc + # has been observed to lock + # up gpfs filesystems, adding + # this on request by HPC staff + fastqc \\ + {input} \\ + -t {threads} \\ + -o "${{tmp}}" + + # Copy output files from tmpdir + # to output directory + find "${{tmp}}" \\ + -type f \\ + \\( -name '*.html' -o -name '*.zip' \\) \\ + -exec cp {{}} {params.outdir} \\; + """ rule fastq_screen: """ @@ -232,6 +234,7 @@ rule fastq_screen: {input} """ + rule kraken: """ Quality-control step to assess for potential sources of microbial contamination. @@ -293,6 +296,7 @@ rule kraken: ktImportTaxonomy - -o {output.kronahtml} """ + rule multiqc: """ Reporting step to aggregate sample statistics and quality-control information @@ -321,16 +325,17 @@ rule multiqc: multiqc = config['tools']['MULTIQCVER'], qcconfig = join(workpath, config['shared_resources']['MULTIQC_CONFIG']), excludedir = join(workpath, extra_fingerprint_dir), - shell: """ - module load {params.multiqc} - multiqc \\ - -f \\ - -c {params.qcconfig} \\ - --interactive \\ - -e cutadapt \\ - --ignore {params.excludedir} \\ - -d """ + workpath + """ - """ + shell: + """ + module load {params.multiqc} + multiqc \\ + -f \\ + -c {params.qcconfig} \\ + --interactive \\ + -e cutadapt \\ + --ignore {params.excludedir} \\ + -d """ + workpath + """ + """ rule insert_size: @@ -362,9 +367,11 @@ rule insert_size: -H {output.pdf} """ + rule deeptools_QC: input: - [ join(workpath, bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] # this should be all bigwigs + # this should be all bigwigs + [ join(workpath, bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] output: javaram = '16g', heatmap = join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"), @@ -374,7 +381,8 @@ rule deeptools_QC: params: rname = "deeptools_QC", deeptoolsver = config['tools']['DEEPTOOLSVER'], - labels=samples # this should be the sample names to match the bigwigs in the same order + # this should be the sample names to match the bigwigs in the same order + labels = samples shell: """ module load {params.deeptoolsver} @@ -384,10 +392,11 @@ rule deeptools_QC: plotPCA -in {output.npz} -o {output.pca} """ + rule FRiP: input: - bed = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ], - bam = join(bam_dir, "{name}.Q5DD.bam"), + bed = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ], + bam = join(bam_dir, "{name}.Q5DD.bam"), output: join(workpath,"PeakQC","{PeakTool}.{name}.Q5DD.FRiP_table.txt"), params: @@ -414,6 +423,7 @@ rule FRiP: -o {params.outroot} """ + rule jaccard: input: lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ], @@ -432,4 +442,4 @@ rule jaccard: -i "{input}" \\ -o "{params.outroot}" \\ -g {params.genome} - """ + """ \ No newline at end of file diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk index 8eae5cd..1fe15a8 100644 --- a/workflow/rules/trim_align_dedup.smk +++ b/workflow/rules/trim_align_dedup.smk @@ -187,7 +187,8 @@ rule BWA: idxstat1 = join(bam_dir, "{name}.sorted.bam.idxstat"), flagstat2 = join(bam_dir, "{name}.Q5.bam.flagstat"), idxstat2 = join(bam_dir, "{name}.Q5.bam.idxstat"), - threads: 32 + threads: + 32 shell: """ module load {params.bwaver}; @@ -219,6 +220,7 @@ rule BWA: fi """ + rule dedup: """ Picard MarkDuplicates removes duplicates from bam file. @@ -311,6 +313,7 @@ rule dedup: fi """ + rule ppqt: input: bam = lambda w : join(bam_dir, w.name + "." + w.ext + "." + get_bam_ext(w.ext, paired_end)) @@ -325,7 +328,8 @@ rule ppqt: scriptPy = join(workpath, "bin", "ppqt_process.py"), tmpdir = tmpdir, file_name = "{name}" - container: config['images']['ppqt'] + container: + config['images']['ppqt'] shell: """ if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi @@ -355,6 +359,7 @@ rule ppqt: python {params.scriptPy} -i {output.ppqt} -o {output.txt} """ + rule bam2bw: """ bamCoverage converts bams to bigwig files for read visialization @@ -419,7 +424,7 @@ rule inputnorm: bigWig file of treatmment sample normalizes with its input control """ input: - bws = lambda w: ctrl_test(chip2input, w.name, bw_dir) + bws = lambda w: ctrl_test(chip2input, w.name, bw_dir) output: join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw") params: @@ -441,4 +446,4 @@ rule inputnorm: --operation 'subtract' \\ --skipNonCoveredRegions \\ -p {threads} - """ + """ \ No newline at end of file diff --git a/workflow/scripts/peakcall.py b/workflow/scripts/peakcall.py index e3b07e1..fb03a3c 100644 --- a/workflow/scripts/peakcall.py +++ b/workflow/scripts/peakcall.py @@ -102,4 +102,11 @@ def getSicerFragLen(ppqt_dir, qc_dir, name, paired_end): fragLen = join(qc_dir, name + ".Q5DD.insert_size_metrics.txt") else: fragLen = join(ppqt_dir, name + ".Q5DD_tagAlign.ppqt.txt") - return fragLen \ No newline at end of file + return fragLen + + +def get_manorm_sizes(g1, g2, group_data, ppqt_in): + file = lambda w, _in: list(map(lambda z: z.strip().split(), open(ppqt_in, 'r').readlines())) + extsize1 = [ppqt[1] for ppqt in file if ppqt[0] == group_data[g1]][0] + extsize2 = [ppqt[1] for ppqt in file if ppqt[0] == group_data[g2]][0] + return f"--s1 {extsize1} --s2 {extsize2}" \ No newline at end of file From 65e3e3c2abd5f9ac069affb5dd1022f6b169d218 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Thu, 11 Jul 2024 14:05:17 -0400 Subject: [PATCH 12/28] fix: reference global tmpdir and paired_end flag --- workflow/rules/dba.smk | 2 +- workflow/rules/peakcall.smk | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index 80265a4..7878039 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -255,7 +255,7 @@ rule manorm: rname = 'manorm', fldr = join(manorm_dir, "{group1}_vs_{group2}-{tool}"), bedtoolsver = config['tools']['BEDTOOLSVER'], - manormver = "manorm/1.1.4" + manormver = "manorm/1.1.4", extsizes = lambda w, _in: get_manorm_sizes(w.group1, w.group2, groupdata, _in.ppqt) shell: """ diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk index f7953ee..e81f5a4 100644 --- a/workflow/rules/peakcall.smk +++ b/workflow/rules/peakcall.smk @@ -102,7 +102,7 @@ rule MACS2_narrow: shell: """ module load {params.macsver}; - if [ '{params.paired_end}' == True ]; then + if [ '""" + str(paired_end) + """' == True ]; then macs2 callpeak \\ -t {input.chip} {params.flag} {input.c_option} \\ -g {params.gsize} \\ @@ -141,7 +141,7 @@ rule MACS2_broad: shell: """ module load {params.macsver}; - if [ '{params.paired_end}' == True ]; then + if [ '""" + str(paired_end) + """' == True ]; then macs2 callpeak \\ -t {input.chip} {params.flag} {input.c_option} \\ -g {params.gsize} \\ @@ -187,11 +187,11 @@ rule SICER: """ module load {params.sicerver} module load {params.bedtoolsver} - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") + if [ ! -d \"""" + str(tmpdir) + """\" ]; then mkdir -p \"""" + str(tmpdir) + """\"; fi + tmp=$(mktemp -d -p \"""" + str(tmpdir) + """\") trap 'rm -rf "${{tmp}}"' EXIT - if [ '{params.paired_end}' == True ]; then + if [ '""" + str(paired_end) + """' == True ]; then MEAN_INSERT_SIZE=$(cat {input.fragLen} | awk '/MEDIAN_INSERT_SIZE/{{f=1;next}} /## HISTOGRAM/{{f=0}} f' | cut -f 6) mean_insert_size=$(printf "%.0f" $MEAN_INSERT_SIZE) else @@ -200,7 +200,7 @@ rule SICER: echo "printing out value of mean-insert-size ${{mean_insert_size}}" a={input.c_option} echo "Printing input.c_option ${{a}}" - if [ '{params.paired_end}' == True ]; then + if [ '""" + str(paired_end) + """' == True ]; then if [ -f "{input.c_option}" ]; then # Copying input to tmpdir due to SICER2 # bam2bed file conversion, if more than @@ -294,7 +294,6 @@ rule MEME: meme_out = join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), ame_out = join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html") params: - rname = 'SICER', rname = 'MEME', ref_fa = config['references'][genome]['GENOME'], meme_vertebrates_db = config['references'][genome]['MEME_VERTEBRATES_DB'], @@ -307,8 +306,8 @@ rule MEME: """ module load meme module load bedtools - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") + if [ ! -d \"""" + str(tmpdir) + """\" ]; then mkdir -p \"""" + str(tmpdir) + """\"; fi + tmp=$(mktemp -d -p \"""" + str(tmpdir) + """\") trap 'rm -rf "${{tmp}}"' EXIT bedtools getfasta -fi {params.ref_fa} -bed {input.bed} -fo ${{tmp}}/{params.outfa} From f94688691ea060500b63139d8a87d2d86443f1b7 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Thu, 11 Jul 2024 14:40:49 -0400 Subject: [PATCH 13/28] fix: fix rep switch --- workflow/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index e025c45..6f0e4e7 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -30,7 +30,7 @@ zipGroup1, zipGroup2, zipToolC, contrasts \ = zip_contrasts(contrast, PeakTools) file_stems, extRPGC, extaln = get_file_components(paired_end) groups = list(groupdatawinput.keys()) -reps = False if len(groupswreps) > 0 else True +reps = True if len(groupswreps) > 0 else False uniq_inputs = list(sorted(set([v for v in chip2input.values() if v]))) sampleswinput = [ chip_value for input_id, chip_value in chip2input.items() \ From af5e79149a8ce71fe2d961b80455b64506e1a751 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Thu, 11 Jul 2024 15:42:17 -0400 Subject: [PATCH 14/28] chore: more spacing, fix reps flag reversal --- workflow/Snakefile | 106 ++++++++++++++++----------- workflow/rules/dba.smk | 12 ++-- workflow/rules/trim_align_dedup.smk | 108 +++++++++++++--------------- 3 files changed, 123 insertions(+), 103 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 6f0e4e7..4b9c2c7 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -38,6 +38,7 @@ sampleswinput = [ ] inputnorm = [""] if len(sampleswinput) == 0 else ["", ".inputnorm"] deepgroups, deepexts = group_output_files(extRPGC, groups, inputnorm) +UropaCats = ["protTSS", "prot", "protSEC", "genes"] # Directory end points bam_dir = join(workpath, "bam") @@ -54,6 +55,7 @@ diffbind_dir = join(workpath, "DiffBind") cfTool_dir = join(workpath, "cfChIPtool") genrich_dir = join(workpath, "Genrich") MEME_dir = join(workpath, "MEME") +manorm_dir = join(workpath, "MANorm") # Read in resource information with open(join('config', 'cluster.json')) as fh: @@ -87,8 +89,6 @@ if assay == "cfchip": join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), PeakTool=PeakTools )) - - if reps: rule_all_ins.extend(expand( join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), @@ -100,47 +100,73 @@ if assay == "cfchip": name=contrasts, _type=["protTSS"] )) - elif assay in ["atac", "chip"]: - peak_types.extend(["prot", "protSEC", "genes"]) - rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips)) - rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips)) + # else: + # rule_all_ins.extend(expand( + # join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), + # PeakTool="MANorm", + # name=contrasts, + # _type=UropaCats + # )) + # rule_all_ins.extend(expand( + # join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), + # group1=zipGroup1, + # group2=zipGroup2, + # tool=zipToolC + # )) +elif assay in ["atac", "chip"]: + peak_types.extend(["prot", "protSEC", "genes"]) + rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips)) + rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips)) + if paired_end: + rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems)) + if assay == "chip": + rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips)) + rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips)) if paired_end: - rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems)) - if assay == "chip": - rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips)) - rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips)) - if paired_end: - short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"] - rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext)) - rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext)) - rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext)) - rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext)) - rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext)) - rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext)) - elif assay == "atac": - rule_all_ins.extend(expand( - join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips - )) - if reps: - rule_all_ins.extend(expand( - join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=PeakTools, name=chips, _type=peak_types - )) - rule_all_ins.extend(expand( - join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC - )) + short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"] + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext)) + rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext)) + elif assay == "atac": + rule_all_ins.extend(expand( + join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips + )) + if reps: + rule_all_ins.extend(expand( + join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool=PeakTools, name=chips, _type=peak_types + )) + rule_all_ins.extend(expand( + join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), + group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC + )) + rule_all_ins.extend(expand( + join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], + name=contrasts, + _type=peak_types + )) + if contrast: rule_all_ins.extend(expand( - join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], - name=contrasts, - _type=peak_types + join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), + PeakTool=PeakTools )) - if contrast: - rule_all_ins.extend(expand( - join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), - PeakTool=PeakTools - )) + # else: + # rule_all_ins.extend(expand( + # join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), + # PeakTool="MANorm", + # name=contrasts, + # _type=UropaCats + # )) + # rule_all_ins.extend(expand( + # join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), + # group1=zipGroup1, + # group2=zipGroup2, + # tool=zipToolC + # )) rule_all_ins.append(join(workpath,"multiqc_report.html")) rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples)) diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index 7878039..42f811e 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -242,15 +242,15 @@ rule manorm: input: bam1 = lambda w: join(bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"), bam2 = lambda w: join(bam_dir, groupdata[w.group2][0] + ".Q5DD.bam"), - ppqt = join(bam_dir, "Q5DD.ppqt.txt"), + ppqt = join(ppqt_dir, "Q5DD.ppqt.txt"), peak1 = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]), peak2 = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]), output: - xls = join(manorm_dir, "{group1}_vs_{group2}-{tool}","{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), - bed = temp(join(manorm_dir, "{group1}_vs_{group2}-{tool}","{group1}_vs_{group2}-{tool}_all_MA.bed")), - wigA = join(manorm_dir, "{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_A_values.wig.gz"), - wigM = join(manorm_dir, "{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_M_values.wig.gz"), - wigP = join(manorm_dir, "{group1}_vs_{group2}-{tool}","output_tracks","{group1}_vs_{group2}_P_values.wig.gz"), + xls = join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), + bed = temp(join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MA.bed")), + wigA = join(manorm_dir, "{group1}_vs_{group2}-{tool}", "output_tracks", "{group1}_vs_{group2}_A_values.wig.gz"), + wigM = join(manorm_dir, "{group1}_vs_{group2}-{tool}", "output_tracks", "{group1}_vs_{group2}_M_values.wig.gz"), + wigP = join(manorm_dir, "{group1}_vs_{group2}-{tool}", "output_tracks", "{group1}_vs_{group2}_P_values.wig.gz"), params: rname = 'manorm', fldr = join(manorm_dir, "{group1}_vs_{group2}-{tool}"), diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk index 1fe15a8..5665956 100644 --- a/workflow/rules/trim_align_dedup.smk +++ b/workflow/rules/trim_align_dedup.smk @@ -60,8 +60,6 @@ rule trim: trailingquality = 10, javaram = "64g", sample = "{name}", - tmpdir = tmpdir, - paired_end = paired_end threads: 16 shell: @@ -70,11 +68,11 @@ rule trim: module load {params.bwaver}; module load {params.samtoolsver}; module load {params.picardver}; - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") + if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi + tmp=$(mktemp -d -p \"""" + tmpdir + """\") trap 'rm -rf "${{tmp}}"' EXIT - if [ '{params.paired_end}' == True ]; then + if [ \"""" + str(paired_end) + """\" == True ]; then cutadapt \\ --pair-filter=any \\ --nextseq-trim=2 \\ @@ -241,11 +239,11 @@ rule dedup: input: bam2 = join(bam_dir,"{name}.Q5.bam") output: - out5 = join(workpath,bam_dir,"{name}.Q5DD.bam"), - out5f = join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), - out5i = join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"), - out6 = provided(join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), paired_end), - out7 = dedup_out7(join(workpath,bam_dir,"{name}"), assay, paired_end) + out5 = join(bam_dir, "{name}.Q5DD.bam"), + out5f = join(bam_dir, "{name}.Q5DD.bam.flagstat"), + out5i = join(bam_dir, "{name}.Q5DD.bam.idxstat"), + out6 = provided(join(bam_dir, "{name}.bwa.Q5.duplic"), paired_end), + out7 = dedup_out7(join(bam_dir, "{name}"), assay, paired_end) params: rname = 'dedup', picardver = config['tools']['PICARDVER'], @@ -253,7 +251,6 @@ rule dedup: bedtoolsver = config['tools']['BEDTOOLSVER'], macsver = config['tools']['MACSVER'], gsize = config['references'][genome]['EFFECTIVEGENOMESIZE'], - folder = join(workpath,bam_dir), genomefile = config['references'][genome]['REFLEN'], rver = config['tools']['RVER'], javaram = '16g', @@ -267,49 +264,49 @@ rule dedup: module load {params.bedtoolsver}; module load {params.macsver}; module load {params.rver}; - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") + if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi + tmp=$(mktemp -d -p \"""" + tmpdir + """\") trap 'rm -rf "${{tmp}}"' EXIT if [ "{assay}" == "cfchip" ];then - java -Xmx{params.javaram} \\ - -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ - -I {input.bam2} \\ - -O {params.tmpBam} \\ - -TMP_DIR ${{tmp}} \\ - -VALIDATION_STRINGENCY SILENT \\ - -REMOVE_DUPLICATES true \\ - -METRICS_FILE {output.out6}; - samtools index {params.tmpBam}; - samtools view -b {params.tmpBam} chr{{1..22}} > {output.out5}; - Rscript {params.rscript} {params.tmpBam} {output.out7}; - rm {params.tmpBam} {params.tmpBam}.bai; - samtools index {output.out5}; - samtools flagstat {output.out5} > {output.out5f}; - samtools idxstats {output.out5} > {output.out5i}; + java -Xmx{params.javaram} \\ + -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ + -I {input.bam2} \\ + -O {params.tmpBam} \\ + -TMP_DIR ${{tmp}} \\ + -VALIDATION_STRINGENCY SILENT \\ + -REMOVE_DUPLICATES true \\ + -METRICS_FILE {output.out6}; + samtools index {params.tmpBam}; + samtools view -b {params.tmpBam} chr{{1..22}} > {output.out5}; + Rscript {params.rscript} {params.tmpBam} {output.out7}; + rm {params.tmpBam} {params.tmpBam}.bai; + samtools index {output.out5}; + samtools flagstat {output.out5} > {output.out5f}; + samtools idxstats {output.out5} > {output.out5i}; elif [ '""" + str(paired_end) + """' == False ];then - macs2 filterdup -i {input} -g {params.gsize} --keep-dup="auto" -o ${{tmp}}/TmpTagAlign; - awk -F"\\t" -v OFS="\\t" '{{if ($2>0 && $3>0) {{print}}}}' ${{tmp}}/TmpTagAlign > ${{tmp}}/TmpTagAlign2; - awk -F"\\t" -v OFS="\\t" '{{print $1,1,$2}}' {params.genomefile} | sort -k1,1 -k2,2n > ${{tmp}}/GenomeFileBed; - bedtools intersect -wa -f 1.0 -a ${{tmp}}/TmpTagAlign2 -b ${{tmp}}/GenomeFileBed > ${{tmp}}/TmpTagAlign3; - bedtools bedtobam -i ${{tmp}}/TmpTagAlign3 -g {params.genomefile} | samtools sort -@4 -o {output.out5}; - gzip ${{tmp}}/TmpTagAlign3; - mv ${{tmp}}/TmpTagAlign3.gz {output.out7}; - samtools index {output.out5}; - samtools flagstat {output.out5} > {output.out5f} - samtools idxstats {output.out5} > {output.out5i} + macs2 filterdup -i {input} -g {params.gsize} --keep-dup="auto" -o ${{tmp}}/TmpTagAlign; + awk -F"\\t" -v OFS="\\t" '{{if ($2>0 && $3>0) {{print}}}}' ${{tmp}}/TmpTagAlign > ${{tmp}}/TmpTagAlign2; + awk -F"\\t" -v OFS="\\t" '{{print $1,1,$2}}' {params.genomefile} | sort -k1,1 -k2,2n > ${{tmp}}/GenomeFileBed; + bedtools intersect -wa -f 1.0 -a ${{tmp}}/TmpTagAlign2 -b ${{tmp}}/GenomeFileBed > ${{tmp}}/TmpTagAlign3; + bedtools bedtobam -i ${{tmp}}/TmpTagAlign3 -g {params.genomefile} | samtools sort -@4 -o {output.out5}; + gzip ${{tmp}}/TmpTagAlign3; + mv ${{tmp}}/TmpTagAlign3.gz {output.out7}; + samtools index {output.out5}; + samtools flagstat {output.out5} > {output.out5f} + samtools idxstats {output.out5} > {output.out5i} else - java -Xmx{params.javaram} \\ - -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ - -I {input.bam2} \\ - -O {output.out5} \\ - -TMP_DIR ${{tmp}} \\ - -VALIDATION_STRINGENCY SILENT \\ - -REMOVE_DUPLICATES true \\ - -METRICS_FILE {output.out6}; - samtools index {output.out5}; - samtools flagstat {output.out5} > {output.out5f}; - samtools idxstats {output.out5} > {output.out5i}; + java -Xmx{params.javaram} \\ + -jar $PICARDJARPATH/picard.jar MarkDuplicates \\ + -I {input.bam2} \\ + -O {output.out5} \\ + -TMP_DIR ${{tmp}} \\ + -VALIDATION_STRINGENCY SILENT \\ + -REMOVE_DUPLICATES true \\ + -METRICS_FILE {output.out6}; + samtools index {output.out5}; + samtools flagstat {output.out5} > {output.out5f}; + samtools idxstats {output.out5} > {output.out5i}; fi """ @@ -326,14 +323,13 @@ rule ppqt: samtoolsver = config['tools']['SAMTOOLSVER'], rver = config['tools']['RVER'], scriptPy = join(workpath, "bin", "ppqt_process.py"), - tmpdir = tmpdir, file_name = "{name}" container: config['images']['ppqt'] shell: """ - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") + if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi + tmp=$(mktemp -d -p \"""" + tmpdir + """\") trap 'rm -rf "${{tmp}}"' EXIT if [ '""" + str(paired_end) + """' == True ]; then @@ -380,20 +376,18 @@ rule bam2bw: rname = "bam2bw", name = "{name}", effectivegenomesize = config['references'][genome]['EFFECTIVEGENOMESIZE'], - paired_end = paired_end, - tmpdir = tmpdir, threads: int(allocated("threads", "bam2bw", cluster)), envmodules: config['tools']['DEEPTOOLSVER'], shell: """ - if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi - tmp=$(mktemp -d -p "{params.tmpdir}") + if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi + tmp=$(mktemp -d -p """ + tmpdir + """) trap 'rm -rf "${{tmp}}"' EXIT bam_cov_option={input.ppqt} - if [ '{params.paired_end}' == False ]; then + if [ \"""" + str(paired_end) + """\" == False ]; then ppqt_len=$(awk '{{print $1}}' {input.ppqt}) bam_cov_option="-e ${{ppqt_len}}" else From 52cd4153fe916a2bc3eecf9a44320dc1986ac9e5 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 15 Jul 2024 14:10:52 -0400 Subject: [PATCH 15/28] chore: make bin files executable, fix execution issues from test data runs --- bin/DiffBind_v2_ChIPseq.Rmd | 0 bin/DiffBind_v2_ChIPseq_block.Rmd | 0 bin/DiffBind_v2_cfChIP_QC.Rmd | 0 bin/FRiP_plot.R | 0 bin/atac_nrf.py | 0 bin/bam_filter_by_mapq.py | 0 bin/frip.py | 0 bin/jaccard_score.py | 0 bin/ppqt_process.py | 0 bin/prep_diffbind.py | 0 bin/prep_diffbindQC.py | 0 config/containers.json | 4 ++-- src/run.py | 2 +- workflow/Snakefile | 3 ++- workflow/rules/dba.smk | 9 +++++++-- workflow/rules/qc.smk | 2 +- 16 files changed, 13 insertions(+), 7 deletions(-) mode change 100644 => 100755 bin/DiffBind_v2_ChIPseq.Rmd mode change 100644 => 100755 bin/DiffBind_v2_ChIPseq_block.Rmd mode change 100644 => 100755 bin/DiffBind_v2_cfChIP_QC.Rmd mode change 100644 => 100755 bin/FRiP_plot.R mode change 100644 => 100755 bin/atac_nrf.py mode change 100644 => 100755 bin/bam_filter_by_mapq.py mode change 100644 => 100755 bin/frip.py mode change 100644 => 100755 bin/jaccard_score.py mode change 100644 => 100755 bin/ppqt_process.py mode change 100644 => 100755 bin/prep_diffbind.py mode change 100644 => 100755 bin/prep_diffbindQC.py diff --git a/bin/DiffBind_v2_ChIPseq.Rmd b/bin/DiffBind_v2_ChIPseq.Rmd old mode 100644 new mode 100755 diff --git a/bin/DiffBind_v2_ChIPseq_block.Rmd b/bin/DiffBind_v2_ChIPseq_block.Rmd old mode 100644 new mode 100755 diff --git a/bin/DiffBind_v2_cfChIP_QC.Rmd b/bin/DiffBind_v2_cfChIP_QC.Rmd old mode 100644 new mode 100755 diff --git a/bin/FRiP_plot.R b/bin/FRiP_plot.R old mode 100644 new mode 100755 diff --git a/bin/atac_nrf.py b/bin/atac_nrf.py old mode 100644 new mode 100755 diff --git a/bin/bam_filter_by_mapq.py b/bin/bam_filter_by_mapq.py old mode 100644 new mode 100755 diff --git a/bin/frip.py b/bin/frip.py old mode 100644 new mode 100755 diff --git a/bin/jaccard_score.py b/bin/jaccard_score.py old mode 100644 new mode 100755 diff --git a/bin/ppqt_process.py b/bin/ppqt_process.py old mode 100644 new mode 100755 diff --git a/bin/prep_diffbind.py b/bin/prep_diffbind.py old mode 100644 new mode 100755 diff --git a/bin/prep_diffbindQC.py b/bin/prep_diffbindQC.py old mode 100644 new mode 100755 diff --git a/config/containers.json b/config/containers.json index 5a296de..5329fa7 100644 --- a/config/containers.json +++ b/config/containers.json @@ -1,7 +1,7 @@ { "images": { - "cfchip": "docker://skchronicles/cfchip_toolkit_v0.5.0", - "python": "docker://asyakhleborodova/chrom_seek_python_v0.1.0", + "cfchip": "docker://skchronicles/cfchip_toolkit:v0.5.0", + "python": "docker://asyakhleborodova/chrom_seek_python:v0.1.0", "ppqt": "docker://asyakhleborodova/ppqt:v0.2.0" } } diff --git a/src/run.py b/src/run.py index 16be94e..019d39c 100644 --- a/src/run.py +++ b/src/run.py @@ -207,7 +207,7 @@ def setup(sub_args, ifiles, repo_path, output_path): # Add other runtime info for debugging config['project']['version'] = __version__ config['project']['workpath'] = os.path.abspath(sub_args.output) - config['project']['binpath'] = os.path.abspath(os.path.join(config['project']['workpath'], '..', 'bin')) + config['project']['binpath'] = os.path.abspath(os.path.join(config['project']['workpath'], 'bin')) git_hash = git_commit_hash(repo_path) config['project']['git_commit_hash'] = git_hash # Add latest git commit hash config['project']['pipeline_path'] = repo_path # Add path to installation diff --git a/workflow/Snakefile b/workflow/Snakefile index 4b9c2c7..23c5ae5 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -122,11 +122,12 @@ elif assay in ["atac", "chip"]: if assay == "chip": rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips)) rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips)) + short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"] if paired_end: - short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"] rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext)) rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=short_ext)) rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=short_ext)) + else: rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=tag_ext)) rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.pdf"), name=samples, ext=tag_ext)) rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt.txt"), name=samples, ext=tag_ext)) diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index 42f811e..3c18590 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -164,10 +164,10 @@ rule UROPA: txt = join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'), bed1 = temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')), bed2 = temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_finalhits.bed')), + json = join(uropa_dir, '{PeakTool1}', '{name}.{PeakTool2}.{type}.json'), params: rname = "uropa", fldr = join(uropa_dir, '{PeakTool1}'), - json = join(uropa_dir, '{PeakTool1}', '{name}.{PeakTool2}.{type}.json'), outroot = join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}'), threads: 4, run: @@ -233,8 +233,13 @@ rule UROPA: this_q = base_query.copy() this_q['distance'] = _d json_construct['queries'].append(this_q) - with open('{params.json}', 'w') as jo: + + with open('{output.json}', 'w') as jo: json.dump(json_construct, jo, indent=4) + jo.close() + + if not os.path.exists('{output.json}'): + raise FileNotFoundError('{output.json} does not exist!') shell("uropa -i {params.json} -p {params.outroot} -t {threads} -s") diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 48343f9..d07ff71 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -74,7 +74,7 @@ rule NRF: samtoolsver = config['tools']['SAMTOOLSVER'], rver = config['tools']['RVER'], preseqver = config['tools']['PRESEQVER'], - nrfscript = join(workpath, "workflow", "scripts", "atac_nrf.py"), + nrfscript = join(bin_path, "atac_nrf.py"), threads: 16 shell: """ From b54ac886c185f5fea2e2a4cb0c78620f6d07293a Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 22 Jul 2024 16:11:35 -0400 Subject: [PATCH 16/28] fix: working out bugs discovered on AV --- bin/prep_diffbind.py | 8 ++++---- bin/prep_diffbindQC.py | 4 ++-- src/run.sh | 4 ++-- workflow/Snakefile | 2 +- workflow/rules/dba.smk | 28 ++++++++++++++-------------- workflow/rules/hooks.smk | 4 ++-- workflow/rules/qc.smk | 10 ++++++---- workflow/rules/trim_align_dedup.smk | 7 +++---- workflow/scripts/blocking.py | 16 +++++++++++----- workflow/scripts/grouping.py | 3 ++- 10 files changed, 47 insertions(+), 39 deletions(-) diff --git a/bin/prep_diffbind.py b/bin/prep_diffbind.py index 4ba7d64..fc96cd7 100755 --- a/bin/prep_diffbind.py +++ b/bin/prep_diffbind.py @@ -23,20 +23,20 @@ blocks = config['project']['blocks'] if None in list(blocks.values()): - samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", + samplesheet = [",".join(["SampleID", "Condition", "Replicate", "bamReads", "ControlID", "bamControl", "Peaks", "PeakCaller"])] else: - samplesheet = [",".join(["SampleID","Condition","Treatment","Replicate", "bamReads", + samplesheet = [",".join(["SampleID", "Condition", "Treatment", "Replicate", "bamReads", "ControlID", "bamControl", "Peaks", "PeakCaller"])] for condition in args.group1, args.group2: for chip in groupdata[condition]: replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) - bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam" + bamReads = args.bamdir + "/" + chip + ".Q5DD.bam" controlID = chip2input[chip] if controlID != "": - bamControl = args.workpath + "/" + args.bamdir + "/" + controlID + ".Q5DD.bam" + bamControl = args.bamdir + "/" + controlID + ".Q5DD.bam" else: bamControl = "" peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension diff --git a/bin/prep_diffbindQC.py b/bin/prep_diffbindQC.py index 550b5f9..59c19d3 100755 --- a/bin/prep_diffbindQC.py +++ b/bin/prep_diffbindQC.py @@ -36,10 +36,10 @@ condition = "" replicate = str(count) count = count +1 - bamReads = args.workpath + "/" + args.bamdir + "/" + chip + ".Q5DD.bam" + bamReads = args.bamdir + "/" + chip + ".Q5DD.bam" controlID = chip2input[chip] if controlID != "": - bamControl = args.workpath + "/" + args.bamdir + "/" + controlID + ".Q5DD.bam" + bamControl = args.bamdir + "/" + controlID + ".Q5DD.bam" else: bamControl = "" peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension diff --git a/src/run.sh b/src/run.sh index 9cade43..83315ea 100755 --- a/src/run.sh +++ b/src/run.sh @@ -209,7 +209,6 @@ function submit(){ if [[ ${6#\'} != /lscratch* ]]; then CLUSTER_OPTS="sbatch --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name={params.rname} -e $SLURM_DIR/slurm-%j_{params.rname}.out -o $SLURM_DIR/slurm-%j_{params.rname}.out {cluster.ntasks} {cluster.ntasks_per_core} {cluster.exclusive}" fi - # Create sbacth script to build index cat << EOF > kickoff.sh #!/usr/bin/env bash #SBATCH --cpus-per-task=16 @@ -226,7 +225,8 @@ snakemake --latency-wait 120 -s "$3/workflow/Snakefile" -d "$3" \\ --use-singularity --singularity-args "'-B $4'" \\ --use-envmodules --configfile="$3/config.json" \\ --printshellcmds --cluster-config "$3/config/cluster.json" \\ - --cluster "${CLUSTER_OPTS}" --keep-going --restart-times 3 -j 500 \\ + --cluster "${CLUSTER_OPTS}" --keep-going -j 500 \\ + --keep-incomplete --restart-times 1 \\ --rerun-incomplete --stats "$3/logfiles/runtime_statistics.json" \\ --keep-remote --local-cores 14 2>&1 # Create summary report diff --git a/workflow/Snakefile b/workflow/Snakefile index 23c5ae5..91a4240 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -33,7 +33,7 @@ groups = list(groupdatawinput.keys()) reps = True if len(groupswreps) > 0 else False uniq_inputs = list(sorted(set([v for v in chip2input.values() if v]))) sampleswinput = [ - chip_value for input_id, chip_value in chip2input.items() \ + chip_value for chip_value, input_id in chip2input.items() \ if chip_value != 'NA' and chip_value != '' ] inputnorm = [""] if len(sampleswinput) == 0 else ["", ".inputnorm"] diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index 3c18590..2fbc15b 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -172,16 +172,15 @@ rule UROPA: threads: 4, run: # Dynamically creates UROPA config file - shell(f"module load {uropaver}") - if not os.path.exists("{params.fldr}"): - os.mkdir("{params.fldr}", mode=0o775) + if not os.path.exists(params.fldr): + os.mkdir(params.fldr, mode=0o775) json_construct = dict() json_construct['queries'] = [] json_construct['show_attributes'] = ["gene_id", "gene_name", "gene_type"] json_construct["priority"] = "Yes" json_construct['gtf'] = gtf - json_construct['bed'] = "{input}" + json_construct['bed'] = input[0] base_query = { "feature": "gene", @@ -191,19 +190,19 @@ rule UROPA: } if assay == 'cfchip': - if '{type}' == 'protTSS': + if wildcards.type == 'protTSS': for _d in (3000, 10000, 100000): this_q = base_query.copy() this_q['distance'] = _d json_construct['queries'].append(this_q) else: - if '{type}' == 'prot': + if wildcards.type == 'prot': for _d in (5000, 100000): this_q = base_query.copy() del this_q["feature.anchor"] this_q['distance'] = _d json_construct['queries'].append(this_q) - elif '{type}' == 'genes': + elif wildcards.type == 'genes': this_query = {} this_query['feature'] = 'gene' for _d in (5000, 100000): @@ -213,7 +212,7 @@ rule UROPA: del this_q["attribute.value"] this_q['distance'] = _d json_construct['queries'].append(this_q) - elif '{type}' == 'protSEC': + elif wildcards.type == 'protSEC': # distance, feature.anchor query_values = ( ([3000, 1000], "start"), @@ -226,21 +225,22 @@ rule UROPA: del this_q["feature.anchor"] if feature_anchor: this_q["feature.anchor"] = feature_anchor - this_q['distance'] = _d + this_q['distance'] = _distance json_construct['queries'].append(this_q) - elif '{type}' == 'protTSS': + elif wildcards.type == 'protTSS': for _d in ([3000, 1000], 10000, 100000): this_q = base_query.copy() this_q['distance'] = _d json_construct['queries'].append(this_q) - with open('{output.json}', 'w') as jo: + with open(output.json, 'w') as jo: json.dump(json_construct, jo, indent=4) jo.close() - if not os.path.exists('{output.json}'): - raise FileNotFoundError('{output.json} does not exist!') - shell("uropa -i {params.json} -p {params.outroot} -t {threads} -s") + if not os.path.exists(output.json): + raise FileNotFoundError(output.json + " does not exist!") + shell.prefix(f"module load {uropaver};") + shell("uropa -i " + output.json + " -p " + params.outroot + " -t " + str(threads) + " -s") rule manorm: diff --git a/workflow/rules/hooks.smk b/workflow/rules/hooks.smk index dfa4f47..1c8666f 100644 --- a/workflow/rules/hooks.smk +++ b/workflow/rules/hooks.smk @@ -31,7 +31,7 @@ if config['options']['mode'] == 'slurm': # previously submitted jobs sleep 15; rm -f COMPLETED FAILED RUNNING; timestamp=$(date +"%Y-%m-%d_%H-%M-%S"); - ./workflow/scripts/jobby \\ + ./bin/jobby \\ $(grep --color=never "^Submitted .* external jobid" logfiles/snakemake.log \\ | awk '{{print $NF}}' \\ | sed "s/['.]//g" \\ @@ -68,7 +68,7 @@ if config['options']['mode'] == 'slurm': # previously submitted jobs sleep 15; rm -f COMPLETED FAILED RUNNING; timestamp=$(date +"%Y-%m-%d_%H-%M-%S"); - ./workflow/scripts/jobby \\ + ./bin/jobby \\ $(grep --color=never "^Submitted .* external jobid" logfiles/snakemake.log \\ | awk '{{print $NF}}' \\ | sed "s/['.]//g" \\ diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index d07ff71..d97f6ea 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -370,8 +370,7 @@ rule insert_size: rule deeptools_QC: input: - # this should be all bigwigs - [ join(workpath, bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] + [ join(bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] output: javaram = '16g', heatmap = join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"), @@ -380,13 +379,16 @@ rule deeptools_QC: png = join(deeptools_dir, "spearman_heatmap.Q5DD_mqc.png") params: rname = "deeptools_QC", + parent_dir = deeptools_dir, deeptoolsver = config['tools']['DEEPTOOLSVER'], # this should be the sample names to match the bigwigs in the same order - labels = samples + labels = samples + threads: 24 shell: """ module load {params.deeptoolsver} - multiBigwigSummary bins -b {input} -l {params.labels} -out {output.npz} + if [ ! -d "{params.parent_dir}" ]; then mkdir "{params.parent_dir}"; fi + multiBigwigSummary bins -b {input} -p {threads} -l {params.labels} -out {output.npz} plotCorrelation -in {output.npz} -o {output.heatmap} -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers plotCorrelation -in {output.npz} -o {output.png} -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers plotPCA -in {output.npz} -o {output.pca} diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk index 5665956..fa6cf00 100644 --- a/workflow/rules/trim_align_dedup.smk +++ b/workflow/rules/trim_align_dedup.smk @@ -418,20 +418,19 @@ rule inputnorm: bigWig file of treatmment sample normalizes with its input control """ input: - bws = lambda w: ctrl_test(chip2input, w.name, bw_dir) + chip = lambda wc: ctrl_test(chip2input, wc.name, bw_dir, 'chip'), + ctrl = lambda wc: ctrl_test(chip2input, wc.name, bw_dir, 'ctrl') output: join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw") params: rname = "inputnorm", - bigwig_declare = lambda w, input: f"--bigwig1 {input.bws[0]}" if len(input.bws) == 1 \ - else f"--bigwig1 {input.bws[0]} --bigwig2 {input.bws[1]}" + bigwig_declare = lambda wc, input: f"--bigwig1 {input.chip} --bigwig2 {input.ctrl}", threads: int(allocated("threads", "inputnorm", cluster)), envmodules: config['tools']['DEEPTOOLSVER'], shell: """ - echo {input} bigwigCompare \\ --binSize 25 \\ --outFileName {output} \\ diff --git a/workflow/scripts/blocking.py b/workflow/scripts/blocking.py index 9f3febb..270376b 100644 --- a/workflow/scripts/blocking.py +++ b/workflow/scripts/blocking.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 +import os from os.path import join +from collections import defaultdict # ~~~ Common helper functions for blocking or controls - - def test_for_block(groupdata, contrast, blocks): """ only want to run blocking on contrasts where all individuals are on both sides of the contrast """ @@ -20,9 +20,15 @@ def test_for_block(groupdata, contrast, blocks): return contrastBlock -def ctrl_test(ctrl_dict, input_name, in_dir): +def ctrl_test(ctrl_dict, input_name, in_dir, mode=None): sample = join(in_dir, f"{input_name}.Q5DD.RPGC.bw") + assert mode in ('chip', 'ctrl'), 'Unrecognized input file mode.' + # assert os.path.exists(sample), f'{sample} sample does not exist!' + if input_name in ctrl_dict: norm = join(in_dir, ctrl_dict[input_name] + ".Q5DD.RPGC.bw") - return [sample, norm] - return [sample] \ No newline at end of file + # assert os.path.exists(norm), f'{norm} control does not exist!' + else: + raise ValueError(f'ChIP sample {input_name} missing from input lookup: \n{str(ctrl_dict)}') + outs = {'chip': sample, 'ctrl': norm} + return outs[mode] \ No newline at end of file diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py index b8214cb..09c9fd4 100644 --- a/workflow/scripts/grouping.py +++ b/workflow/scripts/grouping.py @@ -106,4 +106,5 @@ def get_bam_input(bam_dir, wildcards, paired_end): bams.append(join(bam_dir, "{0}.Q5DD.bam".format(wildcards.name))) elif wildcards.ext == "sorted": bams.append(join(bam_dir, "{0}.sorted.bam".format(wildcards.name))) - return bams \ No newline at end of file + return bams + From 2f6aff81654684a00a78523db22b19f396e4ba19 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 29 Jul 2024 12:42:36 -0400 Subject: [PATCH 17/28] fix: testing corrections --- workflow/Snakefile | 111 +++++++++++++++------------- workflow/chrom-seek.code-workspace | 13 ++++ workflow/rules/peakcall.smk | 2 +- workflow/rules/qc.smk | 1 - workflow/rules/trim_align_dedup.smk | 1 + 5 files changed, 74 insertions(+), 54 deletions(-) create mode 100644 workflow/chrom-seek.code-workspace diff --git a/workflow/Snakefile b/workflow/Snakefile index 91a4240..b2c7c42 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -21,6 +21,7 @@ paired_end = False if config['project']['nends'] == 1 else chips = config['project']['peaks']['chips'] contrast = config['project']['contrast'] chip2input = config['project']['peaks']['inputs'] +has_inputs = False if set(chip2input.values()) in ({''}, {None}) else True groupdata = config['project']['groups'] peak_types = config['options']['peak_type_base'] rule_all_ins = [] @@ -100,28 +101,32 @@ if assay == "cfchip": name=contrasts, _type=["protTSS"] )) - # else: - # rule_all_ins.extend(expand( - # join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), - # PeakTool="MANorm", - # name=contrasts, - # _type=UropaCats - # )) - # rule_all_ins.extend(expand( - # join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), - # group1=zipGroup1, - # group2=zipGroup2, - # tool=zipToolC - # )) + else: + rule_all_ins.extend(expand( + join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), + PeakTool="MANorm", + name=contrasts, + _type=UropaCats + )) + rule_all_ins.extend(expand( + join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), + group1=zipGroup1, + group2=zipGroup2, + tool=zipToolC + )) elif assay in ["atac", "chip"]: peak_types.extend(["prot", "protSEC", "genes"]) - rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips)) - rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips)) + # meme outputs turned off for now + # if has_inputs: + # rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips)) + # rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_ame", "ame.html"), PeakTool=PeakTools, name=chips)) if paired_end: rule_all_ins.extend(expand(join(qc_dir, "{name}.{stem}.insert_size_metrics.txt"), name=samples, stem=file_stems)) if assay == "chip": rule_all_ins.extend(expand(join(macsB_dir, "{name}", "{name}_peaks.broadPeak"), name=chips)) - rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips)) + # sicer outputs turned off for now + # if has_inputs: + # rule_all_ins.extend(expand(join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), name=chips)) short_ext, tag_ext = ["sorted", "Q5DD"], ["sorted", "Q5DD_tagAlign"] if paired_end: rule_all_ins.extend(expand(join(ppqt_dir, "{name}.{ext}.ppqt"), name=samples, ext=short_ext)) @@ -136,51 +141,53 @@ elif assay in ["atac", "chip"]: join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips )) if reps: - rule_all_ins.extend(expand( - join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=PeakTools, name=chips, _type=peak_types - )) - rule_all_ins.extend(expand( - join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC - )) - rule_all_ins.extend(expand( - join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], - name=contrasts, - _type=peak_types - )) + if has_inputs: + rule_all_ins.extend(expand( + join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), + group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC + )) + rule_all_ins.extend(expand( + join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool=PeakTools, name=chips, _type=peak_types + )) + rule_all_ins.extend(expand( + join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], + name=contrasts, + _type=peak_types + )) if contrast: rule_all_ins.extend(expand( join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), PeakTool=PeakTools )) - # else: - # rule_all_ins.extend(expand( - # join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), - # PeakTool="MANorm", - # name=contrasts, - # _type=UropaCats - # )) - # rule_all_ins.extend(expand( - # join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), - # group1=zipGroup1, - # group2=zipGroup2, - # tool=zipToolC - # )) - -rule_all_ins.append(join(workpath,"multiqc_report.html")) + else: + rule_all_ins.extend(expand( + join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), + PeakTool="MANorm", + name=contrasts, + _type=UropaCats + )) + rule_all_ins.extend(expand( + join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), + group1=zipGroup1, + group2=zipGroup2, + tool=zipToolC + )) +rule_all_ins.append(join(workpath, "multiqc_report.html")) rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples)) -rule_all_ins.extend( - expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools) -) +if has_inputs: + rule_all_ins.extend( + expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools) + ) + rule_all_ins.extend( + expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples) + ) rule_all_ins.extend(expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extaln)) rule_all_ins.extend(expand(join(macsN_dir, "{name}","{name}_peaks.narrowPeak"), name=chips)) -rule_all_ins.extend( - expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples) -) rule_all_ins.extend(expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"])) -rule_all_ins.extend(expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput)) +if has_inputs: + rule_all_ins.extend(expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput)) rule all: input: diff --git a/workflow/chrom-seek.code-workspace b/workflow/chrom-seek.code-workspace new file mode 100644 index 0000000..ba2accd --- /dev/null +++ b/workflow/chrom-seek.code-workspace @@ -0,0 +1,13 @@ +{ + "folders": [ + { + "path": ".." + }, + { + "path": "../../../../../../data/OpenOmics/dev/datasets" + } + ], + "settings": { + "r.lsp.promptToInstall": false + } +} \ No newline at end of file diff --git a/workflow/rules/peakcall.smk b/workflow/rules/peakcall.smk index e81f5a4..fd8a520 100644 --- a/workflow/rules/peakcall.smk +++ b/workflow/rules/peakcall.smk @@ -173,7 +173,7 @@ rule SICER: fragLen = lambda w: getSicerFragLen(ppqt_dir, qc_dir, w.name, paired_end), c_option = lambda w: get_control_input(chip2input[w.name], paired_end, bam_dir), output: - bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed"), + bed = join(sicer_dir, "{name}", "{name}_broadpeaks.bed") if has_inputs else [], params: rname = 'SICER', name = "{name}", diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index d97f6ea..61b02ea 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -372,7 +372,6 @@ rule deeptools_QC: input: [ join(bw_dir, name + ".Q5DD.RPGC.bw") for name in samples ] output: - javaram = '16g', heatmap = join(deeptools_dir, "spearman_heatmap.Q5DD.pdf"), pca = join(deeptools_dir, "pca.Q5DD.pdf"), npz = temp(join(deeptools_dir, "Q5DD.npz")), diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk index fa6cf00..0198ea5 100644 --- a/workflow/rules/trim_align_dedup.smk +++ b/workflow/rules/trim_align_dedup.smk @@ -13,6 +13,7 @@ genome = config['options']['genome'] paired_end = False if config['project']['nends'] == 1 else True ends = [1] if not paired_end else [1, 2] chip2input = config['project']['peaks']['inputs'] +has_inputs = False if set(chip2input.values()) == {''} else True # ~~ directories trim_dir = join(workpath, 'trim') From 710cae0ea2a78ddd631dbeb87d2feb8086be13a0 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 31 Jul 2024 14:33:23 -0400 Subject: [PATCH 18/28] fix: comment out manorm rules for now --- workflow/Snakefile | 26 ++++++++++++++------------ workflow/rules/dba.smk | 5 +++-- workflow/rules/trim_align_dedup.smk | 3 ++- workflow/scripts/peakcall.py | 2 ++ 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index b2c7c42..eaf2aef 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -108,12 +108,13 @@ if assay == "cfchip": name=contrasts, _type=UropaCats )) - rule_all_ins.extend(expand( - join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), - group1=zipGroup1, - group2=zipGroup2, - tool=zipToolC - )) + # manorm commented turned off now + # rule_all_ins.extend(expand( + # join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), + # group1=zipGroup1, + # group2=zipGroup2, + # tool=zipToolC + # )) elif assay in ["atac", "chip"]: peak_types.extend(["prot", "protSEC", "genes"]) # meme outputs turned off for now @@ -168,12 +169,13 @@ elif assay in ["atac", "chip"]: name=contrasts, _type=UropaCats )) - rule_all_ins.extend(expand( - join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), - group1=zipGroup1, - group2=zipGroup2, - tool=zipToolC - )) + # manorm commented turned off now + # rule_all_ins.extend(expand( + # join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), + # group1=zipGroup1, + # group2=zipGroup2, + # tool=zipToolC + # )) rule_all_ins.append(join(workpath, "multiqc_report.html")) rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples)) if has_inputs: diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index 2fbc15b..e85fc6e 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -26,6 +26,7 @@ diffbind_dir = join(workpath, "DiffBind") uropa_dir = join(workpath, "UROPA_annotations") uropa_diffbind_dir = join(uropa_dir, "DiffBind") bam_dir = join(workpath, "bam") +ppqt_dir = join(bam_dir, "ppqt") qc_dir = join(workpath, "PeakQC") idr_dir = join(workpath, "IDR") memechip_dir = join(workpath, "MEME") @@ -247,7 +248,7 @@ rule manorm: input: bam1 = lambda w: join(bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"), bam2 = lambda w: join(bam_dir, groupdata[w.group2][0] + ".Q5DD.bam"), - ppqt = join(ppqt_dir, "Q5DD.ppqt.txt"), + # ppqt = join(ppqt_dir, "Q5DD.ppqt.txt"), # ppqt input into manorm TODO peak1 = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]), peak2 = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]), output: @@ -261,7 +262,7 @@ rule manorm: fldr = join(manorm_dir, "{group1}_vs_{group2}-{tool}"), bedtoolsver = config['tools']['BEDTOOLSVER'], manormver = "manorm/1.1.4", - extsizes = lambda w, _in: get_manorm_sizes(w.group1, w.group2, groupdata, _in.ppqt) + extsizes = lambda w, input: get_manorm_sizes(w.group1, w.group2, groupdata, "") shell: """ if [ ! -e /lscratch/$SLURM_JOBID ]; then diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk index 0198ea5..0b8cc18 100644 --- a/workflow/rules/trim_align_dedup.smk +++ b/workflow/rules/trim_align_dedup.smk @@ -9,6 +9,7 @@ from scripts.blocking import ctrl_test # ~~ workflow configuration workpath = config['project']['workpath'] +bin_path = config['project']['binpath'] genome = config['options']['genome'] paired_end = False if config['project']['nends'] == 1 else True ends = [1] if not paired_end else [1, 2] @@ -323,7 +324,7 @@ rule ppqt: rname = "ppqt", samtoolsver = config['tools']['SAMTOOLSVER'], rver = config['tools']['RVER'], - scriptPy = join(workpath, "bin", "ppqt_process.py"), + scriptPy = join(bin_path, "ppqt_process.py"), file_name = "{name}" container: config['images']['ppqt'] diff --git a/workflow/scripts/peakcall.py b/workflow/scripts/peakcall.py index fb03a3c..0d86cff 100644 --- a/workflow/scripts/peakcall.py +++ b/workflow/scripts/peakcall.py @@ -106,6 +106,8 @@ def getSicerFragLen(ppqt_dir, qc_dir, name, paired_end): def get_manorm_sizes(g1, g2, group_data, ppqt_in): + if not ppqt_in: + return "" file = lambda w, _in: list(map(lambda z: z.strip().split(), open(ppqt_in, 'r').readlines())) extsize1 = [ppqt[1] for ppqt in file if ppqt[0] == group_data[g1]][0] extsize2 = [ppqt[1] for ppqt in file if ppqt[0] == group_data[g2]][0] From 53a7949eef583a3c1552df2718cb35bcde6aafeb Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 31 Jul 2024 14:37:50 -0400 Subject: [PATCH 19/28] fix: turn off sicer involved inputs in cfchip pipeline --- workflow/scripts/grouping.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py index 09c9fd4..8d78899 100644 --- a/workflow/scripts/grouping.py +++ b/workflow/scripts/grouping.py @@ -70,7 +70,9 @@ def get_peaktools(assay_type): if assay_type == "atac": tools.append("Genrich") elif assay_type == "chip": - tools.extend(["macsBroad", "sicer"]) + tools.extend(["macsBroad"]) + # turn sicer off for now + # tools.extend(["macsBroad", "sicer"]) return tools From 1779a9175dc30197285a039844a57b2deda005c1 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 31 Jul 2024 15:47:27 -0400 Subject: [PATCH 20/28] fix: realign uropa, promotertable2, diffbind outputs inputs --- workflow/Snakefile | 44 +++++++++++++++++++-------------------- workflow/rules/cfChIP.smk | 3 ++- workflow/rules/dba.smk | 15 +++++++------ 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index eaf2aef..6f16496 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -101,6 +101,11 @@ if assay == "cfchip": name=contrasts, _type=["protTSS"] )) + if contrast: + rule_all_ins.extend(expand( + join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), + PeakTool=PeakTools + )) else: rule_all_ins.extend(expand( join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), @@ -108,13 +113,13 @@ if assay == "cfchip": name=contrasts, _type=UropaCats )) - # manorm commented turned off now - # rule_all_ins.extend(expand( - # join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), - # group1=zipGroup1, - # group2=zipGroup2, - # tool=zipToolC - # )) + rule_all_ins.extend(expand( + join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), + group1=zipGroup1, + group2=zipGroup2, + tool=zipToolC + )) + elif assay in ["atac", "chip"]: peak_types.extend(["prot", "protSEC", "genes"]) # meme outputs turned off for now @@ -151,16 +156,12 @@ elif assay in ["atac", "chip"]: join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), PeakTool=PeakTools, name=chips, _type=peak_types )) - rule_all_ins.extend(expand( - join(uropa_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], - name=contrasts, - _type=peak_types - )) if contrast: rule_all_ins.extend(expand( - join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), - PeakTool=PeakTools + join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], + name=contrasts, + _type=["protTSS", "prot", "protSEC", "genes"], )) else: rule_all_ins.extend(expand( @@ -169,13 +170,12 @@ elif assay in ["atac", "chip"]: name=contrasts, _type=UropaCats )) - # manorm commented turned off now - # rule_all_ins.extend(expand( - # join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), - # group1=zipGroup1, - # group2=zipGroup2, - # tool=zipToolC - # )) + rule_all_ins.extend(expand( + join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), + group1=zipGroup1, + group2=zipGroup2, + tool=zipToolC + )) rule_all_ins.append(join(workpath, "multiqc_report.html")) rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples)) if has_inputs: diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk index 665ef2f..334508a 100644 --- a/workflow/rules/cfChIP.smk +++ b/workflow/rules/cfChIP.smk @@ -16,6 +16,7 @@ bam_dir = join(workpath, "bam") cfTool_dir = join(workpath, "cfChIPtool") cfTool_subdir2 = join(cfTool_dir, "BED", "H3K4me3") qc_dir = join(workpath, "QC") +diffbind_dir = join(workpath, "DiffBind") rule cfChIPtool: @@ -87,7 +88,7 @@ rule promoterTable2: input: expand(join(diffbind_dir, '{name}_DiffbindDeseq2_uropa_protTSS_allhits.txt'), name=contrasts), output: - txt = join(workpath,uropa_dir,"promoterTable2",'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'), + txt = join(uropa_dir, "promoterTable2", 'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'), params: rname = "promoter2", script1 = join(bin_path, "promoterAnnotation_by_Gene.R"), diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index e85fc6e..e1865ac 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -31,7 +31,6 @@ qc_dir = join(workpath, "PeakQC") idr_dir = join(workpath, "IDR") memechip_dir = join(workpath, "MEME") homer_dir = join(workpath, "HOMER_motifs") -uropa_dir = join(workpath, "UROPA_annotations") manorm_dir = join(workpath, "MANorm") downstream_dir = join(workpath, "Downstream") otherDirs = [qc_dir, homer_dir, uropa_dir] @@ -96,13 +95,13 @@ rule diffbind: lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ] output: html = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - Deseq2 = join(diffbind_dir, "DiffbindDeseq2", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"), - EdgeR = join(diffbind_dir, "DiffbindEdgeR", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"), - EdgeR_txt = join(diffbind_dir, "DiffbindEdgeR", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"), - Deseq2_txt = join(diffbind_dir, "DiffbindDeseq2", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"), - EdgeR_ftxt = join(diffbind_dir, "DiffbindEdgeR", "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"), + Deseq2 = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.bed"), + EdgeR = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.bed"), + EdgeR_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR.txt"), + Deseq2_txt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2.txt"), + EdgeR_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_EdgeR_fullList.txt"), Deseq2_ftxt = join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_Deseq2_fullList.txt"), - html_block = provided(join(diffbind_dir_block, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking) + html_block = provided(join(uropa_diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind_blocking.html"), blocking) params: # variables and wildcards used in the shell directive rname = "diffbind", @@ -160,7 +159,7 @@ rule diffbind: rule UROPA: input: - lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ], + lambda w: join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]), output: txt = join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.txt'), bed1 = temp(join(uropa_dir, '{PeakTool1}', '{name}_{PeakTool2}_uropa_{type}_allhits.bed')), From 06fcb3c64a33c70a9018d3391a441332cbcdc7f9 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 31 Jul 2024 18:04:13 -0400 Subject: [PATCH 21/28] fix: realign promotertable cfchip inputs --- workflow/Snakefile | 4 ++-- workflow/rules/cfChIP.smk | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 6f16496..3492aaf 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -96,8 +96,8 @@ if assay == "cfchip": group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC )) rule_all_ins.extend(expand( - join(uropa_diffbind_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=['DiffbindEdgeR','DiffbindDeseq2'], + join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool=['DiffbindEdgeR', 'DiffbindDeseq2'], name=contrasts, _type=["protTSS"] )) diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk index 334508a..7465170 100644 --- a/workflow/rules/cfChIP.smk +++ b/workflow/rules/cfChIP.smk @@ -86,7 +86,7 @@ rule promoterTable1: rule promoterTable2: input: - expand(join(diffbind_dir, '{name}_DiffbindDeseq2_uropa_protTSS_allhits.txt'), name=contrasts), + expand(join(uropa_diffbind_dir, '{name}_DiffbindDeseq2_uropa_protTSS_allhits.txt'), name=contrasts), output: txt = join(uropa_dir, "promoterTable2", 'DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt'), params: From 5b9ff2c7e762d9fb4fd8797af350117b470954a4 Mon Sep 17 00:00:00 2001 From: Skyler Kuhn Date: Fri, 2 Aug 2024 10:52:36 -0400 Subject: [PATCH 22/28] Delete workflow/chrom-seek.code-workspace --- workflow/chrom-seek.code-workspace | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 workflow/chrom-seek.code-workspace diff --git a/workflow/chrom-seek.code-workspace b/workflow/chrom-seek.code-workspace deleted file mode 100644 index ba2accd..0000000 --- a/workflow/chrom-seek.code-workspace +++ /dev/null @@ -1,13 +0,0 @@ -{ - "folders": [ - { - "path": ".." - }, - { - "path": "../../../../../../data/OpenOmics/dev/datasets" - } - ], - "settings": { - "r.lsp.promptToInstall": false - } -} \ No newline at end of file From 991c620756e465c3dd35d1089d710e3c672c431a Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Fri, 2 Aug 2024 12:22:38 -0400 Subject: [PATCH 23/28] fix: minor review fixes, reverting some dev settings --- src/run.sh | 3 +-- workflow/rules/trim_align_dedup.smk | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/run.sh b/src/run.sh index 83315ea..3c91146 100755 --- a/src/run.sh +++ b/src/run.sh @@ -225,8 +225,7 @@ snakemake --latency-wait 120 -s "$3/workflow/Snakefile" -d "$3" \\ --use-singularity --singularity-args "'-B $4'" \\ --use-envmodules --configfile="$3/config.json" \\ --printshellcmds --cluster-config "$3/config/cluster.json" \\ - --cluster "${CLUSTER_OPTS}" --keep-going -j 500 \\ - --keep-incomplete --restart-times 1 \\ + --cluster "${CLUSTER_OPTS}" --keep-going --restart-times 3 -j 500 \\ --rerun-incomplete --stats "$3/logfiles/runtime_statistics.json" \\ --keep-remote --local-cores 14 2>&1 # Create summary report diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk index 0b8cc18..52ab127 100644 --- a/workflow/rules/trim_align_dedup.smk +++ b/workflow/rules/trim_align_dedup.smk @@ -385,7 +385,7 @@ rule bam2bw: shell: """ if [ ! -d \"""" + tmpdir + """\" ]; then mkdir -p \"""" + tmpdir + """\"; fi - tmp=$(mktemp -d -p """ + tmpdir + """) + tmp=$(mktemp -d -p \"""" + tmpdir + """\") trap 'rm -rf "${{tmp}}"' EXIT bam_cov_option={input.ppqt} From a453a6b917b4ddbe132f4cc21ed30425fda8313b Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Fri, 2 Aug 2024 12:59:34 -0400 Subject: [PATCH 24/28] fix: comment out debug-dag from dryrun snakemake execution --- src/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/run.py b/src/run.py index 019d39c..9b66aab 100644 --- a/src/run.py +++ b/src/run.py @@ -611,7 +611,7 @@ def dryrun(outdir, config='config.json', snakefile=os.path.join('workflow', 'Sna 'snakemake', '-npr', '-s', str(snakefile), '--verbose', - '--debug-dag', + # '--debug-dag', '--use-singularity', '--rerun-incomplete', '--cores', str(256), From 7cae04e25765559684105cb3855786e1ab2c06ff Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 6 Aug 2024 13:52:32 -0400 Subject: [PATCH 25/28] fix: reconfigure a few outputs based on test runs --- workflow/Snakefile | 101 ++++++++++++++++++++------------------ workflow/rules/cfChIP.smk | 4 +- workflow/rules/dba.smk | 11 ++--- workflow/rules/qc.smk | 1 + 4 files changed, 61 insertions(+), 56 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 3492aaf..0025ec5 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -86,6 +86,10 @@ if assay == "cfchip": PeakTool="DiffBindQC", _type=peak_types )) + if has_inputs: + rule_all_ins.extend( + expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools) + ) rule_all_ins.extend(expand( join(uropa_dir, "promoterTable1", "{PeakTool}_promoter_overlap_summaryTable.txt"), PeakTool=PeakTools @@ -95,33 +99,35 @@ if assay == "cfchip": join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC )) - rule_all_ins.extend(expand( - join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=['DiffbindEdgeR', 'DiffbindDeseq2'], - name=contrasts, - _type=["protTSS"] - )) + if contrast: + rule_all_ins.extend(expand( + join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool=['DiffbindEdgeR', 'DiffbindDeseq2'], + name=contrasts, + _type=["protTSS"] + )) rule_all_ins.extend(expand( join(uropa_dir, "promoterTable2", "DiffbindDeseq2_{PeakTool}_promoter_overlap_summaryTable.txt"), PeakTool=PeakTools )) else: - rule_all_ins.extend(expand( - join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), - PeakTool="MANorm", - name=contrasts, - _type=UropaCats - )) - rule_all_ins.extend(expand( - join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), - group1=zipGroup1, - group2=zipGroup2, - tool=zipToolC - )) + pass + # remove manorm for now + # rule_all_ins.extend(expand( + # join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), + # PeakTool="MANorm", + # name=contrasts, + # _type=UropaCats + # )) + # rule_all_ins.extend(expand( + # join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), + # group1=zipGroup1, + # group2=zipGroup2, + # tool=zipToolC + # )) elif assay in ["atac", "chip"]: - peak_types.extend(["prot", "protSEC", "genes"]) # meme outputs turned off for now # if has_inputs: # rule_all_ins.extend(expand(join(MEME_dir, "{PeakTool}", "{name}_meme", "meme-chip.html"), PeakTool=PeakTools, name=chips)) @@ -146,50 +152,49 @@ elif assay in ["atac", "chip"]: rule_all_ins.extend(expand( join(genrich_dir, "{name}", "{name}.narrowPeak"), name=chips )) + rule_all_ins.extend(expand( + join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), + PeakTool=PeakTools, name=chips, _type=peak_types + )) if reps: - if has_inputs: - rule_all_ins.extend(expand( - join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC - )) - rule_all_ins.extend(expand( - join(uropa_dir, "{PeakTool}", "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), - PeakTool=PeakTools, name=chips, _type=peak_types - )) + rule_all_ins.extend(expand( + join(diffbind_dir, "{group1}_vs_{group2}-{PeakTool}", "{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), + group1=zipGroup1, group2=zipGroup2, PeakTool=zipToolC + )) if contrast: rule_all_ins.extend(expand( join(uropa_diffbind_dir, "{name}_{PeakTool}_uropa_{_type}_allhits.txt"), PeakTool=["DiffbindEdgeR", "DiffbindDeseq2"], name=contrasts, - _type=["protTSS", "prot", "protSEC", "genes"], + _type=["protTSS"], )) else: - rule_all_ins.extend(expand( - join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), - PeakTool="MANorm", - name=contrasts, - _type=UropaCats - )) - rule_all_ins.extend(expand( - join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), - group1=zipGroup1, - group2=zipGroup2, - tool=zipToolC - )) + pass + # manorm turned off now + # rule_all_ins.extend(expand( + # join(uropa_dir, '{PeakTool}', '{name}_{PeakTool}_uropa_{_type}_allhits.txt'), + # PeakTool="MANorm", + # name=contrasts, + # _type=UropaCats + # )) + # rule_all_ins.extend(expand( + # join(manorm_dir, "{group1}_vs_{group2}-{tool}", "{group1}_vs_{group2}-{tool}_all_MAvalues.xls"), + # group1=zipGroup1, + # group2=zipGroup2, + # tool=zipToolC + # )) rule_all_ins.append(join(workpath, "multiqc_report.html")) rule_all_ins.extend(expand(join(qc_dir, "{name}.preseq.dat"), name=samples)) -if has_inputs: - rule_all_ins.extend( - expand(join(qc_dir, "AllSamples-{PeakTool}", "AllSamples-{PeakTool}_DiffBindQC_TMMcounts.bed"), PeakTool=PeakTools) - ) - rule_all_ins.extend( - expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples) - ) +rule_all_ins.extend( + expand(join(peakqc_dir, "{PeakTool}.{name}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, name=samples) +) rule_all_ins.extend(expand(join(bam_dir, "{name}.{ext}"), name=samples, ext=extaln)) rule_all_ins.extend(expand(join(macsN_dir, "{name}","{name}_peaks.narrowPeak"), name=chips)) rule_all_ins.extend(expand(join(bw_dir, "{name}.{ext}.RPGC.bw"), name=samples, ext=["sorted", "Q5DD"])) + if has_inputs: rule_all_ins.extend(expand(join(bw_dir, "{name}.Q5DD.RPGC.inputnorm.bw"), name=sampleswinput)) + rule all: input: diff --git a/workflow/rules/cfChIP.smk b/workflow/rules/cfChIP.smk index 7465170..7020fba 100644 --- a/workflow/rules/cfChIP.smk +++ b/workflow/rules/cfChIP.smk @@ -130,6 +130,6 @@ rule diffbindQC: --pc {params.peakcaller} --csv {params.csvfile} cp {params.rscript} {params.outdir} cd {params.outdir} - Rscript -e 'rmarkdown::render("DiffBind_v2_cfChIP_QC.Rmd", output_file= "{output.html}", - params=list(csvfile= "{params.csvfile}", contrasts= "{params.contrast}", peakcaller= "{params.PeakTool}"))' + Rscript -e 'rmarkdown::render("DiffBind_v2_cfChIP_QC.Rmd", output_file= "{output.html}", \ + params=list(csvfile="{params.csvfile}", contrasts="{params.contrast}", peakcaller="{params.PeakTool}"))' """ \ No newline at end of file diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index e1865ac..b9aa984 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -18,8 +18,7 @@ contrast = config['project']['contrast'] uropaver = config['tools']['UROPAVER'] gtf = config['references'][genome]['GTFFILE'] -# ~~ directories -bin_path = join(workpath, "workflow", "bin") +# ~~ directoriesxw diffbind_dir_block = join(workpath, "DiffBindBlock") diffbind_dir2 = join(workpath, "DiffBind_block") diffbind_dir = join(workpath, "DiffBind") @@ -142,15 +141,15 @@ rule diffbind: --pc {params.peakcaller} --csv {params.csvfile} cp {params.rscript} {params.outdir} cd {params.outdir} - Rscript -e 'rmarkdown::render("DiffBind_v2_ChIPseq.Rmd", output_file= "{output.html}", - params=list(csvfile= "{params.csvfile}", contrasts= "{params.this_contrast}", peakcaller= "{params.this_peaktool}"))' + Rscript -e 'rmarkdown::render("DiffBind_v2_ChIPseq.Rmd", output_file= "{output.html}", \ + params=list(csvfile="{params.csvfile}", contrasts="{params.this_contrast}", peakcaller="{params.this_peaktool}"))' if [ ! -f {output.Deseq2} ]; then touch {output.Deseq2}; fi if [ ! -f {output.EdgeR} ]; then touch {output.EdgeR}; fi if [ '"""+str(blocking)+"""' == True ]; then echo "DiffBind with Blocking" - Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file= "{output.html_block}", - params=list(csvfile= "{params.csvfile}", contrasts= "{params.this_contrast}", peakcaller= "{params.this_peaktool}", dir= "{params.outdir_block}"))' + Rscript -e 'rmarkdown::render("{params.blocking_rscript}", output_file= "{output.html_block}", \ + params=list(csvfile= "{params.csvfile}", contrasts="{params.this_contrast}", peakcaller="{params.this_peaktool}", dir="{params.outdir_block}"))' if [ ! -f {params.Deseq2_block} ]; then touch {params.Deseq2_block}; fi if [ ! -f {params.EdgeR_block} ]; then touch {params.EdgeR_block}; fi fi diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 61b02ea..82b4376 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -185,6 +185,7 @@ rule fastqc: -exec cp {{}} {params.outdir} \\; """ + rule fastq_screen: """ Quality-control step to screen for different sources of contamination. From fd490e94bac91cab65f7ce1ac0aee6cc27e65116 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 6 Aug 2024 13:53:03 -0400 Subject: [PATCH 26/28] fix: refactor diffbind prep script, add error case exception for umap in diffbindqc rmd --- bin/DiffBind_v2_cfChIP_QC.Rmd | 16 +++++- bin/prep_diffbindQC.py | 105 +++++++++++++++++++--------------- 2 files changed, 71 insertions(+), 50 deletions(-) diff --git a/bin/DiffBind_v2_cfChIP_QC.Rmd b/bin/DiffBind_v2_cfChIP_QC.Rmd index d058cec..6f6731b 100755 --- a/bin/DiffBind_v2_cfChIP_QC.Rmd +++ b/bin/DiffBind_v2_cfChIP_QC.Rmd @@ -146,7 +146,10 @@ try(dba.plotPCA(DBdataCounts),silent=TRUE) ```{r TMM} vec <- c("seqnames", "start", "end", "width", "strand", samples$samples$SampleID) consensus2 <- dba.peakset(DBdataCounts, bRetrieve=TRUE) %>% ##extracts TMM-normalized counts - as.data.frame() %>% setNames(vec) %>% arrange(start, end) %>% mutate(Peaks = paste0("Peak",1:nrow(.))) %>% + as.data.frame() %>% + setNames(vec) %>% + arrange(start, end) %>% + mutate(Peaks = paste0("Peak",1:nrow(.))) %>% dplyr::select(1:4, Peaks, samples$samples$SampleID) outfile1 <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_TMMcounts.csv") @@ -164,12 +167,19 @@ counts_TMM_ALL <- counts_TMM_ALL %>% dplyr::select(5:ncol(.)) %>% t() %>% log10() %>% as.data.frame(.) ##UMAP coordinates set.seed(123) + if (nrow(samples$samples) < 16) { - umap_coord <- umap(counts_TMM_ALL, n_neighbors= nrow(samples$samples)-1) + neighbors=nrow(samples$samples)-1 + if (neighbors > 1) { + umap_coord <- umap(counts_TMM_ALL, n_neighbors=neighbors) + } else { + umap_coord <- umap(counts_TMM_ALL, n_neighbors=2) + } } else { umap_coord <- umap(counts_TMM_ALL) } -umap_coord <-as.data.frame(umap_coord$layout) %>% setNames(c("UMAP1", "UMAP2")) +umap_coord <- as.data.frame(umap_coord$layout) %>% + setNames(c("UMAP1", "UMAP2")) outfile <- paste0(contrasts, "-", peakcaller, "_DiffBindQC_UMAP.csv") write.csv(umap_coord, outfile, row.names = F) diff --git a/bin/prep_diffbindQC.py b/bin/prep_diffbindQC.py index 59c19d3..b01b7a5 100755 --- a/bin/prep_diffbindQC.py +++ b/bin/prep_diffbindQC.py @@ -2,50 +2,61 @@ import json import argparse - -parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv') -parser.add_argument('--wp',dest='workpath',required=True,help='Full path of the working directory') -parser.add_argument('--pt',dest='peaktool',required=True,help='Name of the the peak calling tool, also the directory where the peak file will be located') -parser.add_argument('--pe',dest='peakextension',required=True,help='The file extension of the peakcall output') -parser.add_argument('--pc',dest='peakcaller',required=True,help='Value for the PeakCaller column of the DiffBind csv') -parser.add_argument('--bd',dest='bamdir',required=True,help='Name of the directory where the bam files are located') -parser.add_argument('--csv',dest='csvfile',required=True,help='Name of the output csv file') - -args = parser.parse_args() - -with open("config.json","r") as read_file: - config=json.load(read_file) - -chip2input = config['project']['peaks']['inputs'] -groupdata = config['project']['groups'] - -tmpIDs = [x for xs in groupdata.values() for x in xs] -Ncounts = [tmpIDs.count(tmp) for tmp in set(tmpIDs)] - -samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", - "ControlID", "bamControl", "Peaks", "PeakCaller"])] - -count = 1 -for chip in chip2input.keys(): - if set(Ncounts) == {1}: # if all samples only in one group - for key in groupdata.keys(): - if chip in groupdata[key]: - condition = key - replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) - else: - condition = "" - replicate = str(count) - count = count +1 - bamReads = args.bamdir + "/" + chip + ".Q5DD.bam" - controlID = chip2input[chip] - if controlID != "": - bamControl = args.bamdir + "/" + controlID + ".Q5DD.bam" - else: - bamControl = "" - peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension - samplesheet.append(",".join([chip, condition, replicate, bamReads, - controlID, bamControl, peaks, args.peakcaller])) - -f = open(args.csvfile, 'w') -f.write ("\n".join(samplesheet)) -f.close() +import csv +from os.path import join + + +def main(args): + with open(join(args.workpath, "config.json"), "r") as read_file: + config=json.load(read_file) + + chip2input = config['project']['peaks']['inputs'] + groupdata = config['project']['groups'] + + tmpIDs = [x for xs in groupdata.values() for x in xs] + Ncounts = [tmpIDs.count(tmp) for tmp in set(tmpIDs)] + + with open(args.csvfile, 'w') as csvfile: + columns = ["SampleID","Condition", "Replicate", "bamReads", + "ControlID", "bamControl", "Peaks", "PeakCaller"] + writer = csv.DictWriter(csvfile, fieldnames=columns) + writer.writeheader() + + count = 1 + for chip in chip2input.keys(): + if set(Ncounts) == {1}: # if all samples only in one group + for key in groupdata.keys(): + if chip in groupdata[key]: + condition = key + replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) + else: + condition = "" + replicate = str(count) + count = count +1 + bamReads = args.bamdir + "/" + chip + ".Q5DD.bam" + controlID = chip2input[chip] + if controlID != "": + bamControl = args.bamdir + "/" + controlID + ".Q5DD.bam" + else: + bamControl = "" + peaks = args.workpath + "/" + args.peaktool + "/" + chip + "/" + chip + args.peakextension + row_values = [chip, condition, replicate, bamReads, controlID, bamControl, peaks, args.peakcaller] + writer.writerow(dict(zip(columns, row_values))) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Script to prepare the DiffBind input csv') + parser.add_argument('--wp', dest='workpath', required=True, + help='Full path of the working directory') + parser.add_argument('--pt', dest='peaktool', required=True, + help='Name of the the peak calling tool, also the directory where the peak file will be located') + parser.add_argument('--pe', dest='peakextension', required=True, + help='The file extension of the peakcall output') + parser.add_argument('--pc', dest='peakcaller', required=True, + help='Value for the PeakCaller column of the DiffBind csv') + parser.add_argument('--bd', dest='bamdir', required=True, + help='Name of the directory where the bam files are located') + parser.add_argument('--csv', dest='csvfile', required=True, + help='Name of the output csv file') + + main(parser.parse_args()) From 73076f82db10a2a131ded571a6ac3e200945cfd0 Mon Sep 17 00:00:00 2001 From: Tovah Markowitz Date: Wed, 7 Aug 2024 10:24:53 -0400 Subject: [PATCH 27/28] Update prep_diffbind.py Adding ending carriage return to output files --- bin/prep_diffbind.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/prep_diffbind.py b/bin/prep_diffbind.py index fc96cd7..3711c68 100755 --- a/bin/prep_diffbind.py +++ b/bin/prep_diffbind.py @@ -51,4 +51,5 @@ f = open(args.csvfile, 'w') f.write ("\n".join(samplesheet)) +f.write ("\n") f.close() From 3aef261863ff90d4635fce6d265270314dce0a6f Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 7 Aug 2024 12:00:31 -0400 Subject: [PATCH 28/28] fix: add blocking/control functions to grouping header --- workflow/rules/dba.smk | 2 +- workflow/rules/trim_align_dedup.smk | 4 ++-- workflow/scripts/blocking.py | 34 ----------------------------- workflow/scripts/grouping.py | 26 ++++++++++++++++++++++ 4 files changed, 29 insertions(+), 37 deletions(-) delete mode 100644 workflow/scripts/blocking.py diff --git a/workflow/rules/dba.smk b/workflow/rules/dba.smk index b9aa984..c6f0b5e 100644 --- a/workflow/rules/dba.smk +++ b/workflow/rules/dba.smk @@ -5,7 +5,7 @@ import json from os.path import join from scripts.common import allocated, mk_dir_if_not_exist from scripts.peakcall import outputIDR, zip_peak_files, calc_effective_genome_fraction, get_manorm_sizes -from scripts.blocking import test_for_block +from scripts.grouping import test_for_block # ~~ workflow configuration diff --git a/workflow/rules/trim_align_dedup.smk b/workflow/rules/trim_align_dedup.smk index 52ab127..0642ce0 100644 --- a/workflow/rules/trim_align_dedup.smk +++ b/workflow/rules/trim_align_dedup.smk @@ -4,8 +4,8 @@ import snakemake from os.path import join from scripts.common import allocated, get_bam_ext -from scripts.grouping import dedup_out7, get_bam_input, get_ppqt_input -from scripts.blocking import ctrl_test +from scripts.grouping import dedup_out7, get_bam_input, get_ppqt_input, \ + ctrl_test # ~~ workflow configuration workpath = config['project']['workpath'] diff --git a/workflow/scripts/blocking.py b/workflow/scripts/blocking.py deleted file mode 100644 index 270376b..0000000 --- a/workflow/scripts/blocking.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env python3 -import os -from os.path import join -from collections import defaultdict - - -# ~~~ Common helper functions for blocking or controls -def test_for_block(groupdata, contrast, blocks): - """ only want to run blocking on contrasts where all - individuals are on both sides of the contrast """ - contrastBlock = [ ] - for con in contrast: - group1 = con[0] - group2 = con[1] - block1 = [ blocks[sample] for sample in groupdata[group1] ] - block2 = [ blocks[sample] for sample in groupdata[group2] ] - if len(block1) == len(block2): - if len(set(block1).intersection(block2)) == len(block1): - contrastBlock.append(con) - return contrastBlock - - -def ctrl_test(ctrl_dict, input_name, in_dir, mode=None): - sample = join(in_dir, f"{input_name}.Q5DD.RPGC.bw") - assert mode in ('chip', 'ctrl'), 'Unrecognized input file mode.' - # assert os.path.exists(sample), f'{sample} sample does not exist!' - - if input_name in ctrl_dict: - norm = join(in_dir, ctrl_dict[input_name] + ".Q5DD.RPGC.bw") - # assert os.path.exists(norm), f'{norm} control does not exist!' - else: - raise ValueError(f'ChIP sample {input_name} missing from input lookup: \n{str(ctrl_dict)}') - outs = {'chip': sample, 'ctrl': norm} - return outs[mode] \ No newline at end of file diff --git a/workflow/scripts/grouping.py b/workflow/scripts/grouping.py index 8d78899..ee02b3e 100644 --- a/workflow/scripts/grouping.py +++ b/workflow/scripts/grouping.py @@ -110,3 +110,29 @@ def get_bam_input(bam_dir, wildcards, paired_end): bams.append(join(bam_dir, "{0}.sorted.bam".format(wildcards.name))) return bams + +def test_for_block(groupdata, contrast, blocks): + """ only want to run blocking on contrasts where all + individuals are on both sides of the contrast """ + contrastBlock = [ ] + for con in contrast: + group1 = con[0] + group2 = con[1] + block1 = [ blocks[sample] for sample in groupdata[group1] ] + block2 = [ blocks[sample] for sample in groupdata[group2] ] + if len(block1) == len(block2): + if len(set(block1).intersection(block2)) == len(block1): + contrastBlock.append(con) + return contrastBlock + + +def ctrl_test(ctrl_dict, input_name, in_dir, mode=None): + sample = join(in_dir, f"{input_name}.Q5DD.RPGC.bw") + assert mode in ('chip', 'ctrl'), 'Unrecognized input file mode.' + + if input_name in ctrl_dict: + norm = join(in_dir, ctrl_dict[input_name] + ".Q5DD.RPGC.bw") + else: + raise ValueError(f'ChIP sample {input_name} missing from input lookup: \n{str(ctrl_dict)}') + outs = {'chip': sample, 'ctrl': norm} + return outs[mode] \ No newline at end of file